pax_global_header00006660000000000000000000000064142007315240014510gustar00rootroot0000000000000052 comment=ebe086922d67dd27a808f079575bdf3458891824 nltk-3.7/000077500000000000000000000000001420073152400123315ustar00rootroot00000000000000nltk-3.7/.gitattributes000066400000000000000000000000141420073152400152170ustar00rootroot00000000000000* text=auto nltk-3.7/.github/000077500000000000000000000000001420073152400136715ustar00rootroot00000000000000nltk-3.7/.github/workflows/000077500000000000000000000000001420073152400157265ustar00rootroot00000000000000nltk-3.7/.github/workflows/ci.yaml000066400000000000000000000076471420073152400172230ustar00rootroot00000000000000name: ci-workflow # run workflow for these events on: [push, pull_request, workflow_dispatch] env: CORENLP: /home/runner/third/stanford-corenlp CORENLP_MODELS: /home/runner/third/stanford-corenlp STANFORD_PARSER: /home/runner/third/stanford-parser STANFORD_MODELS: /home/runner/third/stanford-postagger STANFORD_POSTAGGER: /home/runner/third/stanford-postagger SENNA: /home/runner/third/senna PROVER9: /home/runner/third/prover9/bin MEGAM: /home/runner/third/megam # TADM requires `libtaopetsc.so` from PETSc v2.3.3, and likely has more # tricky to install requirements, so we don't run tests for it. # TADM: /home/runner/third/tadm/bin MALT_PARSER: /home/runner/third/maltparser jobs: pre-commit: name: Run pre-commit runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 - uses: pre-commit/action@v2.0.0 cache_nltk_data: name: Cache nltk_data needs: pre-commit strategy: matrix: os: [ubuntu-latest, windows-latest] runs-on: ${{ matrix.os }} steps: - name: Checkout code uses: actions/checkout@v2 - name: Cache nltk data uses: actions/cache@v2 id: restore-cache with: path: ~/nltk_data key: nltk_data_${{ secrets.CACHE_VERSION }} - name: Download nltk data packages on cache miss run: | pip install regex # dependencies needed to download nltk data python -c "import nltk; from pathlib import Path; path = Path('~/nltk_data').expanduser(); path.mkdir(exist_ok=True); nltk.download('all', download_dir=path)" shell: bash if: steps.restore-cache.outputs.cache-hit != 'true' cache_third_party: name: Cache third party tools needs: pre-commit runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v2 - name: Cache third party tools uses: actions/cache@v2 id: restore-cache with: path: ~/third key: third_${{ secrets.CACHE_VERSION }} - name: Download third party data run: | chmod +x ./tools/github_actions/third-party.sh ./tools/github_actions/third-party.sh if: steps.restore-cache.outputs.cache-hit != 'true' test: name: Python ${{ matrix.python-version }} on ${{ matrix.os }} needs: [cache_nltk_data, cache_third_party] strategy: matrix: python-version: ['3.7', '3.8', '3.9', '3.10'] os: [ubuntu-latest, macos-latest, windows-latest] fail-fast: false runs-on: ${{ matrix.os }} steps: - name: Checkout code uses: actions/checkout@v2 - name: Setup python uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Set up JDK 16 uses: actions/setup-java@v1 with: java-version: 16 if: runner.os == 'Linux' - name: Cache dependencies uses: actions/cache@v2 id: restore-cache with: path: ${{ env.pythonLocation }} key: python-dependencies-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements-ci.txt') }}-${{ env.pythonLocation }} - name: Install dependencies on cache miss run: | pip install --no-cache-dir --upgrade pip pip install --no-cache-dir --upgrade --requirement requirements-ci.txt if: steps.restore-cache.outputs.cache-hit != 'true' - name: Use cached nltk data uses: actions/cache@v2 with: path: ~/nltk_data key: nltk_data_${{ secrets.CACHE_VERSION }} - name: Use cached third party tools uses: actions/cache@v2 with: path: ~/third key: third_${{ secrets.CACHE_VERSION }} if: runner.os == 'Linux' - name: Run pytest shell: bash run: | pytest --numprocesses auto -rsx --doctest-modules nltk/test nltk-3.7/.gitignore000066400000000000000000000013261420073152400143230ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *.class *.jar *.egg build/ dist/ nltk.egg-info/ web/_build # Test artifacts and coverage reports *.tox *.errs .hypothesis .noseids .coverage* nltk/test/*.html nltk/test/tweets* model.crf.tagger brown.embedding pylintoutput nosetests.xml nosetests_scrubbed.xml coverage.xml # editor temporary files *.*.sw[op] .idea *~ # git mergetools backups *.orig # emacs backups *# # spell-check backups *.bak # automatically built files for website web/api/*.rst web/howto/*.rst # iPython notebooks .ipynb_checkpoints # pyenv files .python-version # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ #vscode .vscode/ # Direnv .envrc # Mypy .mypy_cache nltk-3.7/.pre-commit-config.yaml000066400000000000000000000014031420073152400166100ustar00rootroot00000000000000repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.0.1 hooks: - id: fix-byte-order-marker - id: trailing-whitespace - id: end-of-file-fixer - id: requirements-txt-fixer - repo: https://github.com/asottile/pyupgrade rev: v2.23.3 hooks: - id: pyupgrade args: ["--py37-plus"] - repo: https://github.com/ambv/black rev: 21.7b0 hooks: - id: black - repo: local hooks: - id: isort name: isort entry: isort require_serial: true language: python language_version: python3 types_or: [cython, pyi, python] args: ['--filter-files'] minimum_pre_commit_version: '2.9.2' additional_dependencies: ['isort==5.10.1'] nltk-3.7/AUTHORS.md000066400000000000000000000153661420073152400140130ustar00rootroot00000000000000# Natural Language Toolkit (NLTK) Authors ## Original Authors - Steven Bird - Edward Loper - Ewan Klein ## Contributors - Tom Aarsen - Rami Al-Rfou' - Mark Amery - Greg Aumann - Ivan Barria - Ingolf Becker - Yonatan Becker - Paul Bedaride - Steven Bethard - Robert Berwick - Dan Blanchard - Nathan Bodenstab - Alexander Böhm - Francis Bond - Paul Bone - Jordan Boyd-Graber - Daniel Blanchard - Phil Blunsom - Lars Buitinck - Cristian Capdevila - Steve Cassidy - Chen-Fu Chiang - Dmitry Chichkov - Jinyoung Choi - Andrew Clausen - Lucas Champollion - Graham Christensen - Trevor Cohn - David Coles - Tom Conroy - Claude Coulombe - Lucas Cooper - Robin Cooper - Chris Crowner - James Curran - Arthur Darcet - Dariel Dato-on - Selina Dennis - Leon Derczynski - Alexis Dimitriadis - Nikhil Dinesh - Liang Dong - David Doukhan - Rebecca Dridan - Pablo Duboue - Long Duong - Christian Federmann - Campion Fellin - Michelle Fullwood - Dan Garrette - Maciej Gawinecki - Jean Mark Gawron - Sumukh Ghodke - Yoav Goldberg - Michael Wayne Goodman - Dougal Graham - Brent Gray - Simon Greenhill - Clark Grubb - Eduardo Pereira Habkost - Masato Hagiwara - Lauri Hallila - Michael Hansen - Yurie Hara - Will Hardy - Tyler Hartley - Peter Hawkins - Saimadhav Heblikar - Fredrik Hedman - Helder - Michael Heilman - Ofer Helman - Christopher Hench - Bruce Hill - Amy Holland - Kristy Hollingshead - Marcus Huderle - Baden Hughes - Nancy Ide - Rebecca Ingram - Edward Ivanovic - Thomas Jakobsen - Nick Johnson - Eric Kafe - Piotr Kasprzyk - Angelos Katharopoulos - Sudharshan Kaushik - Chris Koenig - Mikhail Korobov - Denis Krusko - Ilia Kurenkov - Stefano Lattarini - Pierre-François Laquerre - Stefano Lattarini - Haejoong Lee - Jackson Lee - Max Leonov - Chris Liechti - Hyuckin David Lim - Tom Lippincott - Peter Ljunglöf - Alex Louden - Joseph Lynch - Nitin Madnani - Felipe Madrigal - Bjørn Mæland - Dean Malmgren - Christopher Maloof - Rob Malouf - Iker Manterola - Carl de Marcken - Mitch Marcus - Torsten Marek - Robert Marshall - Marius Mather - Duncan McGreggor - David McClosky - Xinfan Meng - Dmitrijs Milajevs - Margaret Mitchell - Tomonori Nagano - Jason Narad - Shari A’aidil Nasruddin - Lance Nathan - Morten Neergaard - David Nemeskey - Eric Nichols - Joel Nothman - Alireza Nourian - Alexander Oleynikov - Pierpaolo Pantone - Ted Pedersen - Jacob Perkins - Alberto Planas - Ondrej Platek - Alessandro Presta - Qi Liu - Martin Thorsen Ranang - Michael Recachinas - Brandon Rhodes - Joshua Ritterman - Will Roberts - Stuart Robinson - Carlos Rodriguez - Lorenzo Rubio - Alex Rudnick - Jussi Salmela - Geoffrey Sampson - Kepa Sarasola - Kevin Scannell - Nathan Schneider - Rico Sennrich - Thomas Skardal - Eric Smith - Lynn Soe - Rob Speer - Peter Spiller - Richard Sproat - Ceri Stagg - Peter Stahl - Oliver Steele - Thomas Stieglmaier - Jan Strunk - Liling Tan - Claire Taylor - Louis Tiao - Steven Tomcavage - Tiago Tresoldi - Marcus Uneson - Yu Usami - Petro Verkhogliad - Peter Wang - Zhe Wang - Charlotte Wilson - Chuck Wooters - Steven Xu - Beracah Yankama - Lei Ye (叶磊) - Patrick Ye - Geraldine Sim Wei Ying - Jason Yoder - Thomas Zieglier - 0ssifrage - ducki13 - kiwipi - lade - isnowfy - onesandzeros - pquentin - wvanlint - Álvaro Justen - bjut-hz - Sergio Oller - Will Monroe - Elijah Rippeth - Emil Manukyan - Casper Lehmann-Strøm - Andrew Giel - Tanin Na Nakorn - Linghao Zhang - Colin Carroll - Heguang Miao - Hannah Aizenman (story645) - George Berry - Adam Nelson - J Richard Snape - Alex Constantin - Tsolak Ghukasyan - Prasasto Adi - Safwan Kamarrudin - Arthur Tilley - Vilhjalmur Thorsteinsson - Jaehoon Hwang - Chintan Shah - sbagan - Zicheng Xu - Albert Au Yeung - Shenjian Zhao - Deng Wang - Ali Abdullah - Stoytcho Stoytchev - Lakhdar Benzahia - Kheireddine Abainia - Yibin Lin - Artiem Krinitsyn - Björn Mattsson - Oleg Chislov - Pavan Gururaj Joshi - Ethan Hill - Vivek Lakshmanan - Somnath Rakshit - Anlan Du - Pulkit Maloo - Brandon M. Burroughs - John Stewart - Iaroslav Tymchenko - Aleš Tamchyna - Tim Gianitsos - Philippe Partarrieu - Andrew Owen Martin - Adrian Ellis - Nat Quayle Nelson - Yanpeng Zhao - Matan Rak - Nick Ulle - Uday Krishna - Osman Zubair - Viresh Gupta - Ondřej Cífka - Iris X. Zhou - Devashish Lal - Gerhard Kremer - Nicolas Darr - Hervé Nicol - Alexandre H. T. Dias - Daksh Shah - Jacob Weightman - Bonifacio de Oliveira - Armins Bagrats Stepanjans - Vassilis Palassopoulos - Ram Rachum - Or Sharir - Denali Molitor - Jacob Moorman - Cory Nezin - Matt Chaput - Danny Sepler - Akshita Bhagia - Pratap Yadav - Hiroki Teranishi - Ruben Cartuyvels - Dalton Pearson - Robby Horvath - Gavish Poddar - Saibo Geng - Ahmet Yildirim - Yuta Nakamura - Adam Hawley - Panagiotis Simakis ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it: ### Contributors to the Porter Stemmer - Martin Porter - Vivake Gupta - Barry Wilkins - Hiranmay Ghosh - Chris Emerson ### Authors of snowball arabic stemmer algorithm - Assem Chelli - Abdelkrim Aries - Lakhdar Benzahia nltk-3.7/CITATION.cff000066400000000000000000000020001420073152400142130ustar00rootroot00000000000000cff-version: 1.2.0 title: >- Natural Language ToolKit (NLTK) message: >- Please cite this software using the metadata from 'preferred-citation'. type: software authors: - name: "NLTK Team" email: "nltk.team@gmail.com" repository-code: "https://github.com/nltk/nltk" url: "https://www.nltk.org" license: Apache-2.0 keywords: - "NLP" - "CL" - "natural language processing" - "computational linguistics" - "parsing" - "tagging" - "tokenizing" - "syntax" - "linguistics" - "language" - "natural language" - "text analytics" preferred-citation: title: >- Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit type: book authors: - given-names: Steven family-names: Bird orcid: https://orcid.org/0000-0003-3782-7733 - given-names: Ewan family-names: Klein orcid: https://orcid.org/0000-0002-0520-8447 - given-names: Edward family-names: Loper year: 2009 publisher: name: "O'Reilly Media, Inc." nltk-3.7/CONTRIBUTING.md000066400000000000000000000221611420073152400145640ustar00rootroot00000000000000# Contributing to NLTK Hi! Thanks for your interest in contributing to [NLTK](https://www.nltk.org/). :-) You'll be joining a [long list of contributors](https://github.com/nltk/nltk/blob/develop/AUTHORS.md). In this document we'll try to summarize everything that you need to know to do a good job. ## Code and Issues We use [GitHub](https://www.github.com/) to host our code repositories and issues. The [NLTK organization on GitHub](https://github.com/nltk) has many repositories, so we can manage better the issues and development. The most important are: - [nltk/nltk](https://github.com/nltk/nltk/), the main repository with code related to the library; - [nltk/nltk_data](https://github.com/nltk/nltk_data), repository with data related to corpora, taggers and other useful data that are not shipped by default with the library, which can be downloaded by `nltk.downloader`; - [nltk/nltk.github.com](https://github.com/nltk/nltk.github.com), NLTK website with information about the library, documentation, link for downloading NLTK Book etc.; - [nltk/nltk_book](https://github.com/nltk/nltk_book), source code for the NLTK Book. ## Development priorities NLTK consists of the functionality that the Python/NLP community is motivated to contribute. Some priority areas for development are listed in the [NLTK Wiki](https://github.com/nltk/nltk/wiki#development). ## Git and our Branching model ### Git We use [Git](https://git-scm.com/) as our [version control system](https://en.wikipedia.org/wiki/Revision_control), so the best way to contribute is to learn how to use it and put your changes on a Git repository. There's a plenty of documentation about Git -- you can start with the [Pro Git book](https://git-scm.com/book/). ### Setting up a Development Environment To set up your local development environment for contributing to the main repository [nltk/nltk](https://github.com/nltk/nltk/): - Fork the [nltk/nltk](https://github.com/nltk/nltk/) repository on GitHub to your account; - Clone your forked repository locally (`git clone https://github.com//nltk.git`); - Run `cd nltk` to get to the root directory of the `nltk` code base; - Install the dependencies (`pip install -r pip-req.txt`); - Install the [pre-commit](https://pre-commit.com) hooks: (`pre-commit install`) - Download the datasets for running tests (`python -m nltk.downloader all`); - Create a remote link from your local repository to the upstream `nltk/nltk` on GitHub (`git remote add upstream https://github.com/nltk/nltk.git`) -- you will need to use this `upstream` link when updating your local repository with all the latest contributions. ### GitHub Pull requests We use the famous [gitflow](https://nvie.com/posts/a-successful-git-branching-model/) to manage our branches. Summary of our git branching model: - Go to the `develop` branch (`git checkout develop`); - Get all the latest work from the upstream `nltk/nltk` repository (`git pull upstream develop`); - Create a new branch off of `develop` with a descriptive name (for example: `feature/portuguese-sentiment-analysis`, `hotfix/bug-on-downloader`). You can do it by switching to the `develop` branch (`git checkout develop`) and then creating a new branch (`git checkout -b name-of-the-new-branch`); - Do many small commits on that branch locally (`git add files-changed`, `git commit -m "Add some change"`); - Run the tests to make sure nothing breaks (`tox -e py37` if you are on Python 3.7); - Add your name to the `AUTHORS.md` file as a contributor; - Push to your fork on GitHub (with the name as your local branch: `git push origin branch-name`); - Create a pull request using the GitHub Web interface (asking us to pull the changes from your new branch and add to them our `develop` branch); - Wait for comments. ### Tips - Write [helpful commit messages](https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message). - Anything in the `develop` branch should be deployable (no failing tests). - Never use `git add .`: it can add unwanted files; - Avoid using `git commit -a` unless you know what you're doing; - Check every change with `git diff` before adding them to the index (stage area) and with `git diff --cached` before committing; - Make sure you add your name to our [list of contributors](https://github.com/nltk/nltk/blob/develop/AUTHORS.md); - If you have push access to the main repository, please do not commit directly to `develop`: your access should be used only to accept pull requests; if you want to make a new feature, you should use the same process as other developers so you code will be reviewed. - See [RELEASE-HOWTO.txt](RELEASE-HOWTO.txt) to see everything you need before creating a new NLTK release. ## Code Guidelines - Use [PEP8](https://www.python.org/dev/peps/pep-0008/); - Write tests for your new features (please see "Tests" topic below); - Always remember that [commented code is dead code](https://www.codinghorror.com/blog/2008/07/coding-without-comments.html); - Name identifiers (variables, classes, functions, module names) with readable names (`x` is always wrong); - When manipulating strings, we prefer either [f-string formatting](https://docs.python.org/3/tutorial/inputoutput.html#formatted-string-literals) (f`'{a} = {b}'`) or [new-style formatting](https://docs.python.org/library/string.html#format-string-syntax) (`'{} = {}'.format(a, b)`), instead of the old-style formatting (`'%s = %s' % (a, b)`); - All `#TODO` comments should be turned into issues (use our [GitHub issue system](https://github.com/nltk/nltk/issues)); - Run all tests before pushing (just execute `tox`) so you will know if your changes broke something; See also our [developer's guide](https://github.com/nltk/nltk/wiki/Developers-Guide). ## Tests You should write tests for every feature you add or bug you solve in the code. Having automated tests for every line of our code lets us make big changes without worries: there will always be tests to verify if the changes introduced bugs or lack of features. If we don't have tests we will be blind and every change will come with some fear of possibly breaking something. For a better design of your code, we recommend using a technique called [test-driven development](https://en.wikipedia.org/wiki/Test-driven_development), where you write your tests **before** writing the actual code that implements the desired feature. You can use `pytest` to run your tests, no matter which type of test it is: ``` cd nltk/test pytest util.doctest # doctest pytest unit/translate/test_nist.py # unittest pytest # all tests ``` ## Continuous Integration **Deprecated:** NLTK uses [Cloudbees](https://nltk.ci.cloudbees.com/) for continuous integration. **Deprecated:** NLTK uses [Travis](https://travis-ci.org/nltk/nltk/) for continuous integration. NLTK uses [GitHub Actions](https://github.com/nltk/nltk/actions) for continuous integration. See [here](https://docs.github.com/en/actions) for GitHub's documentation. The [`.github/workflows/ci.yaml`](https://github.com/nltk/nltk/blob/develop/.github/workflows/ci.yaml) file configures the CI: - `on:` section - ensures that this CI is run on code pushes, pull request, or through the GitHub website via a button. - The `cache_nltk_data` job - performs these steps: - Downloads the `nltk` source code. - Load `nltk_data` via cache. - Otherwise, download all the data packages through `nltk.download('all')`. - The `test` job - tests against supported Python versions (`3.7`, `3.8`, `3.9`). - tests on `ubuntu-latest` and `macos-latest`. - relies on the `cache_nltk_data` job to ensure that `nltk_data` is available. - performs these steps: - Downloads the `nltk` source code. - Set up Python using whatever version is being checked in the current execution. - Load module dependencies via cache. - Otherwise, install dependencies via `pip install -U -r requirements-ci.txt`. - Load cached `nltk_data` loaded via `cache_nltk_data`. - Run `pytest --numprocesses auto -rsx nltk/test`. - The `pre-commit` job - performs these steps: - Downloads the `nltk` source code. - Runs pre-commit on all files in the repository. (Similar to `pre-commit run --all-files`) - Fails if any hooks performed a change. #### To test with `tox` locally First setup a new virtual environment, see https://docs.python-guide.org/dev/virtualenvs/ Then run `tox -e py37`. For example, using `pipenv`: ``` git clone https://github.com/nltk/nltk.git cd nltk pipenv install -r pip-req.txt pipenv install tox tox -e py37 ``` # Discussion We have three mail lists on Google Groups: - [nltk][nltk-announce], for announcements only; - [nltk-users][nltk-users], for general discussion and user questions; - [nltk-dev][nltk-dev], for people interested in NLTK development. Please feel free to contact us through the [nltk-dev][nltk-dev] mail list if you have any questions or suggestions. Every contribution is very welcome! Happy hacking! (; [nltk-announce]: https://groups.google.com/forum/#!forum/nltk [nltk-dev]: https://groups.google.com/forum/#!forum/nltk-dev [nltk-users]: https://groups.google.com/forum/#!forum/nltk-users nltk-3.7/ChangeLog000066400000000000000000001620431420073152400141110ustar00rootroot00000000000000Version 3.7 2022-02-09 * Improve and update the NLTK team page on nltk.org (#2855, #2941) * Drop support for Python 3.6, support Python 3.10 (#2920) Version 3.6.7 2021-12-28 * Resolve IndexError in `sent_tokenize` and `word_tokenize` (#2922) Version 3.6.6 2021-12-21 * Refactor `gensim.doctest` to work for gensim 4.0.0 and up (#2914) * Add Precision, Recall, F-measure, Confusion Matrix to Taggers (#2862) * Added warnings if .zip files exist without any corresponding .csv files. (#2908) * Fix `FileNotFoundError` when the `download_dir` is a non-existing nested folder (#2910) * Rename omw to omw-1.4 (#2907) * Resolve ReDoS opportunity by fixing incorrectly specified regex (#2906) * Support OMW 1.4 (#2899) * Deprecate Tree get and set node methods (#2900) * Fix broken inaugural test case (#2903) * Use Multilingual Wordnet Data from OMW with newer Wordnet versions (#2889) * Keep NLTKs "tokenize" module working with pathlib (#2896) * Make prettyprinter to be more readable (#2893) * Update links to the nltk book (#2895) * Add `CITATION.cff` to nltk (#2880) * Resolve serious ReDoS in PunktSentenceTokenizer (#2869) * Delete old CI config files (#2881) * Improve Tokenize documentation + add TokenizerI as superclass for TweetTokenizer (#2878) * Fix expected value for BLEU score doctest after changes from #2572 * Add multi Bleu functionality and tests (#2793) * Deprecate 'return_str' parameter in NLTKWordTokenizer and TreebankWordTokenizer (#2883) * Allow empty string in CFG's + more (#2888) * Partition `tree.py` module into `tree` package + pickle fix (#2863) * Fix several TreebankWordTokenizer and NLTKWordTokenizer bugs (#2877) * Rewind Wordnet data file after each lookup (#2868) * Correct __init__ call for SyntaxCorpusReader subclasses (#2872) * Documentation fixes (#2873) * Fix levenstein distance for duplicated letters (#2849) * Support alternative Wordnet versions (#2860) * Remove hundreds of formatting warnings for nltk.org (#2859) * Modernize `nltk.org/howto` pages (#2856) * Fix Bleu Score smoothing function from taking log(0) (#2839) * Update third party tools to newer versions and removing MaltParser fixed version (#2832) * Fix TypeError: _pretty() takes 1 positional argument but 2 were given in sem/drt.py (#2854) * Replace `http` with `https` in most URLs (#2852) Thanks to the following contributors to 3.6.6 Adam Hawley, BatMrE, Danny Sepler, Eric Kafe, Gavish Poddar, Panagiotis Simakis, RnDevelover, Robby Horvath, Tom Aarsen, Yuta Nakamura, Mohaned Mashaly Version 3.6.5 2021-10-11 * modernised nltk.org website * addressed LGTM.com issues * support ZWJ sequences emoji and skin tone modifer emoji in TweetTokenizer * METEOR evaluation now requires pre-tokenized input * Code linting and type hinting * implement get_refs function for DrtLambdaExpression * Enable automated CoreNLP, Senna, Prover9/Mace4, Megam, MaltParser CI tests * specify minimum regex version that supports regex.Pattern * avoid re.Pattern and regex.Pattern which fail for Python 3.6, 3.7 Thanks to the following contributors to 3.6.5 Tom Aarsen, Saibo Geng, Mohaned Mashaly, Dimitri Papadopoulos, Danny Sepler, Ahmet Yildirim, RnDevelover, yutanakamura Version 3.6.4 2021-10-01 * deprecate `nltk.usage(obj)` in favor of `help(obj)` * resolve ReDoS vulnerability in Corpus Reader * solidify performance tests * improve phone number recognition in tweet tokenizer * refactored CISTEM stemmer for German * identify NLTK Team as the author * replace travis badge with github actions badge * add SECURITY.md Thanks to the following contributors to 3.6.4 Tom Aarsen, Mohaned Mashaly, Dimitri Papadopoulos Orfanos, purificant, Danny Sepler Version 3.6.3 2021-09-19 * Dropped support for Python 3.5 * Run CI tests on Windows, too * Moved from Travis CI to GitHub Actions * Code and comment cleanups * Visualize WordNet relation graphs using Graphviz * Fixed large error in METEOR score * Apply isort, pyupgrade, black, added as pre-commit hooks * Prevent debug_decisions in Punkt from throwing IndexError * Resolved ZeroDivisionError in RIBES with dissimilar sentences * Initialize WordNet IC total counts with smoothing value * Fixed AttributeError for Arabic ARLSTem2 stemmer * Many fixes and improvements to lm language model package * Fix bug in nltk.metrics.aline, C_skip = -10 * Improvements to TweetTokenizer * Optional show arg for FreqDist.plot, ConditionalFreqDist.plot * edit_distance now computes Damerau-Levenshtein edit-distance Thanks to the following contributors to 3.6.3 Tom Aarsen, Abhijnan Bajpai, Michael Wayne Goodman, Michał Górny, Maarten ter Huurne, Manu Joseph, Eric Kafe, Ilia Kurenkov, Daniel Loney, Rob Malouf, Mohaned Mashaly, purificant, Danny Sepler, Anthony Sottile Version 3.6.2 2021-04-20 * move test code to nltk/test * clean up some doctests * fix bug in NgramAssocMeasures (order preserving fix) * fixes for compatibility with Pypy 7.3.4 Thanks to the following contributors to 3.6.2 Ruben Cartuyvels, Rob Malouf, Dalton Pearson, Danny Sepler Version 3.6 2021-04-07 * add support for Python 3.9 * add Tree.fromlist * compute Minimum Spanning Tree of unweighted graph using BFS * fix bug with infinite loop in Wordnet closure and tree * fix bug in calculating BLEU using smoothing method 4 * Wordnet synset similarities work for all pos * new Arabic light stemmer (ARLSTem2) * new syllable tokenizer (LegalitySyllableTokenizer) * remove nose in favor of pytest * misc bug fixes, code cleanups, test cleanups, efficiency improvements Thanks to the following contributors to 3.6: Tom Aarsen, K Abainia, Akshita Bhagia, Andrew Bird, Thomas Bird, Tom Conroy, CubieDev, Christopher Hench, Andrew Jorgensen, Eric Kafe, Ilia Kurenkov, Yeting Li, Joseph Manu, Marius Mather, Denali Molitor, Jacob Moorman, Philippe Ombredanne, Vassilis Palassopoulos, Ram Rachum, Danny Sepler, Or Sharir, Brad Solomon, Hiroki Teranishi, Constantin Weisser, Pratap Yadav, Louis Yang Version 3.5 2020-04-13 * add support for Python 3.8 * drop support for Python 2 * create NLTK's own Tokenizer class distinct from the Treebank reference tokeniser * update Vader sentiment analyser * fix JSON serialization of some PoS taggers * minor improvements in grammar.CFG, Vader, pl196x corpus reader, StringTokenizer * change implementation <= and >= for FreqDist so they are partial orders * make FreqDist iterable * correctly handle Penn Treebank trees with a unlabeled branching top node. Thanks to the following contributors to 3.5: Nicolas Darr, Gerhard Kremer, Liling Tan, Christopher Hench, Alexandre Dias, Hervé Nicol, Pierpaolo Pantone, Bonifacio de Oliveira, Maciej Gawinecki, BLKSerene, hoefling, alvations, pyfisch, srhrshr Version 3.4.5 2019-08-20 * Fixed security bug in downloader: Zip slip vulnerability - for the unlikely situation where a user configures their downloader to use a compromised server https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-14751) Thanks to the following contributors to 3.4.5: Mike Salvatore Version 3.4.4 2019-07-04 * fix bug in plot function (probability.py) * add improved PanLex Swadesh corpus reader Thanks to the following contributors to 3.4.4: Devashish Lal, Liling Tan Version 3.4.3 2019-06-07 * add Text.generate() * add QuadgramAssocMeasures * add SSP to tokenizers * return confidence of best tag from AveragedPerceptron * make plot methods return Axes objects * don't require list arguments to PositiveNaiveBayesClassifier.train * fix Tree classes to work with native Python copy library * fix inconsistency for NomBank * fix random seeding in LanguageModel.generate * fix ConditionalFreqDist mutation on tabulate/plot call * fix broken links in documentation * fix misc Wordnet issues * update installation instructions Thanks to the following contributors to 3.4.3: alvations, Bharat123rox, cifkao, drewmiller, free-variation, henchc irisxzhou, nick-ulle, ppartarr, simonepri, yigitsever, zhaoyanpeng Version 3.4.1 2019-04-17 * add chomsky_normal_form for CFGs * add meteor score * add minimum edit/Levenshtein distance based alignment function * allow access to collocation list via text.collocation_list() * support corenlp server options * drop support for Python 3.4 * other minor fixes Thanks to the following contributors to 3.4.1: Adrian Ellis, Andrew Martin, Ayush Kaushal, BLKSerene, Bharat Raghunathan, Franklin Chen, KMiNT21 Kevin Brown, Liling Tan, Matan Rak, Nat Quayle Nelson, Osman Zubair, Purificant, Uday Krishna, Viresh Gupta Version 3.4 2018-11-17 * Support Python 3.7 * Language Modeling incl Kneser-Ney, Witten-Bell, Good-Turing * Cistem Stemmer for German * Support Russian National Corpus incl POS tag model * Decouple sentiment and twitter packages * Minor extensions for WordNet * K-alpha * Fix warning messages for corenlp * Comprehensive code cleanups * Many other minor fixes * Switch continuous integration from Jenkins to Travis Special thanks to Ilia Kurenkov (Language Model package), Liling Tan (Python 3.7, Travis-CI), and purificant (code cleanups). Thanks also to: Afshin Sadeghi, Ales Tamchyna, Alok Debnath, aquatiko, Coykto, Denis Kataev, dnc1994, Fabian Howard, Frankie Robertson, Iaroslav Tymchenko, Jayakrishna Sahit, LBenzahia, Leonie Weißweiler, Linghao Zhang, Rohit Kumar, sahitpj, Tim Gianitsos, vagrant, 53X Version 3.3 2018-05-06 * Support Python 3.6 * New interface to CoreNLP * Support synset retrieval by sense key * Minor fixes to CoNLL Corpus Reader, AlignedSent * Fixed minor inconsistencies in APIs and API documentation * Better conformance to PEP8 * Drop moses.py (incompatible license) Special thanks to Liling Tan for leading our transition to Python 3.6. Thanks to other contributors listed here: https://github.com/nltk/nltk/blob/develop/AUTHORS.md Version 3.2.5 2017-09-24 * Arabic stemmers (ARLSTem, Snowball) * NIST MT evaluation metric and added NIST international_tokenize * Moses tokenizer * Document Russian tagger * Fix to Stanford segmenter * Improve treebank detokenizer, VerbNet, Vader * Misc code and documentation cleanups * Implement fixes suggested by LGTM Thanks to the following contributors to 3.2.5: Ali Abdullah, Lakhdar Benzahia, Henry Elder, Campion Fellin, Tsolak Ghukasyan, Thanh Ha, Jean Helie, Nelson Liu, Nathan Schneider, Chintan Shah, Fábio Silva, Liling Tan, Ziyao Wei, Zicheng Xu, Albert Au Yeung, AbdealiJK, porqupine, sbagan, xprogramer Version 3.2.4 2017-05-21 * remove load-time dependency on Python requests library * add support for Arabic in StanfordSegmenter * fix MosesDetokenizer on irregular quote tokens Thanks to the following contributors to 3.2.4: Alex Constantin, Hatem Nassrat, Liling Tan Version 3.2.3 2017-05-16 * new interface to Stanford CoreNLP Web API * improved Lancaster stemmer with customizable rules from Whoosh * improved Treebank tokenizer * improved support for GLEU score * adopt new Abstract base class style * support custom tab files for extending WordNet * make synset_from_pos_and_offset a public method * make non-English WordNet lemma lookups case-insensitive * speed up TnT tagger * speed up FreqDist and ConditionalFreqDist * support additional quotes in TreebankWordTokenizer * clean up Tk's postscript output * drop explicit support for corpora not distributed with NLTK to streamline testing * allow iterator in perceptron tagger training * allow for curly bracket quantifiers in chunk.regexp.CHUNK_TAG_PATTERN * new corpus reader for MWA subset of PPDB * improved testing framework Thanks to the following contributors to 3.2.3: Mark Amery, Carl Bolz, Abdelhak Bougouffa, Matt Chaput, Michael Goodman, Jaehoon Hwang, Naoya Kanai, Jackson Lee, Christian Meyer, Dmitrijs Milajevs, Adam Nelson, Pierpaolo Pantone, Liling Tan, Vilhjalmur Thorsteinsson, Arthur Tilley, jmhutch, Yorwba, eromoe and others Version 3.2.2 2016-12-31 * added Kondrak's Aline algorithm * added ChrF and GLEU MT evaluation metrics * added Russian pos tagger model * added Moses detokenizer * rewrite Porter Stemmer * rewrite FrameNet corpus reader (adds frame parameter to fes(), lus(), exemplars() see https://www.nltk.org/howto/framenet.html) * updated FrameNet Corpus to version 1.7 * fixes to stanford_segmenter.py, SentiText, CoNLL Corpus Reader * fixes to BLEU, naivebayes, Krippendorff's alpha, Punkt * fixes to tests for TransitionParser, Senna, edit distance * fixes to Moses Tokenizer and Detokenizer * improved TweetTokenizer * strip trailing whitespace when splitting sentences * handle inverted exclamation mark in ToktokTokenizer * resolved some issues with Python 3.5 support * improvements to testing framework * clean up dependencies Thanks to the following contributors to 3.2.2: Prasasto Adi, Mark Amery, Geoff Bacon, George Berry, Colin Carroll, Alexis Dimitriadis, Nicholas Fabina, German Ferrero, Tsolak Ghukasyan, Hyuckin David Lim, Naoya Kanai, Greg Kondrak, Igor Korolev, Tim Leslie, Rob Malouf, Heguang Miao, Dmitrijs Milajevs, Adam Nelson, Dennis O'Brien, Qi Liu, Pierpaolo Pantone, Andy Reagan, Mike Recachinas, Nathan Schneider, Jānis Šlapiņš, Richard Snape, Liling Tan, Marcus Uneson, Linghao Zhang, drevicko, SaintNazaire Version 3.2.1 2016-04-09 * Support for CCG semantics, Stanford segmenter, VADER lexicon * Fixes to BLEU score calculation, CHILDES corpus reader * Other miscellaneous fixes Thanks to the following contributors to 3.2.1: Andrew Giel, Casper Lehmann-Strøm, David Madl, Tanin Na Nakorn, Guilherme Nardari, Philippe Ombredanne, Nathan Schneider, Liling Tan, Josiah Wang, venticello Version 3.2 2016-03-03 * Fixes for Python 3.5 * Code cleanups now Python 2.6 is no longer supported * Improvements to documentation * Comprehensive use of os.path for platform-specific path handling * Support for PanLex * Support for third party download locations for NLTK data * Fix bugs in IBM method 3 smoothing and BLEU calculation * Support smoothing for BLEU score and corpus-level BLEU * Support RIBES score * Improvements to TweetTokenizer * Updates for Stanford API * Add mathematical operators to ConditionalFreqDist * Fix bug in sentiwordnet for adjectives * Merged internal implementations of Trie Thanks to the following contributors to 3.2: Santiago Castro, Jihun Choi, Graham Christensen, Andrew Drozdov, Long Duong, Kyriakos Georgiou, Michael Wayne Goodman, Clark Grubb, Tah Wei Hoon, David Kamholz, Ewan Klein, Reed Loden, Rob Malouf, Philippe Ombredanne, Josh Owen, Pierpaolo Pantone, Mike Recachinas, Elijah Rippeth, Thomas Stieglmaier, Liling Tan, Philip Tzou, Pratap Vardhan. Version 3.1 2015-10-15 * Fixes for Python 3.5 (drop support for capturing groups in regexp tokenizer) * Drop support for Python 2.6 * Adopt perceptron tagger for new default POS tagger nltk.pos_tag * Stanford Neural Dependency Parser wrapper * Sentiment analysis package incl VADER * Improvements to twitter package * Multi word expression tokenizer * Support for everygram and skipgram * consistent evaluation metric interfaces, putting reference before hypothesis * new nltk.translate module, incorporating the old align module * implement stack decoder * clean up Alignment interface * CorpusReader method to support access to license and citation * Multext East Corpus and MTECorpusReader * include six module to streamline installation on MS Windows Thanks to the following contributors to 3.1: Le Tuan Anh, Petra Barancikova, Alexander Böhm, Francis Bond, Long Duong, Anna Garbar, Matthew Honnibal, Tah Wei Hoon, Ewan Klein, Rob Malouf, Dmitrijs Milajevs, Will Monroe, Sergio Oller, Pierpaolo Pantone, Jacob Perkins, Lorenzo Rubio, Thomas Stieglmaier, Liling Tan, Pratap Vardhan Version 3.0.5 2015-09-05 * rewritten IBM models, and new IBM Model 4 and 5 implementations * new Twitter package * stabilized MaltParser API * improved regex tagger * improved documentation on contributing * minor improvements to documentation and testing Thanks to the following contributors to 3.0.5: Álvaro Justen, Dmitrijs Milajevs, Ewan Klein, Heran Lin, Justin Hammar, Liling Tan, Long Duong, Lorenzo Rubio, Pierpaolo Pantone, Tah Wei Hoon Version 3.0.4 2015-07-13 * minor bug fixes and enhancements Thanks to the following contributors to 3.0.4: Nicola Bova, Santiago Castro, Len Remmerswaal, Keith Suderman, kabayan55, pln-fing-udelar (NLP Group, Instituto de Computación, Facultad de Ingeniería, Universidad de la República, Uruguay). Version 3.0.3 2015-06-12 * bug fixes (Stanford NER, Boxer, Snowball, treebank tokenizer, dependency graph, KneserNey, BLEU) * code clean-ups * default POS tagger permits tagset to be specified * gensim illustration * tgrep implementation * added PanLex Swadesh corpora * visualisation for aligned bitext * support for Google App Engine * POSTagger renamed StanfordPOSTagger, NERTagger renamed StanfordNERTagger Thanks to the following contributors to 3.0.3: Long Duong, Pedro Fialho, Dan Garrette, Helder, Saimadhav Heblikar, Chris Inskip, David Kamholz, Dmitrijs Milajevs, Smitha Milli, Tom Mortimer-Jones, Avital Pekker, Jonathan Pool, Sam Raker, Will Roberts, Dmitry Sadovnychyi, Nathan Schneider, Anirudh W Version 3.0.2 2015-03-13 * make pretty-printing method names consistent * improvements to Portuguese stemmer * transition-based dependency parsers * dependency graph visualisation for ipython notebook * interfaces for Senna, BLLIP, python-crfsuite * NKJP corpus reader * code clean ups, minor bug fixes Thanks to the following contributors to 3.0.2: Long Duong, Saimadhav Heblikar, Helder, Mikhail Korobov, Denis Krusko, Alex Louden, Felipe Madrigal, David McClosky, Dmitrijs Milajevs, Ondrej Platek, Nathan Schneider, Dávid Márk Nemeskey, 0ssifrage, ducki13, kiwipi. Version 3.0.1 2015-01-12 * fix setup.py for new version of setuptools Version 3.0.0 2014-09-07 * minor bugfixes * added phrase extraction code by Liling Tan and Fredrik Hedman Thanks to the following contributors to 3.0.0: Mark Amery, Ivan Barria, Ingolf Becker, Francis Bond, Lars Buitinck, Cristian Capdevila, Arthur Darcet, Michelle Fullwood, Dan Garrette, Dougal Graham, Dan Garrette, Dougal Graham, Lauri Hallila, Tyler Hartley, Fredrik Hedman, Ofer Helman, Bruce Hill, Marcus Huderle, Nancy Ide, Nick Johnson, Angelos Katharopoulos, Ewan Klein, Mikhail Korobov, Chris Liechti, Peter Ljunglof, Joseph Lynch, Haejoong Lee, Peter Ljunglöf, Dean Malmgren, Rob Malouf, Thorsten Marek, Dmitrijs Milajevs, Shari A’aidil Nasruddin, Lance Nathan, Joel Nothman, Alireza Nourian, Alexander Oleynikov, Ted Pedersen, Jacob Perkins, Will Roberts, Alex Rudnick, Nathan Schneider, Geraldine Sim Wei Ying, Lynn Soe, Liling Tan, Louis Tiao, Marcus Uneson, Yu Usami, Steven Xu, Zhe Wang, Chuck Wooters, lade, isnowfy, onesandzeros, pquentin, wvanlint Version 3.0b2 2014-08-21 * minor bugfixes and clean-ups * renamed remaining parse_ methods to read_ or load_, cf issue #656 * added Paice's method of evaluating stemming algorithms Thanks to the following contributors to 3.0.0b2: Lars Buitinck, Cristian Capdevila, Lauri Hallila, Ofer Helman, Dmitrijs Milajevs, lade, Liling Tan, Steven Xu Version 3.0.0b1 2014-07-11 * Added SentiWordNet corpus and corpus reader * Fixed support for 10-column dependency file format * Changed Tree initialization to use fromstring Thanks to the following contributors to 3.0b1: Mark Amery, Ivan Barria, Ingolf Becker, Francis Bond, Lars Buitinck, Arthur Darcet, Michelle Fullwood, Dan Garrette, Dougal Graham, Dan Garrette, Dougal Graham, Tyler Hartley, Ofer Helman, Bruce Hill, Marcus Huderle, Nancy Ide, Nick Johnson, Angelos Katharopoulos, Ewan Klein, Mikhail Korobov, Chris Liechti, Peter Ljunglof, Joseph Lynch, Haejoong Lee, Peter Ljunglöf, Dean Malmgren, Rob Malouf, Thorsten Marek, Dmitrijs Milajevs, Shari A’aidil Nasruddin, Lance Nathan, Joel Nothman, Alireza Nourian, Alexander Oleynikov, Ted Pedersen, Jacob Perkins, Will Roberts, Alex Rudnick, Nathan Schneider, Geraldine Sim Wei Ying, Lynn Soe, Liling Tan, Louis Tiao, Marcus Uneson, Yu Usami, Steven Xu, Zhe Wang, Chuck Wooters, isnowfy, onesandzeros, pquentin, wvanlint Version 3.0a4 2014-05-25 * IBM Models 1-3, BLEU, Gale-Church aligner * Lesk algorithm for WSD * Open Multilingual WordNet * New implementation of Brill Tagger * Extend BNCCorpusReader to parse the whole BNC * MASC Tagged Corpus and corpus reader * Interface to Stanford Parser * Code speed-ups and clean-ups * API standardisation, including fromstring method for many objects * Improved regression testing setup * Removed PyYAML dependency Thanks to the following contributors to 3.0a4: Ivan Barria, Ingolf Becker, Francis Bond, Arthur Darcet, Dan Garrette, Ofer Helman, Dougal Graham, Nancy Ide, Ewan Klein, Mikhail Korobov, Chris Liechti, Peter Ljunglof, Joseph Lynch, Rob Malouf, Thorsten Marek, Dmitrijs Milajevs, Shari A’aidil Nasruddin, Lance Nathan, Joel Nothman, Jacob Perkins, Lynn Soe, Liling Tan, Louis Tiao, Marcus Uneson, Steven Xu, Geraldine Sim Wei Ying Version 3.0a3 2013-11-02 * support for FrameNet contributed by Chuck Wooters * support for Universal Declaration of Human Rights Corpus (udhr2) * major API changes: - Tree.node -> Tree.label() / Tree.set_label() - Chunk parser: top_node -> root_label; chunk_node -> chunk_label - WordNet properties are now access methods, e.g. Synset.definition -> Synset.definition() - relextract: show_raw_rtuple() -> rtuple(), show_clause() -> clause() * bugfix in texttiling * replaced simplify_tags with support for universal tagset (simplify_tags=True -> tagset='universal') * Punkt default behavior changed to realign sentence boundaries after trailing parenthesis and quotes * deprecated classify.svm (use scikit-learn instead) * various efficiency improvements Thanks to the following contributors to 3.0a3: Lars Buitinck, Marcus Huderle, Nick Johnson, Dougal Graham, Ewan Klein, Mikhail Korobov, Haejoong Lee, Peter Ljunglöf, Dean Malmgren, Lance Nathan, Alexander Oleynikov, Nathan Schneider, Chuck Wooters, Yu Usami, Steven Xu, pquentin, wvanlint Version 3.0a2 2013-07-12 * speed improvements in word_tokenize, GAAClusterer, TnT tagger, Baum Welch, HMM tagger * small improvements in collocation finders, probability, modelling, Porter Stemmer * bugfix in lowest common hypernyn calculation (used in path similarity measures) * code cleanups, docstring cleanups, demo fixes Thanks to the following contributors to 3.0a2: Mark Amery, Lars Buitinck, Michelle Fullwood, Dan Garrette, Dougal Graham, Tyler Hartley, Bruce Hill, Angelos Katharopoulos, Mikhail Korobov, Rob Malouf, Joel Nothman, Ted Pedersen, Will Roberts, Alex Rudnick, Steven Xu, isnowfy, onesandzeros Version 3.0a1 2013-02-14 * reinstated tkinter support (Haejoong Lee) Version 3.0a0 2013-01-14 * alpha release of first version to support Python 2.6, 2.7, and 3. Version 2.0.4 2012-11-07 * minor bugfix (removed numpy dependency) Version 2.0.3 2012-09-24 * fixed corpus/reader/util.py to support Python 2.5 * make MaltParser safe to use in parallel * fixed bug in inter-annotator agreement * updates to various doctests (nltk/test) * minor bugfixes Thanks to the following contributors to 2.0.3: Robin Cooper, Pablo Duboue, Christian Federmann, Dan Garrette, Ewan Klein, Pierre-François Laquerre, Max Leonov, Peter Ljunglöf, Nitin Madnani, Ceri Stagg Version 2.0.2 2012-07-05 * improvements to PropBank, NomBank, and SemCor corpus readers * interface to full Penn Treebank Corpus V3 (corpus.ptb) * made wordnet.lemmas case-insensitive * more flexible padding in model.ngram * minor bugfixes and documentation enhancements * better support for automated testing Thanks to the following contributors to 2.0.2: Daniel Blanchard, Mikhail Korobov, Nitin Madnani, Duncan McGreggor, Morten Neergaard, Nathan Schneider, Rico Sennrich. Version 2.0.1 2012-05-15 * moved NLTK to GitHub: https://github.com/nltk * set up integration testing: https://jenkins.shiningpanda.com/nltk/ (Morten Neergaard) * converted documentation to Sphinx format: https://www.nltk.org/api/nltk.html * dozens of minor enhancements and bugfixes: https://github.com/nltk/nltk/commits/ * dozens of fixes for conformance with PEP-8 * dozens of fixes to ensure operation with Python 2.5 * added interface to Lin's Dependency Thesaurus (Dan Blanchard) * added interface to scikit-learn classifiers (Lars Buitinck) * added segmentation evaluation measures (David Doukhan) Thanks to the following contributors to 2.0.1 (since 2.0b9, July 2010): Rami Al-Rfou', Yonatan Becker, Steven Bethard, Daniel Blanchard, Lars Buitinck, David Coles, Lucas Cooper, David Doukhan, Dan Garrette, Masato Hagiwara, Michael Hansen, Michael Heilman, Rebecca Ingram, Sudharshan Kaushik, Mikhail Korobov, Peter Ljunglof, Nitin Madnani, Rob Malouf, Tomonori Nagano, Morten Neergaard, David Nemeskey, Joel Nothman, Jacob Perkins, Alessandro Presta, Alex Rudnick, Nathan Schneider, Stefano Lattarini, Peter Stahl, Jason Yoder Version 2.0.1 (rc1) 2011-04-11 NLTK: * added interface to the Stanford POS Tagger * updates to sem.Boxer, sem.drt.DRS * allow unicode strings in grammars * allow non-string features in classifiers * modifications to HunposTagger * issues with DRS printing * fixed bigram collocation finder for window_size > 2 * doctest paths no longer presume unix-style pathname separators * fixed issue with NLTK's tokenize module colliding with the Python tokenize module * fixed issue with stemming Unicode strings * changed ViterbiParser.nbest_parse to parse * ChaSen and KNBC Japanese corpus readers * preserve case in concordance display * fixed bug in simplification of Brown tags * a version of IBM Model 1 as described in Koehn 2010 * new class AlignedSent for aligned sentence data and evaluation metrics * new nltk.util.set_proxy to allow easy configuration of HTTP proxy * improvements to downloader user interface to catch URL and HTTP errors * added CHILDES corpus reader * created special exception hierarchy for Prover9 errors * significant changes to the underlying code of the boxer interface * path-based wordnet similarity metrics use a fake root node for verbs, following the Perl version * added ability to handle multi-sentence discourses in Boxer * added the 'english' Snowball stemmer * simplifications and corrections of Earley Chart Parser rules * several changes to the feature chart parsers for correct unification * bugfixes: FreqDist.plot, FreqDist.max, NgramModel.entropy, CategorizedCorpusReader, DecisionTreeClassifier * removal of Python >2.4 language features for 2.4 compatibility * removal of deprecated functions and associated warnings * added semantic domains to wordnet corpus reader * changed wordnet similarity functions to include instance hyponyms * updated to use latest version of Boxer Data: * JEITA Public Morphologically Tagged Corpus (in ChaSen format) * KNB Annotated corpus of Japanese blog posts * Fixed some minor bugs in alvey.fcfg, and added number of parse trees in alvey_sentences.txt * added more comtrans data Documentation: * minor fixes to documentation * NLTK Japanese book (chapter 12) by Masato Hagiwara NLTK-Contrib: * Viethen and Dale referring expression algorithms Version 2.0b9 2010-07-25 NLTK: * many code and documentation cleanups * Added port of Snowball stemmers * Fixed loading of pickled tokenizers (issue 556) * DecisionTreeClassifier now handles unknown features (issue 570) * Added error messages to LogicParser * Replaced max_models with end_size to prevent Mace from hanging * Added interface to Boxer * Added nltk.corpus.semcor to give access to SemCor 3.0 corpus (issue 530) * Added support for integer- and float-valued features in maxent classifiers * Permit NgramModels to be pickled * Added Sourced Strings (see test/sourcedstring.doctest for details) * Fixed bugs in with Good-Turing and Simple Good-Turing Estimation (issue 26) * Add support for span tokenization, aka standoff annotation of segmentation (incl Punkt) * allow unicode nodes in Tree.productions() * Fixed WordNet's morphy to be consistent with the original implementation, taking the shortest returned form instead of an arbitrary one (issues 427, 487) * Fixed bug in MaxentClassifier * Accepted bugfixes for YCOE corpus reader (issue 435) * Added test to _cumulative_frequencies() to correctly handle the case when no arguments are supplied * Added a TaggerI interface to the HunPos open-source tagger * Return 0, not None, when no count is present for a lemma in WordNet * fixed pretty-printing of unicode leaves * More efficient calculation of the leftcorner relation for left corner parsers * Added two functions for graph calculations: transitive closure and inversion. * FreqDist.pop() and FreqDist.popitems() now invalidate the caches (issue 511) Data: * Added SemCor 3.0 corpus (Brown Corpus tagged with WordNet synsets) * Added LanguageID corpus (trigram counts for 451 languages) * Added grammar for a^n b^n c^n NLTK-Contrib: * minor updates Thanks to the following contributors to 2.0b9: Steven Bethard, Francis Bond, Dmitry Chichkov, Liang Dong, Dan Garrette, Simon Greenhill, Bjorn Maeland, Rob Malouf, Joel Nothman, Jacob Perkins, Alberto Planas, Alex Rudnick, Geoffrey Sampson, Kevin Scannell, Richard Sproat Version 2.0b8 2010-02-05 NLTK: * fixed copyright and license statements * removed PyYAML, and added dependency to installers and download instructions * updated to LogicParser, DRT (Dan Garrette) * WordNet similarity metrics return None instead of -1 when they fail to find a path (Steve Bethard) * shortest_path_distance uses instance hypernyms (Jordan Boyd-Graber) * clean_html improved (Bjorn Maeland) * batch_parse, batch_interpret and batch_evaluate functions allow grammar or grammar filename as argument * more Portuguese examples (portuguese_en.doctest, examples/pt.py) NLTK-Contrib: * Aligner implementations (Christopher Crowner, Torsten Marek) * ScriptTranscriber package (Richard Sproat and Kristy Hollingshead) Book: * updates for second printing, correcting errata https://nltk.googlecode.com/svn/trunk/nltk/doc/book/errata.txt Data: * added Europarl sample, with 10 docs for each of 11 langs (Nitin Madnani) * added SMULTRON sample corpus (Torsten Marek, Martin Volk) Version 2.0b7 2009-11-09 NLTK: * minor bugfixes and enhancements: data loader, inference package, FreqDist, Punkt * added Portuguese example module, similar to nltk.book for English (examples/pt.py) * added all_lemma_names() method to WordNet corpus reader * added update() and __add__() extensions to FreqDist (enhances alignment with Python 3.0 counters) * reimplemented clean_html * added test-suite runner for automatic/manual regression testing NLTK-Data: * updated Punkt models for sentence segmentation * added corpus of the works of Machado de Assis (Brazilian Portuguese) Book: * Added translation of preface into Portuguese, contributed by Tiago Tresoldi. Version 2.0b6 2009-09-20 NLTK: * minor fixes for Python 2.4 compatibility * added words() method to XML corpus reader * minor bugfixes and code clean-ups * fixed downloader to put data in %APPDATA% on Windows Data: * Updated Punkt models * Fixed utf8 encoding issues with UDHR and Stopwords Corpora * Renamed CoNLL "cat" files to "esp" (different language) * Added Alvey NLT feature-based grammar * Added Polish PL196x corpus Version 2.0b5 2009-07-19 NLTK: * minor bugfixes (incl FreqDist, Python eggs) * added reader for Europarl Corpora (contributed by Nitin Madnani) * added reader for IPI PAN Polish Corpus (contributed by Konrad Goluchowski) * fixed data.py so that it doesn't generate a warning for Windows Python 2.6 NLTK-Contrib: * updated Praat reader (contributed by Margaret Mitchell) Version 2.0b4 2009-07-10 NLTK: * switched to Apache License, Version 2.0 * minor bugfixes in semantics and inference packages * support for Python eggs * fixed stale regression tests Data: * added NomBank 1.0 * uppercased feature names in some grammars Version 2.0b3 2009-06-25 NLTK: * several bugfixes * added nombank corpus reader (Paul Bedaride) Version 2.0b2 2009-06-15 NLTK: * minor bugfixes and optimizations for parsers, updated some doctests * added bottom-up filtered left corner parsers, LeftCornerChartParser and IncrementalLeftCornerChartParser. * fixed dispersion plot bug which prevented empty plots Version 2.0b1 2009-06-09 NLTK: * major refactor of chart parser code and improved API (Peter Ljungl喃) * added new bottom-up left-corner chart parser strategy * misc bugfixes (ChunkScore, chart rules, chatbots, jcn-similarity) * improved efficiency of "import nltk" using lazy module imports * moved CCG package and ISRI Arabic stemmer from NLTK-Contrib into core NLTK * misc code cleanups Contrib: * moved out of the main NLTK distribution into a separate distribution Book: * Ongoing polishing ahead of print publication Version 0.9.9 2009-05-06 NLTK: * Finalized API for NLTK 2.0 and the book, incl dozens of small fixes * Names of the form nltk.foo.Bar now available as nltk.Bar for significant functionality; in some cases the name was modified (using old names will produce a deprecation warning) * Bugfixes in downloader, WordNet * Expanded functionality in DecisionTree * Bigram collocations extended for discontiguous bigrams * Translation toy nltk.misc.babelfish * New module nltk.help giving access to tagset documentation * Fix imports so that NLTK builds without Tkinter (Bjorn Maeland) Data: * new maxent NE chunker model * updated grammar packages for the book * data for new tagsets collection, documenting several tagsets * added lolcat translation to the Genesis collection Contrib (work in progress): * Updates to coreference package (Joseph Frazee) * New ISRI Arabic stemmer (Hosam Algasaier) * Updates to Toolbox package (Greg Aumann) Book: * Substantial editorial corrections ahead of final submission Version 0.9.8 2009-02-18 NLTK: * New off-the-shelf tokenizer, POS tagger, and named-entity tagger * New metrics package with inter-annotator agreement scores, distance metrics, rank correlation * New collocations package (Joel Nothman) * Many clean-ups to WordNet package (Steven Bethard, Jordan Boyd-Graber) * Moved old pywordnet-based WordNet package to nltk_contrib * WordNet browser (Paul Bone) * New interface to dependency treebank corpora * Moved MinimalSet class into nltk.misc package * Put NLTK applications in new nltk.app package * Many other improvements incl semantics package, toolbox, MaltParser * Misc changes to many API names in preparation for 1.0, old names deprecated * Most classes now available in the top-level namespace * Work on Python egg distribution (Brandon Rhodes) * Removed deprecated code remaining from 0.8.* versions * Fixes for Python 2.4 compatibility Data: * Corrected identifiers in Dependency Treebank corpus * Basque and Catalan Dependency Treebanks (CoNLL 2007) * PE08 Parser Evaluation data * New models for POS tagger and named-entity tagger Book: * Substantial editorial corrections Version 0.9.7 2008-12-19 NLTK: * fixed problems with accessing zipped corpora * improved design and efficiency of grammars and chart parsers including new bottom-up combine strategy and a redesigned Earley strategy (Peter Ljunglof) * fixed bugs in smoothed probability distributions and added regression tests (Peter Ljunglof) * improvements to Punkt (Joel Nothman) * improvements to text classifiers * simple word-overlap RTE classifier Data: * A new package of large grammars (Peter Ljunglof) * A small gazetteer corpus and corpus reader * Organized example grammars into separate packages * Childrens' stories added to gutenberg package Contrib (work in progress): * fixes and demonstration for named-entity feature extractors in nltk_contrib.coref Book: * extensive changes throughout, including new chapter 5 on classification and substantially revised chapter 11 on managing linguistic data Version 0.9.6 2008-12-07 NLTK: * new WordNet corpus reader (contributed by Steven Bethard) * incorporated dependency parsers into NLTK (was NLTK-Contrib) (contributed by Jason Narad) * moved nltk/cfg.py to nltk/grammar.py and incorporated dependency grammars * improved efficiency of unification algorithm * various enhancements to the semantics package * added plot() and tabulate() methods to FreqDist and ConditionalFreqDist * FreqDist.keys() and list(FreqDist) provide keys reverse-sorted by value, to avoid the confusion caused by FreqDist.sorted() * new downloader module to support interactive data download: nltk.download() run using "python -m nltk.downloader all" * fixed WordNet bug that caused min_depth() to sometimes give incorrect result * added nltk.util.Index as a wrapper around defaultdict(list) plus a functional-style initializer * fixed bug in Earley chart parser that caused it to break * added basic TnT tagger nltk.tag.tnt * new corpus reader for CoNLL dependency format (contributed by Kepa Sarasola and Iker Manterola) * misc other bugfixes Contrib (work in progress): * TIGERSearch implementation by Torsten Marek * extensions to hole and glue semantics modules by Dan Garrette * new coreference package by Joseph Frazee * MapReduce interface by Xinfan Meng Data: * Corpora are stored in compressed format if this will not compromise speed of access * Swadesh Corpus of comparative wordlists in 23 languages * Split grammar collection into separate packages * New Basque and Spanish grammar samples (contributed by Kepa Sarasola and Iker Manterola) * Brown Corpus sections now have meaningful names (e.g. 'a' is now 'news') * Fixed bug that forced users to manually unzip the WordNet corpus * New dependency-parsed version of Treebank corpus sample * Added movie script "Monty Python and the Holy Grail" to webtext corpus * Replaced words corpus data with a much larger list of English words * New URL for list of available NLTK corpora https://nltk.googlecode.com/svn/trunk/nltk_data/index.xml Book: * complete rewrite of first three chapters to make the book accessible to a wider audience * new chapter on data-intensive language processing * extensive reworking of most chapters * Dropped subsection numbering; moved exercises to end of chapters Distributions: * created Portfile to support Mac installation Version 0.9.5 2008-08-27 NLTK: * text module with support for concordancing, text generation, plotting * book module * Major reworking of the automated theorem proving modules (Dan Garrette) * draw.dispersion now uses pylab * draw.concordance GUI tool * nltk.data supports for reading corpora and other data files from within zipfiles * trees can be constructed from strings with Tree(s) (cf Tree.parse(s)) Contrib (work in progress): * many updates to student projects - nltk_contrib.agreement (Thomas Lippincott) - nltk_contrib.coref (Joseph Frazee) - nltk_contrib.depparser (Jason Narad) - nltk_contrib.fuf (Petro Verkhogliad) - nltk_contrib.hadoop (Xinfan Meng) * clean-ups: deleted stale files; moved some packages to misc Data * Cleaned up Gutenberg text corpora * added Moby Dick; removed redundant copy of Blake songs. * more tagger models * renamed to nltk_data to facilitate installation * stored each corpus as a zip file for quicker installation and access, and to solve a problem with the Propbank corpus including a file with an illegal name for MSWindows (con.xml). Book: * changed filenames to chNN format * reworked opening chapters (work in progress) Distributions: * fixed problem with mac installer that arose when Python binary couldn't be found * removed dependency of NLTK on nltk_data so that NLTK code can be installed before the data Version 0.9.4 2008-08-01 NLTK: - Expanded semantics package for first order logic, linear logic, glue semantics, DRT, LFG (Dan Garrette) - new WordSense class in wordnet.synset supporting access to synsets from sense keys and accessing sense counts (Joel Nothman) - interface to Mallet's linear chain CRF implementation (nltk.tag.crf) - misc bugfixes incl Punkt, synsets, maxent - improved support for chunkers incl flexible chunk corpus reader, new rule type: ChunkRuleWithContext - new GUI for pos-tagged concordancing nltk.draw.pos_concordance - new GUI for developing regexp chunkers nltk.draw.rechunkparser - added bio_sents() and bio_words() methods to ConllChunkCorpusReader in conll.py to allow reading (word, tag, chunk_typ) tuples off of CoNLL-2000 corpus. Also modified ConllChunkCorpusView to support these changes. - feature structures support values with custom unification methods - new flag on tagged corpus readers to use simplified tagsets - new package for ngram language modeling with Katz backoff nltk.model - added classes for single-parented and multi-parented trees that automatically maintain parent pointers (nltk.tree.ParentedTree and nltk.tree.MultiParentedTree) - new WordNet browser GUI (Jussi Salmela, Paul Bone) - improved support for lazy sequences - added generate() method to probability distributions - more flexible parser for converting bracketed strings to trees - made fixes to docstrings to improve API documentation Contrib (work in progress) - new NLG package, FUF/SURGE (Petro Verkhogliad) - new dependency parser package (Jason Narad) - new Coreference package, incl support for ACE-2, MUC-6 and MUC-7 corpora (Joseph Frazee) - CCG Parser (Graeme Gange) - first order resolution theorem prover (Dan Garrette) Data: - Nnw NPS Chat Corpus and corpus reader (nltk.corpus.nps_chat) - ConllCorpusReader can now be used to read CoNLL 2004 and 2005 corpora. - Implemented HMM-based Treebank POS tagger and phrase chunker for nltk_contrib.coref in api.py. Pickled versions of these objects are checked in in data/taggers and data/chunkers. Book: - misc corrections in response to feedback from readers Version 0.9.3 2008-06-03 NLTK: - modified WordNet similarity code to use pre-built information content files - new classifier-based tagger, BNC corpus reader - improved unicode support for corpus readers - improved interfaces to Weka, Prover9/Mace4 - new support for using MEGAM and SciPy to train maxent classifiers - rewrite of Punkt sentence segmenter (Joel Nothman) - bugfixes for WordNet information content module (Jordan Boyd-Graber) - code clean-ups throughout Book: - miscellaneous fixes in response to feedback from readers Contrib: - implementation of incremental algorithm for generating referring expressions (contributed by Margaret Mitchell) - refactoring WordNet browser (Paul Bone) Corpora: - included WordNet information content files Version 0.9.2 2008-03-04 NLTK: - new theorem-prover and model-checker module nltk.inference, including interface to Prover9/Mace4 (Dan Garrette, Ewan Klein) - bugfix in Reuters corpus reader that causes Python to complain about too many open files - VerbNet and PropBank corpus readers Data: - VerbNet Corpus version 2.1: hierarchical, verb lexicon linked to WordNet - PropBank Corpus: predicate-argument structures, as stand-off annotation of Penn Treebank Contrib: - New work on WordNet browser, incorporating a client-server model (Jussi Salmela) Distributions: - Mac OS 10.5 distribution Version 0.9.1 2008-01-24 NLTK: - new interface for text categorization corpora - new corpus readers: RTE, Movie Reviews, Question Classification, Brown Corpus - bugfix in ConcatenatedCorpusView that caused iteration to fail if it didn't start from the beginning of the corpus Data: - Question classification data, included with permission of Li & Roth - Reuters 21578 Corpus, ApteMod version, from CPAN - Movie Reviews corpus (sentiment polarity), included with permission of Lillian Lee - Corpus for Recognising Textual Entailment (RTE) Challenges 1, 2 and 3 - Brown Corpus (reverted to original file structure: ca01-cr09) - Penn Treebank corpus sample (simplified implementation, new readers treebank_raw and treebank_chunk) - Minor redesign of corpus readers, to use filenames instead of "items" to identify parts of a corpus Contrib: - theorem_prover: Prover9, tableau, MaltParser, Mace4, glue semantics, docs (Dan Garrette, Ewan Klein) - drt: improved drawing, conversion to FOL (Dan Garrette) - gluesemantics: GUI demonstration, abstracted LFG code, documentation (Dan Garrette) - readability: various text readability scores (Thomas Jakobsen, Thomas Skardal) - toolbox: code to normalize toolbox databases (Greg Aumann) Book: - many improvements in early chapters in response to reader feedback - updates for revised corpus readers - moved unicode section to chapter 3 - work on engineering.txt (not included in 0.9.1) Distributions: - Fixed installation for Mac OS 10.5 (Joshua Ritterman) - Generalize doctest_driver to work with doc_contrib Version 0.9 2007-10-12 NLTK: - New naming of packages and modules, and more functions imported into top-level nltk namespace, e.g. nltk.chunk.Regexp -> nltk.RegexpParser, nltk.tokenize.Line -> nltk.LineTokenizer, nltk.stem.Porter -> nltk.PorterStemmer, nltk.parse.ShiftReduce -> nltk.ShiftReduceParser - processing class names changed from verbs to nouns, e.g. StemI -> StemmerI, ParseI -> ParserI, ChunkParseI -> ChunkParserI, ClassifyI -> ClassifierI - all tokenizers are now available as subclasses of TokenizeI, selected tokenizers are also available as functions, e.g. wordpunct_tokenize() - rewritten ngram tagger code, collapsed lookup tagger with unigram tagger - improved tagger API, permitting training in the initializer - new system for deprecating code so that users are notified of name changes. - support for reading feature cfgs to parallel reading cfgs (parse_featcfg()) - text classifier package, maxent (GIS, IIS), naive Bayes, decision trees, weka support - more consistent tree printing - wordnet's morphy stemmer now accessible via stemmer package - RSLP Portuguese stemmer (originally developed by Viviane Moreira Orengo, reimplemented by Tiago Tresoldi) - promoted ieer_rels.py to the sem package - improvements to WordNet package (Jussi Salmela) - more regression tests, and support for checking coverage of tests - miscellaneous bugfixes - remove numpy dependency Data: - new corpus reader implementation, refactored syntax corpus readers - new data package: corpora, grammars, tokenizers, stemmers, samples - CESS-ESP Spanish Treebank and corpus reader - CESS-CAT Catalan Treebank and corpus reader - Alpino Dutch Treebank and corpus reader - MacMorpho POS-tagged Brazilian Portuguese news text and corpus reader - trained model for Portuguese sentence segmenter - Floresta Portuguese Treebank version 7.4 and corpus reader - TIMIT player audio support Contrib: - BioReader (contributed by Carlos Rodriguez) - TnT tagger (contributed by Sam Huston) - wordnet browser (contributed by Jussi Salmela, requires wxpython) - lpath interpreter (contributed by Haejoong Lee) - timex -- regular expression-based temporal expression tagger Book: - polishing of early chapters - introductions to parts 1, 2, 3 - improvements in book processing software (xrefs, avm & gloss formatting, javascript clipboard) - updates to book organization, chapter contents - corrections throughout suggested by readers (acknowledged in preface) - more consistent use of US spelling throughout - all examples redone to work with single import statement: "import nltk" - reordered chapters: 5->7->8->9->11->12->5 * language engineering in part 1 to broaden the appeal of the earlier part of the book and to talk more about evaluation and baselines at an earlier stage * concentrate the partial and full parsing material in part 2, and remove the specialized feature-grammar material into part 3 Distributions: - streamlined mac installation (Joshua Ritterman) - included mac distribution with ISO image Version 0.8 2007-07-01 Code: - changed nltk.__init__ imports to explicitly import names from top-level modules - changed corpus.util to use the 'rb' flag for opening files, to fix problems reading corpora under MSWindows - updated stale examples in engineering.txt - extended feature structure interface to permit chained features, e.g. fs['F','G'] - further misc improvements to test code plus some bugfixes Tutorials: - rewritten opening section of tagging chapter - reorganized some exercises Version 0.8b2 2007-06-26 Code (major): - new corpus package, obsoleting old corpora package - supports caching, slicing, corpus search path - more flexible API - global updates so all NLTK modules use new corpus package - moved nltk/contrib to separate top-level package nltk_contrib - changed wordpunct tokenizer to use \w instead of a-zA-Z0-9 as this will be more robust for languages other than English, with implications for many corpus readers that use it - known bug: certain re-entrant structures in featstruct - known bug: when the LHS of an edge contains an ApplicationExpression, variable values in the RHS bindings aren't copied over when the fundamental rule applies - known bug: HMM tagger is broken Tutorials: - global updates to NLTK and docs - ongoing polishing Corpora: - treebank sample reverted to published multi-file structure Contrib: - DRT and Glue Semantics code (nltk_contrib.drt, nltk_contrib.gluesemantics, by Dan Garrette) Version 0.8b1 2007-06-18 Code (major): - changed package name to nltk - import all top-level modules into nltk, reducing need for import statements - reorganization of sub-package structures to simplify imports - new featstruct module, unifying old featurelite and featurestructure modules - FreqDist now inherits from dict, fd.count(sample) becomes fd[sample] - FreqDist initializer permits: fd = FreqDist(len(token) for token in text) - made numpy optional Code (minor): - changed GrammarFile initializer to accept filename - consistent tree display format - fixed loading process for WordNet and TIMIT that prevented code installation if data not installed - taken more care with unicode types - incorporated pcfg code into cfg module - moved cfg, tree, featstruct to top level - new filebroker module to make handling of example grammar files more transparent - more corpus readers (webtext, abc) - added cfg.covers() to check that a grammar covers a sentence - simple text-based wordnet browser - known bug: parse/featurechart.py uses incorrect apply() function Corpora: - csv data file to document NLTK corpora Contrib: - added Glue semantics code (contrib.glue, by Dan Garrette) - Punkt sentence segmenter port (contrib.punkt, by Willy) - added LPath interpreter (contrib.lpath, by Haejoong Lee) - extensive work on classifiers (contrib.classifier*, Sumukh Ghodke) Tutorials: - polishing on parts I, II - more illustrations, data plots, summaries, exercises - continuing to make prose more accessible to non-linguistic audience - new default import that all chapters presume: from nltk.book import * Distributions: - updated to latest version of numpy - removed WordNet installation instructions as WordNet is now included in corpus distribution - added pylab (matplotlib) Version 0.7.5 2007-05-16 Code: - improved WordNet and WordNet-Similarity interface - the Lancaster Stemmer (contributed by Steven Tomcavage) Corpora: - Web text samples - BioCreAtIvE-PPI - a corpus for protein-protein interactions - Switchboard Telephone Speech Corpus Sample (via Talkbank) - CMU Problem Reports Corpus sample - CONLL2002 POS+NER data - Patient Information Leaflet corpus - WordNet 3.0 data files - English wordlists: basic English, frequent words Tutorials: - more improvements to text and images Version 0.7.4 2007-05-01 Code: - Indian POS tagged corpus reader: corpora.indian - Sinica Treebank corpus reader: corpora.sinica_treebank - new web corpus reader corpora.web - tag package now supports pickling - added function to utilities.py to guess character encoding Corpora: - Rotokas texts from Stuart Robinson - POS-tagged corpora for several Indian languages (Bangla, Hindi, Marathi, Telugu) from A Kumaran Tutorials: - Substantial work on Part II of book on structured programming, parsing and grammar - More bibliographic citations - Improvements in typesetting, cross references - Redimensioned images and tables for better use of page space - Moved project list to wiki Contrib: - validation of toolbox entries using chunking - improved classifiers Distribution: - updated for Python 2.5.1, Numpy 1.0.2 Version 0.7.3 2007-04-02 * Code: - made chunk.Regexp.parse() more flexible about its input - developed new syntax for PCFG grammars, e.g. A -> B C [0.3] | D [0.7] - fixed CFG parser to support grammars with slash categories - moved beta classify package from main NLTK to contrib - Brill taggers loaded correctly - misc bugfixes * Corpora: - Shakespeare XML corpus sample and corpus reader * Tutorials: - improvements to prose, exercises, plots, images - expanded and reorganized tutorial on structured programming - formatting improvements for Python listings - improved plots (using pylab) - categorization of problems by difficulty Contrib: - more work on kimmo lexicon and grammar - more work on classifiers Version 0.7.2 2007-03-01 * Code: - simple feature detectors (detect module) - fixed problem when token generators are passed to a parser (parse package) - fixed bug in Grammar.productions() (identified by Lucas Champollion and Mitch Marcus) - fixed import bug in category.GrammarFile.earley_parser - added utilities.OrderedDict - initial port of old NLTK classifier package (by Sam Huston) - UDHR corpus reader * Corpora: - added UDHR corpus (Universal Declaration of Human Rights) with 10k text samples in 300+ languages * Tutorials: - improved images - improved book formatting, including new support for: - javascript to copy program examples to clipboard in HTML version, - bibliography, chapter cross-references, colorization, index, table-of-contents * Contrib: - new Kimmo system: contrib.mit.six863.kimmo (Rob Speer) - fixes for: contrib.fsa (Rob Speer) - demonstration of text classifiers trained on UDHR corpus for language identification: contrib.langid (Sam Huston) - new Lambek calculus system: contrib.lambek - new tree implementation based on elementtree: contrib.tree Version 0.7.1 2007-01-14 * Code: - bugfixes (HMM, WordNet) Version 0.7 2006-12-22 * Code: - bugfixes, including fixed bug in Brown corpus reader - cleaned up wordnet 2.1 interface code and similarity measures - support for full Penn treebank format contributed by Yoav Goldberg * Tutorials: - expanded tutorials on advanced parsing and structured programming - checked all doctest code - improved images for chart parsing Version 0.7b1 2006-12-06 * Code: - expanded semantic interpretation package - new high-level chunking interface, with cascaded chunking - split chunking code into new chunk package - updated wordnet package to support version 2.1 of Wordnet. - prototyped basic wordnet similarity measures (path distance, Wu + Palmer and Leacock + Chodorow, Resnik similarity measures.) - bugfixes (tag.Window, tag.ngram) - more doctests * Contrib: - toolbox language settings module * Tutorials: - rewrite of chunking chapter, switched from Treebank to CoNLL format as main focus, simplified evaluation framework, added ngram chunking section - substantial updates throughout (esp programming and semantics chapters) * Corpora: - Chat-80 Prolog data files provided as corpora, plus corpus reader Version 0.7a2 2006-11-13 * Code: - more doctest - code to read Chat-80 data - HMM bugfix * Tutorials: - continued updates and polishing * Corpora: - toolbox MDF sample data Version 0.7a1 2006-10-29 * Code: - new toolbox module (Greg Aumann) - new semantics package (Ewan Klein) - bugfixes * Tutorials - substantial revision, especially in preface, introduction, words, and semantics chapters. Version 0.6.6 2006-10-06 * Code: - bugfixes (probability, shoebox, draw) * Contrib: - new work on shoebox package (Stuart Robinson) * Tutorials: - continual expansion and revision, especially on introduction to programming, advanced programming and the feature-based grammar chapters. Version 0.6.5 2006-07-09 * Code: - improvements to shoebox module (Stuart Robinson, Greg Aumann) - incorporated feature-based parsing into core NLTK-Lite - corpus reader for Sinica treebank sample - new stemmer package * Contrib: - hole semantics implementation (Peter Wang) - Incorporating yaml - new work on feature structures, unification, lambda calculus - new work on shoebox package (Stuart Robinson, Greg Aumann) * Corpora: - Sinica treebank sample * Tutorials: - expanded discussion throughout, incl: left-recursion, trees, grammars, feature-based grammar, agreement, unification, PCFGs, baseline performance, exercises, improved display of trees Version 0.6.4 2006-04-20 * Code: - corpus readers for Senseval 2 and TIMIT - clusterer (ported from old NLTK) - support for cascaded chunkers - bugfix suggested by Brent Payne - new SortedDict class for regression testing * Contrib: - CombinedTagger tagger and marshalling taggers, contributed by Tiago Tresoldi * Corpora: - new: Senseval 2, TIMIT sample * Tutorials: - major revisions to programming, words, tagging, chunking, and parsing tutorials - many new exercises - formatting improvements, including colorized program examples - fixed problem with testing on training data, reported by Jason Baldridge Version 0.6.3 2006-03-09 * switch to new style classes * repair FSA model sufficiently for Kimmo module to work * port of MIT Kimmo morphological analyzer; still needs lots of code clean-up and inline docs * expanded support for shoebox format, developed with Stuart Robinson * fixed bug in indexing CFG productions, for empty right-hand-sides * efficiency improvements, suggested by Martin Ranang * replaced classeq with isinstance, for efficiency improvement, as suggested by Martin Ranang * bugfixes in chunk eval * simplified call to draw_trees * names, stopwords corpora Version 0.6.2 2006-01-29 * Peter Spiller's concordancer * Will Hardy's implementation of Penton's paradigm visualization system * corpus readers for presidential speeches * removed NLTK dependency * generalized CFG terminals to permit full range of characters * used fully qualified names in demo code, for portability * bugfixes from Yoav Goldberg, Eduardo Pereira Habkost * fixed obscure quoting bug in tree displays and conversions * simplified demo code, fixed import bug nltk-3.7/LICENSE.txt000066400000000000000000000261361420073152400141640ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. nltk-3.7/MANIFEST.in000066400000000000000000000002461420073152400140710ustar00rootroot00000000000000include *.md include *.txt include *.ini include setup.* include ChangeLog include Makefile include MANIFEST.in graft nltk graft tools graft web global-exclude *~ nltk-3.7/Makefile000066400000000000000000000035101420073152400137700ustar00rootroot00000000000000# Natural Language Toolkit: source Makefile # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT PYTHON = python VERSION = $(shell $(PYTHON) -c 'import nltk; print(nltk.__version__)' | sed '/^Warning: */d') NLTK_URL = $(shell $(PYTHON) -c 'import nltk; print(nltk.__url__)' | sed '/^Warning: */d') .PHONY: all clean clean_code all: dist ######################################################################## # TESTING ######################################################################## DOCTEST_FILES = nltk/test/*.doctest DOCTEST_CODE_FILES = nltk/*.py nltk/*/*.py doctest: pytest $(DOCTEST_FILES) doctest_code: pytest $(DOCTEST_CODE_FILES) demotest: find nltk -name "*.py"\ -and -not -path *misc* \ -and -not -name brown_ic.py \ -exec echo ==== '{}' ==== \; -exec python '{}' \; ######################################################################## # DISTRIBUTIONS ######################################################################## dist: zipdist # twine only permits one source distribution #gztardist: clean_code # $(PYTHON) setup.py -q sdist --format=gztar zipdist: clean_code $(PYTHON) setup.py -q sdist --format=zip bdist_wheel windist: clean_code $(PYTHON) setup.py -q bdist --format=wininst --plat-name=win32 ######################################################################## # CLEAN ######################################################################## clean: clean_code rm -rf build web/_build iso dist api MANIFEST nltk-$(VERSION) nltk.egg-info clean_code: rm -f `find nltk -name '*.pyc'` rm -f `find nltk -name '*.pyo'` rm -f `find . -name '*~'` rm -rf `find . -name '__pycache__'` rm -f MANIFEST # regenerate manifest from MANIFEST.in nltk-3.7/README.md000066400000000000000000000033051420073152400136110ustar00rootroot00000000000000# Natural Language Toolkit (NLTK) [![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) ![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) NLTK -- the Natural Language Toolkit -- is a suite of open source Python modules, data sets, and tutorials supporting research and development in Natural Language Processing. NLTK requires Python version 3.7, 3.8, 3.9 or 3.10. For documentation, please visit [nltk.org](https://www.nltk.org/). ## Contributing Do you want to contribute to NLTK development? Great! Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. See also [how to contribute to NLTK](https://www.nltk.org/contribute.html). ## Donate Have you found the toolkit helpful? Please support NLTK development by donating to the project via PayPal, using the link on the NLTK homepage. ## Citing If you publish work that uses NLTK, please cite the NLTK book, as follows: Bird, Steven, Edward Loper and Ewan Klein (2009). Natural Language Processing with Python. O'Reilly Media Inc. ## Copyright Copyright (C) 2001-2022 NLTK Project For license information, see [LICENSE.txt](LICENSE.txt). [AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK. ### Redistributing - NLTK source code is distributed under the Apache 2.0 License. - NLTK documentation is distributed under the Creative Commons Attribution-Noncommercial-No Derivative Works 3.0 United States license. - NLTK corpora are provided under the terms given in the README file for each corpus; all are redistributable and available for non-commercial use. - NLTK may be freely redistributed, subject to the provisions of these licenses. nltk-3.7/RELEASE-HOWTO.txt000066400000000000000000000054721420073152400150600ustar00rootroot00000000000000Building an NLTK distribution ---------------------------------- 1. Testing - Check no errors are reported in our continuous integration service: https://github.com/nltk/nltk/actions - Optionally test demonstration code locally make demotest - Optionally test individual modules: tox-3.8 -e py38 nltk.package.module - Check the data index is up-to-date: cd ../nltk_data; make; push 2. Update Version Number and ChangeLog - Update version number edit nltk/VERSION and web/conf.py (version and release) - Check web/install.rst mentions latest version of Python - Check setup.py lists correct range of Python versions - Add a new entry to the news page in nltk/web/news.rst - Update the ChangeLog (for nltk, nltk_data) git log "$(git describe --tags --abbrev=0)..HEAD" --oneline edit ChangeLog 3. Build Documentation - Check the copyright year is correct and update if necessary e.g. ./tools/global_replace.py 2001-2022 2001-2022 check web/conf.py copyright line - Check that installation instructions are up-to-date (including the range of Python versions that are supported) edit web/install.rst setup.py - Ensure that nltk_theme is installed and updated pip install -U nltk_theme - Rebuild the API docs sphinx-build -E ./web ./build - Publish them cd ../nltk.github.com; git pull (begin with current docs repo) cp -r ../nltk/build/* . git add . git commit -m "updates for version 3.X.Y" git push origin master 4. Create a new version - Tag this version: cd ../nltk git tag -a 3.X.Y -m "version 3.X.Y" git push --tags verify that it shows up here: https://github.com/nltk/nltk/releases This is important for the website, as the footer will link to the tag with the version from web/conf.py. 5. Release - Make the distributions make clean; make dist; ls dist/ - Upload the distributions python -m twine upload dist/* - Check upload https://pypi.python.org/pypi/nltk 6. Announce - Post announcement to NLTK the mailing lists: nltk-dev (for beta releases) nltk-users (for final releases) nltk twitter account 7. Optionally update repo version - we don't want builds from the repository to have the same release number e.g. after release X.Y.4, update repository version to X.Y.5a (alpha) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @@@ BOOK BUILD @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ The build requires docutils, pdflatex, python imaging library, epydoc, cdrtools, ImageMagick 1. Check out a clean copy of the subversion repository (or make clean) and install locally with sudo python setup.py install; make clean 2. make doc (slow; see doc/ for the results) and commit nltk-3.7/SECURITY.md000066400000000000000000000001501420073152400141160ustar00rootroot00000000000000# Security Policy ## Reporting a Vulnerability Please report security issues to `nltk.team@gmail.com` nltk-3.7/nltk/000077500000000000000000000000001420073152400133015ustar00rootroot00000000000000nltk-3.7/nltk/VERSION000066400000000000000000000000041420073152400143430ustar00rootroot000000000000003.7 nltk-3.7/nltk/__init__.py000066400000000000000000000140641420073152400154170ustar00rootroot00000000000000# Natural Language Toolkit (NLTK) # # Copyright (C) 2001-2022 NLTK Project # Authors: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing. A free online book is available. (If you use the library for academic research, please cite the book.) Steven Bird, Ewan Klein, and Edward Loper (2009). Natural Language Processing with Python. O'Reilly Media Inc. https://www.nltk.org/book/ isort:skip_file """ import os # ////////////////////////////////////////////////////// # Metadata # ////////////////////////////////////////////////////// # Version. For each new release, the version number should be updated # in the file VERSION. try: # If a VERSION file exists, use it! version_file = os.path.join(os.path.dirname(__file__), "VERSION") with open(version_file) as infile: __version__ = infile.read().strip() except NameError: __version__ = "unknown (running code interactively?)" except OSError as ex: __version__ = "unknown (%s)" % ex if __doc__ is not None: # fix for the ``python -OO`` __doc__ += "\n@version: " + __version__ # Copyright notice __copyright__ = """\ Copyright (C) 2001-2022 NLTK Project. Distributed and Licensed under the Apache License, Version 2.0, which is included by reference. """ __license__ = "Apache License, Version 2.0" # Description of the toolkit, keywords, and the project's primary URL. __longdescr__ = """\ The Natural Language Toolkit (NLTK) is a Python package for natural language processing. NLTK requires Python 3.7, 3.8, 3.9 or 3.10.""" __keywords__ = [ "NLP", "CL", "natural language processing", "computational linguistics", "parsing", "tagging", "tokenizing", "syntax", "linguistics", "language", "natural language", "text analytics", ] __url__ = "https://www.nltk.org/" # Maintainer, contributors, etc. __maintainer__ = "NLTK Team" __maintainer_email__ = "nltk.team@gmail.com" __author__ = __maintainer__ __author_email__ = __maintainer_email__ # "Trove" classifiers for Python Package Index. __classifiers__ = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Education", "Intended Audience :: Information Technology", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Human Machine Interfaces", "Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Text Processing", "Topic :: Text Processing :: Filters", "Topic :: Text Processing :: General", "Topic :: Text Processing :: Indexing", "Topic :: Text Processing :: Linguistic", ] from nltk.internals import config_java # support numpy from pypy try: import numpypy except ImportError: pass # Override missing methods on environments where it cannot be used like GAE. import subprocess if not hasattr(subprocess, "PIPE"): def _fake_PIPE(*args, **kwargs): raise NotImplementedError("subprocess.PIPE is not supported.") subprocess.PIPE = _fake_PIPE if not hasattr(subprocess, "Popen"): def _fake_Popen(*args, **kwargs): raise NotImplementedError("subprocess.Popen is not supported.") subprocess.Popen = _fake_Popen ########################################################### # TOP-LEVEL MODULES ########################################################### # Import top-level functionality into top-level namespace from nltk.collocations import * from nltk.decorators import decorator, memoize from nltk.featstruct import * from nltk.grammar import * from nltk.probability import * from nltk.text import * from nltk.util import * from nltk.jsontags import * ########################################################### # PACKAGES ########################################################### from nltk.chunk import * from nltk.classify import * from nltk.inference import * from nltk.metrics import * from nltk.parse import * from nltk.tag import * from nltk.tokenize import * from nltk.translate import * from nltk.tree import * from nltk.sem import * from nltk.stem import * # Packages which can be lazily imported # (a) we don't import * # (b) they're slow to import or have run-time dependencies # that can safely fail at run time from nltk import lazyimport app = lazyimport.LazyModule("nltk.app", locals(), globals()) chat = lazyimport.LazyModule("nltk.chat", locals(), globals()) corpus = lazyimport.LazyModule("nltk.corpus", locals(), globals()) draw = lazyimport.LazyModule("nltk.draw", locals(), globals()) toolbox = lazyimport.LazyModule("nltk.toolbox", locals(), globals()) # Optional loading try: import numpy except ImportError: pass else: from nltk import cluster from nltk.downloader import download, download_shell try: import tkinter except ImportError: pass else: try: from nltk.downloader import download_gui except RuntimeError as e: import warnings warnings.warn( "Corpus downloader GUI not loaded " "(RuntimeError during import: %s)" % str(e) ) # explicitly import all top-level modules (ensuring # they override the same names inadvertently imported # from a subpackage) from nltk import ccg, chunk, classify, collocations from nltk import data, featstruct, grammar, help, inference, metrics from nltk import misc, parse, probability, sem, stem, wsd from nltk import tag, tbl, text, tokenize, translate, tree, util # FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116 def demo(): print("To run the demo code for a module, type nltk.module.demo()") nltk-3.7/nltk/app/000077500000000000000000000000001420073152400140615ustar00rootroot00000000000000nltk-3.7/nltk/app/__init__.py000066400000000000000000000027731420073152400162030ustar00rootroot00000000000000# Natural Language Toolkit: Applications package # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ Interactive NLTK Applications: chartparser: Chart Parser chunkparser: Regular-Expression Chunk Parser collocations: Find collocations in text concordance: Part-of-speech concordancer nemo: Finding (and Replacing) Nemo regular expression tool rdparser: Recursive Descent Parser srparser: Shift-Reduce Parser wordnet: WordNet Browser """ # Import Tkinter-based modules if Tkinter is installed try: import tkinter except ImportError: import warnings warnings.warn("nltk.app package not loaded (please install Tkinter library).") else: from nltk.app.chartparser_app import app as chartparser from nltk.app.chunkparser_app import app as chunkparser from nltk.app.collocations_app import app as collocations from nltk.app.concordance_app import app as concordance from nltk.app.nemo_app import app as nemo from nltk.app.rdparser_app import app as rdparser from nltk.app.srparser_app import app as srparser from nltk.app.wordnet_app import app as wordnet try: from matplotlib import pylab except ImportError: import warnings warnings.warn("nltk.app.wordfreq not loaded (requires the matplotlib library).") else: from nltk.app.wordfreq_app import app as wordfreq nltk-3.7/nltk/app/chartparser_app.py000066400000000000000000002471721420073152400176260ustar00rootroot00000000000000# Natural Language Toolkit: Chart Parser Application # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Jean Mark Gawron # Steven Bird # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring chart parsing. Chart parsing is a flexible parsing algorithm that uses a data structure called a "chart" to record hypotheses about syntactic constituents. Each hypothesis is represented by a single "edge" on the chart. A set of "chart rules" determine when new edges can be added to the chart. This set of rules controls the overall behavior of the parser (e.g. whether it parses top-down or bottom-up). The chart parsing tool demonstrates the process of parsing a single sentence, with a given grammar and lexicon. Its display is divided into three sections: the bottom section displays the chart; the middle section displays the sentence; and the top section displays the partial syntax tree corresponding to the selected edge. Buttons along the bottom of the window are used to control the execution of the algorithm. The chart parsing tool allows for flexible control of the parsing algorithm. At each step of the algorithm, you can select which rule or strategy you wish to apply. This allows you to experiment with mixing different strategies (e.g. top-down and bottom-up). You can exercise fine-grained control over the algorithm by selecting which edge you wish to apply a rule to. """ # At some point, we should rewrite this tool to use the new canvas # widget system. import os.path import pickle from tkinter import ( Button, Canvas, Checkbutton, Frame, IntVar, Label, Menu, Scrollbar, Tk, Toplevel, ) from tkinter.filedialog import askopenfilename, asksaveasfilename from tkinter.font import Font from tkinter.messagebox import showerror, showinfo from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment from nltk.draw.util import ( CanvasFrame, ColorizedList, EntryDialog, MutableOptionMenu, ShowText, SymbolWidget, ) from nltk.grammar import CFG, Nonterminal from nltk.parse.chart import ( BottomUpPredictCombineRule, BottomUpPredictRule, Chart, LeafEdge, LeafInitRule, SingleEdgeFundamentalRule, SteppingChartParser, TopDownInitRule, TopDownPredictRule, TreeEdge, ) from nltk.tree import Tree from nltk.util import in_idle # Known bug: ChartView doesn't handle edges generated by epsilon # productions (e.g., [Production: PP -> ]) very well. ####################################################################### # Edge List ####################################################################### class EdgeList(ColorizedList): ARROW = SymbolWidget.SYMBOLS["rightarrow"] def _init_colortags(self, textwidget, options): textwidget.tag_config("terminal", foreground="#006000") textwidget.tag_config("arrow", font="symbol", underline="0") textwidget.tag_config("dot", foreground="#000000") textwidget.tag_config( "nonterminal", foreground="blue", font=("helvetica", -12, "bold") ) def _item_repr(self, item): contents = [] contents.append(("%s\t" % item.lhs(), "nonterminal")) contents.append((self.ARROW, "arrow")) for i, elt in enumerate(item.rhs()): if i == item.dot(): contents.append((" *", "dot")) if isinstance(elt, Nonterminal): contents.append((" %s" % elt.symbol(), "nonterminal")) else: contents.append((" %r" % elt, "terminal")) if item.is_complete(): contents.append((" *", "dot")) return contents ####################################################################### # Chart Matrix View ####################################################################### class ChartMatrixView: """ A view of a chart that displays the contents of the corresponding matrix. """ def __init__( self, parent, chart, toplevel=True, title="Chart Matrix", show_numedges=False ): self._chart = chart self._cells = [] self._marks = [] self._selected_cell = None if toplevel: self._root = Toplevel(parent) self._root.title(title) self._root.bind("", self.destroy) self._init_quit(self._root) else: self._root = Frame(parent) self._init_matrix(self._root) self._init_list(self._root) if show_numedges: self._init_numedges(self._root) else: self._numedges_label = None self._callbacks = {} self._num_edges = 0 self.draw() def _init_quit(self, root): quit = Button(root, text="Quit", command=self.destroy) quit.pack(side="bottom", expand=0, fill="none") def _init_matrix(self, root): cframe = Frame(root, border=2, relief="sunken") cframe.pack(expand=0, fill="none", padx=1, pady=3, side="top") self._canvas = Canvas(cframe, width=200, height=200, background="white") self._canvas.pack(expand=0, fill="none") def _init_numedges(self, root): self._numedges_label = Label(root, text="0 edges") self._numedges_label.pack(expand=0, fill="none", side="top") def _init_list(self, root): self._list = EdgeList(root, [], width=20, height=5) self._list.pack(side="top", expand=1, fill="both", pady=3) def cb(edge, self=self): self._fire_callbacks("select", edge) self._list.add_callback("select", cb) self._list.focus() def destroy(self, *e): if self._root is None: return try: self._root.destroy() except: pass self._root = None def set_chart(self, chart): if chart is not self._chart: self._chart = chart self._num_edges = 0 self.draw() def update(self): if self._root is None: return # Count the edges in each cell N = len(self._cells) cell_edges = [[0 for i in range(N)] for j in range(N)] for edge in self._chart: cell_edges[edge.start()][edge.end()] += 1 # Color the cells correspondingly. for i in range(N): for j in range(i, N): if cell_edges[i][j] == 0: color = "gray20" else: color = "#00{:02x}{:02x}".format( min(255, 50 + 128 * cell_edges[i][j] / 10), max(0, 128 - 128 * cell_edges[i][j] / 10), ) cell_tag = self._cells[i][j] self._canvas.itemconfig(cell_tag, fill=color) if (i, j) == self._selected_cell: self._canvas.itemconfig(cell_tag, outline="#00ffff", width=3) self._canvas.tag_raise(cell_tag) else: self._canvas.itemconfig(cell_tag, outline="black", width=1) # Update the edge list. edges = list(self._chart.select(span=self._selected_cell)) self._list.set(edges) # Update our edge count. self._num_edges = self._chart.num_edges() if self._numedges_label is not None: self._numedges_label["text"] = "%d edges" % self._num_edges def activate(self): self._canvas.itemconfig("inactivebox", state="hidden") self.update() def inactivate(self): self._canvas.itemconfig("inactivebox", state="normal") self.update() def add_callback(self, event, func): self._callbacks.setdefault(event, {})[func] = 1 def remove_callback(self, event, func=None): if func is None: del self._callbacks[event] else: try: del self._callbacks[event][func] except: pass def _fire_callbacks(self, event, *args): if event not in self._callbacks: return for cb_func in list(self._callbacks[event].keys()): cb_func(*args) def select_cell(self, i, j): if self._root is None: return # If the cell is already selected (and the chart contents # haven't changed), then do nothing. if (i, j) == self._selected_cell and self._chart.num_edges() == self._num_edges: return self._selected_cell = (i, j) self.update() # Fire the callback. self._fire_callbacks("select_cell", i, j) def deselect_cell(self): if self._root is None: return self._selected_cell = None self._list.set([]) self.update() def _click_cell(self, i, j): if self._selected_cell == (i, j): self.deselect_cell() else: self.select_cell(i, j) def view_edge(self, edge): self.select_cell(*edge.span()) self._list.view(edge) def mark_edge(self, edge): if self._root is None: return self.select_cell(*edge.span()) self._list.mark(edge) def unmark_edge(self, edge=None): if self._root is None: return self._list.unmark(edge) def markonly_edge(self, edge): if self._root is None: return self.select_cell(*edge.span()) self._list.markonly(edge) def draw(self): if self._root is None: return LEFT_MARGIN = BOT_MARGIN = 15 TOP_MARGIN = 5 c = self._canvas c.delete("all") N = self._chart.num_leaves() + 1 dx = (int(c["width"]) - LEFT_MARGIN) / N dy = (int(c["height"]) - TOP_MARGIN - BOT_MARGIN) / N c.delete("all") # Labels and dotted lines for i in range(N): c.create_text( LEFT_MARGIN - 2, i * dy + dy / 2 + TOP_MARGIN, text=repr(i), anchor="e" ) c.create_text( i * dx + dx / 2 + LEFT_MARGIN, N * dy + TOP_MARGIN + 1, text=repr(i), anchor="n", ) c.create_line( LEFT_MARGIN, dy * (i + 1) + TOP_MARGIN, dx * N + LEFT_MARGIN, dy * (i + 1) + TOP_MARGIN, dash=".", ) c.create_line( dx * i + LEFT_MARGIN, TOP_MARGIN, dx * i + LEFT_MARGIN, dy * N + TOP_MARGIN, dash=".", ) # A box around the whole thing c.create_rectangle( LEFT_MARGIN, TOP_MARGIN, LEFT_MARGIN + dx * N, dy * N + TOP_MARGIN, width=2 ) # Cells self._cells = [[None for i in range(N)] for j in range(N)] for i in range(N): for j in range(i, N): t = c.create_rectangle( j * dx + LEFT_MARGIN, i * dy + TOP_MARGIN, (j + 1) * dx + LEFT_MARGIN, (i + 1) * dy + TOP_MARGIN, fill="gray20", ) self._cells[i][j] = t def cb(event, self=self, i=i, j=j): self._click_cell(i, j) c.tag_bind(t, "", cb) # Inactive box xmax, ymax = int(c["width"]), int(c["height"]) t = c.create_rectangle( -100, -100, xmax + 100, ymax + 100, fill="gray50", state="hidden", tag="inactivebox", ) c.tag_lower(t) # Update the cells. self.update() def pack(self, *args, **kwargs): self._root.pack(*args, **kwargs) ####################################################################### # Chart Results View ####################################################################### class ChartResultsView: def __init__(self, parent, chart, grammar, toplevel=True): self._chart = chart self._grammar = grammar self._trees = [] self._y = 10 self._treewidgets = [] self._selection = None self._selectbox = None if toplevel: self._root = Toplevel(parent) self._root.title("Chart Parser Application: Results") self._root.bind("", self.destroy) else: self._root = Frame(parent) # Buttons if toplevel: buttons = Frame(self._root) buttons.pack(side="bottom", expand=0, fill="x") Button(buttons, text="Quit", command=self.destroy).pack(side="right") Button(buttons, text="Print All", command=self.print_all).pack(side="left") Button(buttons, text="Print Selection", command=self.print_selection).pack( side="left" ) # Canvas frame. self._cframe = CanvasFrame(self._root, closeenough=20) self._cframe.pack(side="top", expand=1, fill="both") # Initial update self.update() def update(self, edge=None): if self._root is None: return # If the edge isn't a parse edge, do nothing. if edge is not None: if edge.lhs() != self._grammar.start(): return if edge.span() != (0, self._chart.num_leaves()): return for parse in self._chart.parses(self._grammar.start()): if parse not in self._trees: self._add(parse) def _add(self, parse): # Add it to self._trees. self._trees.append(parse) # Create a widget for it. c = self._cframe.canvas() treewidget = tree_to_treesegment(c, parse) # Add it to the canvas frame. self._treewidgets.append(treewidget) self._cframe.add_widget(treewidget, 10, self._y) # Register callbacks. treewidget.bind_click(self._click) # Update y. self._y = treewidget.bbox()[3] + 10 def _click(self, widget): c = self._cframe.canvas() if self._selection is not None: c.delete(self._selectbox) self._selection = widget (x1, y1, x2, y2) = widget.bbox() self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline="#088") def _color(self, treewidget, color): treewidget.label()["color"] = color for child in treewidget.subtrees(): if isinstance(child, TreeSegmentWidget): self._color(child, color) else: child["color"] = color def print_all(self, *e): if self._root is None: return self._cframe.print_to_file() def print_selection(self, *e): if self._root is None: return if self._selection is None: showerror("Print Error", "No tree selected") else: c = self._cframe.canvas() for widget in self._treewidgets: if widget is not self._selection: self._cframe.destroy_widget(widget) c.delete(self._selectbox) (x1, y1, x2, y2) = self._selection.bbox() self._selection.move(10 - x1, 10 - y1) c["scrollregion"] = f"0 0 {x2 - x1 + 20} {y2 - y1 + 20}" self._cframe.print_to_file() # Restore our state. self._treewidgets = [self._selection] self.clear() self.update() def clear(self): if self._root is None: return for treewidget in self._treewidgets: self._cframe.destroy_widget(treewidget) self._trees = [] self._treewidgets = [] if self._selection is not None: self._cframe.canvas().delete(self._selectbox) self._selection = None self._y = 10 def set_chart(self, chart): self.clear() self._chart = chart self.update() def set_grammar(self, grammar): self.clear() self._grammar = grammar self.update() def destroy(self, *e): if self._root is None: return try: self._root.destroy() except: pass self._root = None def pack(self, *args, **kwargs): self._root.pack(*args, **kwargs) ####################################################################### # Chart Comparer ####################################################################### class ChartComparer: """ :ivar _root: The root window :ivar _charts: A dictionary mapping names to charts. When charts are loaded, they are added to this dictionary. :ivar _left_chart: The left ``Chart``. :ivar _left_name: The name ``_left_chart`` (derived from filename) :ivar _left_matrix: The ``ChartMatrixView`` for ``_left_chart`` :ivar _left_selector: The drop-down ``MutableOptionsMenu`` used to select ``_left_chart``. :ivar _right_chart: The right ``Chart``. :ivar _right_name: The name ``_right_chart`` (derived from filename) :ivar _right_matrix: The ``ChartMatrixView`` for ``_right_chart`` :ivar _right_selector: The drop-down ``MutableOptionsMenu`` used to select ``_right_chart``. :ivar _out_chart: The out ``Chart``. :ivar _out_name: The name ``_out_chart`` (derived from filename) :ivar _out_matrix: The ``ChartMatrixView`` for ``_out_chart`` :ivar _out_label: The label for ``_out_chart``. :ivar _op_label: A Label containing the most recent operation. """ _OPSYMBOL = { "-": "-", "and": SymbolWidget.SYMBOLS["intersection"], "or": SymbolWidget.SYMBOLS["union"], } def __init__(self, *chart_filenames): # This chart is displayed when we don't have a value (eg # before any chart is loaded). faketok = [""] * 8 self._emptychart = Chart(faketok) # The left & right charts start out empty. self._left_name = "None" self._right_name = "None" self._left_chart = self._emptychart self._right_chart = self._emptychart # The charts that have been loaded. self._charts = {"None": self._emptychart} # The output chart. self._out_chart = self._emptychart # The most recent operation self._operator = None # Set up the root window. self._root = Tk() self._root.title("Chart Comparison") self._root.bind("", self.destroy) self._root.bind("", self.destroy) # Initialize all widgets, etc. self._init_menubar(self._root) self._init_chartviews(self._root) self._init_divider(self._root) self._init_buttons(self._root) self._init_bindings(self._root) # Load any specified charts. for filename in chart_filenames: self.load_chart(filename) def destroy(self, *e): if self._root is None: return try: self._root.destroy() except: pass self._root = None def mainloop(self, *args, **kwargs): return self._root.mainloop(*args, **kwargs) # //////////////////////////////////////////////////////////// # Initialization # //////////////////////////////////////////////////////////// def _init_menubar(self, root): menubar = Menu(root) # File menu filemenu = Menu(menubar, tearoff=0) filemenu.add_command( label="Load Chart", accelerator="Ctrl-o", underline=0, command=self.load_chart_dialog, ) filemenu.add_command( label="Save Output", accelerator="Ctrl-s", underline=0, command=self.save_chart_dialog, ) filemenu.add_separator() filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) # Compare menu opmenu = Menu(menubar, tearoff=0) opmenu.add_command( label="Intersection", command=self._intersection, accelerator="+" ) opmenu.add_command(label="Union", command=self._union, accelerator="*") opmenu.add_command( label="Difference", command=self._difference, accelerator="-" ) opmenu.add_separator() opmenu.add_command(label="Swap Charts", command=self._swapcharts) menubar.add_cascade(label="Compare", underline=0, menu=opmenu) # Add the menu self._root.config(menu=menubar) def _init_divider(self, root): divider = Frame(root, border=2, relief="sunken") divider.pack(side="top", fill="x", ipady=2) def _init_chartviews(self, root): opfont = ("symbol", -36) # Font for operator. eqfont = ("helvetica", -36) # Font for equals sign. frame = Frame(root, background="#c0c0c0") frame.pack(side="top", expand=1, fill="both") # The left matrix. cv1_frame = Frame(frame, border=3, relief="groove") cv1_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") self._left_selector = MutableOptionMenu( cv1_frame, list(self._charts.keys()), command=self._select_left ) self._left_selector.pack(side="top", pady=5, fill="x") self._left_matrix = ChartMatrixView( cv1_frame, self._emptychart, toplevel=False, show_numedges=True ) self._left_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") self._left_matrix.add_callback("select", self.select_edge) self._left_matrix.add_callback("select_cell", self.select_cell) self._left_matrix.inactivate() # The operator. self._op_label = Label( frame, text=" ", width=3, background="#c0c0c0", font=opfont ) self._op_label.pack(side="left", padx=5, pady=5) # The right matrix. cv2_frame = Frame(frame, border=3, relief="groove") cv2_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") self._right_selector = MutableOptionMenu( cv2_frame, list(self._charts.keys()), command=self._select_right ) self._right_selector.pack(side="top", pady=5, fill="x") self._right_matrix = ChartMatrixView( cv2_frame, self._emptychart, toplevel=False, show_numedges=True ) self._right_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") self._right_matrix.add_callback("select", self.select_edge) self._right_matrix.add_callback("select_cell", self.select_cell) self._right_matrix.inactivate() # The equals sign Label(frame, text="=", width=3, background="#c0c0c0", font=eqfont).pack( side="left", padx=5, pady=5 ) # The output matrix. out_frame = Frame(frame, border=3, relief="groove") out_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") self._out_label = Label(out_frame, text="Output") self._out_label.pack(side="top", pady=9) self._out_matrix = ChartMatrixView( out_frame, self._emptychart, toplevel=False, show_numedges=True ) self._out_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") self._out_matrix.add_callback("select", self.select_edge) self._out_matrix.add_callback("select_cell", self.select_cell) self._out_matrix.inactivate() def _init_buttons(self, root): buttons = Frame(root) buttons.pack(side="bottom", pady=5, fill="x", expand=0) Button(buttons, text="Intersection", command=self._intersection).pack( side="left" ) Button(buttons, text="Union", command=self._union).pack(side="left") Button(buttons, text="Difference", command=self._difference).pack(side="left") Frame(buttons, width=20).pack(side="left") Button(buttons, text="Swap Charts", command=self._swapcharts).pack(side="left") Button(buttons, text="Detach Output", command=self._detach_out).pack( side="right" ) def _init_bindings(self, root): # root.bind('', self.save_chart) root.bind("", self.load_chart_dialog) # root.bind('', self.reset) # //////////////////////////////////////////////////////////// # Input Handling # //////////////////////////////////////////////////////////// def _select_left(self, name): self._left_name = name self._left_chart = self._charts[name] self._left_matrix.set_chart(self._left_chart) if name == "None": self._left_matrix.inactivate() self._apply_op() def _select_right(self, name): self._right_name = name self._right_chart = self._charts[name] self._right_matrix.set_chart(self._right_chart) if name == "None": self._right_matrix.inactivate() self._apply_op() def _apply_op(self): if self._operator == "-": self._difference() elif self._operator == "or": self._union() elif self._operator == "and": self._intersection() # //////////////////////////////////////////////////////////// # File # //////////////////////////////////////////////////////////// CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")] def save_chart_dialog(self, *args): filename = asksaveasfilename( filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" ) if not filename: return try: with open(filename, "wb") as outfile: pickle.dump(self._out_chart, outfile) except Exception as e: showerror("Error Saving Chart", f"Unable to open file: {filename!r}\n{e}") def load_chart_dialog(self, *args): filename = askopenfilename( filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" ) if not filename: return try: self.load_chart(filename) except Exception as e: showerror("Error Loading Chart", f"Unable to open file: {filename!r}\n{e}") def load_chart(self, filename): with open(filename, "rb") as infile: chart = pickle.load(infile) name = os.path.basename(filename) if name.endswith(".pickle"): name = name[:-7] if name.endswith(".chart"): name = name[:-6] self._charts[name] = chart self._left_selector.add(name) self._right_selector.add(name) # If either left_matrix or right_matrix is empty, then # display the new chart. if self._left_chart is self._emptychart: self._left_selector.set(name) elif self._right_chart is self._emptychart: self._right_selector.set(name) def _update_chartviews(self): self._left_matrix.update() self._right_matrix.update() self._out_matrix.update() # //////////////////////////////////////////////////////////// # Selection # //////////////////////////////////////////////////////////// def select_edge(self, edge): if edge in self._left_chart: self._left_matrix.markonly_edge(edge) else: self._left_matrix.unmark_edge() if edge in self._right_chart: self._right_matrix.markonly_edge(edge) else: self._right_matrix.unmark_edge() if edge in self._out_chart: self._out_matrix.markonly_edge(edge) else: self._out_matrix.unmark_edge() def select_cell(self, i, j): self._left_matrix.select_cell(i, j) self._right_matrix.select_cell(i, j) self._out_matrix.select_cell(i, j) # //////////////////////////////////////////////////////////// # Operations # //////////////////////////////////////////////////////////// def _difference(self): if not self._checkcompat(): return out_chart = Chart(self._left_chart.tokens()) for edge in self._left_chart: if edge not in self._right_chart: out_chart.insert(edge, []) self._update("-", out_chart) def _intersection(self): if not self._checkcompat(): return out_chart = Chart(self._left_chart.tokens()) for edge in self._left_chart: if edge in self._right_chart: out_chart.insert(edge, []) self._update("and", out_chart) def _union(self): if not self._checkcompat(): return out_chart = Chart(self._left_chart.tokens()) for edge in self._left_chart: out_chart.insert(edge, []) for edge in self._right_chart: out_chart.insert(edge, []) self._update("or", out_chart) def _swapcharts(self): left, right = self._left_name, self._right_name self._left_selector.set(right) self._right_selector.set(left) def _checkcompat(self): if ( self._left_chart.tokens() != self._right_chart.tokens() or self._left_chart.property_names() != self._right_chart.property_names() or self._left_chart == self._emptychart or self._right_chart == self._emptychart ): # Clear & inactivate the output chart. self._out_chart = self._emptychart self._out_matrix.set_chart(self._out_chart) self._out_matrix.inactivate() self._out_label["text"] = "Output" # Issue some other warning? return False else: return True def _update(self, operator, out_chart): self._operator = operator self._op_label["text"] = self._OPSYMBOL[operator] self._out_chart = out_chart self._out_matrix.set_chart(out_chart) self._out_label["text"] = "{} {} {}".format( self._left_name, self._operator, self._right_name, ) def _clear_out_chart(self): self._out_chart = self._emptychart self._out_matrix.set_chart(self._out_chart) self._op_label["text"] = " " self._out_matrix.inactivate() def _detach_out(self): ChartMatrixView(self._root, self._out_chart, title=self._out_label["text"]) ####################################################################### # Chart View ####################################################################### class ChartView: """ A component for viewing charts. This is used by ``ChartParserApp`` to allow students to interactively experiment with various chart parsing techniques. It is also used by ``Chart.draw()``. :ivar _chart: The chart that we are giving a view of. This chart may be modified; after it is modified, you should call ``update``. :ivar _sentence: The list of tokens that the chart spans. :ivar _root: The root window. :ivar _chart_canvas: The canvas we're using to display the chart itself. :ivar _tree_canvas: The canvas we're using to display the tree that each edge spans. May be None, if we're not displaying trees. :ivar _sentence_canvas: The canvas we're using to display the sentence text. May be None, if we're not displaying the sentence text. :ivar _edgetags: A dictionary mapping from edges to the tags of the canvas elements (lines, etc) used to display that edge. The values of this dictionary have the form ``(linetag, rhstag1, dottag, rhstag2, lhstag)``. :ivar _treetags: A list of all the tags that make up the tree; used to erase the tree (without erasing the loclines). :ivar _chart_height: The height of the chart canvas. :ivar _sentence_height: The height of the sentence canvas. :ivar _tree_height: The height of the tree :ivar _text_height: The height of a text string (in the normal font). :ivar _edgelevels: A list of edges at each level of the chart (the top level is the 0th element). This list is used to remember where edges should be drawn; and to make sure that no edges are overlapping on the chart view. :ivar _unitsize: Pixel size of one unit (from the location). This is determined by the span of the chart's location, and the width of the chart display canvas. :ivar _fontsize: The current font size :ivar _marks: A dictionary from edges to marks. Marks are strings, specifying colors (e.g. 'green'). """ _LEAF_SPACING = 10 _MARGIN = 10 _TREE_LEVEL_SIZE = 12 _CHART_LEVEL_SIZE = 40 def __init__(self, chart, root=None, **kw): """ Construct a new ``Chart`` display. """ # Process keyword args. draw_tree = kw.get("draw_tree", 0) draw_sentence = kw.get("draw_sentence", 1) self._fontsize = kw.get("fontsize", -12) # The chart! self._chart = chart # Callback functions self._callbacks = {} # Keep track of drawn edges self._edgelevels = [] self._edgetags = {} # Keep track of which edges are marked. self._marks = {} # These are used to keep track of the set of tree tokens # currently displayed in the tree canvas. self._treetoks = [] self._treetoks_edge = None self._treetoks_index = 0 # Keep track of the tags used to draw the tree self._tree_tags = [] # Put multiple edges on each level? self._compact = 0 # If they didn't provide a main window, then set one up. if root is None: top = Tk() top.title("Chart View") def destroy1(e, top=top): top.destroy() def destroy2(top=top): top.destroy() top.bind("q", destroy1) b = Button(top, text="Done", command=destroy2) b.pack(side="bottom") self._root = top else: self._root = root # Create some fonts. self._init_fonts(root) # Create the chart canvas. (self._chart_sb, self._chart_canvas) = self._sb_canvas(self._root) self._chart_canvas["height"] = 300 self._chart_canvas["closeenough"] = 15 # Create the sentence canvas. if draw_sentence: cframe = Frame(self._root, relief="sunk", border=2) cframe.pack(fill="both", side="bottom") self._sentence_canvas = Canvas(cframe, height=50) self._sentence_canvas["background"] = "#e0e0e0" self._sentence_canvas.pack(fill="both") # self._sentence_canvas['height'] = self._sentence_height else: self._sentence_canvas = None # Create the tree canvas. if draw_tree: (sb, canvas) = self._sb_canvas(self._root, "n", "x") (self._tree_sb, self._tree_canvas) = (sb, canvas) self._tree_canvas["height"] = 200 else: self._tree_canvas = None # Do some analysis to figure out how big the window should be self._analyze() self.draw() self._resize() self._grow() # Set up the configure callback, which will be called whenever # the window is resized. self._chart_canvas.bind("", self._configure) def _init_fonts(self, root): self._boldfont = Font(family="helvetica", weight="bold", size=self._fontsize) self._font = Font(family="helvetica", size=self._fontsize) # See: self._sysfont = Font(font=Button()["font"]) root.option_add("*Font", self._sysfont) def _sb_canvas(self, root, expand="y", fill="both", side="bottom"): """ Helper for __init__: construct a canvas with a scrollbar. """ cframe = Frame(root, relief="sunk", border=2) cframe.pack(fill=fill, expand=expand, side=side) canvas = Canvas(cframe, background="#e0e0e0") # Give the canvas a scrollbar. sb = Scrollbar(cframe, orient="vertical") sb.pack(side="right", fill="y") canvas.pack(side="left", fill=fill, expand="yes") # Connect the scrollbars to the canvas. sb["command"] = canvas.yview canvas["yscrollcommand"] = sb.set return (sb, canvas) def scroll_up(self, *e): self._chart_canvas.yview("scroll", -1, "units") def scroll_down(self, *e): self._chart_canvas.yview("scroll", 1, "units") def page_up(self, *e): self._chart_canvas.yview("scroll", -1, "pages") def page_down(self, *e): self._chart_canvas.yview("scroll", 1, "pages") def _grow(self): """ Grow the window, if necessary """ # Grow, if need-be N = self._chart.num_leaves() width = max( int(self._chart_canvas["width"]), N * self._unitsize + ChartView._MARGIN * 2 ) # It won't resize without the second (height) line, but I # don't understand why not. self._chart_canvas.configure(width=width) self._chart_canvas.configure(height=self._chart_canvas["height"]) self._unitsize = (width - 2 * ChartView._MARGIN) / N # Reset the height for the sentence window. if self._sentence_canvas is not None: self._sentence_canvas["height"] = self._sentence_height def set_font_size(self, size): self._font.configure(size=-abs(size)) self._boldfont.configure(size=-abs(size)) self._sysfont.configure(size=-abs(size)) self._analyze() self._grow() self.draw() def get_font_size(self): return abs(self._fontsize) def _configure(self, e): """ The configure callback. This is called whenever the window is resized. It is also called when the window is first mapped. It figures out the unit size, and redraws the contents of each canvas. """ N = self._chart.num_leaves() self._unitsize = (e.width - 2 * ChartView._MARGIN) / N self.draw() def update(self, chart=None): """ Draw any edges that have not been drawn. This is typically called when a after modifies the canvas that a CanvasView is displaying. ``update`` will cause any edges that have been added to the chart to be drawn. If update is given a ``chart`` argument, then it will replace the current chart with the given chart. """ if chart is not None: self._chart = chart self._edgelevels = [] self._marks = {} self._analyze() self._grow() self.draw() self.erase_tree() self._resize() else: for edge in self._chart: if edge not in self._edgetags: self._add_edge(edge) self._resize() def _edge_conflict(self, edge, lvl): """ Return True if the given edge overlaps with any edge on the given level. This is used by _add_edge to figure out what level a new edge should be added to. """ (s1, e1) = edge.span() for otheredge in self._edgelevels[lvl]: (s2, e2) = otheredge.span() if (s1 <= s2 < e1) or (s2 <= s1 < e2) or (s1 == s2 == e1 == e2): return True return False def _analyze_edge(self, edge): """ Given a new edge, recalculate: - _text_height - _unitsize (if the edge text is too big for the current _unitsize, then increase _unitsize) """ c = self._chart_canvas if isinstance(edge, TreeEdge): lhs = edge.lhs() rhselts = [] for elt in edge.rhs(): if isinstance(elt, Nonterminal): rhselts.append(str(elt.symbol())) else: rhselts.append(repr(elt)) rhs = " ".join(rhselts) else: lhs = edge.lhs() rhs = "" for s in (lhs, rhs): tag = c.create_text( 0, 0, text=s, font=self._boldfont, anchor="nw", justify="left" ) bbox = c.bbox(tag) c.delete(tag) width = bbox[2] # + ChartView._LEAF_SPACING edgelen = max(edge.length(), 1) self._unitsize = max(self._unitsize, width / edgelen) self._text_height = max(self._text_height, bbox[3] - bbox[1]) def _add_edge(self, edge, minlvl=0): """ Add a single edge to the ChartView: - Call analyze_edge to recalculate display parameters - Find an available level - Call _draw_edge """ # Do NOT show leaf edges in the chart. if isinstance(edge, LeafEdge): return if edge in self._edgetags: return self._analyze_edge(edge) self._grow() if not self._compact: self._edgelevels.append([edge]) lvl = len(self._edgelevels) - 1 self._draw_edge(edge, lvl) self._resize() return # Figure out what level to draw the edge on. lvl = 0 while True: # If this level doesn't exist yet, create it. while lvl >= len(self._edgelevels): self._edgelevels.append([]) self._resize() # Check if we can fit the edge in this level. if lvl >= minlvl and not self._edge_conflict(edge, lvl): # Go ahead and draw it. self._edgelevels[lvl].append(edge) break # Try the next level. lvl += 1 self._draw_edge(edge, lvl) def view_edge(self, edge): level = None for i in range(len(self._edgelevels)): if edge in self._edgelevels[i]: level = i break if level is None: return # Try to view the new edge.. y = (level + 1) * self._chart_level_size dy = self._text_height + 10 self._chart_canvas.yview("moveto", 1.0) if self._chart_height != 0: self._chart_canvas.yview("moveto", (y - dy) / self._chart_height) def _draw_edge(self, edge, lvl): """ Draw a single edge on the ChartView. """ c = self._chart_canvas # Draw the arrow. x1 = edge.start() * self._unitsize + ChartView._MARGIN x2 = edge.end() * self._unitsize + ChartView._MARGIN if x2 == x1: x2 += max(4, self._unitsize / 5) y = (lvl + 1) * self._chart_level_size linetag = c.create_line(x1, y, x2, y, arrow="last", width=3) # Draw a label for the edge. if isinstance(edge, TreeEdge): rhs = [] for elt in edge.rhs(): if isinstance(elt, Nonterminal): rhs.append(str(elt.symbol())) else: rhs.append(repr(elt)) pos = edge.dot() else: rhs = [] pos = 0 rhs1 = " ".join(rhs[:pos]) rhs2 = " ".join(rhs[pos:]) rhstag1 = c.create_text(x1 + 3, y, text=rhs1, font=self._font, anchor="nw") dotx = c.bbox(rhstag1)[2] + 6 doty = (c.bbox(rhstag1)[1] + c.bbox(rhstag1)[3]) / 2 dottag = c.create_oval(dotx - 2, doty - 2, dotx + 2, doty + 2) rhstag2 = c.create_text(dotx + 6, y, text=rhs2, font=self._font, anchor="nw") lhstag = c.create_text( (x1 + x2) / 2, y, text=str(edge.lhs()), anchor="s", font=self._boldfont ) # Keep track of the edge's tags. self._edgetags[edge] = (linetag, rhstag1, dottag, rhstag2, lhstag) # Register a callback for clicking on the edge. def cb(event, self=self, edge=edge): self._fire_callbacks("select", edge) c.tag_bind(rhstag1, "", cb) c.tag_bind(rhstag2, "", cb) c.tag_bind(linetag, "", cb) c.tag_bind(dottag, "", cb) c.tag_bind(lhstag, "", cb) self._color_edge(edge) def _color_edge(self, edge, linecolor=None, textcolor=None): """ Color in an edge with the given colors. If no colors are specified, use intelligent defaults (dependent on selection, etc.) """ if edge not in self._edgetags: return c = self._chart_canvas if linecolor is not None and textcolor is not None: if edge in self._marks: linecolor = self._marks[edge] tags = self._edgetags[edge] c.itemconfig(tags[0], fill=linecolor) c.itemconfig(tags[1], fill=textcolor) c.itemconfig(tags[2], fill=textcolor, outline=textcolor) c.itemconfig(tags[3], fill=textcolor) c.itemconfig(tags[4], fill=textcolor) return else: N = self._chart.num_leaves() if edge in self._marks: self._color_edge(self._marks[edge]) if edge.is_complete() and edge.span() == (0, N): self._color_edge(edge, "#084", "#042") elif isinstance(edge, LeafEdge): self._color_edge(edge, "#48c", "#246") else: self._color_edge(edge, "#00f", "#008") def mark_edge(self, edge, mark="#0df"): """ Mark an edge """ self._marks[edge] = mark self._color_edge(edge) def unmark_edge(self, edge=None): """ Unmark an edge (or all edges) """ if edge is None: old_marked_edges = list(self._marks.keys()) self._marks = {} for edge in old_marked_edges: self._color_edge(edge) else: del self._marks[edge] self._color_edge(edge) def markonly_edge(self, edge, mark="#0df"): self.unmark_edge() self.mark_edge(edge, mark) def _analyze(self): """ Analyze the sentence string, to figure out how big a unit needs to be, How big the tree should be, etc. """ # Figure out the text height and the unit size. unitsize = 70 # min unitsize text_height = 0 c = self._chart_canvas # Check against all tokens for leaf in self._chart.leaves(): tag = c.create_text( 0, 0, text=repr(leaf), font=self._font, anchor="nw", justify="left" ) bbox = c.bbox(tag) c.delete(tag) width = bbox[2] + ChartView._LEAF_SPACING unitsize = max(width, unitsize) text_height = max(text_height, bbox[3] - bbox[1]) self._unitsize = unitsize self._text_height = text_height self._sentence_height = self._text_height + 2 * ChartView._MARGIN # Check against edges. for edge in self._chart.edges(): self._analyze_edge(edge) # Size of chart levels self._chart_level_size = self._text_height * 2 # Default tree size.. self._tree_height = 3 * (ChartView._TREE_LEVEL_SIZE + self._text_height) # Resize the scrollregions. self._resize() def _resize(self): """ Update the scroll-regions for each canvas. This ensures that everything is within a scroll-region, so the user can use the scrollbars to view the entire display. This does *not* resize the window. """ c = self._chart_canvas # Reset the chart scroll region width = self._chart.num_leaves() * self._unitsize + ChartView._MARGIN * 2 levels = len(self._edgelevels) self._chart_height = (levels + 2) * self._chart_level_size c["scrollregion"] = (0, 0, width, self._chart_height) # Reset the tree scroll region if self._tree_canvas: self._tree_canvas["scrollregion"] = (0, 0, width, self._tree_height) def _draw_loclines(self): """ Draw location lines. These are vertical gridlines used to show where each location unit is. """ BOTTOM = 50000 c1 = self._tree_canvas c2 = self._sentence_canvas c3 = self._chart_canvas margin = ChartView._MARGIN self._loclines = [] for i in range(0, self._chart.num_leaves() + 1): x = i * self._unitsize + margin if c1: t1 = c1.create_line(x, 0, x, BOTTOM) c1.tag_lower(t1) if c2: t2 = c2.create_line(x, 0, x, self._sentence_height) c2.tag_lower(t2) t3 = c3.create_line(x, 0, x, BOTTOM) c3.tag_lower(t3) t4 = c3.create_text(x + 2, 0, text=repr(i), anchor="nw", font=self._font) c3.tag_lower(t4) # if i % 4 == 0: # if c1: c1.itemconfig(t1, width=2, fill='gray60') # if c2: c2.itemconfig(t2, width=2, fill='gray60') # c3.itemconfig(t3, width=2, fill='gray60') if i % 2 == 0: if c1: c1.itemconfig(t1, fill="gray60") if c2: c2.itemconfig(t2, fill="gray60") c3.itemconfig(t3, fill="gray60") else: if c1: c1.itemconfig(t1, fill="gray80") if c2: c2.itemconfig(t2, fill="gray80") c3.itemconfig(t3, fill="gray80") def _draw_sentence(self): """Draw the sentence string.""" if self._chart.num_leaves() == 0: return c = self._sentence_canvas margin = ChartView._MARGIN y = ChartView._MARGIN for i, leaf in enumerate(self._chart.leaves()): x1 = i * self._unitsize + margin x2 = x1 + self._unitsize x = (x1 + x2) / 2 tag = c.create_text( x, y, text=repr(leaf), font=self._font, anchor="n", justify="left" ) bbox = c.bbox(tag) rt = c.create_rectangle( x1 + 2, bbox[1] - (ChartView._LEAF_SPACING / 2), x2 - 2, bbox[3] + (ChartView._LEAF_SPACING / 2), fill="#f0f0f0", outline="#f0f0f0", ) c.tag_lower(rt) def erase_tree(self): for tag in self._tree_tags: self._tree_canvas.delete(tag) self._treetoks = [] self._treetoks_edge = None self._treetoks_index = 0 def draw_tree(self, edge=None): if edge is None and self._treetoks_edge is None: return if edge is None: edge = self._treetoks_edge # If it's a new edge, then get a new list of treetoks. if self._treetoks_edge != edge: self._treetoks = [t for t in self._chart.trees(edge) if isinstance(t, Tree)] self._treetoks_edge = edge self._treetoks_index = 0 # Make sure there's something to draw. if len(self._treetoks) == 0: return # Erase the old tree. for tag in self._tree_tags: self._tree_canvas.delete(tag) # Draw the new tree. tree = self._treetoks[self._treetoks_index] self._draw_treetok(tree, edge.start()) # Show how many trees are available for the edge. self._draw_treecycle() # Update the scroll region. w = self._chart.num_leaves() * self._unitsize + 2 * ChartView._MARGIN h = tree.height() * (ChartView._TREE_LEVEL_SIZE + self._text_height) self._tree_canvas["scrollregion"] = (0, 0, w, h) def cycle_tree(self): self._treetoks_index = (self._treetoks_index + 1) % len(self._treetoks) self.draw_tree(self._treetoks_edge) def _draw_treecycle(self): if len(self._treetoks) <= 1: return # Draw the label. label = "%d Trees" % len(self._treetoks) c = self._tree_canvas margin = ChartView._MARGIN right = self._chart.num_leaves() * self._unitsize + margin - 2 tag = c.create_text(right, 2, anchor="ne", text=label, font=self._boldfont) self._tree_tags.append(tag) _, _, _, y = c.bbox(tag) # Draw the triangles. for i in range(len(self._treetoks)): x = right - 20 * (len(self._treetoks) - i - 1) if i == self._treetoks_index: fill = "#084" else: fill = "#fff" tag = c.create_polygon( x, y + 10, x - 5, y, x - 10, y + 10, fill=fill, outline="black" ) self._tree_tags.append(tag) # Set up a callback: show the tree if they click on its # triangle. def cb(event, self=self, i=i): self._treetoks_index = i self.draw_tree() c.tag_bind(tag, "", cb) def _draw_treetok(self, treetok, index, depth=0): """ :param index: The index of the first leaf in the tree. :return: The index of the first leaf after the tree. """ c = self._tree_canvas margin = ChartView._MARGIN # Draw the children child_xs = [] for child in treetok: if isinstance(child, Tree): child_x, index = self._draw_treetok(child, index, depth + 1) child_xs.append(child_x) else: child_xs.append((2 * index + 1) * self._unitsize / 2 + margin) index += 1 # If we have children, then get the node's x by averaging their # node x's. Otherwise, make room for ourselves. if child_xs: nodex = sum(child_xs) / len(child_xs) else: # [XX] breaks for null productions. nodex = (2 * index + 1) * self._unitsize / 2 + margin index += 1 # Draw the node nodey = depth * (ChartView._TREE_LEVEL_SIZE + self._text_height) tag = c.create_text( nodex, nodey, anchor="n", justify="center", text=str(treetok.label()), fill="#042", font=self._boldfont, ) self._tree_tags.append(tag) # Draw lines to the children. childy = nodey + ChartView._TREE_LEVEL_SIZE + self._text_height for childx, child in zip(child_xs, treetok): if isinstance(child, Tree) and child: # A "real" tree token: tag = c.create_line( nodex, nodey + self._text_height, childx, childy, width=2, fill="#084", ) self._tree_tags.append(tag) if isinstance(child, Tree) and not child: # An unexpanded tree token: tag = c.create_line( nodex, nodey + self._text_height, childx, childy, width=2, fill="#048", dash="2 3", ) self._tree_tags.append(tag) if not isinstance(child, Tree): # A leaf: tag = c.create_line( nodex, nodey + self._text_height, childx, 10000, width=2, fill="#084", ) self._tree_tags.append(tag) return nodex, index def draw(self): """ Draw everything (from scratch). """ if self._tree_canvas: self._tree_canvas.delete("all") self.draw_tree() if self._sentence_canvas: self._sentence_canvas.delete("all") self._draw_sentence() self._chart_canvas.delete("all") self._edgetags = {} # Redraw any edges we erased. for lvl in range(len(self._edgelevels)): for edge in self._edgelevels[lvl]: self._draw_edge(edge, lvl) for edge in self._chart: self._add_edge(edge) self._draw_loclines() def add_callback(self, event, func): self._callbacks.setdefault(event, {})[func] = 1 def remove_callback(self, event, func=None): if func is None: del self._callbacks[event] else: try: del self._callbacks[event][func] except: pass def _fire_callbacks(self, event, *args): if event not in self._callbacks: return for cb_func in list(self._callbacks[event].keys()): cb_func(*args) ####################################################################### # Edge Rules ####################################################################### # These version of the chart rules only apply to a specific edge. # This lets the user select an edge, and then apply a rule. class EdgeRule: """ To create an edge rule, make an empty base class that uses EdgeRule as the first base class, and the basic rule as the second base class. (Order matters!) """ def __init__(self, edge): super = self.__class__.__bases__[1] self._edge = edge self.NUM_EDGES = super.NUM_EDGES - 1 def apply(self, chart, grammar, *edges): super = self.__class__.__bases__[1] edges += (self._edge,) yield from super.apply(self, chart, grammar, *edges) def __str__(self): super = self.__class__.__bases__[1] return super.__str__(self) class TopDownPredictEdgeRule(EdgeRule, TopDownPredictRule): pass class BottomUpEdgeRule(EdgeRule, BottomUpPredictRule): pass class BottomUpLeftCornerEdgeRule(EdgeRule, BottomUpPredictCombineRule): pass class FundamentalEdgeRule(EdgeRule, SingleEdgeFundamentalRule): pass ####################################################################### # Chart Parser Application ####################################################################### class ChartParserApp: def __init__(self, grammar, tokens, title="Chart Parser Application"): # Initialize the parser self._init_parser(grammar, tokens) self._root = None try: # Create the root window. self._root = Tk() self._root.title(title) self._root.bind("", self.destroy) # Set up some frames. frame3 = Frame(self._root) frame2 = Frame(self._root) frame1 = Frame(self._root) frame3.pack(side="bottom", fill="none") frame2.pack(side="bottom", fill="x") frame1.pack(side="bottom", fill="both", expand=1) self._init_fonts(self._root) self._init_animation() self._init_chartview(frame1) self._init_rulelabel(frame2) self._init_buttons(frame3) self._init_menubar() self._matrix = None self._results = None # Set up keyboard bindings. self._init_bindings() except: print("Error creating Tree View") self.destroy() raise def destroy(self, *args): if self._root is None: return self._root.destroy() self._root = None def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._root.mainloop(*args, **kwargs) # //////////////////////////////////////////////////////////// # Initialization Helpers # //////////////////////////////////////////////////////////// def _init_parser(self, grammar, tokens): self._grammar = grammar self._tokens = tokens self._reset_parser() def _reset_parser(self): self._cp = SteppingChartParser(self._grammar) self._cp.initialize(self._tokens) self._chart = self._cp.chart() # Insert LeafEdges before the parsing starts. for _new_edge in LeafInitRule().apply(self._chart, self._grammar): pass # The step iterator -- use this to generate new edges self._cpstep = self._cp.step() # The currently selected edge self._selection = None def _init_fonts(self, root): # See: self._sysfont = Font(font=Button()["font"]) root.option_add("*Font", self._sysfont) # TWhat's our font size (default=same as sysfont) self._size = IntVar(root) self._size.set(self._sysfont.cget("size")) self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) self._font = Font(family="helvetica", size=self._size.get()) def _init_animation(self): # Are we stepping? (default=yes) self._step = IntVar(self._root) self._step.set(1) # What's our animation speed (default=fast) self._animate = IntVar(self._root) self._animate.set(3) # Default speed = fast # Are we currently animating? self._animating = 0 def _init_chartview(self, parent): self._cv = ChartView(self._chart, parent, draw_tree=1, draw_sentence=1) self._cv.add_callback("select", self._click_cv_edge) def _init_rulelabel(self, parent): ruletxt = "Last edge generated by:" self._rulelabel1 = Label(parent, text=ruletxt, font=self._boldfont) self._rulelabel2 = Label( parent, width=40, relief="groove", anchor="w", font=self._boldfont ) self._rulelabel1.pack(side="left") self._rulelabel2.pack(side="left") step = Checkbutton(parent, variable=self._step, text="Step") step.pack(side="right") def _init_buttons(self, parent): frame1 = Frame(parent) frame2 = Frame(parent) frame1.pack(side="bottom", fill="x") frame2.pack(side="top", fill="none") Button( frame1, text="Reset\nParser", background="#90c0d0", foreground="black", command=self.reset, ).pack(side="right") # Button(frame1, text='Pause', # background='#90c0d0', foreground='black', # command=self.pause).pack(side='left') Button( frame1, text="Top Down\nStrategy", background="#90c0d0", foreground="black", command=self.top_down_strategy, ).pack(side="left") Button( frame1, text="Bottom Up\nStrategy", background="#90c0d0", foreground="black", command=self.bottom_up_strategy, ).pack(side="left") Button( frame1, text="Bottom Up\nLeft-Corner Strategy", background="#90c0d0", foreground="black", command=self.bottom_up_leftcorner_strategy, ).pack(side="left") Button( frame2, text="Top Down Init\nRule", background="#90f090", foreground="black", command=self.top_down_init, ).pack(side="left") Button( frame2, text="Top Down Predict\nRule", background="#90f090", foreground="black", command=self.top_down_predict, ).pack(side="left") Frame(frame2, width=20).pack(side="left") Button( frame2, text="Bottom Up Predict\nRule", background="#90f090", foreground="black", command=self.bottom_up, ).pack(side="left") Frame(frame2, width=20).pack(side="left") Button( frame2, text="Bottom Up Left-Corner\nPredict Rule", background="#90f090", foreground="black", command=self.bottom_up_leftcorner, ).pack(side="left") Frame(frame2, width=20).pack(side="left") Button( frame2, text="Fundamental\nRule", background="#90f090", foreground="black", command=self.fundamental, ).pack(side="left") def _init_bindings(self): self._root.bind("", self._cv.scroll_up) self._root.bind("", self._cv.scroll_down) self._root.bind("", self._cv.page_up) self._root.bind("", self._cv.page_down) self._root.bind("", self.destroy) self._root.bind("", self.destroy) self._root.bind("", self.help) self._root.bind("", self.save_chart) self._root.bind("", self.load_chart) self._root.bind("", self.reset) self._root.bind("t", self.top_down_strategy) self._root.bind("b", self.bottom_up_strategy) self._root.bind("c", self.bottom_up_leftcorner_strategy) self._root.bind("", self._stop_animation) self._root.bind("", self.edit_grammar) self._root.bind("", self.edit_sentence) # Animation speed control self._root.bind("-", lambda e, a=self._animate: a.set(1)) self._root.bind("=", lambda e, a=self._animate: a.set(2)) self._root.bind("+", lambda e, a=self._animate: a.set(3)) # Step control self._root.bind("s", lambda e, s=self._step: s.set(not s.get())) def _init_menubar(self): menubar = Menu(self._root) filemenu = Menu(menubar, tearoff=0) filemenu.add_command( label="Save Chart", underline=0, command=self.save_chart, accelerator="Ctrl-s", ) filemenu.add_command( label="Load Chart", underline=0, command=self.load_chart, accelerator="Ctrl-o", ) filemenu.add_command( label="Reset Chart", underline=0, command=self.reset, accelerator="Ctrl-r" ) filemenu.add_separator() filemenu.add_command(label="Save Grammar", command=self.save_grammar) filemenu.add_command(label="Load Grammar", command=self.load_grammar) filemenu.add_separator() filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) editmenu.add_command( label="Edit Grammar", underline=5, command=self.edit_grammar, accelerator="Ctrl-g", ) editmenu.add_command( label="Edit Text", underline=5, command=self.edit_sentence, accelerator="Ctrl-t", ) menubar.add_cascade(label="Edit", underline=0, menu=editmenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_command( label="Chart Matrix", underline=6, command=self.view_matrix ) viewmenu.add_command(label="Results", underline=0, command=self.view_results) menubar.add_cascade(label="View", underline=0, menu=viewmenu) rulemenu = Menu(menubar, tearoff=0) rulemenu.add_command( label="Top Down Strategy", underline=0, command=self.top_down_strategy, accelerator="t", ) rulemenu.add_command( label="Bottom Up Strategy", underline=0, command=self.bottom_up_strategy, accelerator="b", ) rulemenu.add_command( label="Bottom Up Left-Corner Strategy", underline=0, command=self.bottom_up_leftcorner_strategy, accelerator="c", ) rulemenu.add_separator() rulemenu.add_command(label="Bottom Up Rule", command=self.bottom_up) rulemenu.add_command( label="Bottom Up Left-Corner Rule", command=self.bottom_up_leftcorner ) rulemenu.add_command(label="Top Down Init Rule", command=self.top_down_init) rulemenu.add_command( label="Top Down Predict Rule", command=self.top_down_predict ) rulemenu.add_command(label="Fundamental Rule", command=self.fundamental) menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) animatemenu = Menu(menubar, tearoff=0) animatemenu.add_checkbutton( label="Step", underline=0, variable=self._step, accelerator="s" ) animatemenu.add_separator() animatemenu.add_radiobutton( label="No Animation", underline=0, variable=self._animate, value=0 ) animatemenu.add_radiobutton( label="Slow Animation", underline=0, variable=self._animate, value=1, accelerator="-", ) animatemenu.add_radiobutton( label="Normal Animation", underline=0, variable=self._animate, value=2, accelerator="=", ) animatemenu.add_radiobutton( label="Fast Animation", underline=0, variable=self._animate, value=3, accelerator="+", ) menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) zoommenu = Menu(menubar, tearoff=0) zoommenu.add_radiobutton( label="Tiny", variable=self._size, underline=0, value=10, command=self.resize, ) zoommenu.add_radiobutton( label="Small", variable=self._size, underline=0, value=12, command=self.resize, ) zoommenu.add_radiobutton( label="Medium", variable=self._size, underline=0, value=14, command=self.resize, ) zoommenu.add_radiobutton( label="Large", variable=self._size, underline=0, value=18, command=self.resize, ) zoommenu.add_radiobutton( label="Huge", variable=self._size, underline=0, value=24, command=self.resize, ) menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label="About", underline=0, command=self.about) helpmenu.add_command( label="Instructions", underline=0, command=self.help, accelerator="F1" ) menubar.add_cascade(label="Help", underline=0, menu=helpmenu) self._root.config(menu=menubar) # //////////////////////////////////////////////////////////// # Selection Handling # //////////////////////////////////////////////////////////// def _click_cv_edge(self, edge): if edge != self._selection: # Clicking on a new edge selects it. self._select_edge(edge) else: # Repeated clicks on one edge cycle its trees. self._cv.cycle_tree() # [XX] this can get confused if animation is running # faster than the callbacks... def _select_matrix_edge(self, edge): self._select_edge(edge) self._cv.view_edge(edge) def _select_edge(self, edge): self._selection = edge # Update the chart view. self._cv.markonly_edge(edge, "#f00") self._cv.draw_tree(edge) # Update the matrix view. if self._matrix: self._matrix.markonly_edge(edge) if self._matrix: self._matrix.view_edge(edge) def _deselect_edge(self): self._selection = None # Update the chart view. self._cv.unmark_edge() self._cv.erase_tree() # Update the matrix view if self._matrix: self._matrix.unmark_edge() def _show_new_edge(self, edge): self._display_rule(self._cp.current_chartrule()) # Update the chart view. self._cv.update() self._cv.draw_tree(edge) self._cv.markonly_edge(edge, "#0df") self._cv.view_edge(edge) # Update the matrix view. if self._matrix: self._matrix.update() if self._matrix: self._matrix.markonly_edge(edge) if self._matrix: self._matrix.view_edge(edge) # Update the results view. if self._results: self._results.update(edge) # //////////////////////////////////////////////////////////// # Help/usage # //////////////////////////////////////////////////////////// def help(self, *e): self._animating = 0 # The default font's not very legible; try using 'fixed' instead. try: ShowText( self._root, "Help: Chart Parser Application", (__doc__ or "").strip(), width=75, font="fixed", ) except: ShowText( self._root, "Help: Chart Parser Application", (__doc__ or "").strip(), width=75, ) def about(self, *e): ABOUT = "NLTK Chart Parser Application\n" + "Written by Edward Loper" showinfo("About: Chart Parser Application", ABOUT) # //////////////////////////////////////////////////////////// # File Menu # //////////////////////////////////////////////////////////// CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")] GRAMMAR_FILE_TYPES = [ ("Plaintext grammar file", ".cfg"), ("Pickle file", ".pickle"), ("All files", "*"), ] def load_chart(self, *args): "Load a chart from a pickle file" filename = askopenfilename( filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" ) if not filename: return try: with open(filename, "rb") as infile: chart = pickle.load(infile) self._chart = chart self._cv.update(chart) if self._matrix: self._matrix.set_chart(chart) if self._matrix: self._matrix.deselect_cell() if self._results: self._results.set_chart(chart) self._cp.set_chart(chart) except Exception as e: raise showerror("Error Loading Chart", "Unable to open file: %r" % filename) def save_chart(self, *args): "Save a chart to a pickle file" filename = asksaveasfilename( filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" ) if not filename: return try: with open(filename, "wb") as outfile: pickle.dump(self._chart, outfile) except Exception as e: raise showerror("Error Saving Chart", "Unable to open file: %r" % filename) def load_grammar(self, *args): "Load a grammar from a pickle file" filename = askopenfilename( filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg" ) if not filename: return try: if filename.endswith(".pickle"): with open(filename, "rb") as infile: grammar = pickle.load(infile) else: with open(filename) as infile: grammar = CFG.fromstring(infile.read()) self.set_grammar(grammar) except Exception as e: showerror("Error Loading Grammar", "Unable to open file: %r" % filename) def save_grammar(self, *args): filename = asksaveasfilename( filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg" ) if not filename: return try: if filename.endswith(".pickle"): with open(filename, "wb") as outfile: pickle.dump((self._chart, self._tokens), outfile) else: with open(filename, "w") as outfile: prods = self._grammar.productions() start = [p for p in prods if p.lhs() == self._grammar.start()] rest = [p for p in prods if p.lhs() != self._grammar.start()] for prod in start: outfile.write("%s\n" % prod) for prod in rest: outfile.write("%s\n" % prod) except Exception as e: showerror("Error Saving Grammar", "Unable to open file: %r" % filename) def reset(self, *args): self._animating = 0 self._reset_parser() self._cv.update(self._chart) if self._matrix: self._matrix.set_chart(self._chart) if self._matrix: self._matrix.deselect_cell() if self._results: self._results.set_chart(self._chart) # //////////////////////////////////////////////////////////// # Edit # //////////////////////////////////////////////////////////// def edit_grammar(self, *e): CFGEditor(self._root, self._grammar, self.set_grammar) def set_grammar(self, grammar): self._grammar = grammar self._cp.set_grammar(grammar) if self._results: self._results.set_grammar(grammar) def edit_sentence(self, *e): sentence = " ".join(self._tokens) title = "Edit Text" instr = "Enter a new sentence to parse." EntryDialog(self._root, sentence, instr, self.set_sentence, title) def set_sentence(self, sentence): self._tokens = list(sentence.split()) self.reset() # //////////////////////////////////////////////////////////// # View Menu # //////////////////////////////////////////////////////////// def view_matrix(self, *e): if self._matrix is not None: self._matrix.destroy() self._matrix = ChartMatrixView(self._root, self._chart) self._matrix.add_callback("select", self._select_matrix_edge) def view_results(self, *e): if self._results is not None: self._results.destroy() self._results = ChartResultsView(self._root, self._chart, self._grammar) # //////////////////////////////////////////////////////////// # Zoom Menu # //////////////////////////////////////////////////////////// def resize(self): self._animating = 0 self.set_font_size(self._size.get()) def set_font_size(self, size): self._cv.set_font_size(size) self._font.configure(size=-abs(size)) self._boldfont.configure(size=-abs(size)) self._sysfont.configure(size=-abs(size)) def get_font_size(self): return abs(self._size.get()) # //////////////////////////////////////////////////////////// # Parsing # //////////////////////////////////////////////////////////// def apply_strategy(self, strategy, edge_strategy=None): # If we're animating, then stop. if self._animating: self._animating = 0 return # Clear the rule display & mark. self._display_rule(None) # self._cv.unmark_edge() if self._step.get(): selection = self._selection if (selection is not None) and (edge_strategy is not None): # Apply the given strategy to the selected edge. self._cp.set_strategy([edge_strategy(selection)]) newedge = self._apply_strategy() # If it failed, then clear the selection. if newedge is None: self._cv.unmark_edge() self._selection = None else: self._cp.set_strategy(strategy) self._apply_strategy() else: self._cp.set_strategy(strategy) if self._animate.get(): self._animating = 1 self._animate_strategy() else: for edge in self._cpstep: if edge is None: break self._cv.update() if self._matrix: self._matrix.update() if self._results: self._results.update() def _stop_animation(self, *e): self._animating = 0 def _animate_strategy(self, speed=1): if self._animating == 0: return if self._apply_strategy() is not None: if self._animate.get() == 0 or self._step.get() == 1: return if self._animate.get() == 1: self._root.after(3000, self._animate_strategy) elif self._animate.get() == 2: self._root.after(1000, self._animate_strategy) else: self._root.after(20, self._animate_strategy) def _apply_strategy(self): new_edge = next(self._cpstep) if new_edge is not None: self._show_new_edge(new_edge) return new_edge def _display_rule(self, rule): if rule is None: self._rulelabel2["text"] = "" else: name = str(rule) self._rulelabel2["text"] = name size = self._cv.get_font_size() # //////////////////////////////////////////////////////////// # Parsing Strategies # //////////////////////////////////////////////////////////// # Basic rules: _TD_INIT = [TopDownInitRule()] _TD_PREDICT = [TopDownPredictRule()] _BU_RULE = [BottomUpPredictRule()] _BU_LC_RULE = [BottomUpPredictCombineRule()] _FUNDAMENTAL = [SingleEdgeFundamentalRule()] # Complete strategies: _TD_STRATEGY = _TD_INIT + _TD_PREDICT + _FUNDAMENTAL _BU_STRATEGY = _BU_RULE + _FUNDAMENTAL _BU_LC_STRATEGY = _BU_LC_RULE + _FUNDAMENTAL # Button callback functions: def top_down_init(self, *e): self.apply_strategy(self._TD_INIT, None) def top_down_predict(self, *e): self.apply_strategy(self._TD_PREDICT, TopDownPredictEdgeRule) def bottom_up(self, *e): self.apply_strategy(self._BU_RULE, BottomUpEdgeRule) def bottom_up_leftcorner(self, *e): self.apply_strategy(self._BU_LC_RULE, BottomUpLeftCornerEdgeRule) def fundamental(self, *e): self.apply_strategy(self._FUNDAMENTAL, FundamentalEdgeRule) def bottom_up_strategy(self, *e): self.apply_strategy(self._BU_STRATEGY, BottomUpEdgeRule) def bottom_up_leftcorner_strategy(self, *e): self.apply_strategy(self._BU_LC_STRATEGY, BottomUpLeftCornerEdgeRule) def top_down_strategy(self, *e): self.apply_strategy(self._TD_STRATEGY, TopDownPredictEdgeRule) def app(): grammar = CFG.fromstring( """ # Grammatical productions. S -> NP VP VP -> VP PP | V NP | V NP -> Det N | NP PP PP -> P NP # Lexical productions. NP -> 'John' | 'I' Det -> 'the' | 'my' | 'a' N -> 'dog' | 'cookie' | 'table' | 'cake' | 'fork' V -> 'ate' | 'saw' P -> 'on' | 'under' | 'with' """ ) sent = "John ate the cake on the table with a fork" sent = "John ate the cake on the table" tokens = list(sent.split()) print("grammar= (") for rule in grammar.productions(): print((" ", repr(rule) + ",")) print(")") print("tokens = %r" % tokens) print('Calling "ChartParserApp(grammar, tokens)"...') ChartParserApp(grammar, tokens).mainloop() if __name__ == "__main__": app() # Chart comparer: # charts = ['/tmp/earley.pickle', # '/tmp/topdown.pickle', # '/tmp/bottomup.pickle'] # ChartComparer(*charts).mainloop() # import profile # profile.run('demo2()', '/tmp/profile.out') # import pstats # p = pstats.Stats('/tmp/profile.out') # p.strip_dirs().sort_stats('time', 'cum').print_stats(60) # p.strip_dirs().sort_stats('cum', 'time').print_stats(60) __all__ = ["app"] nltk-3.7/nltk/app/chunkparser_app.py000066400000000000000000001567661420073152400176450ustar00rootroot00000000000000# Natural Language Toolkit: Regexp Chunk Parser Application # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring the regular expression based chunk parser ``nltk.chunk.RegexpChunkParser``. """ # Todo: Add a way to select the development set from the menubar. This # might just need to be a selection box (conll vs treebank etc) plus # configuration parameters to select what's being chunked (eg VP vs NP) # and what part of the data is being used as the development set. import random import re import textwrap import time from tkinter import ( Button, Canvas, Checkbutton, Frame, IntVar, Label, Menu, Scrollbar, Text, Tk, ) from tkinter.filedialog import askopenfilename, asksaveasfilename from tkinter.font import Font from nltk.chunk import ChunkScore, RegexpChunkParser from nltk.chunk.regexp import RegexpChunkRule from nltk.corpus import conll2000, treebank_chunk from nltk.draw.util import ShowText from nltk.tree import Tree from nltk.util import in_idle class RegexpChunkApp: """ A graphical tool for exploring the regular expression based chunk parser ``nltk.chunk.RegexpChunkParser``. See ``HELP`` for instructional text. """ ##///////////////////////////////////////////////////////////////// ## Help Text ##///////////////////////////////////////////////////////////////// #: A dictionary mapping from part of speech tags to descriptions, #: which is used in the help text. (This should probably live with #: the conll and/or treebank corpus instead.) TAGSET = { "CC": "Coordinating conjunction", "PRP$": "Possessive pronoun", "CD": "Cardinal number", "RB": "Adverb", "DT": "Determiner", "RBR": "Adverb, comparative", "EX": "Existential there", "RBS": "Adverb, superlative", "FW": "Foreign word", "RP": "Particle", "JJ": "Adjective", "TO": "to", "JJR": "Adjective, comparative", "UH": "Interjection", "JJS": "Adjective, superlative", "VB": "Verb, base form", "LS": "List item marker", "VBD": "Verb, past tense", "MD": "Modal", "NNS": "Noun, plural", "NN": "Noun, singular or masps", "VBN": "Verb, past participle", "VBZ": "Verb,3rd ps. sing. present", "NNP": "Proper noun, singular", "NNPS": "Proper noun plural", "WDT": "wh-determiner", "PDT": "Predeterminer", "WP": "wh-pronoun", "POS": "Possessive ending", "WP$": "Possessive wh-pronoun", "PRP": "Personal pronoun", "WRB": "wh-adverb", "(": "open parenthesis", ")": "close parenthesis", "``": "open quote", ",": "comma", "''": "close quote", ".": "period", "#": "pound sign (currency marker)", "$": "dollar sign (currency marker)", "IN": "Preposition/subord. conjunction", "SYM": "Symbol (mathematical or scientific)", "VBG": "Verb, gerund/present participle", "VBP": "Verb, non-3rd ps. sing. present", ":": "colon", } #: Contents for the help box. This is a list of tuples, one for #: each help page, where each tuple has four elements: #: - A title (displayed as a tab) #: - A string description of tabstops (see Tkinter.Text for details) #: - The text contents for the help page. You can use expressions #: like ... to colorize the text; see ``HELP_AUTOTAG`` #: for a list of tags you can use for colorizing. HELP = [ ( "Help", "20", "Welcome to the regular expression chunk-parser grammar editor. " "You can use this editor to develop and test chunk parser grammars " "based on NLTK's RegexpChunkParser class.\n\n" # Help box. "Use this box ('Help') to learn more about the editor; click on the " "tabs for help on specific topics:" "\n" "Rules: grammar rule types\n" "Regexps: regular expression syntax\n" "Tags: part of speech tags\n\n" # Grammar. "Use the upper-left box ('Grammar') to edit your grammar. " "Each line of your grammar specifies a single 'rule', " "which performs an action such as creating a chunk or merging " "two chunks.\n\n" # Dev set. "The lower-left box ('Development Set') runs your grammar on the " "development set, and displays the results. " "Your grammar's chunks are highlighted, and " "the correct (gold standard) chunks are " "underlined. If they " "match, they are displayed in green; otherwise, " "they are displayed in red. The box displays a single " "sentence from the development set at a time; use the scrollbar or " "the next/previous buttons view additional sentences.\n\n" # Performance "The lower-right box ('Evaluation') tracks the performance of " "your grammar on the development set. The 'precision' axis " "indicates how many of your grammar's chunks are correct; and " "the 'recall' axis indicates how many of the gold standard " "chunks your system generated. Typically, you should try to " "design a grammar that scores high on both metrics. The " "exact precision and recall of the current grammar, as well " "as their harmonic mean (the 'f-score'), are displayed in " "the status bar at the bottom of the window.", ), ( "Rules", "10", "

{...regexp...}

" "\nChunk rule: creates new chunks from words matching " "regexp.\n\n" "

}...regexp...{

" "\nStrip rule: removes words matching regexp from existing " "chunks.\n\n" "

...regexp1...}{...regexp2...

" "\nSplit rule: splits chunks that match regexp1 followed by " "regexp2 in two.\n\n" "

...regexp...{}...regexp...

" "\nMerge rule: joins consecutive chunks that match regexp1 " "and regexp2\n", ), ( "Regexps", "10 60", # "Regular Expression Syntax Summary:\n\n" "

Pattern\t\tMatches...

\n" "" "\t<T>\ta word with tag T " "(where T may be a regexp).\n" "\tx?\tan optional x\n" "\tx+\ta sequence of 1 or more x's\n" "\tx*\ta sequence of 0 or more x's\n" "\tx|y\tx or y\n" "\t.\tmatches any character\n" "\t(x)\tTreats x as a group\n" "\t# x...\tTreats x... " "(to the end of the line) as a comment\n" "\t\\C\tmatches character C " "(useful when C is a special character " "like + or #)\n" "" "\n

Examples:

\n" "" "\t\n" '\t\tMatches "cow/NN"\n' '\t\tMatches "green/NN"\n' "\t\n" '\t\tMatches "eating/VBG"\n' '\t\tMatches "ate/VBD"\n' "\t
\n" '\t\tMatches "on/IN the/DT car/NN"\n' "\t?\n" '\t\tMatches "ran/VBD"\n' '\t\tMatches "slowly/RB ate/VBD"\n' r"\t<\#> # This is a comment...\n" '\t\tMatches "#/# 100/CD"\n' "", ), ( "Tags", "10 60", "

Part of Speech Tags:

\n" + "" + "<>" + "\n", # this gets auto-substituted w/ self.TAGSET ), ] HELP_AUTOTAG = [ ("red", dict(foreground="#a00")), ("green", dict(foreground="#080")), ("highlight", dict(background="#ddd")), ("underline", dict(underline=True)), ("h1", dict(underline=True)), ("indent", dict(lmargin1=20, lmargin2=20)), ("hangindent", dict(lmargin1=0, lmargin2=60)), ("var", dict(foreground="#88f")), ("regexp", dict(foreground="#ba7")), ("match", dict(foreground="#6a6")), ] ##///////////////////////////////////////////////////////////////// ## Config Parameters ##///////////////////////////////////////////////////////////////// _EVAL_DELAY = 1 """If the user has not pressed any key for this amount of time (in seconds), and the current grammar has not been evaluated, then the eval demon will evaluate it.""" _EVAL_CHUNK = 15 """The number of sentences that should be evaluated by the eval demon each time it runs.""" _EVAL_FREQ = 0.2 """The frequency (in seconds) at which the eval demon is run""" _EVAL_DEMON_MIN = 0.02 """The minimum amount of time that the eval demon should take each time it runs -- if it takes less than this time, _EVAL_CHUNK will be modified upwards.""" _EVAL_DEMON_MAX = 0.04 """The maximum amount of time that the eval demon should take each time it runs -- if it takes more than this time, _EVAL_CHUNK will be modified downwards.""" _GRAMMARBOX_PARAMS = dict( width=40, height=12, background="#efe", highlightbackground="#efe", highlightthickness=1, relief="groove", border=2, wrap="word", ) _HELPBOX_PARAMS = dict( width=15, height=15, background="#efe", highlightbackground="#efe", foreground="#555", highlightthickness=1, relief="groove", border=2, wrap="word", ) _DEVSETBOX_PARAMS = dict( width=70, height=10, background="#eef", highlightbackground="#eef", highlightthickness=1, relief="groove", border=2, wrap="word", tabs=(30,), ) _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2) _FONT_PARAMS = dict(family="helvetica", size=-20) _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3) _EVALBOX_PARAMS = dict( background="#eef", highlightbackground="#eef", highlightthickness=1, relief="groove", border=2, width=300, height=280, ) _BUTTON_PARAMS = dict( background="#777", activebackground="#777", highlightbackground="#777" ) _HELPTAB_BG_COLOR = "#aba" _HELPTAB_FG_COLOR = "#efe" _HELPTAB_FG_PARAMS = dict(background="#efe") _HELPTAB_BG_PARAMS = dict(background="#aba") _HELPTAB_SPACER = 6 def normalize_grammar(self, grammar): # Strip comments grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar) # Normalize whitespace grammar = re.sub(" +", " ", grammar) grammar = re.sub(r"\n\s+", r"\n", grammar) grammar = grammar.strip() # [xx] Hack: automatically backslash $! grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar) return grammar def __init__( self, devset_name="conll2000", devset=None, grammar="", chunk_label="NP", tagset=None, ): """ :param devset_name: The name of the development set; used for display & for save files. If either the name 'treebank' or the name 'conll2000' is used, and devset is None, then devset will be set automatically. :param devset: A list of chunked sentences :param grammar: The initial grammar to display. :param tagset: Dictionary from tags to string descriptions, used for the help page. Defaults to ``self.TAGSET``. """ self._chunk_label = chunk_label if tagset is None: tagset = self.TAGSET self.tagset = tagset # Named development sets: if devset is None: if devset_name == "conll2000": devset = conll2000.chunked_sents("train.txt") # [:100] elif devset == "treebank": devset = treebank_chunk.chunked_sents() # [:100] else: raise ValueError("Unknown development set %s" % devset_name) self.chunker = None """The chunker built from the grammar string""" self.grammar = grammar """The unparsed grammar string""" self.normalized_grammar = None """A normalized version of ``self.grammar``.""" self.grammar_changed = 0 """The last time() that the grammar was changed.""" self.devset = devset """The development set -- a list of chunked sentences.""" self.devset_name = devset_name """The name of the development set (for save files).""" self.devset_index = -1 """The index into the development set of the first instance that's currently being viewed.""" self._last_keypress = 0 """The time() when a key was most recently pressed""" self._history = [] """A list of (grammar, precision, recall, fscore) tuples for grammars that the user has already tried.""" self._history_index = 0 """When the user is scrolling through previous grammars, this is used to keep track of which grammar they're looking at.""" self._eval_grammar = None """The grammar that is being currently evaluated by the eval demon.""" self._eval_normalized_grammar = None """A normalized copy of ``_eval_grammar``.""" self._eval_index = 0 """The index of the next sentence in the development set that should be looked at by the eval demon.""" self._eval_score = ChunkScore(chunk_label=chunk_label) """The ``ChunkScore`` object that's used to keep track of the score of the current grammar on the development set.""" # Set up the main window. top = self.top = Tk() top.geometry("+50+50") top.title("Regexp Chunk Parser App") top.bind("", self.destroy) # Variable that restricts how much of the devset we look at. self._devset_size = IntVar(top) self._devset_size.set(100) # Set up all the tkinter widgets self._init_fonts(top) self._init_widgets(top) self._init_bindings(top) self._init_menubar(top) self.grammarbox.focus() # If a grammar was given, then display it. if grammar: self.grammarbox.insert("end", grammar + "\n") self.grammarbox.mark_set("insert", "1.0") # Display the first item in the development set self.show_devset(0) self.update() def _init_bindings(self, top): top.bind("", self._devset_next) top.bind("", self._devset_prev) top.bind("", self.toggle_show_trace) top.bind("", self.update) top.bind("", lambda e: self.save_grammar()) top.bind("", lambda e: self.load_grammar()) self.grammarbox.bind("", self.toggle_show_trace) self.grammarbox.bind("", self._devset_next) self.grammarbox.bind("", self._devset_prev) # Redraw the eval graph when the window size changes self.evalbox.bind("", self._eval_plot) def _init_fonts(self, top): # TWhat's our font size (default=same as sysfont) self._size = IntVar(top) self._size.set(20) self._font = Font(family="helvetica", size=-self._size.get()) self._smallfont = Font( family="helvetica", size=-(int(self._size.get() * 14 // 20)) ) def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label="Reset Application", underline=0, command=self.reset) filemenu.add_command( label="Save Current Grammar", underline=0, accelerator="Ctrl-s", command=self.save_grammar, ) filemenu.add_command( label="Load Grammar", underline=0, accelerator="Ctrl-o", command=self.load_grammar, ) filemenu.add_command( label="Save Grammar History", underline=13, command=self.save_history ) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_radiobutton( label="Tiny", variable=self._size, underline=0, value=10, command=self.resize, ) viewmenu.add_radiobutton( label="Small", variable=self._size, underline=0, value=16, command=self.resize, ) viewmenu.add_radiobutton( label="Medium", variable=self._size, underline=0, value=20, command=self.resize, ) viewmenu.add_radiobutton( label="Large", variable=self._size, underline=0, value=24, command=self.resize, ) viewmenu.add_radiobutton( label="Huge", variable=self._size, underline=0, value=34, command=self.resize, ) menubar.add_cascade(label="View", underline=0, menu=viewmenu) devsetmenu = Menu(menubar, tearoff=0) devsetmenu.add_radiobutton( label="50 sentences", variable=self._devset_size, value=50, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label="100 sentences", variable=self._devset_size, value=100, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label="200 sentences", variable=self._devset_size, value=200, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label="500 sentences", variable=self._devset_size, value=500, command=self.set_devset_size, ) menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label="About", underline=0, command=self.about) menubar.add_cascade(label="Help", underline=0, menu=helpmenu) parent.config(menu=menubar) def toggle_show_trace(self, *e): if self._showing_trace: self.show_devset() else: self.show_trace() return "break" _SCALE_N = 5 # center on the last 5 examples. _DRAW_LINES = False def _eval_plot(self, *e, **config): width = config.get("width", self.evalbox.winfo_width()) height = config.get("height", self.evalbox.winfo_height()) # Clear the canvas self.evalbox.delete("all") # Draw the precision & recall labels. tag = self.evalbox.create_text( 10, height // 2 - 10, justify="left", anchor="w", text="Precision" ) left, right = self.evalbox.bbox(tag)[2] + 5, width - 10 tag = self.evalbox.create_text( left + (width - left) // 2, height - 10, anchor="s", text="Recall", justify="center", ) top, bot = 10, self.evalbox.bbox(tag)[1] - 10 # Draw masks for clipping the plot. bg = self._EVALBOX_PARAMS["background"] self.evalbox.lower( self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg) ) self.evalbox.lower( self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg) ) # Calculate the plot's scale. if self._autoscale.get() and len(self._history) > 1: max_precision = max_recall = 0 min_precision = min_recall = 1 for i in range(1, min(len(self._history), self._SCALE_N + 1)): grammar, precision, recall, fmeasure = self._history[-i] min_precision = min(precision, min_precision) min_recall = min(recall, min_recall) max_precision = max(precision, max_precision) max_recall = max(recall, max_recall) # if max_precision-min_precision > max_recall-min_recall: # min_recall -= (max_precision-min_precision)/2 # max_recall += (max_precision-min_precision)/2 # else: # min_precision -= (max_recall-min_recall)/2 # max_precision += (max_recall-min_recall)/2 # if min_recall < 0: # max_recall -= min_recall # min_recall = 0 # if min_precision < 0: # max_precision -= min_precision # min_precision = 0 min_precision = max(min_precision - 0.01, 0) min_recall = max(min_recall - 0.01, 0) max_precision = min(max_precision + 0.01, 1) max_recall = min(max_recall + 0.01, 1) else: min_precision = min_recall = 0 max_precision = max_recall = 1 # Draw the axis lines & grid lines for i in range(11): x = left + (right - left) * ( (i / 10.0 - min_recall) / (max_recall - min_recall) ) y = bot - (bot - top) * ( (i / 10.0 - min_precision) / (max_precision - min_precision) ) if left < x < right: self.evalbox.create_line(x, top, x, bot, fill="#888") if top < y < bot: self.evalbox.create_line(left, y, right, y, fill="#888") self.evalbox.create_line(left, top, left, bot) self.evalbox.create_line(left, bot, right, bot) # Display the plot's scale self.evalbox.create_text( left - 3, bot, justify="right", anchor="se", text="%d%%" % (100 * min_precision), ) self.evalbox.create_text( left - 3, top, justify="right", anchor="ne", text="%d%%" % (100 * max_precision), ) self.evalbox.create_text( left, bot + 3, justify="center", anchor="nw", text="%d%%" % (100 * min_recall), ) self.evalbox.create_text( right, bot + 3, justify="center", anchor="ne", text="%d%%" % (100 * max_recall), ) # Display the scores. prev_x = prev_y = None for i, (_, precision, recall, fscore) in enumerate(self._history): x = left + (right - left) * ( (recall - min_recall) / (max_recall - min_recall) ) y = bot - (bot - top) * ( (precision - min_precision) / (max_precision - min_precision) ) if i == self._history_index: self.evalbox.create_oval( x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000" ) self.status["text"] = ( "Precision: %.2f%%\t" % (precision * 100) + "Recall: %.2f%%\t" % (recall * 100) + "F-score: %.2f%%" % (fscore * 100) ) else: self.evalbox.lower( self.evalbox.create_oval( x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8" ) ) if prev_x is not None and self._eval_lines.get(): self.evalbox.lower( self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8") ) prev_x, prev_y = x, y _eval_demon_running = False def _eval_demon(self): if self.top is None: return if self.chunker is None: self._eval_demon_running = False return # Note our starting time. t0 = time.time() # If are still typing, then wait for them to finish. if ( time.time() - self._last_keypress < self._EVAL_DELAY and self.normalized_grammar != self._eval_normalized_grammar ): self._eval_demon_running = True return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) # If the grammar changed, restart the evaluation. if self.normalized_grammar != self._eval_normalized_grammar: # Check if we've seen this grammar already. If so, then # just use the old evaluation values. for (g, p, r, f) in self._history: if self.normalized_grammar == self.normalize_grammar(g): self._history.append((g, p, r, f)) self._history_index = len(self._history) - 1 self._eval_plot() self._eval_demon_running = False self._eval_normalized_grammar = None return self._eval_index = 0 self._eval_score = ChunkScore(chunk_label=self._chunk_label) self._eval_grammar = self.grammar self._eval_normalized_grammar = self.normalized_grammar # If the grammar is empty, the don't bother evaluating it, or # recording it in history -- the score will just be 0. if self.normalized_grammar.strip() == "": # self._eval_index = self._devset_size.get() self._eval_demon_running = False return # Score the next set of examples for gold in self.devset[ self._eval_index : min( self._eval_index + self._EVAL_CHUNK, self._devset_size.get() ) ]: guess = self._chunkparse(gold.leaves()) self._eval_score.score(gold, guess) # update our index in the devset. self._eval_index += self._EVAL_CHUNK # Check if we're done if self._eval_index >= self._devset_size.get(): self._history.append( ( self._eval_grammar, self._eval_score.precision(), self._eval_score.recall(), self._eval_score.f_measure(), ) ) self._history_index = len(self._history) - 1 self._eval_plot() self._eval_demon_running = False self._eval_normalized_grammar = None else: progress = 100 * self._eval_index / self._devset_size.get() self.status["text"] = "Evaluating on Development Set (%d%%)" % progress self._eval_demon_running = True self._adaptively_modify_eval_chunk(time.time() - t0) self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) def _adaptively_modify_eval_chunk(self, t): """ Modify _EVAL_CHUNK to try to keep the amount of time that the eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX. :param t: The amount of time that the eval demon took. """ if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5: self._EVAL_CHUNK = min( self._EVAL_CHUNK - 1, max( int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)), self._EVAL_CHUNK - 10, ), ) elif t < self._EVAL_DEMON_MIN: self._EVAL_CHUNK = max( self._EVAL_CHUNK + 1, min( int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)), self._EVAL_CHUNK + 10, ), ) def _init_widgets(self, top): frame0 = Frame(top, **self._FRAME_PARAMS) frame0.grid_columnconfigure(0, weight=4) frame0.grid_columnconfigure(3, weight=2) frame0.grid_rowconfigure(1, weight=1) frame0.grid_rowconfigure(5, weight=1) # The grammar self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS) self.grammarlabel = Label( frame0, font=self._font, text="Grammar:", highlightcolor="black", background=self._GRAMMARBOX_PARAMS["background"], ) self.grammarlabel.grid(column=0, row=0, sticky="SW") self.grammarbox.grid(column=0, row=1, sticky="NEWS") # Scroll bar for grammar grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview) grammar_scrollbar.grid(column=1, row=1, sticky="NWS") self.grammarbox.config(yscrollcommand=grammar_scrollbar.set) # grammar buttons bg = self._FRAME_PARAMS["background"] frame3 = Frame(frame0, background=bg) frame3.grid(column=0, row=2, sticky="EW") Button( frame3, text="Prev Grammar", command=self._history_prev, **self._BUTTON_PARAMS, ).pack(side="left") Button( frame3, text="Next Grammar", command=self._history_next, **self._BUTTON_PARAMS, ).pack(side="left") # Help box self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS) self.helpbox.grid(column=3, row=1, sticky="NEWS") self.helptabs = {} bg = self._FRAME_PARAMS["background"] helptab_frame = Frame(frame0, background=bg) helptab_frame.grid(column=3, row=0, sticky="SW") for i, (tab, tabstops, text) in enumerate(self.HELP): label = Label(helptab_frame, text=tab, font=self._smallfont) label.grid(column=i * 2, row=0, sticky="S") # help_frame.grid_columnconfigure(i, weight=1) # label.pack(side='left') label.bind("", lambda e, tab=tab: self.show_help(tab)) self.helptabs[tab] = label Frame( helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg ).grid(column=i * 2 + 1, row=0) self.helptabs[self.HELP[0][0]].configure(font=self._font) self.helpbox.tag_config("elide", elide=True) for (tag, params) in self.HELP_AUTOTAG: self.helpbox.tag_config("tag-%s" % tag, **params) self.show_help(self.HELP[0][0]) # Scroll bar for helpbox help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview) self.helpbox.config(yscrollcommand=help_scrollbar.set) help_scrollbar.grid(column=4, row=1, sticky="NWS") # The dev set frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"]) self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS) self.devsetbox.pack(expand=True, fill="both") self.devsetlabel = Label( frame0, font=self._font, text="Development Set:", justify="right", background=self._DEVSETBOX_PARAMS["background"], ) self.devsetlabel.grid(column=0, row=4, sticky="SW") frame4.grid(column=0, row=5, sticky="NEWS") # dev set scrollbars self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll) self.devset_scroll.grid(column=1, row=5, sticky="NWS") self.devset_xscroll = Scrollbar( frame4, command=self.devsetbox.xview, orient="horiz" ) self.devsetbox["xscrollcommand"] = self.devset_xscroll.set self.devset_xscroll.pack(side="bottom", fill="x") # dev set buttons bg = self._FRAME_PARAMS["background"] frame1 = Frame(frame0, background=bg) frame1.grid(column=0, row=7, sticky="EW") Button( frame1, text="Prev Example (Ctrl-p)", command=self._devset_prev, **self._BUTTON_PARAMS, ).pack(side="left") Button( frame1, text="Next Example (Ctrl-n)", command=self._devset_next, **self._BUTTON_PARAMS, ).pack(side="left") self.devset_button = Button( frame1, text="Show example", command=self.show_devset, state="disabled", **self._BUTTON_PARAMS, ) self.devset_button.pack(side="right") self.trace_button = Button( frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS ) self.trace_button.pack(side="right") # evaluation box self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS) label = Label( frame0, font=self._font, text="Evaluation:", justify="right", background=self._EVALBOX_PARAMS["background"], ) label.grid(column=3, row=4, sticky="SW") self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2) # evaluation box buttons bg = self._FRAME_PARAMS["background"] frame2 = Frame(frame0, background=bg) frame2.grid(column=3, row=7, sticky="EW") self._autoscale = IntVar(self.top) self._autoscale.set(False) Checkbutton( frame2, variable=self._autoscale, command=self._eval_plot, text="Zoom", **self._BUTTON_PARAMS, ).pack(side="left") self._eval_lines = IntVar(self.top) self._eval_lines.set(False) Checkbutton( frame2, variable=self._eval_lines, command=self._eval_plot, text="Lines", **self._BUTTON_PARAMS, ).pack(side="left") Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right") # The status label self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS) self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5) # Help box & devset box can't be edited. self.helpbox["state"] = "disabled" self.devsetbox["state"] = "disabled" # Spacers bg = self._FRAME_PARAMS["background"] Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3) Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0) Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8) # pack the frame. frame0.pack(fill="both", expand=True) # Set up colors for the devset box self.devsetbox.tag_config("true-pos", background="#afa", underline="True") self.devsetbox.tag_config("false-neg", underline="True", foreground="#800") self.devsetbox.tag_config("false-pos", background="#faa") self.devsetbox.tag_config("trace", foreground="#666", wrap="none") self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none") self.devsetbox.tag_config("error", foreground="#800") # And for the grammarbox self.grammarbox.tag_config("error", background="#fec") self.grammarbox.tag_config("comment", foreground="#840") self.grammarbox.tag_config("angle", foreground="#00f") self.grammarbox.tag_config("brace", foreground="#0a0") self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40) _showing_trace = False def show_trace(self, *e): self._showing_trace = True self.trace_button["state"] = "disabled" self.devset_button["state"] = "normal" self.devsetbox["state"] = "normal" # self.devsetbox['wrap'] = 'none' self.devsetbox.delete("1.0", "end") self.devsetlabel["text"] = "Development Set (%d/%d)" % ( (self.devset_index + 1, self._devset_size.get()) ) if self.chunker is None: self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.") self.devsetbox.tag_add("error", "1.0", "end") return # can't do anything more gold_tree = self.devset[self.devset_index] rules = self.chunker.rules() # Calculate the tag sequence tagseq = "\t" charnum = [1] for wordnum, (word, pos) in enumerate(gold_tree.leaves()): tagseq += "%s " % pos charnum.append(len(tagseq)) self.charnum = { (i, j): charnum[j] for i in range(len(rules) + 1) for j in range(len(charnum)) } self.linenum = {i: i * 2 + 2 for i in range(len(rules) + 1)} for i in range(len(rules) + 1): if i == 0: self.devsetbox.insert("end", "Start:\n") self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") else: self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1]) self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") # Display the tag sequence. self.devsetbox.insert("end", tagseq + "\n") self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c") # Run a partial parser, and extract gold & test chunks chunker = RegexpChunkParser(rules[:i]) test_tree = self._chunkparse(gold_tree.leaves()) gold_chunks = self._chunks(gold_tree) test_chunks = self._chunks(test_tree) # Compare them. for chunk in gold_chunks.intersection(test_chunks): self._color_chunk(i, chunk, "true-pos") for chunk in gold_chunks - test_chunks: self._color_chunk(i, chunk, "false-neg") for chunk in test_chunks - gold_chunks: self._color_chunk(i, chunk, "false-pos") self.devsetbox.insert("end", "Finished.\n") self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") # This is a hack, because the x-scrollbar isn't updating its # position right -- I'm not sure what the underlying cause is # though. (This is on OS X w/ python 2.5) self.top.after(100, self.devset_xscroll.set, 0, 0.3) def show_help(self, tab): self.helpbox["state"] = "normal" self.helpbox.delete("1.0", "end") for (name, tabstops, text) in self.HELP: if name == tab: text = text.replace( "<>", "\n".join( "\t%s\t%s" % item for item in sorted( list(self.tagset.items()), key=lambda t_w: re.match(r"\w+", t_w[0]) and (0, t_w[0]) or (1, t_w[0]), ) ), ) self.helptabs[name].config(**self._HELPTAB_FG_PARAMS) self.helpbox.config(tabs=tabstops) self.helpbox.insert("1.0", text + "\n" * 20) C = "1.0 + %d chars" for (tag, params) in self.HELP_AUTOTAG: pattern = f"(?s)(<{tag}>)(.*?)()" for m in re.finditer(pattern, text): self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1)) self.helpbox.tag_add( "tag-%s" % tag, C % m.start(2), C % m.end(2) ) self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3)) else: self.helptabs[name].config(**self._HELPTAB_BG_PARAMS) self.helpbox["state"] = "disabled" def _history_prev(self, *e): self._view_history(self._history_index - 1) return "break" def _history_next(self, *e): self._view_history(self._history_index + 1) return "break" def _view_history(self, index): # Bounds & sanity checking: index = max(0, min(len(self._history) - 1, index)) if not self._history: return # Already viewing the requested history item? if index == self._history_index: return # Show the requested grammar. It will get added to _history # only if they edit it (causing self.update() to get run.) self.grammarbox["state"] = "normal" self.grammarbox.delete("1.0", "end") self.grammarbox.insert("end", self._history[index][0]) self.grammarbox.mark_set("insert", "1.0") self._history_index = index self._syntax_highlight_grammar(self._history[index][0]) # Record the normalized grammar & regenerate the chunker. self.normalized_grammar = self.normalize_grammar(self._history[index][0]) if self.normalized_grammar: rules = [ RegexpChunkRule.fromstring(line) for line in self.normalized_grammar.split("\n") ] else: rules = [] self.chunker = RegexpChunkParser(rules) # Show the score. self._eval_plot() # Update the devset box self._highlight_devset() if self._showing_trace: self.show_trace() # Update the grammar label if self._history_index < len(self._history) - 1: self.grammarlabel["text"] = "Grammar {}/{}:".format( self._history_index + 1, len(self._history), ) else: self.grammarlabel["text"] = "Grammar:" def _devset_next(self, *e): self._devset_scroll("scroll", 1, "page") return "break" def _devset_prev(self, *e): self._devset_scroll("scroll", -1, "page") return "break" def destroy(self, *e): if self.top is None: return self.top.destroy() self.top = None def _devset_scroll(self, command, *args): N = 1 # size of a page -- one sentence. showing_trace = self._showing_trace if command == "scroll" and args[1].startswith("unit"): self.show_devset(self.devset_index + int(args[0])) elif command == "scroll" and args[1].startswith("page"): self.show_devset(self.devset_index + N * int(args[0])) elif command == "moveto": self.show_devset(int(float(args[0]) * self._devset_size.get())) else: assert 0, f"bad scroll command {command} {args}" if showing_trace: self.show_trace() def show_devset(self, index=None): if index is None: index = self.devset_index # Bounds checking index = min(max(0, index), self._devset_size.get() - 1) if index == self.devset_index and not self._showing_trace: return self.devset_index = index self._showing_trace = False self.trace_button["state"] = "normal" self.devset_button["state"] = "disabled" # Clear the text box. self.devsetbox["state"] = "normal" self.devsetbox["wrap"] = "word" self.devsetbox.delete("1.0", "end") self.devsetlabel["text"] = "Development Set (%d/%d)" % ( (self.devset_index + 1, self._devset_size.get()) ) # Add the sentences sample = self.devset[self.devset_index : self.devset_index + 1] self.charnum = {} self.linenum = {0: 1} for sentnum, sent in enumerate(sample): linestr = "" for wordnum, (word, pos) in enumerate(sent.leaves()): self.charnum[sentnum, wordnum] = len(linestr) linestr += f"{word}/{pos} " self.charnum[sentnum, wordnum + 1] = len(linestr) self.devsetbox.insert("end", linestr[:-1] + "\n\n") # Highlight chunks in the dev set if self.chunker is not None: self._highlight_devset() self.devsetbox["state"] = "disabled" # Update the scrollbar first = self.devset_index / self._devset_size.get() last = (self.devset_index + 2) / self._devset_size.get() self.devset_scroll.set(first, last) def _chunks(self, tree): chunks = set() wordnum = 0 for child in tree: if isinstance(child, Tree): if child.label() == self._chunk_label: chunks.add((wordnum, wordnum + len(child))) wordnum += len(child) else: wordnum += 1 return chunks def _syntax_highlight_grammar(self, grammar): if self.top is None: return self.grammarbox.tag_remove("comment", "1.0", "end") self.grammarbox.tag_remove("angle", "1.0", "end") self.grammarbox.tag_remove("brace", "1.0", "end") self.grammarbox.tag_add("hangindent", "1.0", "end") for lineno, line in enumerate(grammar.split("\n")): if not line.strip(): continue m = re.match(r"(\\.|[^#])*(#.*)?", line) comment_start = None if m.group(2): comment_start = m.start(2) s = "%d.%d" % (lineno + 1, m.start(2)) e = "%d.%d" % (lineno + 1, m.end(2)) self.grammarbox.tag_add("comment", s, e) for m in re.finditer("[<>{}]", line): if comment_start is not None and m.start() >= comment_start: break s = "%d.%d" % (lineno + 1, m.start()) e = "%d.%d" % (lineno + 1, m.end()) if m.group() in "<>": self.grammarbox.tag_add("angle", s, e) else: self.grammarbox.tag_add("brace", s, e) def _grammarcheck(self, grammar): if self.top is None: return self.grammarbox.tag_remove("error", "1.0", "end") self._grammarcheck_errs = [] for lineno, line in enumerate(grammar.split("\n")): line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line) line = line.strip() if line: try: RegexpChunkRule.fromstring(line) except ValueError as e: self.grammarbox.tag_add( "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1) ) self.status["text"] = "" def update(self, *event): # Record when update was called (for grammarcheck) if event: self._last_keypress = time.time() # Read the grammar from the Text box. self.grammar = grammar = self.grammarbox.get("1.0", "end") # If the grammar hasn't changed, do nothing: normalized_grammar = self.normalize_grammar(grammar) if normalized_grammar == self.normalized_grammar: return else: self.normalized_grammar = normalized_grammar # If the grammar has changed, and we're looking at history, # then stop looking at history. if self._history_index < len(self._history) - 1: self.grammarlabel["text"] = "Grammar:" self._syntax_highlight_grammar(grammar) # The grammar has changed; try parsing it. If it doesn't # parse, do nothing. (flag error location?) try: # Note: the normalized grammar has no blank lines. if normalized_grammar: rules = [ RegexpChunkRule.fromstring(line) for line in normalized_grammar.split("\n") ] else: rules = [] except ValueError as e: # Use the un-normalized grammar for error highlighting. self._grammarcheck(grammar) self.chunker = None return self.chunker = RegexpChunkParser(rules) self.grammarbox.tag_remove("error", "1.0", "end") self.grammar_changed = time.time() # Display the results if self._showing_trace: self.show_trace() else: self._highlight_devset() # Start the eval demon if not self._eval_demon_running: self._eval_demon() def _highlight_devset(self, sample=None): if sample is None: sample = self.devset[self.devset_index : self.devset_index + 1] self.devsetbox.tag_remove("true-pos", "1.0", "end") self.devsetbox.tag_remove("false-neg", "1.0", "end") self.devsetbox.tag_remove("false-pos", "1.0", "end") # Run the grammar on the test cases. for sentnum, gold_tree in enumerate(sample): # Run the chunk parser test_tree = self._chunkparse(gold_tree.leaves()) # Extract gold & test chunks gold_chunks = self._chunks(gold_tree) test_chunks = self._chunks(test_tree) # Compare them. for chunk in gold_chunks.intersection(test_chunks): self._color_chunk(sentnum, chunk, "true-pos") for chunk in gold_chunks - test_chunks: self._color_chunk(sentnum, chunk, "false-neg") for chunk in test_chunks - gold_chunks: self._color_chunk(sentnum, chunk, "false-pos") def _chunkparse(self, words): try: return self.chunker.parse(words) except (ValueError, IndexError) as e: # There's an error somewhere in the grammar, but we're not sure # exactly where, so just mark the whole grammar as bad. # E.g., this is caused by: "({})" self.grammarbox.tag_add("error", "1.0", "end") # Treat it as tagging nothing: return words def _color_chunk(self, sentnum, chunk, tag): start, end = chunk self.devsetbox.tag_add( tag, f"{self.linenum[sentnum]}.{self.charnum[sentnum, start]}", f"{self.linenum[sentnum]}.{self.charnum[sentnum, end] - 1}", ) def reset(self): # Clear various variables self.chunker = None self.grammar = None self.normalized_grammar = None self.grammar_changed = 0 self._history = [] self._history_index = 0 # Update the on-screen display. self.grammarbox.delete("1.0", "end") self.show_devset(0) self.update() # self._eval_plot() SAVE_GRAMMAR_TEMPLATE = ( "# Regexp Chunk Parsing Grammar\n" "# Saved %(date)s\n" "#\n" "# Development set: %(devset)s\n" "# Precision: %(precision)s\n" "# Recall: %(recall)s\n" "# F-score: %(fscore)s\n\n" "%(grammar)s\n" ) def save_grammar(self, filename=None): if not filename: ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")] filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk") if not filename: return if self._history and self.normalized_grammar == self.normalize_grammar( self._history[-1][0] ): precision, recall, fscore = ( "%.2f%%" % (100 * v) for v in self._history[-1][1:] ) elif self.chunker is None: precision = recall = fscore = "Grammar not well formed" else: precision = recall = fscore = "Not finished evaluation yet" with open(filename, "w") as outfile: outfile.write( self.SAVE_GRAMMAR_TEMPLATE % dict( date=time.ctime(), devset=self.devset_name, precision=precision, recall=recall, fscore=fscore, grammar=self.grammar.strip(), ) ) def load_grammar(self, filename=None): if not filename: ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")] filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk") if not filename: return self.grammarbox.delete("1.0", "end") self.update() with open(filename) as infile: grammar = infile.read() grammar = re.sub( r"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar ).lstrip() self.grammarbox.insert("1.0", grammar) self.update() def save_history(self, filename=None): if not filename: ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")] filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt") if not filename: return with open(filename, "w") as outfile: outfile.write("# Regexp Chunk Parsing Grammar History\n") outfile.write("# Saved %s\n" % time.ctime()) outfile.write("# Development set: %s\n" % self.devset_name) for i, (g, p, r, f) in enumerate(self._history): hdr = ( "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, " "fscore=%.2f%%)" % (i + 1, len(self._history), p * 100, r * 100, f * 100) ) outfile.write("\n%s\n" % hdr) outfile.write("".join(" %s\n" % line for line in g.strip().split())) if not ( self._history and self.normalized_grammar == self.normalize_grammar(self._history[-1][0]) ): if self.chunker is None: outfile.write("\nCurrent Grammar (not well-formed)\n") else: outfile.write("\nCurrent Grammar (not evaluated)\n") outfile.write( "".join(" %s\n" % line for line in self.grammar.strip().split()) ) def about(self, *e): ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper" TITLE = "About: Regular Expression Chunk Parser Application" try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self.top, TITLE, ABOUT) def set_devset_size(self, size=None): if size is not None: self._devset_size.set(size) self._devset_size.set(min(len(self.devset), self._devset_size.get())) self.show_devset(1) self.show_devset(0) # what about history? Evaluated at diff dev set sizes! def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20)) def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self.top.mainloop(*args, **kwargs) def app(): RegexpChunkApp().mainloop() if __name__ == "__main__": app() __all__ = ["app"] nltk-3.7/nltk/app/collocations_app.py000066400000000000000000000336221420073152400177720ustar00rootroot00000000000000# Natural Language Toolkit: Collocations Application # Much of the GUI code is imported from concordance.py; We intend to merge these tools together # Copyright (C) 2001-2022 NLTK Project # Author: Sumukh Ghodke # URL: # For license information, see LICENSE.TXT # import queue as q import threading from tkinter import ( END, LEFT, SUNKEN, Button, Frame, IntVar, Label, Menu, OptionMenu, Scrollbar, StringVar, Text, Tk, ) from tkinter.font import Font from nltk.corpus import ( alpino, brown, cess_cat, cess_esp, floresta, indian, mac_morpho, machado, nps_chat, sinica_treebank, treebank, ) from nltk.probability import FreqDist from nltk.util import in_idle CORPUS_LOADED_EVENT = "<>" ERROR_LOADING_CORPUS_EVENT = "<>" POLL_INTERVAL = 100 _DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words( categories=["news", "editorial", "reviews"] ), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll) def _init_top(self, top): top.geometry("550x650+50+50") top.title("NLTK Collocations List") top.bind("", self.destroy) top.protocol("WM_DELETE_WINDOW", self.destroy) top.minsize(550, 650) def _init_widgets(self, parent): self.main_frame = Frame( parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1) ) self._init_corpus_select(self.main_frame) self._init_results_box(self.main_frame) self._init_paging(self.main_frame) self._init_status(self.main_frame) self.main_frame.pack(fill="both", expand=True) def _init_corpus_select(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.var = StringVar(innerframe) self.var.set(self.model.DEFAULT_CORPUS) Label( innerframe, justify=LEFT, text=" Corpus: ", background=self._BACKGROUND_COLOUR, padx=2, pady=1, border=0, ).pack(side="left") other_corpora = list(self.model.CORPORA.keys()).remove( self.model.DEFAULT_CORPUS ) om = OptionMenu( innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora() ) om["borderwidth"] = 0 om["highlightthickness"] = 1 om.pack(side="left") innerframe.pack(side="top", fill="x", anchor="n") def _init_status(self, parent): self.status = Label( parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx=1, pady=0, ) self.status.pack(side="top", anchor="sw") def _init_menubar(self): self._result_size = IntVar(self.top) menubar = Menu(self.top) filemenu = Menu(menubar, tearoff=0, borderwidth=0) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) rescntmenu = Menu(editmenu, tearoff=0) rescntmenu.add_radiobutton( label="20", variable=self._result_size, underline=0, value=20, command=self.set_result_size, ) rescntmenu.add_radiobutton( label="50", variable=self._result_size, underline=0, value=50, command=self.set_result_size, ) rescntmenu.add_radiobutton( label="100", variable=self._result_size, underline=0, value=100, command=self.set_result_size, ) rescntmenu.invoke(1) editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu) menubar.add_cascade(label="Edit", underline=0, menu=editmenu) self.top.config(menu=menubar) def set_result_size(self, **kwargs): self.model.result_count = self._result_size.get() def _init_results_box(self, parent): innerframe = Frame(parent) i1 = Frame(innerframe) i2 = Frame(innerframe) vscrollbar = Scrollbar(i1, borderwidth=1) hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz") self.results_box = Text( i1, font=Font(family="courier", size="16"), state="disabled", borderwidth=1, yscrollcommand=vscrollbar.set, xscrollcommand=hscrollbar.set, wrap="none", width="40", height="20", exportselection=1, ) self.results_box.pack(side="left", fill="both", expand=True) vscrollbar.pack(side="left", fill="y", anchor="e") vscrollbar.config(command=self.results_box.yview) hscrollbar.pack(side="left", fill="x", expand=True, anchor="w") hscrollbar.config(command=self.results_box.xview) # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack( side="left", anchor="e" ) i1.pack(side="top", fill="both", expand=True, anchor="n") i2.pack(side="bottom", fill="x", anchor="s") innerframe.pack(side="top", fill="both", expand=True) def _init_paging(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.prev = prev = Button( innerframe, text="Previous", command=self.previous, width="10", borderwidth=1, highlightthickness=1, state="disabled", ) prev.pack(side="left", anchor="center") self.next = next = Button( innerframe, text="Next", command=self.__next__, width="10", borderwidth=1, highlightthickness=1, state="disabled", ) next.pack(side="right", anchor="center") innerframe.pack(side="top", fill="y") self.reset_current_page() def reset_current_page(self): self.current_page = -1 def _poll(self): try: event = self.queue.get(block=False) except q.Empty: pass else: if event == CORPUS_LOADED_EVENT: self.handle_corpus_loaded(event) elif event == ERROR_LOADING_CORPUS_EVENT: self.handle_error_loading_corpus(event) self.after = self.top.after(POLL_INTERVAL, self._poll) def handle_error_loading_corpus(self, event): self.status["text"] = "Error in loading " + self.var.get() self.unfreeze_editable() self.clear_results_box() self.freeze_editable() self.reset_current_page() def handle_corpus_loaded(self, event): self.status["text"] = self.var.get() + " is loaded" self.unfreeze_editable() self.clear_results_box() self.reset_current_page() # self.next() collocations = self.model.next(self.current_page + 1) self.write_results(collocations) self.current_page += 1 def corpus_selected(self, *args): new_selection = self.var.get() self.load_corpus(new_selection) def previous(self): self.freeze_editable() collocations = self.model.prev(self.current_page - 1) self.current_page = self.current_page - 1 self.clear_results_box() self.write_results(collocations) self.unfreeze_editable() def __next__(self): self.freeze_editable() collocations = self.model.next(self.current_page + 1) self.clear_results_box() self.write_results(collocations) self.current_page += 1 self.unfreeze_editable() def load_corpus(self, selection): if self.model.selected_corpus != selection: self.status["text"] = "Loading " + selection + "..." self.freeze_editable() self.model.load_corpus(selection) def freeze_editable(self): self.prev["state"] = "disabled" self.next["state"] = "disabled" def clear_results_box(self): self.results_box["state"] = "normal" self.results_box.delete("1.0", END) self.results_box["state"] = "disabled" def fire_event(self, event): # Firing an event so that rendering of widgets happen in the mainloop thread self.top.event_generate(event, when="tail") def destroy(self, *e): if self.top is None: return self.top.after_cancel(self.after) self.top.destroy() self.top = None def mainloop(self, *args, **kwargs): if in_idle(): return self.top.mainloop(*args, **kwargs) def unfreeze_editable(self): self.set_paging_button_states() def set_paging_button_states(self): if self.current_page == -1 or self.current_page == 0: self.prev["state"] = "disabled" else: self.prev["state"] = "normal" if self.model.is_last_page(self.current_page): self.next["state"] = "disabled" else: self.next["state"] = "normal" def write_results(self, results): self.results_box["state"] = "normal" row = 1 for each in results: self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n") row += 1 self.results_box["state"] = "disabled" class CollocationsModel: def __init__(self, queue): self.result_count = None self.selected_corpus = None self.collocations = None self.CORPORA = _CORPORA self.DEFAULT_CORPUS = _DEFAULT self.queue = queue self.reset_results() def reset_results(self): self.result_pages = [] self.results_returned = 0 def load_corpus(self, name): self.selected_corpus = name self.collocations = None runner_thread = self.LoadCorpus(name, self) runner_thread.start() self.reset_results() def non_default_corpora(self): copy = [] copy.extend(list(self.CORPORA.keys())) copy.remove(self.DEFAULT_CORPUS) copy.sort() return copy def is_last_page(self, number): if number < len(self.result_pages): return False return self.results_returned + ( number - len(self.result_pages) ) * self.result_count >= len(self.collocations) def next(self, page): if (len(self.result_pages) - 1) < page: for i in range(page - (len(self.result_pages) - 1)): self.result_pages.append( self.collocations[ self.results_returned : self.results_returned + self.result_count ] ) self.results_returned += self.result_count return self.result_pages[page] def prev(self, page): if page == -1: return [] return self.result_pages[page] class LoadCorpus(threading.Thread): def __init__(self, name, model): threading.Thread.__init__(self) self.model, self.name = model, name def run(self): try: words = self.model.CORPORA[self.name]() from operator import itemgetter text = [w for w in words if len(w) > 2] fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1)) vocab = FreqDist(text) scored = [ ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2])) for w1, w2 in fd ] scored.sort(key=itemgetter(1), reverse=True) self.model.collocations = list(map(itemgetter(0), scored)) self.model.queue.put(CORPUS_LOADED_EVENT) except Exception as e: print(e) self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) # def collocations(): # colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]] def app(): c = CollocationsView() c.mainloop() if __name__ == "__main__": app() __all__ = ["app"] nltk-3.7/nltk/app/concordance_app.py000077500000000000000000000571551420073152400175710ustar00rootroot00000000000000# Natural Language Toolkit: Concordance Application # # Copyright (C) 2001-2022 NLTK Project # Author: Sumukh Ghodke # URL: # For license information, see LICENSE.TXT import queue as q import re import threading from tkinter import ( END, LEFT, SUNKEN, Button, Entry, Frame, IntVar, Label, Menu, OptionMenu, Scrollbar, StringVar, Text, Tk, ) from tkinter.font import Font from nltk.corpus import ( alpino, brown, cess_cat, cess_esp, floresta, indian, mac_morpho, nps_chat, sinica_treebank, treebank, ) from nltk.draw.util import ShowText from nltk.util import in_idle WORD_OR_TAG = "[^/ ]+" BOUNDARY = r"\b" CORPUS_LOADED_EVENT = "<>" SEARCH_TERMINATED_EVENT = "<>" SEARCH_ERROR_EVENT = "<>" ERROR_LOADING_CORPUS_EVENT = "<>" POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = "English: Brown Corpus (Humor, simplified)" _CORPORA = { "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents( tagset="universal" ), "English: Brown Corpus": lambda: brown.tagged_sents(), "English: Brown Corpus (simplified)": lambda: brown.tagged_sents( tagset="universal" ), "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents( categories=["news", "editorial", "reviews"], tagset="universal" ), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents( categories="religion", tagset="universal" ), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents( categories="learned", tagset="universal" ), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="universal" ), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents( categories="romance", tagset="universal" ), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents( categories="humor", tagset="universal" ), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts( tagset="universal" ), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents( tagset="universal" ), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( tagset="universal" ), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( tagset="universal" ), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents( files="hindi.pos", tagset="universal" ), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents( tagset="universal" ), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents( tagset="universal" ), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents( tagset="universal" ), } class ConcordanceSearchView: _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = "#F00" # red _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG" _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG" # Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT = 0.30 def __init__(self): self.queue = q.Queue() self.model = ConcordanceSearchModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll) def _init_top(self, top): top.geometry("950x680+50+50") top.title("NLTK Concordance Search") top.bind("", self.destroy) top.protocol("WM_DELETE_WINDOW", self.destroy) top.minsize(950, 680) def _init_widgets(self, parent): self.main_frame = Frame( parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1) ) self._init_corpus_select(self.main_frame) self._init_query_box(self.main_frame) self._init_results_box(self.main_frame) self._init_paging(self.main_frame) self._init_status(self.main_frame) self.main_frame.pack(fill="both", expand=True) def _init_menubar(self): self._result_size = IntVar(self.top) self._cntx_bf_len = IntVar(self.top) self._cntx_af_len = IntVar(self.top) menubar = Menu(self.top) filemenu = Menu(menubar, tearoff=0, borderwidth=0) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) rescntmenu = Menu(editmenu, tearoff=0) rescntmenu.add_radiobutton( label="20", variable=self._result_size, underline=0, value=20, command=self.set_result_size, ) rescntmenu.add_radiobutton( label="50", variable=self._result_size, underline=0, value=50, command=self.set_result_size, ) rescntmenu.add_radiobutton( label="100", variable=self._result_size, underline=0, value=100, command=self.set_result_size, ) rescntmenu.invoke(1) editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu) cntxmenu = Menu(editmenu, tearoff=0) cntxbfmenu = Menu(cntxmenu, tearoff=0) cntxbfmenu.add_radiobutton( label="60 characters", variable=self._cntx_bf_len, underline=0, value=60, command=self.set_cntx_bf_len, ) cntxbfmenu.add_radiobutton( label="80 characters", variable=self._cntx_bf_len, underline=0, value=80, command=self.set_cntx_bf_len, ) cntxbfmenu.add_radiobutton( label="100 characters", variable=self._cntx_bf_len, underline=0, value=100, command=self.set_cntx_bf_len, ) cntxbfmenu.invoke(1) cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu) cntxafmenu = Menu(cntxmenu, tearoff=0) cntxafmenu.add_radiobutton( label="70 characters", variable=self._cntx_af_len, underline=0, value=70, command=self.set_cntx_af_len, ) cntxafmenu.add_radiobutton( label="90 characters", variable=self._cntx_af_len, underline=0, value=90, command=self.set_cntx_af_len, ) cntxafmenu.add_radiobutton( label="110 characters", variable=self._cntx_af_len, underline=0, value=110, command=self.set_cntx_af_len, ) cntxafmenu.invoke(1) cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu) editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu) menubar.add_cascade(label="Edit", underline=0, menu=editmenu) self.top.config(menu=menubar) def set_result_size(self, **kwargs): self.model.result_count = self._result_size.get() def set_cntx_af_len(self, **kwargs): self._char_after = self._cntx_af_len.get() def set_cntx_bf_len(self, **kwargs): self._char_before = self._cntx_bf_len.get() def _init_corpus_select(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.var = StringVar(innerframe) self.var.set(self.model.DEFAULT_CORPUS) Label( innerframe, justify=LEFT, text=" Corpus: ", background=self._BACKGROUND_COLOUR, padx=2, pady=1, border=0, ).pack(side="left") other_corpora = list(self.model.CORPORA.keys()).remove( self.model.DEFAULT_CORPUS ) om = OptionMenu( innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora() ) om["borderwidth"] = 0 om["highlightthickness"] = 1 om.pack(side="left") innerframe.pack(side="top", fill="x", anchor="n") def _init_status(self, parent): self.status = Label( parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx=1, pady=0, ) self.status.pack(side="top", anchor="sw") def _init_query_box(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) another = Frame(innerframe, background=self._BACKGROUND_COLOUR) self.query_box = Entry(another, width=60) self.query_box.pack(side="left", fill="x", pady=25, anchor="center") self.search_button = Button( another, text="Search", command=self.search, borderwidth=1, highlightthickness=1, ) self.search_button.pack(side="left", fill="x", pady=25, anchor="center") self.query_box.bind("", self.search_enter_keypress_handler) another.pack() innerframe.pack(side="top", fill="x", anchor="n") def search_enter_keypress_handler(self, *event): self.search() def _init_results_box(self, parent): innerframe = Frame(parent) i1 = Frame(innerframe) i2 = Frame(innerframe) vscrollbar = Scrollbar(i1, borderwidth=1) hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz") self.results_box = Text( i1, font=Font(family="courier", size="16"), state="disabled", borderwidth=1, yscrollcommand=vscrollbar.set, xscrollcommand=hscrollbar.set, wrap="none", width="40", height="20", exportselection=1, ) self.results_box.pack(side="left", fill="both", expand=True) self.results_box.tag_config( self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR ) self.results_box.tag_config( self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR ) vscrollbar.pack(side="left", fill="y", anchor="e") vscrollbar.config(command=self.results_box.yview) hscrollbar.pack(side="left", fill="x", expand=True, anchor="w") hscrollbar.config(command=self.results_box.xview) # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack( side="left", anchor="e" ) i1.pack(side="top", fill="both", expand=True, anchor="n") i2.pack(side="bottom", fill="x", anchor="s") innerframe.pack(side="top", fill="both", expand=True) def _init_paging(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.prev = prev = Button( innerframe, text="Previous", command=self.previous, width="10", borderwidth=1, highlightthickness=1, state="disabled", ) prev.pack(side="left", anchor="center") self.next = next = Button( innerframe, text="Next", command=self.__next__, width="10", borderwidth=1, highlightthickness=1, state="disabled", ) next.pack(side="right", anchor="center") innerframe.pack(side="top", fill="y") self.current_page = 0 def previous(self): self.clear_results_box() self.freeze_editable() self.model.prev(self.current_page - 1) def __next__(self): self.clear_results_box() self.freeze_editable() self.model.next(self.current_page + 1) def about(self, *e): ABOUT = "NLTK Concordance Search Demo\n" TITLE = "About: NLTK Concordance Search Demo" try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE, parent=self.main_frame).show() except: ShowText(self.top, TITLE, ABOUT) def _bind_event_handlers(self): self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded) self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated) self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error) self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus) def _poll(self): try: event = self.queue.get(block=False) except q.Empty: pass else: if event == CORPUS_LOADED_EVENT: self.handle_corpus_loaded(event) elif event == SEARCH_TERMINATED_EVENT: self.handle_search_terminated(event) elif event == SEARCH_ERROR_EVENT: self.handle_search_error(event) elif event == ERROR_LOADING_CORPUS_EVENT: self.handle_error_loading_corpus(event) self.after = self.top.after(POLL_INTERVAL, self._poll) def handle_error_loading_corpus(self, event): self.status["text"] = "Error in loading " + self.var.get() self.unfreeze_editable() self.clear_all() self.freeze_editable() def handle_corpus_loaded(self, event): self.status["text"] = self.var.get() + " is loaded" self.unfreeze_editable() self.clear_all() self.query_box.focus_set() def handle_search_terminated(self, event): # todo: refactor the model such that it is less state sensitive results = self.model.get_results() self.write_results(results) self.status["text"] = "" if len(results) == 0: self.status["text"] = "No results found for " + self.model.query else: self.current_page = self.model.last_requested_page self.unfreeze_editable() self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT) def handle_search_error(self, event): self.status["text"] = "Error in query " + self.model.query self.unfreeze_editable() def corpus_selected(self, *args): new_selection = self.var.get() self.load_corpus(new_selection) def load_corpus(self, selection): if self.model.selected_corpus != selection: self.status["text"] = "Loading " + selection + "..." self.freeze_editable() self.model.load_corpus(selection) def search(self): self.current_page = 0 self.clear_results_box() self.model.reset_results() query = self.query_box.get() if len(query.strip()) == 0: return self.status["text"] = "Searching for " + query self.freeze_editable() self.model.search(query, self.current_page + 1) def write_results(self, results): self.results_box["state"] = "normal" row = 1 for each in results: sent, pos1, pos2 = each[0].strip(), each[1], each[2] if len(sent) != 0: if pos1 < self._char_before: sent, pos1, pos2 = self.pad(sent, pos1, pos2) sentence = sent[pos1 - self._char_before : pos1 + self._char_after] if not row == len(results): sentence += "\n" self.results_box.insert(str(row) + ".0", sentence) word_markers, label_markers = self.words_and_labels(sent, pos1, pos2) for marker in word_markers: self.results_box.tag_add( self._HIGHLIGHT_WORD_TAG, str(row) + "." + str(marker[0]), str(row) + "." + str(marker[1]), ) for marker in label_markers: self.results_box.tag_add( self._HIGHLIGHT_LABEL_TAG, str(row) + "." + str(marker[0]), str(row) + "." + str(marker[1]), ) row += 1 self.results_box["state"] = "disabled" def words_and_labels(self, sentence, pos1, pos2): search_exp = sentence[pos1:pos2] words, labels = [], [] labeled_words = search_exp.split(" ") index = 0 for each in labeled_words: if each == "": index += 1 else: word, label = each.split("/") words.append( (self._char_before + index, self._char_before + index + len(word)) ) index += len(word) + 1 labels.append( (self._char_before + index, self._char_before + index + len(label)) ) index += len(label) index += 1 return words, labels def pad(self, sent, hstart, hend): if hstart >= self._char_before: return sent, hstart, hend d = self._char_before - hstart sent = "".join([" "] * d) + sent return sent, hstart + d, hend + d def destroy(self, *e): if self.top is None: return self.top.after_cancel(self.after) self.top.destroy() self.top = None def clear_all(self): self.query_box.delete(0, END) self.model.reset_query() self.clear_results_box() def clear_results_box(self): self.results_box["state"] = "normal" self.results_box.delete("1.0", END) self.results_box["state"] = "disabled" def freeze_editable(self): self.query_box["state"] = "disabled" self.search_button["state"] = "disabled" self.prev["state"] = "disabled" self.next["state"] = "disabled" def unfreeze_editable(self): self.query_box["state"] = "normal" self.search_button["state"] = "normal" self.set_paging_button_states() def set_paging_button_states(self): if self.current_page == 0 or self.current_page == 1: self.prev["state"] = "disabled" else: self.prev["state"] = "normal" if self.model.has_more_pages(self.current_page): self.next["state"] = "normal" else: self.next["state"] = "disabled" def fire_event(self, event): # Firing an event so that rendering of widgets happen in the mainloop thread self.top.event_generate(event, when="tail") def mainloop(self, *args, **kwargs): if in_idle(): return self.top.mainloop(*args, **kwargs) class ConcordanceSearchModel: def __init__(self, queue): self.queue = queue self.CORPORA = _CORPORA self.DEFAULT_CORPUS = _DEFAULT self.selected_corpus = None self.reset_query() self.reset_results() self.result_count = None self.last_sent_searched = 0 def non_default_corpora(self): copy = [] copy.extend(list(self.CORPORA.keys())) copy.remove(self.DEFAULT_CORPUS) copy.sort() return copy def load_corpus(self, name): self.selected_corpus = name self.tagged_sents = [] runner_thread = self.LoadCorpus(name, self) runner_thread.start() def search(self, query, page): self.query = query self.last_requested_page = page self.SearchCorpus(self, page, self.result_count).start() def next(self, page): self.last_requested_page = page if len(self.results) < page: self.search(self.query, page) else: self.queue.put(SEARCH_TERMINATED_EVENT) def prev(self, page): self.last_requested_page = page self.queue.put(SEARCH_TERMINATED_EVENT) def reset_results(self): self.last_sent_searched = 0 self.results = [] self.last_page = None def reset_query(self): self.query = None def set_results(self, page, resultset): self.results.insert(page - 1, resultset) def get_results(self): return self.results[self.last_requested_page - 1] def has_more_pages(self, page): if self.results == [] or self.results[0] == []: return False if self.last_page is None: return True return page < self.last_page class LoadCorpus(threading.Thread): def __init__(self, name, model): threading.Thread.__init__(self) self.model, self.name = model, name def run(self): try: ts = self.model.CORPORA[self.name]() self.model.tagged_sents = [ " ".join(w + "/" + t for (w, t) in sent) for sent in ts ] self.model.queue.put(CORPUS_LOADED_EVENT) except Exception as e: print(e) self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) class SearchCorpus(threading.Thread): def __init__(self, model, page, count): self.model, self.count, self.page = model, count, page threading.Thread.__init__(self) def run(self): q = self.processed_query() sent_pos, i, sent_count = [], 0, 0 for sent in self.model.tagged_sents[self.model.last_sent_searched :]: try: m = re.search(q, sent) except re.error: self.model.reset_results() self.model.queue.put(SEARCH_ERROR_EVENT) return if m: sent_pos.append((sent, m.start(), m.end())) i += 1 if i > self.count: self.model.last_sent_searched += sent_count - 1 break sent_count += 1 if self.count >= len(sent_pos): self.model.last_sent_searched += sent_count - 1 self.model.last_page = self.page self.model.set_results(self.page, sent_pos) else: self.model.set_results(self.page, sent_pos[:-1]) self.model.queue.put(SEARCH_TERMINATED_EVENT) def processed_query(self): new = [] for term in self.model.query.split(): term = re.sub(r"\.", r"[^/ ]", term) if re.match("[A-Z]+$", term): new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY) elif "/" in term: new.append(BOUNDARY + term + BOUNDARY) else: new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY) return " ".join(new) def app(): d = ConcordanceSearchView() d.mainloop() if __name__ == "__main__": app() __all__ = ["app"] nltk-3.7/nltk/app/nemo_app.py000077500000000000000000000275561420073152400162530ustar00rootroot00000000000000# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06 # https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783 """ Finding (and Replacing) Nemo Instant Regular Expressions Created by Aristide Grange """ import itertools import re from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk windowTitle = "Finding (and Replacing) Nemo" initialFind = r"n(.*?)e(.*?)m(.*?)o" initialRepl = r"M\1A\2K\3I" initialText = """\ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. """ images = { "FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=", "find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7", "REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7", "repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=", } colors = ["#FF7B39", "#80F121"] emphColors = ["#DAFC33", "#F42548"] fieldParams = { "height": 3, "width": 70, "font": ("monaco", 14), "highlightthickness": 0, "borderwidth": 0, "background": "white", } textParams = { "bg": "#F7E0D4", "fg": "#2321F1", "highlightthickness": 0, "width": 1, "height": 10, "font": ("verdana", 16), "wrap": "word", } class Zone: def __init__(self, image, initialField, initialText): frm = Frame(root) frm.config(background="white") self.image = PhotoImage(format="gif", data=images[image.upper()]) self.imageDimmed = PhotoImage(format="gif", data=images[image]) self.img = Label(frm) self.img.config(borderwidth=0) self.img.pack(side="left") self.fld = Text(frm, **fieldParams) self.initScrollText(frm, self.fld, initialField) frm = Frame(root) self.txt = Text(frm, **textParams) self.initScrollText(frm, self.txt, initialText) for i in range(2): self.txt.tag_config(colors[i], background=colors[i]) self.txt.tag_config("emph" + colors[i], foreground=emphColors[i]) def initScrollText(self, frm, txt, contents): scl = Scrollbar(frm) scl.config(command=txt.yview) scl.pack(side="right", fill="y") txt.pack(side="left", expand=True, fill="x") txt.config(yscrollcommand=scl.set) txt.insert("1.0", contents) frm.pack(fill="x") Frame(height=2, bd=1, relief="ridge").pack(fill="x") def refresh(self): self.colorCycle = itertools.cycle(colors) try: self.substitute() self.img.config(image=self.image) except re.error: self.img.config(image=self.imageDimmed) class FindZone(Zone): def addTags(self, m): color = next(self.colorCycle) self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end()) try: self.txt.tag_add( "emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph") ) except: pass def substitute(self, *args): for color in colors: self.txt.tag_remove(color, "1.0", "end") self.txt.tag_remove("emph" + color, "1.0", "end") self.rex = re.compile("") # default value in case of malformed regexp self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE) try: re.compile("(?P%s)" % self.fld.get(SEL_FIRST, SEL_LAST)) self.rexSel = re.compile( "%s(?P%s)%s" % ( self.fld.get("1.0", SEL_FIRST), self.fld.get(SEL_FIRST, SEL_LAST), self.fld.get(SEL_LAST, "end")[:-1], ), re.MULTILINE, ) except: self.rexSel = self.rex self.rexSel.sub(self.addTags, self.txt.get("1.0", "end")) class ReplaceZone(Zone): def addTags(self, m): s = sz.rex.sub(self.repl, m.group()) self.txt.delete( "1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff) ) self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle)) self.diff += len(s) - (m.end() - m.start()) def substitute(self): self.txt.delete("1.0", "end") self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1]) self.diff = 0 self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1]) sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1]) def launchRefresh(_): sz.fld.after_idle(sz.refresh) rz.fld.after_idle(rz.refresh) def app(): global root, sz, rz, rex0 root = Tk() root.resizable(height=False, width=True) root.title(windowTitle) root.minsize(width=250, height=0) sz = FindZone("find", initialFind, initialText) sz.fld.bind("", launchRefresh) sz.fld.bind("", launchRefresh) sz.fld.bind("", launchRefresh) sz.rexSel = re.compile("") rz = ReplaceZone("repl", initialRepl, "") rex0 = re.compile(r"(?", launchRefresh) launchRefresh(None) root.mainloop() if __name__ == "__main__": app() __all__ = ["app"] nltk-3.7/nltk/app/rdparser_app.py000066400000000000000000001075711420073152400171300ustar00rootroot00000000000000# Natural Language Toolkit: Recursive Descent Parser Application # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring the recursive descent parser. The recursive descent parser maintains a tree, which records the structure of the portion of the text that has been parsed. It uses CFG productions to expand the fringe of the tree, and matches its leaves against the text. Initially, the tree contains the start symbol ("S"). It is shown in the main canvas, to the right of the list of available expansions. The parser builds up a tree structure for the text using three operations: - "expand" uses a CFG production to add children to a node on the fringe of the tree. - "match" compares a leaf in the tree to a text token. - "backtrack" returns the tree to its state before the most recent expand or match operation. The parser maintains a list of tree locations called a "frontier" to remember which nodes have not yet been expanded and which leaves have not yet been matched against the text. The leftmost frontier node is shown in green, and the other frontier nodes are shown in blue. The parser always performs expand and match operations on the leftmost element of the frontier. You can control the parser's operation by using the "expand," "match," and "backtrack" buttons; or you can use the "step" button to let the parser automatically decide which operation to apply. The parser uses the following rules to decide which operation to apply: - If the leftmost frontier element is a token, try matching it. - If the leftmost frontier element is a node, try expanding it with the first untried expansion. - Otherwise, backtrack. The "expand" button applies the untried expansion whose CFG production is listed earliest in the grammar. To manually choose which expansion to apply, click on a CFG production from the list of available expansions, on the left side of the main window. The "autostep" button will let the parser continue applying applications to the tree until it reaches a complete parse. You can cancel an autostep in progress at any time by clicking on the "autostep" button again. Keyboard Shortcuts:: [Space]\t Perform the next expand, match, or backtrack operation [a]\t Step through operations until the next complete parse [e]\t Perform an expand operation [m]\t Perform a match operation [b]\t Perform a backtrack operation [Delete]\t Reset the parser [g]\t Show/hide available expansions list [h]\t Help [Ctrl-p]\t Print [q]\t Quit """ from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk from tkinter.font import Font from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget from nltk.parse import SteppingRecursiveDescentParser from nltk.tree import Tree from nltk.util import in_idle class RecursiveDescentApp: """ A graphical tool for exploring the recursive descent parser. The tool displays the parser's tree and the remaining text, and allows the user to control the parser's operation. In particular, the user can expand subtrees on the frontier, match tokens on the frontier against the text, and backtrack. A "step" button simply steps through the parsing process, performing the operations that ``RecursiveDescentParser`` would use. """ def __init__(self, grammar, sent, trace=0): self._sent = sent self._parser = SteppingRecursiveDescentParser(grammar, trace) # Set up the main window. self._top = Tk() self._top.title("Recursive Descent Parser Application") # Set up key bindings. self._init_bindings() # Initialize the fonts. self._init_fonts(self._top) # Animations. animating_lock is a lock to prevent the demo # from performing new operations while it's animating. self._animation_frames = IntVar(self._top) self._animation_frames.set(5) self._animating_lock = 0 self._autostep = 0 # The user can hide the grammar. self._show_grammar = IntVar(self._top) self._show_grammar.set(1) # Create the basic frames. self._init_menubar(self._top) self._init_buttons(self._top) self._init_feedback(self._top) self._init_grammar(self._top) self._init_canvas(self._top) # Initialize the parser. self._parser.initialize(self._sent) # Resize callback self._canvas.bind("", self._configure) ######################################### ## Initialization Helpers ######################################### def _init_fonts(self, root): # See: self._sysfont = Font(font=Button()["font"]) root.option_add("*Font", self._sysfont) # TWhat's our font size (default=same as sysfont) self._size = IntVar(root) self._size.set(self._sysfont.cget("size")) self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) self._font = Font(family="helvetica", size=self._size.get()) if self._size.get() < 0: big = self._size.get() - 2 else: big = self._size.get() + 2 self._bigfont = Font(family="helvetica", weight="bold", size=big) def _init_grammar(self, parent): # Grammar view. self._prodframe = listframe = Frame(parent) self._prodframe.pack(fill="both", side="left", padx=2) self._prodlist_label = Label( self._prodframe, font=self._boldfont, text="Available Expansions" ) self._prodlist_label.pack() self._prodlist = Listbox( self._prodframe, selectmode="single", relief="groove", background="white", foreground="#909090", font=self._font, selectforeground="#004040", selectbackground="#c0f0c0", ) self._prodlist.pack(side="right", fill="both", expand=1) self._productions = list(self._parser.grammar().productions()) for production in self._productions: self._prodlist.insert("end", (" %s" % production)) self._prodlist.config(height=min(len(self._productions), 25)) # Add a scrollbar if there are more than 25 productions. if len(self._productions) > 25: listscroll = Scrollbar(self._prodframe, orient="vertical") self._prodlist.config(yscrollcommand=listscroll.set) listscroll.config(command=self._prodlist.yview) listscroll.pack(side="left", fill="y") # If they select a production, apply it. self._prodlist.bind("<>", self._prodlist_select) def _init_bindings(self): # Key bindings are a good thing. self._top.bind("", self.destroy) self._top.bind("", self.destroy) self._top.bind("", self.destroy) self._top.bind("e", self.expand) # self._top.bind('', self.expand) # self._top.bind('', self.expand) self._top.bind("m", self.match) self._top.bind("", self.match) self._top.bind("", self.match) self._top.bind("b", self.backtrack) self._top.bind("", self.backtrack) self._top.bind("", self.backtrack) self._top.bind("", self.backtrack) self._top.bind("", self.backtrack) self._top.bind("a", self.autostep) # self._top.bind('', self.autostep) self._top.bind("", self.autostep) self._top.bind("", self.cancel_autostep) self._top.bind("", self.step) self._top.bind("", self.reset) self._top.bind("", self.postscript) # self._top.bind('', self.help) # self._top.bind('', self.help) self._top.bind("", self.help) self._top.bind("", self.help) # self._top.bind('', self.toggle_grammar) # self._top.bind('', self.toggle_grammar) # self._top.bind('', self.toggle_grammar) self._top.bind("", self.edit_grammar) self._top.bind("", self.edit_sentence) def _init_buttons(self, parent): # Set up the frames. self._buttonframe = buttonframe = Frame(parent) buttonframe.pack(fill="none", side="bottom", padx=3, pady=2) Button( buttonframe, text="Step", background="#90c0d0", foreground="black", command=self.step, ).pack(side="left") Button( buttonframe, text="Autostep", background="#90c0d0", foreground="black", command=self.autostep, ).pack(side="left") Button( buttonframe, text="Expand", underline=0, background="#90f090", foreground="black", command=self.expand, ).pack(side="left") Button( buttonframe, text="Match", underline=0, background="#90f090", foreground="black", command=self.match, ).pack(side="left") Button( buttonframe, text="Backtrack", underline=0, background="#f0a0a0", foreground="black", command=self.backtrack, ).pack(side="left") # Replace autostep... # self._autostep_button = Button(buttonframe, text='Autostep', # underline=0, command=self.autostep) # self._autostep_button.pack(side='left') def _configure(self, event): self._autostep = 0 (x1, y1, x2, y2) = self._cframe.scrollregion() y2 = event.height - 6 self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2) self._redraw() def _init_feedback(self, parent): self._feedbackframe = feedbackframe = Frame(parent) feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3) self._lastoper_label = Label( feedbackframe, text="Last Operation:", font=self._font ) self._lastoper_label.pack(side="left") lastoperframe = Frame(feedbackframe, relief="sunken", border=1) lastoperframe.pack(fill="x", side="right", expand=1, padx=5) self._lastoper1 = Label( lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font ) self._lastoper2 = Label( lastoperframe, anchor="w", width=30, foreground="#004040", background="#f0f0f0", font=self._font, ) self._lastoper1.pack(side="left") self._lastoper2.pack(side="left", fill="x", expand=1) def _init_canvas(self, parent): self._cframe = CanvasFrame( parent, background="white", # width=525, height=250, closeenough=10, border=2, relief="sunken", ) self._cframe.pack(expand=1, fill="both", side="top", pady=2) canvas = self._canvas = self._cframe.canvas() # Initially, there's no tree or text self._tree = None self._textwidgets = [] self._textline = None def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command( label="Reset Parser", underline=0, command=self.reset, accelerator="Del" ) filemenu.add_command( label="Print to Postscript", underline=0, command=self.postscript, accelerator="Ctrl-p", ) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) editmenu.add_command( label="Edit Grammar", underline=5, command=self.edit_grammar, accelerator="Ctrl-g", ) editmenu.add_command( label="Edit Text", underline=5, command=self.edit_sentence, accelerator="Ctrl-t", ) menubar.add_cascade(label="Edit", underline=0, menu=editmenu) rulemenu = Menu(menubar, tearoff=0) rulemenu.add_command( label="Step", underline=1, command=self.step, accelerator="Space" ) rulemenu.add_separator() rulemenu.add_command( label="Match", underline=0, command=self.match, accelerator="Ctrl-m" ) rulemenu.add_command( label="Expand", underline=0, command=self.expand, accelerator="Ctrl-e" ) rulemenu.add_separator() rulemenu.add_command( label="Backtrack", underline=0, command=self.backtrack, accelerator="Ctrl-b" ) menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_checkbutton( label="Show Grammar", underline=0, variable=self._show_grammar, command=self._toggle_grammar, ) viewmenu.add_separator() viewmenu.add_radiobutton( label="Tiny", variable=self._size, underline=0, value=10, command=self.resize, ) viewmenu.add_radiobutton( label="Small", variable=self._size, underline=0, value=12, command=self.resize, ) viewmenu.add_radiobutton( label="Medium", variable=self._size, underline=0, value=14, command=self.resize, ) viewmenu.add_radiobutton( label="Large", variable=self._size, underline=0, value=18, command=self.resize, ) viewmenu.add_radiobutton( label="Huge", variable=self._size, underline=0, value=24, command=self.resize, ) menubar.add_cascade(label="View", underline=0, menu=viewmenu) animatemenu = Menu(menubar, tearoff=0) animatemenu.add_radiobutton( label="No Animation", underline=0, variable=self._animation_frames, value=0 ) animatemenu.add_radiobutton( label="Slow Animation", underline=0, variable=self._animation_frames, value=10, accelerator="-", ) animatemenu.add_radiobutton( label="Normal Animation", underline=0, variable=self._animation_frames, value=5, accelerator="=", ) animatemenu.add_radiobutton( label="Fast Animation", underline=0, variable=self._animation_frames, value=2, accelerator="+", ) menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label="About", underline=0, command=self.about) helpmenu.add_command( label="Instructions", underline=0, command=self.help, accelerator="F1" ) menubar.add_cascade(label="Help", underline=0, menu=helpmenu) parent.config(menu=menubar) ######################################### ## Helper ######################################### def _get(self, widget, treeloc): for i in treeloc: widget = widget.subtrees()[i] if isinstance(widget, TreeSegmentWidget): widget = widget.label() return widget ######################################### ## Main draw procedure ######################################### def _redraw(self): canvas = self._canvas # Delete the old tree, widgets, etc. if self._tree is not None: self._cframe.destroy_widget(self._tree) for twidget in self._textwidgets: self._cframe.destroy_widget(twidget) if self._textline is not None: self._canvas.delete(self._textline) # Draw the tree. helv = ("helvetica", -self._size.get()) bold = ("helvetica", -self._size.get(), "bold") attribs = { "tree_color": "#000000", "tree_width": 2, "node_font": bold, "leaf_font": helv, } tree = self._parser.tree() self._tree = tree_to_treesegment(canvas, tree, **attribs) self._cframe.add_widget(self._tree, 30, 5) # Draw the text. helv = ("helvetica", -self._size.get()) bottom = y = self._cframe.scrollregion()[3] self._textwidgets = [ TextWidget(canvas, word, font=self._font) for word in self._sent ] for twidget in self._textwidgets: self._cframe.add_widget(twidget, 0, 0) twidget.move(0, bottom - twidget.bbox()[3] - 5) y = min(y, twidget.bbox()[1]) # Draw a line over the text, to separate it from the tree. self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash=".") # Highlight appropriate nodes. self._highlight_nodes() self._highlight_prodlist() # Make sure the text lines up. self._position_text() def _redraw_quick(self): # This should be more-or-less sufficient after an animation. self._highlight_nodes() self._highlight_prodlist() self._position_text() def _highlight_nodes(self): # Highlight the list of nodes to be checked. bold = ("helvetica", -self._size.get(), "bold") for treeloc in self._parser.frontier()[:1]: self._get(self._tree, treeloc)["color"] = "#20a050" self._get(self._tree, treeloc)["font"] = bold for treeloc in self._parser.frontier()[1:]: self._get(self._tree, treeloc)["color"] = "#008080" def _highlight_prodlist(self): # Highlight the productions that can be expanded. # Boy, too bad tkinter doesn't implement Listbox.itemconfig; # that would be pretty useful here. self._prodlist.delete(0, "end") expandable = self._parser.expandable_productions() untried = self._parser.untried_expandable_productions() productions = self._productions for index in range(len(productions)): if productions[index] in expandable: if productions[index] in untried: self._prodlist.insert(index, " %s" % productions[index]) else: self._prodlist.insert(index, " %s (TRIED)" % productions[index]) self._prodlist.selection_set(index) else: self._prodlist.insert(index, " %s" % productions[index]) def _position_text(self): # Line up the text widgets that are matched against the tree numwords = len(self._sent) num_matched = numwords - len(self._parser.remaining_text()) leaves = self._tree_leaves()[:num_matched] xmax = self._tree.bbox()[0] for i in range(0, len(leaves)): widget = self._textwidgets[i] leaf = leaves[i] widget["color"] = "#006040" leaf["color"] = "#006040" widget.move(leaf.bbox()[0] - widget.bbox()[0], 0) xmax = widget.bbox()[2] + 10 # Line up the text widgets that are not matched against the tree. for i in range(len(leaves), numwords): widget = self._textwidgets[i] widget["color"] = "#a0a0a0" widget.move(xmax - widget.bbox()[0], 0) xmax = widget.bbox()[2] + 10 # If we have a complete parse, make everything green :) if self._parser.currently_complete(): for twidget in self._textwidgets: twidget["color"] = "#00a000" # Move the matched leaves down to the text. for i in range(0, len(leaves)): widget = self._textwidgets[i] leaf = leaves[i] dy = widget.bbox()[1] - leaf.bbox()[3] - 10.0 dy = max(dy, leaf.parent().label().bbox()[3] - leaf.bbox()[3] + 10) leaf.move(0, dy) def _tree_leaves(self, tree=None): if tree is None: tree = self._tree if isinstance(tree, TreeSegmentWidget): leaves = [] for child in tree.subtrees(): leaves += self._tree_leaves(child) return leaves else: return [tree] ######################################### ## Button Callbacks ######################################### def destroy(self, *e): self._autostep = 0 if self._top is None: return self._top.destroy() self._top = None def reset(self, *e): self._autostep = 0 self._parser.initialize(self._sent) self._lastoper1["text"] = "Reset Application" self._lastoper2["text"] = "" self._redraw() def autostep(self, *e): if self._animation_frames.get() == 0: self._animation_frames.set(2) if self._autostep: self._autostep = 0 else: self._autostep = 1 self._step() def cancel_autostep(self, *e): # self._autostep_button['text'] = 'Autostep' self._autostep = 0 # Make sure to stop auto-stepping if we get any user input. def step(self, *e): self._autostep = 0 self._step() def match(self, *e): self._autostep = 0 self._match() def expand(self, *e): self._autostep = 0 self._expand() def backtrack(self, *e): self._autostep = 0 self._backtrack() def _step(self): if self._animating_lock: return # Try expanding, matching, and backtracking (in that order) if self._expand(): pass elif self._parser.untried_match() and self._match(): pass elif self._backtrack(): pass else: self._lastoper1["text"] = "Finished" self._lastoper2["text"] = "" self._autostep = 0 # Check if we just completed a parse. if self._parser.currently_complete(): self._autostep = 0 self._lastoper2["text"] += " [COMPLETE PARSE]" def _expand(self, *e): if self._animating_lock: return old_frontier = self._parser.frontier() rv = self._parser.expand() if rv is not None: self._lastoper1["text"] = "Expand:" self._lastoper2["text"] = rv self._prodlist.selection_clear(0, "end") index = self._productions.index(rv) self._prodlist.selection_set(index) self._animate_expand(old_frontier[0]) return True else: self._lastoper1["text"] = "Expand:" self._lastoper2["text"] = "(all expansions tried)" return False def _match(self, *e): if self._animating_lock: return old_frontier = self._parser.frontier() rv = self._parser.match() if rv is not None: self._lastoper1["text"] = "Match:" self._lastoper2["text"] = rv self._animate_match(old_frontier[0]) return True else: self._lastoper1["text"] = "Match:" self._lastoper2["text"] = "(failed)" return False def _backtrack(self, *e): if self._animating_lock: return if self._parser.backtrack(): elt = self._parser.tree() for i in self._parser.frontier()[0]: elt = elt[i] self._lastoper1["text"] = "Backtrack" self._lastoper2["text"] = "" if isinstance(elt, Tree): self._animate_backtrack(self._parser.frontier()[0]) else: self._animate_match_backtrack(self._parser.frontier()[0]) return True else: self._autostep = 0 self._lastoper1["text"] = "Finished" self._lastoper2["text"] = "" return False def about(self, *e): ABOUT = ( "NLTK Recursive Descent Parser Application\n" + "Written by Edward Loper" ) TITLE = "About: Recursive Descent Parser Application" try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self._top, TITLE, ABOUT) def help(self, *e): self._autostep = 0 # The default font's not very legible; try using 'fixed' instead. try: ShowText( self._top, "Help: Recursive Descent Parser Application", (__doc__ or "").strip(), width=75, font="fixed", ) except: ShowText( self._top, "Help: Recursive Descent Parser Application", (__doc__ or "").strip(), width=75, ) def postscript(self, *e): self._autostep = 0 self._cframe.print_to_file() def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._boldfont.configure(size=-(abs(size))) self._sysfont.configure(size=-(abs(size))) self._bigfont.configure(size=-(abs(size + 2))) self._redraw() ######################################### ## Expand Production Selection ######################################### def _toggle_grammar(self, *e): if self._show_grammar.get(): self._prodframe.pack( fill="both", side="left", padx=2, after=self._feedbackframe ) self._lastoper1["text"] = "Show Grammar" else: self._prodframe.pack_forget() self._lastoper1["text"] = "Hide Grammar" self._lastoper2["text"] = "" # def toggle_grammar(self, *e): # self._show_grammar = not self._show_grammar # if self._show_grammar: # self._prodframe.pack(fill='both', expand='y', side='left', # after=self._feedbackframe) # self._lastoper1['text'] = 'Show Grammar' # else: # self._prodframe.pack_forget() # self._lastoper1['text'] = 'Hide Grammar' # self._lastoper2['text'] = '' def _prodlist_select(self, event): selection = self._prodlist.curselection() if len(selection) != 1: return index = int(selection[0]) old_frontier = self._parser.frontier() production = self._parser.expand(self._productions[index]) if production: self._lastoper1["text"] = "Expand:" self._lastoper2["text"] = production self._prodlist.selection_clear(0, "end") self._prodlist.selection_set(index) self._animate_expand(old_frontier[0]) else: # Reset the production selections. self._prodlist.selection_clear(0, "end") for prod in self._parser.expandable_productions(): index = self._productions.index(prod) self._prodlist.selection_set(index) ######################################### ## Animation ######################################### def _animate_expand(self, treeloc): oldwidget = self._get(self._tree, treeloc) oldtree = oldwidget.parent() top = not isinstance(oldtree.parent(), TreeSegmentWidget) tree = self._parser.tree() for i in treeloc: tree = tree[i] widget = tree_to_treesegment( self._canvas, tree, node_font=self._boldfont, leaf_color="white", tree_width=2, tree_color="white", node_color="white", leaf_font=self._font, ) widget.label()["color"] = "#20a050" (oldx, oldy) = oldtree.label().bbox()[:2] (newx, newy) = widget.label().bbox()[:2] widget.move(oldx - newx, oldy - newy) if top: self._cframe.add_widget(widget, 0, 5) widget.move(30 - widget.label().bbox()[0], 0) self._tree = widget else: oldtree.parent().replace_child(oldtree, widget) # Move the children over so they don't overlap. # Line the children up in a strange way. if widget.subtrees(): dx = ( oldx + widget.label().width() / 2 - widget.subtrees()[0].bbox()[0] / 2 - widget.subtrees()[0].bbox()[2] / 2 ) for subtree in widget.subtrees(): subtree.move(dx, 0) self._makeroom(widget) if top: self._cframe.destroy_widget(oldtree) else: oldtree.destroy() colors = [ "gray%d" % (10 * int(10 * x / self._animation_frames.get())) for x in range(self._animation_frames.get(), 0, -1) ] # Move the text string down, if necessary. dy = widget.bbox()[3] + 30 - self._canvas.coords(self._textline)[1] if dy > 0: for twidget in self._textwidgets: twidget.move(0, dy) self._canvas.move(self._textline, 0, dy) self._animate_expand_frame(widget, colors) def _makeroom(self, treeseg): """ Make sure that no sibling tree bbox's overlap. """ parent = treeseg.parent() if not isinstance(parent, TreeSegmentWidget): return index = parent.subtrees().index(treeseg) # Handle siblings to the right rsiblings = parent.subtrees()[index + 1 :] if rsiblings: dx = treeseg.bbox()[2] - rsiblings[0].bbox()[0] + 10 for sibling in rsiblings: sibling.move(dx, 0) # Handle siblings to the left if index > 0: lsibling = parent.subtrees()[index - 1] dx = max(0, lsibling.bbox()[2] - treeseg.bbox()[0] + 10) treeseg.move(dx, 0) # Keep working up the tree. self._makeroom(parent) def _animate_expand_frame(self, widget, colors): if len(colors) > 0: self._animating_lock = 1 widget["color"] = colors[0] for subtree in widget.subtrees(): if isinstance(subtree, TreeSegmentWidget): subtree.label()["color"] = colors[0] else: subtree["color"] = colors[0] self._top.after(50, self._animate_expand_frame, widget, colors[1:]) else: widget["color"] = "black" for subtree in widget.subtrees(): if isinstance(subtree, TreeSegmentWidget): subtree.label()["color"] = "black" else: subtree["color"] = "black" self._redraw_quick() widget.label()["color"] = "black" self._animating_lock = 0 if self._autostep: self._step() def _animate_backtrack(self, treeloc): # Flash red first, if we're animating. if self._animation_frames.get() == 0: colors = [] else: colors = ["#a00000", "#000000", "#a00000"] colors += [ "gray%d" % (10 * int(10 * x / (self._animation_frames.get()))) for x in range(1, self._animation_frames.get() + 1) ] widgets = [self._get(self._tree, treeloc).parent()] for subtree in widgets[0].subtrees(): if isinstance(subtree, TreeSegmentWidget): widgets.append(subtree.label()) else: widgets.append(subtree) self._animate_backtrack_frame(widgets, colors) def _animate_backtrack_frame(self, widgets, colors): if len(colors) > 0: self._animating_lock = 1 for widget in widgets: widget["color"] = colors[0] self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:]) else: for widget in widgets[0].subtrees(): widgets[0].remove_child(widget) widget.destroy() self._redraw_quick() self._animating_lock = 0 if self._autostep: self._step() def _animate_match_backtrack(self, treeloc): widget = self._get(self._tree, treeloc) node = widget.parent().label() dy = (node.bbox()[3] - widget.bbox()[1] + 14) / max( 1, self._animation_frames.get() ) self._animate_match_backtrack_frame(self._animation_frames.get(), widget, dy) def _animate_match(self, treeloc): widget = self._get(self._tree, treeloc) dy = (self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) / max( 1, self._animation_frames.get() ) self._animate_match_frame(self._animation_frames.get(), widget, dy) def _animate_match_frame(self, frame, widget, dy): if frame > 0: self._animating_lock = 1 widget.move(0, dy) self._top.after(10, self._animate_match_frame, frame - 1, widget, dy) else: widget["color"] = "#006040" self._redraw_quick() self._animating_lock = 0 if self._autostep: self._step() def _animate_match_backtrack_frame(self, frame, widget, dy): if frame > 0: self._animating_lock = 1 widget.move(0, dy) self._top.after( 10, self._animate_match_backtrack_frame, frame - 1, widget, dy ) else: widget.parent().remove_child(widget) widget.destroy() self._animating_lock = 0 if self._autostep: self._step() def edit_grammar(self, *e): CFGEditor(self._top, self._parser.grammar(), self.set_grammar) def set_grammar(self, grammar): self._parser.set_grammar(grammar) self._productions = list(grammar.productions()) self._prodlist.delete(0, "end") for production in self._productions: self._prodlist.insert("end", (" %s" % production)) def edit_sentence(self, *e): sentence = " ".join(self._sent) title = "Edit Text" instr = "Enter a new sentence to parse." EntryDialog(self._top, sentence, instr, self.set_sentence, title) def set_sentence(self, sentence): self._sent = sentence.split() # [XX] use tagged? self.reset() def app(): """ Create a recursive descent parser demo, using a simple grammar and text. """ from nltk.grammar import CFG grammar = CFG.fromstring( """ # Grammatical productions. S -> NP VP NP -> Det N PP | Det N VP -> V NP PP | V NP | V PP -> P NP # Lexical productions. NP -> 'I' Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'ate' | 'saw' P -> 'in' | 'under' | 'with' """ ) sent = "the dog saw a man in the park".split() RecursiveDescentApp(grammar, sent).mainloop() if __name__ == "__main__": app() __all__ = ["app"] nltk-3.7/nltk/app/srparser_app.py000066400000000000000000001012701420073152400171350ustar00rootroot00000000000000# Natural Language Toolkit: Shift-Reduce Parser Application # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring the shift-reduce parser. The shift-reduce parser maintains a stack, which records the structure of the portion of the text that has been parsed. The stack is initially empty. Its contents are shown on the left side of the main canvas. On the right side of the main canvas is the remaining text. This is the portion of the text which has not yet been considered by the parser. The parser builds up a tree structure for the text using two operations: - "shift" moves the first token from the remaining text to the top of the stack. In the demo, the top of the stack is its right-hand side. - "reduce" uses a grammar production to combine the rightmost stack elements into a single tree token. You can control the parser's operation by using the "shift" and "reduce" buttons; or you can use the "step" button to let the parser automatically decide which operation to apply. The parser uses the following rules to decide which operation to apply: - Only shift if no reductions are available. - If multiple reductions are available, then apply the reduction whose CFG production is listed earliest in the grammar. The "reduce" button applies the reduction whose CFG production is listed earliest in the grammar. There are two ways to manually choose which reduction to apply: - Click on a CFG production from the list of available reductions, on the left side of the main window. The reduction based on that production will be applied to the top of the stack. - Click on one of the stack elements. A popup window will appear, containing all available reductions. Select one, and it will be applied to the top of the stack. Note that reductions can only be applied to the top of the stack. Keyboard Shortcuts:: [Space]\t Perform the next shift or reduce operation [s]\t Perform a shift operation [r]\t Perform a reduction operation [Ctrl-z]\t Undo most recent operation [Delete]\t Reset the parser [g]\t Show/hide available production list [Ctrl-a]\t Toggle animations [h]\t Help [Ctrl-p]\t Print [q]\t Quit """ from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk from tkinter.font import Font from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget from nltk.parse import SteppingShiftReduceParser from nltk.tree import Tree from nltk.util import in_idle """ Possible future improvements: - button/window to change and/or select text. Just pop up a window with an entry, and let them modify the text; and then retokenize it? Maybe give a warning if it contains tokens whose types are not in the grammar. - button/window to change and/or select grammar. Select from several alternative grammars? Or actually change the grammar? If the later, then I'd want to define nltk.draw.cfg, which would be responsible for that. """ class ShiftReduceApp: """ A graphical tool for exploring the shift-reduce parser. The tool displays the parser's stack and the remaining text, and allows the user to control the parser's operation. In particular, the user can shift tokens onto the stack, and can perform reductions on the top elements of the stack. A "step" button simply steps through the parsing process, performing the operations that ``nltk.parse.ShiftReduceParser`` would use. """ def __init__(self, grammar, sent, trace=0): self._sent = sent self._parser = SteppingShiftReduceParser(grammar, trace) # Set up the main window. self._top = Tk() self._top.title("Shift Reduce Parser Application") # Animations. animating_lock is a lock to prevent the demo # from performing new operations while it's animating. self._animating_lock = 0 self._animate = IntVar(self._top) self._animate.set(10) # = medium # The user can hide the grammar. self._show_grammar = IntVar(self._top) self._show_grammar.set(1) # Initialize fonts. self._init_fonts(self._top) # Set up key bindings. self._init_bindings() # Create the basic frames. self._init_menubar(self._top) self._init_buttons(self._top) self._init_feedback(self._top) self._init_grammar(self._top) self._init_canvas(self._top) # A popup menu for reducing. self._reduce_menu = Menu(self._canvas, tearoff=0) # Reset the demo, and set the feedback frame to empty. self.reset() self._lastoper1["text"] = "" ######################################### ## Initialization Helpers ######################################### def _init_fonts(self, root): # See: self._sysfont = Font(font=Button()["font"]) root.option_add("*Font", self._sysfont) # TWhat's our font size (default=same as sysfont) self._size = IntVar(root) self._size.set(self._sysfont.cget("size")) self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) self._font = Font(family="helvetica", size=self._size.get()) def _init_grammar(self, parent): # Grammar view. self._prodframe = listframe = Frame(parent) self._prodframe.pack(fill="both", side="left", padx=2) self._prodlist_label = Label( self._prodframe, font=self._boldfont, text="Available Reductions" ) self._prodlist_label.pack() self._prodlist = Listbox( self._prodframe, selectmode="single", relief="groove", background="white", foreground="#909090", font=self._font, selectforeground="#004040", selectbackground="#c0f0c0", ) self._prodlist.pack(side="right", fill="both", expand=1) self._productions = list(self._parser.grammar().productions()) for production in self._productions: self._prodlist.insert("end", (" %s" % production)) self._prodlist.config(height=min(len(self._productions), 25)) # Add a scrollbar if there are more than 25 productions. if 1: # len(self._productions) > 25: listscroll = Scrollbar(self._prodframe, orient="vertical") self._prodlist.config(yscrollcommand=listscroll.set) listscroll.config(command=self._prodlist.yview) listscroll.pack(side="left", fill="y") # If they select a production, apply it. self._prodlist.bind("<>", self._prodlist_select) # When they hover over a production, highlight it. self._hover = -1 self._prodlist.bind("", self._highlight_hover) self._prodlist.bind("", self._clear_hover) def _init_bindings(self): # Quit self._top.bind("", self.destroy) self._top.bind("", self.destroy) self._top.bind("", self.destroy) self._top.bind("", self.destroy) # Ops (step, shift, reduce, undo) self._top.bind("", self.step) self._top.bind("", self.shift) self._top.bind("", self.shift) self._top.bind("", self.shift) self._top.bind("", self.reduce) self._top.bind("", self.reduce) self._top.bind("", self.reduce) self._top.bind("", self.reset) self._top.bind("", self.undo) self._top.bind("", self.undo) self._top.bind("", self.undo) self._top.bind("", self.undo) self._top.bind("", self.undo) # Misc self._top.bind("", self.postscript) self._top.bind("", self.help) self._top.bind("", self.help) self._top.bind("", self.edit_grammar) self._top.bind("", self.edit_sentence) # Animation speed control self._top.bind("-", lambda e, a=self._animate: a.set(20)) self._top.bind("=", lambda e, a=self._animate: a.set(10)) self._top.bind("+", lambda e, a=self._animate: a.set(4)) def _init_buttons(self, parent): # Set up the frames. self._buttonframe = buttonframe = Frame(parent) buttonframe.pack(fill="none", side="bottom") Button( buttonframe, text="Step", background="#90c0d0", foreground="black", command=self.step, ).pack(side="left") Button( buttonframe, text="Shift", underline=0, background="#90f090", foreground="black", command=self.shift, ).pack(side="left") Button( buttonframe, text="Reduce", underline=0, background="#90f090", foreground="black", command=self.reduce, ).pack(side="left") Button( buttonframe, text="Undo", underline=0, background="#f0a0a0", foreground="black", command=self.undo, ).pack(side="left") def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command( label="Reset Parser", underline=0, command=self.reset, accelerator="Del" ) filemenu.add_command( label="Print to Postscript", underline=0, command=self.postscript, accelerator="Ctrl-p", ) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) editmenu.add_command( label="Edit Grammar", underline=5, command=self.edit_grammar, accelerator="Ctrl-g", ) editmenu.add_command( label="Edit Text", underline=5, command=self.edit_sentence, accelerator="Ctrl-t", ) menubar.add_cascade(label="Edit", underline=0, menu=editmenu) rulemenu = Menu(menubar, tearoff=0) rulemenu.add_command( label="Step", underline=1, command=self.step, accelerator="Space" ) rulemenu.add_separator() rulemenu.add_command( label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s" ) rulemenu.add_command( label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r" ) rulemenu.add_separator() rulemenu.add_command( label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u" ) menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_checkbutton( label="Show Grammar", underline=0, variable=self._show_grammar, command=self._toggle_grammar, ) viewmenu.add_separator() viewmenu.add_radiobutton( label="Tiny", variable=self._size, underline=0, value=10, command=self.resize, ) viewmenu.add_radiobutton( label="Small", variable=self._size, underline=0, value=12, command=self.resize, ) viewmenu.add_radiobutton( label="Medium", variable=self._size, underline=0, value=14, command=self.resize, ) viewmenu.add_radiobutton( label="Large", variable=self._size, underline=0, value=18, command=self.resize, ) viewmenu.add_radiobutton( label="Huge", variable=self._size, underline=0, value=24, command=self.resize, ) menubar.add_cascade(label="View", underline=0, menu=viewmenu) animatemenu = Menu(menubar, tearoff=0) animatemenu.add_radiobutton( label="No Animation", underline=0, variable=self._animate, value=0 ) animatemenu.add_radiobutton( label="Slow Animation", underline=0, variable=self._animate, value=20, accelerator="-", ) animatemenu.add_radiobutton( label="Normal Animation", underline=0, variable=self._animate, value=10, accelerator="=", ) animatemenu.add_radiobutton( label="Fast Animation", underline=0, variable=self._animate, value=4, accelerator="+", ) menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label="About", underline=0, command=self.about) helpmenu.add_command( label="Instructions", underline=0, command=self.help, accelerator="F1" ) menubar.add_cascade(label="Help", underline=0, menu=helpmenu) parent.config(menu=menubar) def _init_feedback(self, parent): self._feedbackframe = feedbackframe = Frame(parent) feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3) self._lastoper_label = Label( feedbackframe, text="Last Operation:", font=self._font ) self._lastoper_label.pack(side="left") lastoperframe = Frame(feedbackframe, relief="sunken", border=1) lastoperframe.pack(fill="x", side="right", expand=1, padx=5) self._lastoper1 = Label( lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font ) self._lastoper2 = Label( lastoperframe, anchor="w", width=30, foreground="#004040", background="#f0f0f0", font=self._font, ) self._lastoper1.pack(side="left") self._lastoper2.pack(side="left", fill="x", expand=1) def _init_canvas(self, parent): self._cframe = CanvasFrame( parent, background="white", width=525, closeenough=10, border=2, relief="sunken", ) self._cframe.pack(expand=1, fill="both", side="top", pady=2) canvas = self._canvas = self._cframe.canvas() self._stackwidgets = [] self._rtextwidgets = [] self._titlebar = canvas.create_rectangle( 0, 0, 0, 0, fill="#c0f0f0", outline="black" ) self._exprline = canvas.create_line(0, 0, 0, 0, dash=".") self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080") size = self._size.get() + 4 self._stacklabel = TextWidget( canvas, "Stack", color="#004040", font=self._boldfont ) self._rtextlabel = TextWidget( canvas, "Remaining Text", color="#004040", font=self._boldfont ) self._cframe.add_widget(self._stacklabel) self._cframe.add_widget(self._rtextlabel) ######################################### ## Main draw procedure ######################################### def _redraw(self): scrollregion = self._canvas["scrollregion"].split() (cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion) # Delete the old stack & rtext widgets. for stackwidget in self._stackwidgets: self._cframe.destroy_widget(stackwidget) self._stackwidgets = [] for rtextwidget in self._rtextwidgets: self._cframe.destroy_widget(rtextwidget) self._rtextwidgets = [] # Position the titlebar & exprline (x1, y1, x2, y2) = self._stacklabel.bbox() y = y2 - y1 + 10 self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4) self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10) # Position the titlebar labels.. (x1, y1, x2, y2) = self._stacklabel.bbox() self._stacklabel.move(5 - x1, 3 - y1) (x1, y1, x2, y2) = self._rtextlabel.bbox() self._rtextlabel.move(cx2 - x2 - 5, 3 - y1) # Draw the stack. stackx = 5 for tok in self._parser.stack(): if isinstance(tok, Tree): attribs = { "tree_color": "#4080a0", "tree_width": 2, "node_font": self._boldfont, "node_color": "#006060", "leaf_color": "#006060", "leaf_font": self._font, } widget = tree_to_treesegment(self._canvas, tok, **attribs) widget.label()["color"] = "#000000" else: widget = TextWidget(self._canvas, tok, color="#000000", font=self._font) widget.bind_click(self._popup_reduce) self._stackwidgets.append(widget) self._cframe.add_widget(widget, stackx, y) stackx = widget.bbox()[2] + 10 # Draw the remaining text. rtextwidth = 0 for tok in self._parser.remaining_text(): widget = TextWidget(self._canvas, tok, color="#000000", font=self._font) self._rtextwidgets.append(widget) self._cframe.add_widget(widget, rtextwidth, y) rtextwidth = widget.bbox()[2] + 4 # Allow enough room to shift the next token (for animations) if len(self._rtextwidgets) > 0: stackx += self._rtextwidgets[0].width() # Move the remaining text to the correct location (keep it # right-justified, when possible); and move the remaining text # label, if necessary. stackx = max(stackx, self._stacklabel.width() + 25) rlabelwidth = self._rtextlabel.width() + 10 if stackx >= cx2 - max(rtextwidth, rlabelwidth): cx2 = stackx + max(rtextwidth, rlabelwidth) for rtextwidget in self._rtextwidgets: rtextwidget.move(4 + cx2 - rtextwidth, 0) self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0) midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2 self._canvas.coords(self._stacktop, midx, 0, midx, 5000) (x1, y1, x2, y2) = self._stacklabel.bbox() # Set up binding to allow them to shift a token by dragging it. if len(self._rtextwidgets) > 0: def drag_shift(widget, midx=midx, self=self): if widget.bbox()[0] < midx: self.shift() else: self._redraw() self._rtextwidgets[0].bind_drag(drag_shift) self._rtextwidgets[0].bind_click(self.shift) # Draw the stack top. self._highlight_productions() def _draw_stack_top(self, widget): # hack.. midx = widget.bbox()[2] + 50 self._canvas.coords(self._stacktop, midx, 0, midx, 5000) def _highlight_productions(self): # Highlight the productions that can be reduced. self._prodlist.selection_clear(0, "end") for prod in self._parser.reducible_productions(): index = self._productions.index(prod) self._prodlist.selection_set(index) ######################################### ## Button Callbacks ######################################### def destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def reset(self, *e): self._parser.initialize(self._sent) self._lastoper1["text"] = "Reset App" self._lastoper2["text"] = "" self._redraw() def step(self, *e): if self.reduce(): return True elif self.shift(): return True else: if list(self._parser.parses()): self._lastoper1["text"] = "Finished:" self._lastoper2["text"] = "Success" else: self._lastoper1["text"] = "Finished:" self._lastoper2["text"] = "Failure" def shift(self, *e): if self._animating_lock: return if self._parser.shift(): tok = self._parser.stack()[-1] self._lastoper1["text"] = "Shift:" self._lastoper2["text"] = "%r" % tok if self._animate.get(): self._animate_shift() else: self._redraw() return True return False def reduce(self, *e): if self._animating_lock: return production = self._parser.reduce() if production: self._lastoper1["text"] = "Reduce:" self._lastoper2["text"] = "%s" % production if self._animate.get(): self._animate_reduce() else: self._redraw() return production def undo(self, *e): if self._animating_lock: return if self._parser.undo(): self._redraw() def postscript(self, *e): self._cframe.print_to_file() def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) ######################################### ## Menubar callbacks ######################################### def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._boldfont.configure(size=-(abs(size))) self._sysfont.configure(size=-(abs(size))) # self._stacklabel['font'] = ('helvetica', -size-4, 'bold') # self._rtextlabel['font'] = ('helvetica', -size-4, 'bold') # self._lastoper_label['font'] = ('helvetica', -size) # self._lastoper1['font'] = ('helvetica', -size) # self._lastoper2['font'] = ('helvetica', -size) # self._prodlist['font'] = ('helvetica', -size) # self._prodlist_label['font'] = ('helvetica', -size-2, 'bold') self._redraw() def help(self, *e): # The default font's not very legible; try using 'fixed' instead. try: ShowText( self._top, "Help: Shift-Reduce Parser Application", (__doc__ or "").strip(), width=75, font="fixed", ) except: ShowText( self._top, "Help: Shift-Reduce Parser Application", (__doc__ or "").strip(), width=75, ) def about(self, *e): ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper" TITLE = "About: Shift-Reduce Parser Application" try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self._top, TITLE, ABOUT) def edit_grammar(self, *e): CFGEditor(self._top, self._parser.grammar(), self.set_grammar) def set_grammar(self, grammar): self._parser.set_grammar(grammar) self._productions = list(grammar.productions()) self._prodlist.delete(0, "end") for production in self._productions: self._prodlist.insert("end", (" %s" % production)) def edit_sentence(self, *e): sentence = " ".join(self._sent) title = "Edit Text" instr = "Enter a new sentence to parse." EntryDialog(self._top, sentence, instr, self.set_sentence, title) def set_sentence(self, sent): self._sent = sent.split() # [XX] use tagged? self.reset() ######################################### ## Reduce Production Selection ######################################### def _toggle_grammar(self, *e): if self._show_grammar.get(): self._prodframe.pack( fill="both", side="left", padx=2, after=self._feedbackframe ) self._lastoper1["text"] = "Show Grammar" else: self._prodframe.pack_forget() self._lastoper1["text"] = "Hide Grammar" self._lastoper2["text"] = "" def _prodlist_select(self, event): selection = self._prodlist.curselection() if len(selection) != 1: return index = int(selection[0]) production = self._parser.reduce(self._productions[index]) if production: self._lastoper1["text"] = "Reduce:" self._lastoper2["text"] = "%s" % production if self._animate.get(): self._animate_reduce() else: self._redraw() else: # Reset the production selections. self._prodlist.selection_clear(0, "end") for prod in self._parser.reducible_productions(): index = self._productions.index(prod) self._prodlist.selection_set(index) def _popup_reduce(self, widget): # Remove old commands. productions = self._parser.reducible_productions() if len(productions) == 0: return self._reduce_menu.delete(0, "end") for production in productions: self._reduce_menu.add_command(label=str(production), command=self.reduce) self._reduce_menu.post( self._canvas.winfo_pointerx(), self._canvas.winfo_pointery() ) ######################################### ## Animations ######################################### def _animate_shift(self): # What widget are we shifting? widget = self._rtextwidgets[0] # Where are we shifting from & to? right = widget.bbox()[0] if len(self._stackwidgets) == 0: left = 5 else: left = self._stackwidgets[-1].bbox()[2] + 10 # Start animating. dt = self._animate.get() dx = (left - right) * 1.0 / dt self._animate_shift_frame(dt, widget, dx) def _animate_shift_frame(self, frame, widget, dx): if frame > 0: self._animating_lock = 1 widget.move(dx, 0) self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx) else: # but: stacktop?? # Shift the widget to the stack. del self._rtextwidgets[0] self._stackwidgets.append(widget) self._animating_lock = 0 # Display the available productions. self._draw_stack_top(widget) self._highlight_productions() def _animate_reduce(self): # What widgets are we shifting? numwidgets = len(self._parser.stack()[-1]) # number of children widgets = self._stackwidgets[-numwidgets:] # How far are we moving? if isinstance(widgets[0], TreeSegmentWidget): ydist = 15 + widgets[0].label().height() else: ydist = 15 + widgets[0].height() # Start animating. dt = self._animate.get() dy = ydist * 2.0 / dt self._animate_reduce_frame(dt / 2, widgets, dy) def _animate_reduce_frame(self, frame, widgets, dy): if frame > 0: self._animating_lock = 1 for widget in widgets: widget.move(0, dy) self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy) else: del self._stackwidgets[-len(widgets) :] for widget in widgets: self._cframe.remove_widget(widget) tok = self._parser.stack()[-1] if not isinstance(tok, Tree): raise ValueError() label = TextWidget( self._canvas, str(tok.label()), color="#006060", font=self._boldfont ) widget = TreeSegmentWidget(self._canvas, label, widgets, width=2) (x1, y1, x2, y2) = self._stacklabel.bbox() y = y2 - y1 + 10 if not self._stackwidgets: x = 5 else: x = self._stackwidgets[-1].bbox()[2] + 10 self._cframe.add_widget(widget, x, y) self._stackwidgets.append(widget) # Display the available productions. self._draw_stack_top(widget) self._highlight_productions() # # Delete the old widgets.. # del self._stackwidgets[-len(widgets):] # for widget in widgets: # self._cframe.destroy_widget(widget) # # # Make a new one. # tok = self._parser.stack()[-1] # if isinstance(tok, Tree): # attribs = {'tree_color': '#4080a0', 'tree_width': 2, # 'node_font': bold, 'node_color': '#006060', # 'leaf_color': '#006060', 'leaf_font':self._font} # widget = tree_to_treesegment(self._canvas, tok.type(), # **attribs) # widget.node()['color'] = '#000000' # else: # widget = TextWidget(self._canvas, tok.type(), # color='#000000', font=self._font) # widget.bind_click(self._popup_reduce) # (x1, y1, x2, y2) = self._stacklabel.bbox() # y = y2-y1+10 # if not self._stackwidgets: x = 5 # else: x = self._stackwidgets[-1].bbox()[2] + 10 # self._cframe.add_widget(widget, x, y) # self._stackwidgets.append(widget) # self._redraw() self._animating_lock = 0 ######################################### ## Hovering. ######################################### def _highlight_hover(self, event): # What production are we hovering over? index = self._prodlist.nearest(event.y) if self._hover == index: return # Clear any previous hover highlighting. self._clear_hover() # If the production corresponds to an available reduction, # highlight the stack. selection = [int(s) for s in self._prodlist.curselection()] if index in selection: rhslen = len(self._productions[index].rhs()) for stackwidget in self._stackwidgets[-rhslen:]: if isinstance(stackwidget, TreeSegmentWidget): stackwidget.label()["color"] = "#00a000" else: stackwidget["color"] = "#00a000" # Remember what production we're hovering over. self._hover = index def _clear_hover(self, *event): # Clear any previous hover highlighting. if self._hover == -1: return self._hover = -1 for stackwidget in self._stackwidgets: if isinstance(stackwidget, TreeSegmentWidget): stackwidget.label()["color"] = "black" else: stackwidget["color"] = "black" def app(): """ Create a shift reduce parser app, using a simple grammar and text. """ from nltk.grammar import CFG, Nonterminal, Production nonterminals = "S VP NP PP P N Name V Det" (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), # Lexical Productions Production(NP, ["I"]), Production(Det, ["the"]), Production(Det, ["a"]), Production(N, ["man"]), Production(V, ["saw"]), Production(P, ["in"]), Production(P, ["with"]), Production(N, ["park"]), Production(N, ["dog"]), Production(N, ["statue"]), Production(Det, ["my"]), ) grammar = CFG(S, productions) # tokenize the sentence sent = "my dog saw a man in the park with a statue".split() ShiftReduceApp(grammar, sent).mainloop() if __name__ == "__main__": app() __all__ = ["app"] nltk-3.7/nltk/app/wordfreq_app.py000066400000000000000000000016311420073152400171250ustar00rootroot00000000000000# Natural Language Toolkit: Wordfreq Application # # Copyright (C) 2001-2022 NLTK Project # Author: Sumukh Ghodke # URL: # For license information, see LICENSE.TXT from matplotlib import pylab from nltk.corpus import gutenberg from nltk.text import Text def plot_word_freq_dist(text): fd = text.vocab() samples = [item for item, _ in fd.most_common(50)] values = [fd[sample] for sample in samples] values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))] pylab.title(text.name) pylab.xlabel("Samples") pylab.ylabel("Cumulative Percentage") pylab.plot(values) pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90) pylab.show() def app(): t1 = Text(gutenberg.words("melville-moby_dick.txt")) plot_word_freq_dist(t1) if __name__ == "__main__": app() __all__ = ["app"] nltk-3.7/nltk/app/wordnet_app.py000066400000000000000000001026571420073152400167700ustar00rootroot00000000000000# Natural Language Toolkit: WordNet Browser Application # # Copyright (C) 2001-2022 NLTK Project # Author: Jussi Salmela # Paul Bone # URL: # For license information, see LICENSE.TXT """ A WordNet Browser application which launches the default browser (if it is not already running) and opens a new tab with a connection to http://localhost:port/ . It also starts an HTTP server on the specified port and begins serving browser requests. The default port is 8000. (For command-line help, run "python wordnet -h") This application requires that the user's web browser supports Javascript. BrowServer is a server for browsing the NLTK Wordnet database It first launches a browser client to be used for browsing and then starts serving the requests of that and maybe other clients Usage:: browserver.py -h browserver.py [-s] [-p ] Options:: -h or --help Display this help message. -l or --log-file Logs messages to the given file, If this option is not specified messages are silently dropped. -p or --port Run the web server on this TCP port, defaults to 8000. -s or --server-mode Do not start a web browser, and do not allow a user to shutdown the server through the web interface. """ # TODO: throughout this package variable names and docstrings need # modifying to be compliant with NLTK's coding standards. Tests also # need to be develop to ensure this continues to work in the face of # changes to other NLTK packages. import base64 import copy import datetime import getopt import os import pickle import re import sys import threading import time import webbrowser from collections import defaultdict from http.server import BaseHTTPRequestHandler, HTTPServer # Allow this program to run inside the NLTK source tree. from sys import argv, path from urllib.parse import unquote_plus from nltk.corpus import wordnet as wn from nltk.corpus.reader.wordnet import Lemma, Synset # now included in local file # from util import html_header, html_trailer, \ # get_static_index_page, get_static_page_by_path, \ # page_from_word, page_from_href firstClient = True # True if we're not also running a web browser. The value f server_mode # gets set by demo(). server_mode = None # If set this is a file object for writing log messages. logfile = None class MyServerHandler(BaseHTTPRequestHandler): def do_HEAD(self): self.send_head() def do_GET(self): global firstClient sp = self.path[1:] if unquote_plus(sp) == "SHUTDOWN THE SERVER": if server_mode: page = "Server must be killed with SIGTERM." type = "text/plain" else: print("Server shutting down!") os._exit(0) elif sp == "": # First request. type = "text/html" if not server_mode and firstClient: firstClient = False page = get_static_index_page(True) else: page = get_static_index_page(False) word = "green" elif sp.endswith(".html"): # Trying to fetch a HTML file TODO: type = "text/html" usp = unquote_plus(sp) if usp == "NLTK Wordnet Browser Database Info.html": word = "* Database Info *" if os.path.isfile(usp): with open(usp) as infile: page = infile.read() else: page = ( (html_header % word) + "

The database info file:" "

" + usp + "" + "

was not found. Run this:" + "

python dbinfo_html.py" + "

to produce it." + html_trailer ) else: # Handle files here. word = sp page = get_static_page_by_path(usp) elif sp.startswith("search"): # This doesn't seem to work with MWEs. type = "text/html" parts = (sp.split("?")[1]).split("&") word = [ p.split("=")[1].replace("+", " ") for p in parts if p.startswith("nextWord") ][0] page, word = page_from_word(word) elif sp.startswith("lookup_"): # TODO add a variation of this that takes a non ecoded word or MWE. type = "text/html" sp = sp[len("lookup_") :] page, word = page_from_href(sp) elif sp == "start_page": # if this is the first request we should display help # information, and possibly set a default word. type = "text/html" page, word = page_from_word("wordnet") else: type = "text/plain" page = "Could not parse request: '%s'" % sp # Send result. self.send_head(type) self.wfile.write(page.encode("utf8")) def send_head(self, type=None): self.send_response(200) self.send_header("Content-type", type) self.end_headers() def log_message(self, format, *args): global logfile if logfile: logfile.write( "%s - - [%s] %s\n" % (self.address_string(), self.log_date_time_string(), format % args) ) def get_unique_counter_from_url(sp): """ Extract the unique counter from the URL if it has one. Otherwise return null. """ pos = sp.rfind("%23") if pos != -1: return int(sp[(pos + 3) :]) else: return None def wnb(port=8000, runBrowser=True, logfilename=None): """ Run NLTK Wordnet Browser Server. :param port: The port number for the server to listen on, defaults to 8000 :type port: int :param runBrowser: True to start a web browser and point it at the web server. :type runBrowser: bool """ # The webbrowser module is unpredictable, typically it blocks if it uses # a console web browser, and doesn't block if it uses a GUI webbrowser, # so we need to force it to have a clear correct behaviour. # # Normally the server should run for as long as the user wants. they # should idealy be able to control this from the UI by closing the # window or tab. Second best would be clicking a button to say # 'Shutdown' that first shutsdown the server and closes the window or # tab, or exits the text-mode browser. Both of these are unfreasable. # # The next best alternative is to start the server, have it close when # it receives SIGTERM (default), and run the browser as well. The user # may have to shutdown both programs. # # Since webbrowser may block, and the webserver will block, we must run # them in separate threads. # global server_mode, logfile server_mode = not runBrowser # Setup logging. if logfilename: try: logfile = open(logfilename, "a", 1) # 1 means 'line buffering' except OSError as e: sys.stderr.write("Couldn't open %s for writing: %s", logfilename, e) sys.exit(1) else: logfile = None # Compute URL and start web browser url = "http://localhost:" + str(port) server_ready = None browser_thread = None if runBrowser: server_ready = threading.Event() browser_thread = startBrowser(url, server_ready) # Start the server. server = HTTPServer(("", port), MyServerHandler) if logfile: logfile.write("NLTK Wordnet browser server running serving: %s\n" % url) if runBrowser: server_ready.set() try: server.serve_forever() except KeyboardInterrupt: pass if runBrowser: browser_thread.join() if logfile: logfile.close() def startBrowser(url, server_ready): def run(): server_ready.wait() time.sleep(1) # Wait a little bit more, there's still the chance of # a race condition. webbrowser.open(url, new=2, autoraise=1) t = threading.Thread(target=run) t.start() return t ##################################################################### # Utilities ##################################################################### """ WordNet Browser Utilities. This provides a backend to both wxbrowse and browserver.py. """ ################################################################################ # # Main logic for wordnet browser. # # This is wrapped inside a function since wn is only available if the # WordNet corpus is installed. def _pos_tuples(): return [ (wn.NOUN, "N", "noun"), (wn.VERB, "V", "verb"), (wn.ADJ, "J", "adj"), (wn.ADV, "R", "adv"), ] def _pos_match(pos_tuple): """ This function returns the complete pos tuple for the partial pos tuple given to it. It attempts to match it against the first non-null component of the given pos tuple. """ if pos_tuple[0] == "s": pos_tuple = ("a", pos_tuple[1], pos_tuple[2]) for n, x in enumerate(pos_tuple): if x is not None: break for pt in _pos_tuples(): if pt[n] == pos_tuple[n]: return pt return None HYPONYM = 0 HYPERNYM = 1 CLASS_REGIONAL = 2 PART_HOLONYM = 3 PART_MERONYM = 4 ATTRIBUTE = 5 SUBSTANCE_HOLONYM = 6 SUBSTANCE_MERONYM = 7 MEMBER_HOLONYM = 8 MEMBER_MERONYM = 9 VERB_GROUP = 10 INSTANCE_HYPONYM = 12 INSTANCE_HYPERNYM = 13 CAUSE = 14 ALSO_SEE = 15 SIMILAR = 16 ENTAILMENT = 17 ANTONYM = 18 FRAMES = 19 PERTAINYM = 20 CLASS_CATEGORY = 21 CLASS_USAGE = 22 CLASS_REGIONAL = 23 CLASS_USAGE = 24 CLASS_CATEGORY = 11 DERIVATIONALLY_RELATED_FORM = 25 INDIRECT_HYPERNYMS = 26 def lemma_property(word, synset, func): def flattern(l): if l == []: return [] else: return l[0] + flattern(l[1:]) return flattern([func(l) for l in synset.lemmas if l.name == word]) def rebuild_tree(orig_tree): node = orig_tree[0] children = orig_tree[1:] return (node, [rebuild_tree(t) for t in children]) def get_relations_data(word, synset): """ Get synset relations data for a synset. Note that this doesn't yet support things such as full hyponym vs direct hyponym. """ if synset.pos() == wn.NOUN: return ( (HYPONYM, "Hyponyms", synset.hyponyms()), (INSTANCE_HYPONYM, "Instance hyponyms", synset.instance_hyponyms()), (HYPERNYM, "Direct hypernyms", synset.hypernyms()), ( INDIRECT_HYPERNYMS, "Indirect hypernyms", rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1], ), # hypernyms', 'Sister terms', (INSTANCE_HYPERNYM, "Instance hypernyms", synset.instance_hypernyms()), # (CLASS_REGIONAL, ['domain term region'], ), (PART_HOLONYM, "Part holonyms", synset.part_holonyms()), (PART_MERONYM, "Part meronyms", synset.part_meronyms()), (SUBSTANCE_HOLONYM, "Substance holonyms", synset.substance_holonyms()), (SUBSTANCE_MERONYM, "Substance meronyms", synset.substance_meronyms()), (MEMBER_HOLONYM, "Member holonyms", synset.member_holonyms()), (MEMBER_MERONYM, "Member meronyms", synset.member_meronyms()), (ATTRIBUTE, "Attributes", synset.attributes()), (ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())), ( DERIVATIONALLY_RELATED_FORM, "Derivationally related form", lemma_property( word, synset, lambda l: l.derivationally_related_forms() ), ), ) elif synset.pos() == wn.VERB: return ( (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), (HYPONYM, "Hyponym", synset.hyponyms()), (HYPERNYM, "Direct hypernyms", synset.hypernyms()), ( INDIRECT_HYPERNYMS, "Indirect hypernyms", rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1], ), (ENTAILMENT, "Entailments", synset.entailments()), (CAUSE, "Causes", synset.causes()), (ALSO_SEE, "Also see", synset.also_sees()), (VERB_GROUP, "Verb Groups", synset.verb_groups()), ( DERIVATIONALLY_RELATED_FORM, "Derivationally related form", lemma_property( word, synset, lambda l: l.derivationally_related_forms() ), ), ) elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT: return ( (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), (SIMILAR, "Similar to", synset.similar_tos()), # Participle of verb - not supported by corpus ( PERTAINYM, "Pertainyms", lemma_property(word, synset, lambda l: l.pertainyms()), ), (ATTRIBUTE, "Attributes", synset.attributes()), (ALSO_SEE, "Also see", synset.also_sees()), ) elif synset.pos() == wn.ADV: # This is weird. adverbs such as 'quick' and 'fast' don't seem # to have antonyms returned by the corpus.a return ( (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), ) # Derived from adjective - not supported by corpus else: raise TypeError("Unhandles synset POS type: " + str(synset.pos())) html_header = """ NLTK Wordnet Browser display of: %s """ html_trailer = """ """ explanation = """

Search Help

  • The display below the line is an example of the output the browser shows you when you enter a search word. The search word was green.
  • The search result shows for different parts of speech the synsets i.e. different meanings for the word.
  • All underlined texts are hypertext links. There are two types of links: word links and others. Clicking a word link carries out a search for the word in the Wordnet database.
  • Clicking a link of the other type opens a display section of data attached to that link. Clicking that link a second time closes the section again.
  • Clicking S: opens a section showing the relations for that synset.
  • Clicking on a relation name opens a section that displays the associated synsets.
  • Type a search word in the Word field and start the search by the Enter/Return key or click the Search button.

""" # HTML oriented functions def _bold(txt): return "%s" % txt def _center(txt): return "
%s
" % txt def _hlev(n, txt): return "%s" % (n, txt, n) def _italic(txt): return "%s" % txt def _li(txt): return "
  • %s
  • " % txt def pg(word, body): """ Return a HTML page of NLTK Browser format constructed from the word and body :param word: The word that the body corresponds to :type word: str :param body: The HTML body corresponding to the word :type body: str :return: a HTML page for the word-body combination :rtype: str """ return (html_header % word) + body + html_trailer def _ul(txt): return "
      " + txt + "
    " def _abbc(txt): """ abbc = asterisks, breaks, bold, center """ return _center(_bold("
    " * 10 + "*" * 10 + " " + txt + " " + "*" * 10)) full_hyponym_cont_text = _ul(_li(_italic("(has full hyponym continuation)"))) + "\n" def _get_synset(synset_key): """ The synset key is the unique name of the synset, this can be retrieved via synset.name() """ return wn.synset(synset_key) def _collect_one_synset(word, synset, synset_relations): """ Returns the HTML string for one synset or word :param word: the current word :type word: str :param synset: a synset :type synset: synset :param synset_relations: information about which synset relations to display. :type synset_relations: dict(synset_key, set(relation_id)) :return: The HTML string built for this synset :rtype: str """ if isinstance(synset, tuple): # It's a word raise NotImplementedError("word not supported by _collect_one_synset") typ = "S" pos_tuple = _pos_match((synset.pos(), None, None)) assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos() descr = pos_tuple[2] ref = copy.deepcopy(Reference(word, synset_relations)) ref.toggle_synset(synset) synset_label = typ + ";" if synset.name() in synset_relations: synset_label = _bold(synset_label) s = f"
  • {make_lookup_link(ref, synset_label)} ({descr}) " def format_lemma(w): w = w.replace("_", " ") if w.lower() == word: return _bold(w) else: ref = Reference(w) return make_lookup_link(ref, w) s += ", ".join(format_lemma(l.name()) for l in synset.lemmas()) gl = " ({}) {} ".format( synset.definition(), "; ".join('"%s"' % e for e in synset.examples()), ) return s + gl + _synset_relations(word, synset, synset_relations) + "
  • \n" def _collect_all_synsets(word, pos, synset_relations=dict()): """ Return a HTML unordered list of synsets for the given word and part of speech. """ return "
      %s\n
    \n" % "".join( _collect_one_synset(word, synset, synset_relations) for synset in wn.synsets(word, pos) ) def _synset_relations(word, synset, synset_relations): """ Builds the HTML string for the relations of a synset :param word: The current word :type word: str :param synset: The synset for which we're building the relations. :type synset: Synset :param synset_relations: synset keys and relation types for which to display relations. :type synset_relations: dict(synset_key, set(relation_type)) :return: The HTML for a synset's relations :rtype: str """ if not synset.name() in synset_relations: return "" ref = Reference(word, synset_relations) def relation_html(r): if isinstance(r, Synset): return make_lookup_link(Reference(r.lemma_names()[0]), r.lemma_names()[0]) elif isinstance(r, Lemma): return relation_html(r.synset()) elif isinstance(r, tuple): # It's probably a tuple containing a Synset and a list of # similar tuples. This forms a tree of synsets. return "{}\n
      {}
    \n".format( relation_html(r[0]), "".join("
  • %s
  • \n" % relation_html(sr) for sr in r[1]), ) else: raise TypeError( "r must be a synset, lemma or list, it was: type(r) = %s, r = %s" % (type(r), r) ) def make_synset_html(db_name, disp_name, rels): synset_html = "%s\n" % make_lookup_link( copy.deepcopy(ref).toggle_synset_relation(synset, db_name).encode(), disp_name, ) if db_name in ref.synset_relations[synset.name()]: synset_html += "
      %s
    \n" % "".join( "
  • %s
  • \n" % relation_html(r) for r in rels ) return synset_html html = ( "
      " + "\n".join( "
    • %s
    • " % make_synset_html(*rel_data) for rel_data in get_relations_data(word, synset) if rel_data[2] != [] ) + "
    " ) return html class Reference: """ A reference to a page that may be generated by page_word """ def __init__(self, word, synset_relations=dict()): """ Build a reference to a new page. word is the word or words (separated by commas) for which to search for synsets of synset_relations is a dictionary of synset keys to sets of synset relation identifaiers to unfold a list of synset relations for. """ self.word = word self.synset_relations = synset_relations def encode(self): """ Encode this reference into a string to be used in a URL. """ # This uses a tuple rather than an object since the python # pickle representation is much smaller and there is no need # to represent the complete object. string = pickle.dumps((self.word, self.synset_relations), -1) return base64.urlsafe_b64encode(string).decode() @staticmethod def decode(string): """ Decode a reference encoded with Reference.encode """ string = base64.urlsafe_b64decode(string.encode()) word, synset_relations = pickle.loads(string) return Reference(word, synset_relations) def toggle_synset_relation(self, synset, relation): """ Toggle the display of the relations for the given synset and relation type. This function will throw a KeyError if the synset is currently not being displayed. """ if relation in self.synset_relations[synset.name()]: self.synset_relations[synset.name()].remove(relation) else: self.synset_relations[synset.name()].add(relation) return self def toggle_synset(self, synset): """ Toggle displaying of the relation types for the given synset """ if synset.name() in self.synset_relations: del self.synset_relations[synset.name()] else: self.synset_relations[synset.name()] = set() return self def make_lookup_link(ref, label): return f'{label}' def page_from_word(word): """ Return a HTML page for the given word. :type word: str :param word: The currently active word :return: A tuple (page,word), where page is the new current HTML page to be sent to the browser and word is the new current word :rtype: A tuple (str,str) """ return page_from_reference(Reference(word)) def page_from_href(href): """ Returns a tuple of the HTML page built and the new current word :param href: The hypertext reference to be solved :type href: str :return: A tuple (page,word), where page is the new current HTML page to be sent to the browser and word is the new current word :rtype: A tuple (str,str) """ return page_from_reference(Reference.decode(href)) def page_from_reference(href): """ Returns a tuple of the HTML page built and the new current word :param href: The hypertext reference to be solved :type href: str :return: A tuple (page,word), where page is the new current HTML page to be sent to the browser and word is the new current word :rtype: A tuple (str,str) """ word = href.word pos_forms = defaultdict(list) words = word.split(",") words = [w for w in [w.strip().lower().replace(" ", "_") for w in words] if w != ""] if len(words) == 0: # No words were found. return "", "Please specify a word to search for." # This looks up multiple words at once. This is probably not # necessary and may lead to problems. for w in words: for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]: form = wn.morphy(w, pos) if form and form not in pos_forms[pos]: pos_forms[pos].append(form) body = "" for pos, pos_str, name in _pos_tuples(): if pos in pos_forms: body += _hlev(3, name) + "\n" for w in pos_forms[pos]: # Not all words of exc files are in the database, skip # to the next word if a KeyError is raised. try: body += _collect_all_synsets(w, pos, href.synset_relations) except KeyError: pass if not body: body = "The word or words '%s' where not found in the dictionary." % word return body, word ##################################################################### # Static pages ##################################################################### def get_static_page_by_path(path): """ Return a static HTML page from the path given. """ if path == "index_2.html": return get_static_index_page(False) elif path == "index.html": return get_static_index_page(True) elif path == "NLTK Wordnet Browser Database Info.html": return "Display of Wordnet Database Statistics is not supported" elif path == "upper_2.html": return get_static_upper_page(False) elif path == "upper.html": return get_static_upper_page(True) elif path == "web_help.html": return get_static_web_help_page() elif path == "wx_help.html": return get_static_wx_help_page() else: return "Internal error: Path for static page '%s' is unknown" % path def get_static_web_help_page(): """ Return the static web help page. """ return """ NLTK Wordnet Browser display of: * Help *

    NLTK Wordnet Browser Help

    The NLTK Wordnet Browser is a tool to use in browsing the Wordnet database. It tries to behave like the Wordnet project's web browser but the difference is that the NLTK Wordnet Browser uses a local Wordnet database.

    You are using the Javascript client part of the NLTK Wordnet BrowseServer. We assume your browser is in tab sheets enabled mode.

    For background information on Wordnet, see the Wordnet project home page: https://wordnet.princeton.edu/. For more information on the NLTK project, see the project home: https://www.nltk.org/. To get an idea of what the Wordnet version used by this browser includes choose Show Database Info from the View submenu.

    Word search

    The word to be searched is typed into the New Word field and the search started with Enter or by clicking the Search button. There is no uppercase/lowercase distinction: the search word is transformed to lowercase before the search.

    In addition, the word does not have to be in base form. The browser tries to find the possible base form(s) by making certain morphological substitutions. Typing fLIeS as an obscure example gives one this. Click the previous link to see what this kind of search looks like and then come back to this page by using the Alt+LeftArrow key combination.

    The result of a search is a display of one or more synsets for every part of speech in which a form of the search word was found to occur. A synset is a set of words having the same sense or meaning. Each word in a synset that is underlined is a hyperlink which can be clicked to trigger an automatic search for that word.

    Every synset has a hyperlink S: at the start of its display line. Clicking that symbol shows you the name of every relation that this synset is part of. Every relation name is a hyperlink that opens up a display for that relation. Clicking it another time closes the display again. Clicking another relation name on a line that has an opened relation closes the open relation and opens the clicked relation.

    It is also possible to give two or more words or collocations to be searched at the same time separating them with a comma like this cheer up,clear up, for example. Click the previous link to see what this kind of search looks like and then come back to this page by using the Alt+LeftArrow key combination. As you could see the search result includes the synsets found in the same order than the forms were given in the search field.

    There are also word level (lexical) relations recorded in the Wordnet database. Opening this kind of relation displays lines with a hyperlink W: at their beginning. Clicking this link shows more info on the word in question.

    The Buttons

    The Search and Help buttons need no more explanation.

    The Show Database Info button shows a collection of Wordnet database statistics.

    The Shutdown the Server button is shown for the first client of the BrowServer program i.e. for the client that is automatically launched when the BrowServer is started but not for the succeeding clients in order to protect the server from accidental shutdowns.

    """ def get_static_welcome_message(): """ Get the static welcome page. """ return """

    Search Help

    • The display below the line is an example of the output the browser shows you when you enter a search word. The search word was green.
    • The search result shows for different parts of speech the synsets i.e. different meanings for the word.
    • All underlined texts are hypertext links. There are two types of links: word links and others. Clicking a word link carries out a search for the word in the Wordnet database.
    • Clicking a link of the other type opens a display section of data attached to that link. Clicking that link a second time closes the section again.
    • Clicking S: opens a section showing the relations for that synset.
    • Clicking on a relation name opens a section that displays the associated synsets.
    • Type a search word in the Next Word field and start the search by the Enter/Return key or click the Search button.
    """ def get_static_index_page(with_shutdown): """ Get the static index page. """ template = """ NLTK Wordnet Browser """ if with_shutdown: upper_link = "upper.html" else: upper_link = "upper_2.html" return template % upper_link def get_static_upper_page(with_shutdown): """ Return the upper frame page, If with_shutdown is True then a 'shutdown' button is also provided to shutdown the server. """ template = """ Untitled Document
    Current Word:  Next Word: 
    Help %s """ if with_shutdown: shutdown_link = 'Shutdown' else: shutdown_link = "" return template % shutdown_link def usage(): """ Display the command line help message. """ print(__doc__) def app(): # Parse and interpret options. (opts, _) = getopt.getopt( argv[1:], "l:p:sh", ["logfile=", "port=", "server-mode", "help"] ) port = 8000 server_mode = False help_mode = False logfilename = None for (opt, value) in opts: if (opt == "-l") or (opt == "--logfile"): logfilename = str(value) elif (opt == "-p") or (opt == "--port"): port = int(value) elif (opt == "-s") or (opt == "--server-mode"): server_mode = True elif (opt == "-h") or (opt == "--help"): help_mode = True if help_mode: usage() else: wnb(port, not server_mode, logfilename) if __name__ == "__main__": app() __all__ = ["app"] nltk-3.7/nltk/book.py000066400000000000000000000071631420073152400146140ustar00rootroot00000000000000# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # # URL: # For license information, see LICENSE.TXT from nltk.corpus import ( genesis, gutenberg, inaugural, nps_chat, treebank, webtext, wordnet, ) from nltk.probability import FreqDist from nltk.text import Text from nltk.util import bigrams print("*** Introductory Examples for the NLTK Book ***") print("Loading text1, ..., text9 and sent1, ..., sent9") print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words("melville-moby_dick.txt")) print("text1:", text1.name) text2 = Text(gutenberg.words("austen-sense.txt")) print("text2:", text2.name) text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words("singles.txt"), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words("chesterton-thursday.txt")) print("text9:", text9.name) def texts(): print("text1:", text1.name) print("text2:", text2.name) print("text3:", text3.name) print("text4:", text4.name) print("text5:", text5.name) print("text6:", text6.name) print("text7:", text7.name) print("text8:", text8.name) print("text9:", text9.name) sent1 = ["Call", "me", "Ishmael", "."] sent2 = [ "The", "family", "of", "Dashwood", "had", "long", "been", "settled", "in", "Sussex", ".", ] sent3 = [ "In", "the", "beginning", "God", "created", "the", "heaven", "and", "the", "earth", ".", ] sent4 = [ "Fellow", "-", "Citizens", "of", "the", "Senate", "and", "of", "the", "House", "of", "Representatives", ":", ] sent5 = [ "I", "have", "a", "problem", "with", "people", "PMing", "me", "to", "lol", "JOIN", ] sent6 = [ "SCENE", "1", ":", "[", "wind", "]", "[", "clop", "clop", "clop", "]", "KING", "ARTHUR", ":", "Whoa", "there", "!", ] sent7 = [ "Pierre", "Vinken", ",", "61", "years", "old", ",", "will", "join", "the", "board", "as", "a", "nonexecutive", "director", "Nov.", "29", ".", ] sent8 = [ "25", "SEXY", "MALE", ",", "seeks", "attrac", "older", "single", "lady", ",", "for", "discreet", "encounters", ".", ] sent9 = [ "THE", "suburb", "of", "Saffron", "Park", "lay", "on", "the", "sunset", "side", "of", "London", ",", "as", "red", "and", "ragged", "as", "a", "cloud", "of", "sunset", ".", ] def sents(): print("sent1:", " ".join(sent1)) print("sent2:", " ".join(sent2)) print("sent3:", " ".join(sent3)) print("sent4:", " ".join(sent4)) print("sent5:", " ".join(sent5)) print("sent6:", " ".join(sent6)) print("sent7:", " ".join(sent7)) print("sent8:", " ".join(sent8)) print("sent9:", " ".join(sent9)) nltk-3.7/nltk/ccg/000077500000000000000000000000001420073152400140355ustar00rootroot00000000000000nltk-3.7/nltk/ccg/__init__.py000066400000000000000000000015611420073152400161510ustar00rootroot00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2022 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ Combinatory Categorial Grammar. For more information see nltk/doc/contrib/ccg/ccg.pdf """ from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge from nltk.ccg.combinator import ( BackwardApplication, BackwardBx, BackwardCombinator, BackwardComposition, BackwardSx, BackwardT, DirectedBinaryCombinator, ForwardApplication, ForwardCombinator, ForwardComposition, ForwardSubstitution, ForwardT, UndirectedBinaryCombinator, UndirectedComposition, UndirectedFunctionApplication, UndirectedSubstitution, UndirectedTypeRaise, ) from nltk.ccg.lexicon import CCGLexicon nltk-3.7/nltk/ccg/api.py000066400000000000000000000234221420073152400151630ustar00rootroot00000000000000# Natural Language Toolkit: CCG Categories # # Copyright (C) 2001-2022 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT from abc import ABCMeta, abstractmethod from functools import total_ordering from nltk.internals import raise_unorderable_types @total_ordering class AbstractCCGCategory(metaclass=ABCMeta): """ Interface for categories in combinatory grammars. """ @abstractmethod def is_primitive(self): """ Returns true if the category is primitive. """ @abstractmethod def is_function(self): """ Returns true if the category is a function application. """ @abstractmethod def is_var(self): """ Returns true if the category is a variable. """ @abstractmethod def substitute(self, substitutions): """ Takes a set of (var, category) substitutions, and replaces every occurrence of the variable with the corresponding category. """ @abstractmethod def can_unify(self, other): """ Determines whether two categories can be unified. - Returns None if they cannot be unified - Returns a list of necessary substitutions if they can. """ # Utility functions: comparison, strings and hashing. @abstractmethod def __str__(self): pass def __eq__(self, other): return ( self.__class__ is other.__class__ and self._comparison_key == other._comparison_key ) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, AbstractCCGCategory): raise_unorderable_types("<", self, other) if self.__class__ is other.__class__: return self._comparison_key < other._comparison_key else: return self.__class__.__name__ < other.__class__.__name__ def __hash__(self): try: return self._hash except AttributeError: self._hash = hash(self._comparison_key) return self._hash class CCGVar(AbstractCCGCategory): """ Class representing a variable CCG category. Used for conjunctions (and possibly type-raising, if implemented as a unary rule). """ _maxID = 0 def __init__(self, prim_only=False): """Initialize a variable (selects a new identifier) :param prim_only: a boolean that determines whether the variable is restricted to primitives :type prim_only: bool """ self._id = self.new_id() self._prim_only = prim_only self._comparison_key = self._id @classmethod def new_id(cls): """ A class method allowing generation of unique variable identifiers. """ cls._maxID = cls._maxID + 1 return cls._maxID - 1 @classmethod def reset_id(cls): cls._maxID = 0 def is_primitive(self): return False def is_function(self): return False def is_var(self): return True def substitute(self, substitutions): """If there is a substitution corresponding to this variable, return the substituted category. """ for (var, cat) in substitutions: if var == self: return cat return self def can_unify(self, other): """If the variable can be replaced with other a substitution is returned. """ if other.is_primitive() or not self._prim_only: return [(self, other)] return None def id(self): return self._id def __str__(self): return "_var" + str(self._id) @total_ordering class Direction: """ Class representing the direction of a function application. Also contains maintains information as to which combinators may be used with the category. """ def __init__(self, dir, restrictions): self._dir = dir self._restrs = restrictions self._comparison_key = (dir, tuple(restrictions)) # Testing the application direction def is_forward(self): return self._dir == "/" def is_backward(self): return self._dir == "\\" def dir(self): return self._dir def restrs(self): """A list of restrictions on the combinators. '.' denotes that permuting operations are disallowed ',' denotes that function composition is disallowed '_' denotes that the direction has variable restrictions. (This is redundant in the current implementation of type-raising) """ return self._restrs def is_variable(self): return self._restrs == "_" # Unification and substitution of variable directions. # Used only if type-raising is implemented as a unary rule, as it # must inherit restrictions from the argument category. def can_unify(self, other): if other.is_variable(): return [("_", self.restrs())] elif self.is_variable(): return [("_", other.restrs())] else: if self.restrs() == other.restrs(): return [] return None def substitute(self, subs): if not self.is_variable(): return self for (var, restrs) in subs: if var == "_": return Direction(self._dir, restrs) return self # Testing permitted combinators def can_compose(self): return "," not in self._restrs def can_cross(self): return "." not in self._restrs def __eq__(self, other): return ( self.__class__ is other.__class__ and self._comparison_key == other._comparison_key ) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, Direction): raise_unorderable_types("<", self, other) if self.__class__ is other.__class__: return self._comparison_key < other._comparison_key else: return self.__class__.__name__ < other.__class__.__name__ def __hash__(self): try: return self._hash except AttributeError: self._hash = hash(self._comparison_key) return self._hash def __str__(self): r_str = "" for r in self._restrs: r_str = r_str + "%s" % r return f"{self._dir}{r_str}" # The negation operator reverses the direction of the application def __neg__(self): if self._dir == "/": return Direction("\\", self._restrs) else: return Direction("/", self._restrs) class PrimitiveCategory(AbstractCCGCategory): """ Class representing primitive categories. Takes a string representation of the category, and a list of strings specifying the morphological subcategories. """ def __init__(self, categ, restrictions=[]): self._categ = categ self._restrs = restrictions self._comparison_key = (categ, tuple(restrictions)) def is_primitive(self): return True def is_function(self): return False def is_var(self): return False def restrs(self): return self._restrs def categ(self): return self._categ # Substitution does nothing to a primitive category def substitute(self, subs): return self # A primitive can be unified with a class of the same # base category, given that the other category shares all # of its subclasses, or with a variable. def can_unify(self, other): if not other.is_primitive(): return None if other.is_var(): return [(other, self)] if other.categ() == self.categ(): for restr in self._restrs: if restr not in other.restrs(): return None return [] return None def __str__(self): if self._restrs == []: return "%s" % self._categ restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs) return f"{self._categ}{restrictions}" class FunctionalCategory(AbstractCCGCategory): """ Class that represents a function application category. Consists of argument and result categories, together with an application direction. """ def __init__(self, res, arg, dir): self._res = res self._arg = arg self._dir = dir self._comparison_key = (arg, dir, res) def is_primitive(self): return False def is_function(self): return True def is_var(self): return False # Substitution returns the category consisting of the # substitution applied to each of its constituents. def substitute(self, subs): sub_res = self._res.substitute(subs) sub_dir = self._dir.substitute(subs) sub_arg = self._arg.substitute(subs) return FunctionalCategory(sub_res, sub_arg, self._dir) # A function can unify with another function, so long as its # constituents can unify, or with an unrestricted variable. def can_unify(self, other): if other.is_var(): return [(other, self)] if other.is_function(): sa = self._res.can_unify(other.res()) sd = self._dir.can_unify(other.dir()) if sa is not None and sd is not None: sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa)) if sb is not None: return sa + sb return None # Constituent accessors def arg(self): return self._arg def res(self): return self._res def dir(self): return self._dir def __str__(self): return f"({self._res}{self._dir}{self._arg})" nltk-3.7/nltk/ccg/chart.py000066400000000000000000000325431420073152400155170ustar00rootroot00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2022 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ The lexicon is constructed by calling ``lexicon.fromstring()``. In order to construct a parser, you also need a rule set. The standard English rules are provided in chart as ``chart.DefaultRuleSet``. The parser can then be constructed by calling, for example: ``parser = chart.CCGChartParser(, )`` Parsing is then performed by running ``parser.parse(.split())``. While this returns a list of trees, the default representation of the produced trees is not very enlightening, particularly given that it uses the same tree class as the CFG parsers. It is probably better to call: ``chart.printCCGDerivation()`` which should print a nice representation of the derivation. This entire process is shown far more clearly in the demonstration: python chart.py """ import itertools from nltk.ccg.combinator import * from nltk.ccg.combinator import ( BackwardApplication, BackwardBx, BackwardComposition, BackwardSx, BackwardT, ForwardApplication, ForwardComposition, ForwardSubstitution, ForwardT, ) from nltk.ccg.lexicon import Token, fromstring from nltk.ccg.logic import * from nltk.parse import ParserI from nltk.parse.chart import AbstractChartRule, Chart, EdgeI from nltk.sem.logic import * from nltk.tree import Tree # Based on the EdgeI class from NLTK. # A number of the properties of the EdgeI interface don't # transfer well to CCGs, however. class CCGEdge(EdgeI): def __init__(self, span, categ, rule): self._span = span self._categ = categ self._rule = rule self._comparison_key = (span, categ, rule) # Accessors def lhs(self): return self._categ def span(self): return self._span def start(self): return self._span[0] def end(self): return self._span[1] def length(self): return self._span[1] - self.span[0] def rhs(self): return () def dot(self): return 0 def is_complete(self): return True def is_incomplete(self): return False def nextsym(self): return None def categ(self): return self._categ def rule(self): return self._rule class CCGLeafEdge(EdgeI): """ Class representing leaf edges in a CCG derivation. """ def __init__(self, pos, token, leaf): self._pos = pos self._token = token self._leaf = leaf self._comparison_key = (pos, token.categ(), leaf) # Accessors def lhs(self): return self._token.categ() def span(self): return (self._pos, self._pos + 1) def start(self): return self._pos def end(self): return self._pos + 1 def length(self): return 1 def rhs(self): return self._leaf def dot(self): return 0 def is_complete(self): return True def is_incomplete(self): return False def nextsym(self): return None def token(self): return self._token def categ(self): return self._token.categ() def leaf(self): return self._leaf class BinaryCombinatorRule(AbstractChartRule): """ Class implementing application of a binary combinator to a chart. Takes the directed combinator to apply. """ NUMEDGES = 2 def __init__(self, combinator): self._combinator = combinator # Apply a combinator def apply(self, chart, grammar, left_edge, right_edge): # The left & right edges must be touching. if not (left_edge.end() == right_edge.start()): return # Check if the two edges are permitted to combine. # If so, generate the corresponding edge. if self._combinator.can_combine(left_edge.categ(), right_edge.categ()): for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): new_edge = CCGEdge( span=(left_edge.start(), right_edge.end()), categ=res, rule=self._combinator, ) if chart.insert(new_edge, (left_edge, right_edge)): yield new_edge # The representation of the combinator (for printing derivations) def __str__(self): return "%s" % self._combinator # Type-raising must be handled slightly differently to the other rules, as the # resulting rules only span a single edge, rather than both edges. class ForwardTypeRaiseRule(AbstractChartRule): """ Class for applying forward type raising """ NUMEDGES = 2 def __init__(self): self._combinator = ForwardT def apply(self, chart, grammar, left_edge, right_edge): if not (left_edge.end() == right_edge.start()): return for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator) if chart.insert(new_edge, (left_edge,)): yield new_edge def __str__(self): return "%s" % self._combinator class BackwardTypeRaiseRule(AbstractChartRule): """ Class for applying backward type raising. """ NUMEDGES = 2 def __init__(self): self._combinator = BackwardT def apply(self, chart, grammar, left_edge, right_edge): if not (left_edge.end() == right_edge.start()): return for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator) if chart.insert(new_edge, (right_edge,)): yield new_edge def __str__(self): return "%s" % self._combinator # Common sets of combinators used for English derivations. ApplicationRuleSet = [ BinaryCombinatorRule(ForwardApplication), BinaryCombinatorRule(BackwardApplication), ] CompositionRuleSet = [ BinaryCombinatorRule(ForwardComposition), BinaryCombinatorRule(BackwardComposition), BinaryCombinatorRule(BackwardBx), ] SubstitutionRuleSet = [ BinaryCombinatorRule(ForwardSubstitution), BinaryCombinatorRule(BackwardSx), ] TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()] # The standard English rule set. DefaultRuleSet = ( ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet ) class CCGChartParser(ParserI): """ Chart parser for CCGs. Based largely on the ChartParser class from NLTK. """ def __init__(self, lexicon, rules, trace=0): self._lexicon = lexicon self._rules = rules self._trace = trace def lexicon(self): return self._lexicon # Implements the CYK algorithm def parse(self, tokens): tokens = list(tokens) chart = CCGChart(list(tokens)) lex = self._lexicon # Initialize leaf edges. for index in range(chart.num_leaves()): for token in lex.categories(chart.leaf(index)): new_edge = CCGLeafEdge(index, token, chart.leaf(index)) chart.insert(new_edge, ()) # Select a span for the new edges for span in range(2, chart.num_leaves() + 1): for start in range(0, chart.num_leaves() - span + 1): # Try all possible pairs of edges that could generate # an edge for that span for part in range(1, span): lstart = start mid = start + part rend = start + span for left in chart.select(span=(lstart, mid)): for right in chart.select(span=(mid, rend)): # Generate all possible combinations of the two edges for rule in self._rules: edges_added_by_rule = 0 for newedge in rule.apply(chart, lex, left, right): edges_added_by_rule += 1 # Output the resulting parses return chart.parses(lex.start()) class CCGChart(Chart): def __init__(self, tokens): Chart.__init__(self, tokens) # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be # constructed slightly differently to those in the default Chart class, so it has to # be reimplemented def _trees(self, edge, complete, memo, tree_class): assert complete, "CCGChart cannot build incomplete trees" if edge in memo: return memo[edge] if isinstance(edge, CCGLeafEdge): word = tree_class(edge.token(), [self._tokens[edge.start()]]) leaf = tree_class((edge.token(), "Leaf"), [word]) memo[edge] = [leaf] return [leaf] memo[edge] = [] trees = [] for cpl in self.child_pointer_lists(edge): child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] for children in itertools.product(*child_choices): lhs = ( Token( self._tokens[edge.start() : edge.end()], edge.lhs(), compute_semantics(children, edge), ), str(edge.rule()), ) trees.append(tree_class(lhs, children)) memo[edge] = trees return trees def compute_semantics(children, edge): if children[0].label()[0].semantics() is None: return None if len(children) == 2: if isinstance(edge.rule(), BackwardCombinator): children = [children[1], children[0]] combinator = edge.rule()._combinator function = children[0].label()[0].semantics() argument = children[1].label()[0].semantics() if isinstance(combinator, UndirectedFunctionApplication): return compute_function_semantics(function, argument) elif isinstance(combinator, UndirectedComposition): return compute_composition_semantics(function, argument) elif isinstance(combinator, UndirectedSubstitution): return compute_substitution_semantics(function, argument) else: raise AssertionError("Unsupported combinator '" + combinator + "'") else: return compute_type_raised_semantics(children[0].label()[0].semantics()) # -------- # Displaying derivations # -------- def printCCGDerivation(tree): # Get the leaves and initial categories leafcats = tree.pos() leafstr = "" catstr = "" # Construct a string with both the leaf word and corresponding # category aligned. for (leaf, cat) in leafcats: str_cat = "%s" % cat nextlen = 2 + max(len(leaf), len(str_cat)) lcatlen = (nextlen - len(str_cat)) // 2 rcatlen = lcatlen + (nextlen - len(str_cat)) % 2 catstr += " " * lcatlen + str_cat + " " * rcatlen lleaflen = (nextlen - len(leaf)) // 2 rleaflen = lleaflen + (nextlen - len(leaf)) % 2 leafstr += " " * lleaflen + leaf + " " * rleaflen print(leafstr.rstrip()) print(catstr.rstrip()) # Display the derivation steps printCCGTree(0, tree) # Prints the sequence of derivation steps. def printCCGTree(lwidth, tree): rwidth = lwidth # Is a leaf (word). # Increment the span by the space occupied by the leaf. if not isinstance(tree, Tree): return 2 + lwidth + len(tree) # Find the width of the current derivation step for child in tree: rwidth = max(rwidth, printCCGTree(rwidth, child)) # Is a leaf node. # Don't print anything, but account for the space occupied. if not isinstance(tree.label(), tuple): return max( rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0]) ) (token, op) = tree.label() if op == "Leaf": return rwidth # Pad to the left with spaces, followed by a sequence of '-' # and the derivation rule. print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op) # Print the resulting category on a new line. str_res = "%s" % (token.categ()) if token.semantics() is not None: str_res += " {" + str(token.semantics()) + "}" respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth print(respadlen * " " + str_res) return rwidth ### Demonstration code # Construct the lexicon lex = fromstring( """ :- S, NP, N, VP # Primitive categories, S is the target primitive Det :: NP/N # Family of words Pro :: NP TV :: VP/NP Modal :: (S\\NP)/VP # Backslashes need to be escaped I => Pro # Word -> Category mapping you => Pro the => Det # Variables have the special keyword 'var' # '.' prevents permutation # ',' prevents composition and => var\\.,var/.,var which => (N\\N)/(S/NP) will => Modal # Categories can be either explicit, or families. might => Modal cook => TV eat => TV mushrooms => N parsnips => N bacon => N """ ) def demo(): parser = CCGChartParser(lex, DefaultRuleSet) for parse in parser.parse("I might cook and eat the bacon".split()): printCCGDerivation(parse) if __name__ == "__main__": demo() nltk-3.7/nltk/ccg/combinator.py000066400000000000000000000240661420073152400165540ustar00rootroot00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2022 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ CCG Combinators """ from abc import ABCMeta, abstractmethod from nltk.ccg.api import FunctionalCategory class UndirectedBinaryCombinator(metaclass=ABCMeta): """ Abstract class for representing a binary combinator. Merely defines functions for checking if the function and argument are able to be combined, and what the resulting category is. Note that as no assumptions are made as to direction, the unrestricted combinators can perform all backward, forward and crossed variations of the combinators; these restrictions must be added in the rule class. """ @abstractmethod def can_combine(self, function, argument): pass @abstractmethod def combine(self, function, argument): pass class DirectedBinaryCombinator(metaclass=ABCMeta): """ Wrapper for the undirected binary combinator. It takes left and right categories, and decides which is to be the function, and which the argument. It then decides whether or not they can be combined. """ @abstractmethod def can_combine(self, left, right): pass @abstractmethod def combine(self, left, right): pass class ForwardCombinator(DirectedBinaryCombinator): """ Class representing combinators where the primary functor is on the left. Takes an undirected combinator, and a predicate which adds constraints restricting the cases in which it may apply. """ def __init__(self, combinator, predicate, suffix=""): self._combinator = combinator self._predicate = predicate self._suffix = suffix def can_combine(self, left, right): return self._combinator.can_combine(left, right) and self._predicate( left, right ) def combine(self, left, right): yield from self._combinator.combine(left, right) def __str__(self): return f">{self._combinator}{self._suffix}" class BackwardCombinator(DirectedBinaryCombinator): """ The backward equivalent of the ForwardCombinator class. """ def __init__(self, combinator, predicate, suffix=""): self._combinator = combinator self._predicate = predicate self._suffix = suffix def can_combine(self, left, right): return self._combinator.can_combine(right, left) and self._predicate( left, right ) def combine(self, left, right): yield from self._combinator.combine(right, left) def __str__(self): return f"<{self._combinator}{self._suffix}" class UndirectedFunctionApplication(UndirectedBinaryCombinator): """ Class representing function application. Implements rules of the form: X/Y Y -> X (>) And the corresponding backwards application rule """ def can_combine(self, function, argument): if not function.is_function(): return False return not function.arg().can_unify(argument) is None def combine(self, function, argument): if not function.is_function(): return subs = function.arg().can_unify(argument) if subs is None: return yield function.res().substitute(subs) def __str__(self): return "" # Predicates for function application. # Ensures the left functor takes an argument on the right def forwardOnly(left, right): return left.dir().is_forward() # Ensures the right functor takes an argument on the left def backwardOnly(left, right): return right.dir().is_backward() # Application combinator instances ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly) BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly) class UndirectedComposition(UndirectedBinaryCombinator): """ Functional composition (harmonic) combinator. Implements rules of the form X/Y Y/Z -> X/Z (B>) And the corresponding backwards and crossed variations. """ def can_combine(self, function, argument): # Can only combine two functions, and both functions must # allow composition. if not (function.is_function() and argument.is_function()): return False if function.dir().can_compose() and argument.dir().can_compose(): return not function.arg().can_unify(argument.res()) is None return False def combine(self, function, argument): if not (function.is_function() and argument.is_function()): return if function.dir().can_compose() and argument.dir().can_compose(): subs = function.arg().can_unify(argument.res()) if subs is not None: yield FunctionalCategory( function.res().substitute(subs), argument.arg().substitute(subs), argument.dir(), ) def __str__(self): return "B" # Predicates for restricting application of straight composition. def bothForward(left, right): return left.dir().is_forward() and right.dir().is_forward() def bothBackward(left, right): return left.dir().is_backward() and right.dir().is_backward() # Predicates for crossed composition def crossedDirs(left, right): return left.dir().is_forward() and right.dir().is_backward() def backwardBxConstraint(left, right): # The functors must be crossed inwards if not crossedDirs(left, right): return False # Permuting combinators must be allowed if not left.dir().can_cross() and right.dir().can_cross(): return False # The resulting argument category is restricted to be primitive return left.arg().is_primitive() # Straight composition combinators ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly) BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly) # Backward crossed composition BackwardBx = BackwardCombinator( UndirectedComposition(), backwardBxConstraint, suffix="x" ) class UndirectedSubstitution(UndirectedBinaryCombinator): r""" Substitution (permutation) combinator. Implements rules of the form Y/Z (X\Y)/Z -> X/Z ( N\N def innermostFunction(categ): while categ.res().is_function(): categ = categ.res() return categ class UndirectedTypeRaise(UndirectedBinaryCombinator): """ Undirected combinator for type raising. """ def can_combine(self, function, arg): # The argument must be a function. # The restriction that arg.res() must be a function # merely reduces redundant type-raising; if arg.res() is # primitive, we have: # X Y\X =>((>) Y # which is equivalent to # X Y\X =>(<) Y if not (arg.is_function() and arg.res().is_function()): return False arg = innermostFunction(arg) # left, arg_categ are undefined! subs = left.can_unify(arg_categ.arg()) if subs is not None: return True return False def combine(self, function, arg): if not ( function.is_primitive() and arg.is_function() and arg.res().is_function() ): return # Type-raising matches only the innermost application. arg = innermostFunction(arg) subs = function.can_unify(arg.arg()) if subs is not None: xcat = arg.res().substitute(subs) yield FunctionalCategory( xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir()) ) def __str__(self): return "T" # Predicates for type-raising # The direction of the innermost category must be towards # the primary functor. # The restriction that the variable must be primitive is not # common to all versions of CCGs; some authors have other restrictions. def forwardTConstraint(left, right): arg = innermostFunction(right) return arg.dir().is_backward() and arg.res().is_primitive() def backwardTConstraint(left, right): arg = innermostFunction(left) return arg.dir().is_forward() and arg.res().is_primitive() # Instances of type-raising combinators ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint) BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint) nltk-3.7/nltk/ccg/lexicon.py000066400000000000000000000224651420073152400160610ustar00rootroot00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2022 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ CCG Lexicons """ import re from collections import defaultdict from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory from nltk.internals import deprecated from nltk.sem.logic import Expression # ------------ # Regular expressions used for parsing components of the lexicon # ------------ # Parses a primitive category and subscripts PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""") # Separates the next primitive category from the remainder of the # string NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""") # Separates the next application operator from the remainder APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""") # Parses the definition of the right-hand side (rhs) of either a word or a family LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE) # Parses the right hand side that contains category and maybe semantic predicate RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE) # Parses the semantic predicate SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE) # Strips comments from a line COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""") class Token: """ Class representing a token. token => category {semantics} e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)} * `token` (string) * `categ` (string) * `semantics` (Expression) """ def __init__(self, token, categ, semantics=None): self._token = token self._categ = categ self._semantics = semantics def categ(self): return self._categ def semantics(self): return self._semantics def __str__(self): semantics_str = "" if self._semantics is not None: semantics_str = " {" + str(self._semantics) + "}" return "" + str(self._categ) + semantics_str def __cmp__(self, other): if not isinstance(other, Token): return -1 return cmp((self._categ, self._semantics), other.categ(), other.semantics()) class CCGLexicon: """ Class representing a lexicon for CCG grammars. * `primitives`: The list of primitive categories for the lexicon * `families`: Families of categories * `entries`: A mapping of words to possible categories """ def __init__(self, start, primitives, families, entries): self._start = PrimitiveCategory(start) self._primitives = primitives self._families = families self._entries = entries def categories(self, word): """ Returns all the possible categories for a word """ return self._entries[word] def start(self): """ Return the target category for the parser """ return self._start def __str__(self): """ String representation of the lexicon. Used for debugging. """ string = "" first = True for ident in sorted(self._entries): if not first: string = string + "\n" string = string + ident + " => " first = True for cat in self._entries[ident]: if not first: string = string + " | " else: first = False string = string + "%s" % cat return string # ----------- # Parsing lexicons # ----------- def matchBrackets(string): """ Separate the contents matching the first set of brackets from the rest of the input. """ rest = string[1:] inside = "(" while rest != "" and not rest.startswith(")"): if rest.startswith("("): (part, rest) = matchBrackets(rest) inside = inside + part else: inside = inside + rest[0] rest = rest[1:] if rest.startswith(")"): return (inside + ")", rest[1:]) raise AssertionError("Unmatched bracket in string '" + string + "'") def nextCategory(string): """ Separate the string for the next portion of the category from the rest of the string """ if string.startswith("("): return matchBrackets(string) return NEXTPRIM_RE.match(string).groups() def parseApplication(app): """ Parse an application operator """ return Direction(app[0], app[1:]) def parseSubscripts(subscr): """ Parse the subscripts for a primitive category """ if subscr: return subscr[1:-1].split(",") return [] def parsePrimitiveCategory(chunks, primitives, families, var): """ Parse a primitive category If the primitive is the special category 'var', replace it with the correct `CCGVar`. """ if chunks[0] == "var": if chunks[1] is None: if var is None: var = CCGVar() return (var, var) catstr = chunks[0] if catstr in families: (cat, cvar) = families[catstr] if var is None: var = cvar else: cat = cat.substitute([(cvar, var)]) return (cat, var) if catstr in primitives: subscrs = parseSubscripts(chunks[1]) return (PrimitiveCategory(catstr, subscrs), var) raise AssertionError( "String '" + catstr + "' is neither a family nor primitive category." ) def augParseCategory(line, primitives, families, var=None): """ Parse a string representing a category, and returns a tuple with (possibly) the CCG variable for the category """ (cat_string, rest) = nextCategory(line) if cat_string.startswith("("): (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var) else: (res, var) = parsePrimitiveCategory( PRIM_RE.match(cat_string).groups(), primitives, families, var ) while rest != "": app = APP_RE.match(rest).groups() direction = parseApplication(app[0:3]) rest = app[3] (cat_string, rest) = nextCategory(rest) if cat_string.startswith("("): (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var) else: (arg, var) = parsePrimitiveCategory( PRIM_RE.match(cat_string).groups(), primitives, families, var ) res = FunctionalCategory(res, arg, direction) return (res, var) def fromstring(lex_str, include_semantics=False): """ Convert string representation into a lexicon for CCGs. """ CCGVar.reset_id() primitives = [] families = {} entries = defaultdict(list) for line in lex_str.splitlines(): # Strip comments and leading/trailing whitespace. line = COMMENTS_RE.match(line).groups()[0].strip() if line == "": continue if line.startswith(":-"): # A line of primitive categories. # The first one is the target category # ie, :- S, N, NP, VP primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(",") ] else: # Either a family definition, or a word definition (ident, sep, rhs) = LEX_RE.match(line).groups() (catstr, semantics_str) = RHS_RE.match(rhs).groups() (cat, var) = augParseCategory(catstr, primitives, families) if sep == "::": # Family definition # ie, Det :: NP/N families[ident] = (cat, var) else: semantics = None if include_semantics is True: if semantics_str is None: raise AssertionError( line + " must contain semantics because include_semantics is set to True" ) else: semantics = Expression.fromstring( SEMANTICS_RE.match(semantics_str).groups()[0] ) # Word definition # ie, which => (N\N)/(S/NP) entries[ident].append(Token(ident, cat, semantics)) return CCGLexicon(primitives[0], primitives, families, entries) @deprecated("Use fromstring() instead.") def parseLexicon(lex_str): return fromstring(lex_str) openccg_tinytiny = fromstring( """ # Rather minimal lexicon based on the openccg `tinytiny' grammar. # Only incorporates a subset of the morphological subcategories, however. :- S,NP,N # Primitive categories Det :: NP/N # Determiners Pro :: NP IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular) IntransVpl :: S\\NP[pl] # Plural TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular) TransVpl :: S\\NP[pl]/NP # Plural the => NP[sg]/N[sg] the => NP[pl]/N[pl] I => Pro me => Pro we => Pro us => Pro book => N[sg] books => N[pl] peach => N[sg] peaches => N[pl] policeman => N[sg] policemen => N[pl] boy => N[sg] boys => N[pl] sleep => IntransVsg sleep => IntransVpl eat => IntransVpl eat => TransVpl eats => IntransVsg eats => TransVsg see => TransVpl sees => TransVsg """ ) nltk-3.7/nltk/ccg/logic.py000066400000000000000000000034231420073152400155060ustar00rootroot00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2022 NLTK Project # Author: Tanin Na Nakorn (@tanin) # URL: # For license information, see LICENSE.TXT """ Helper functions for CCG semantics computation """ from nltk.sem.logic import * def compute_type_raised_semantics(semantics): core = semantics parent = None while isinstance(core, LambdaExpression): parent = core core = core.term var = Variable("F") while var in core.free(): var = unique_variable(pattern=var) core = ApplicationExpression(FunctionVariableExpression(var), core) if parent is not None: parent.term = core else: semantics = core return LambdaExpression(var, semantics) def compute_function_semantics(function, argument): return ApplicationExpression(function, argument).simplify() def compute_composition_semantics(function, argument): assert isinstance(argument, LambdaExpression), ( "`" + str(argument) + "` must be a lambda expression" ) return LambdaExpression( argument.variable, ApplicationExpression(function, argument.term).simplify() ) def compute_substitution_semantics(function, argument): assert isinstance(function, LambdaExpression) and isinstance( function.term, LambdaExpression ), ("`" + str(function) + "` must be a lambda expression with 2 arguments") assert isinstance(argument, LambdaExpression), ( "`" + str(argument) + "` must be a lambda expression" ) new_argument = ApplicationExpression( argument, VariableExpression(function.variable) ).simplify() new_term = ApplicationExpression(function.term, new_argument).simplify() return LambdaExpression(function.variable, new_term) nltk-3.7/nltk/chat/000077500000000000000000000000001420073152400142205ustar00rootroot00000000000000nltk-3.7/nltk/chat/__init__.py000066400000000000000000000030341420073152400163310ustar00rootroot00000000000000# Natural Language Toolkit: Chatbots # # Copyright (C) 2001-2022 NLTK Project # Authors: Steven Bird # URL: # For license information, see LICENSE.TXT # Based on an Eliza implementation by Joe Strout , # Jeff Epler and Jez Higgins . """ A class for simple chatbots. These perform simple pattern matching on sentences typed by users, and respond with automatically generated sentences. These chatbots may not work using the windows command line or the windows IDLE GUI. """ from nltk.chat.eliza import eliza_chat from nltk.chat.iesha import iesha_chat from nltk.chat.rude import rude_chat from nltk.chat.suntsu import suntsu_chat from nltk.chat.util import Chat from nltk.chat.zen import zen_chat bots = [ (eliza_chat, "Eliza (psycho-babble)"), (iesha_chat, "Iesha (teen anime junky)"), (rude_chat, "Rude (abusive bot)"), (suntsu_chat, "Suntsu (Chinese sayings)"), (zen_chat, "Zen (gems of wisdom)"), ] def chatbots(): import sys print("Which chatbot would you like to talk to?") botcount = len(bots) for i in range(botcount): print(" %d: %s" % (i + 1, bots[i][1])) while True: print("\nEnter a number in the range 1-%d: " % botcount, end=" ") choice = sys.stdin.readline().strip() if choice.isdigit() and (int(choice) - 1) in range(botcount): break else: print(" Error: bad chatbot number") chatbot = bots[int(choice) - 1][0] chatbot() nltk-3.7/nltk/chat/eliza.py000066400000000000000000000221111420073152400156730ustar00rootroot00000000000000# Natural Language Toolkit: Eliza # # Copyright (C) 2001-2022 NLTK Project # Authors: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # Based on an Eliza implementation by Joe Strout , # Jeff Epler and Jez Higgins . # a translation table used to convert things you say into things the # computer says back, e.g. "I am" --> "you are" from nltk.chat.util import Chat, reflections # a table of response pairs, where each pair consists of a # regular expression, and a list of possible responses, # with group-macros labelled as %1, %2. pairs = ( ( r"I need (.*)", ( "Why do you need %1?", "Would it really help you to get %1?", "Are you sure you need %1?", ), ), ( r"Why don\'t you (.*)", ( "Do you really think I don't %1?", "Perhaps eventually I will %1.", "Do you really want me to %1?", ), ), ( r"Why can\'t I (.*)", ( "Do you think you should be able to %1?", "If you could %1, what would you do?", "I don't know -- why can't you %1?", "Have you really tried?", ), ), ( r"I can\'t (.*)", ( "How do you know you can't %1?", "Perhaps you could %1 if you tried.", "What would it take for you to %1?", ), ), ( r"I am (.*)", ( "Did you come to me because you are %1?", "How long have you been %1?", "How do you feel about being %1?", ), ), ( r"I\'m (.*)", ( "How does being %1 make you feel?", "Do you enjoy being %1?", "Why do you tell me you're %1?", "Why do you think you're %1?", ), ), ( r"Are you (.*)", ( "Why does it matter whether I am %1?", "Would you prefer it if I were not %1?", "Perhaps you believe I am %1.", "I may be %1 -- what do you think?", ), ), ( r"What (.*)", ( "Why do you ask?", "How would an answer to that help you?", "What do you think?", ), ), ( r"How (.*)", ( "How do you suppose?", "Perhaps you can answer your own question.", "What is it you're really asking?", ), ), ( r"Because (.*)", ( "Is that the real reason?", "What other reasons come to mind?", "Does that reason apply to anything else?", "If %1, what else must be true?", ), ), ( r"(.*) sorry (.*)", ( "There are many times when no apology is needed.", "What feelings do you have when you apologize?", ), ), ( r"Hello(.*)", ( "Hello... I'm glad you could drop by today.", "Hi there... how are you today?", "Hello, how are you feeling today?", ), ), ( r"I think (.*)", ("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"), ), ( r"(.*) friend (.*)", ( "Tell me more about your friends.", "When you think of a friend, what comes to mind?", "Why don't you tell me about a childhood friend?", ), ), (r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")), ( r"(.*) computer(.*)", ( "Are you really talking about me?", "Does it seem strange to talk to a computer?", "How do computers make you feel?", "Do you feel threatened by computers?", ), ), ( r"Is it (.*)", ( "Do you think it is %1?", "Perhaps it's %1 -- what do you think?", "If it were %1, what would you do?", "It could well be that %1.", ), ), ( r"It is (.*)", ( "You seem very certain.", "If I told you that it probably isn't %1, what would you feel?", ), ), ( r"Can you (.*)", ( "What makes you think I can't %1?", "If I could %1, then what?", "Why do you ask if I can %1?", ), ), ( r"Can I (.*)", ( "Perhaps you don't want to %1.", "Do you want to be able to %1?", "If you could %1, would you?", ), ), ( r"You are (.*)", ( "Why do you think I am %1?", "Does it please you to think that I'm %1?", "Perhaps you would like me to be %1.", "Perhaps you're really talking about yourself?", ), ), ( r"You\'re (.*)", ( "Why do you say I am %1?", "Why do you think I am %1?", "Are we talking about you, or me?", ), ), ( r"I don\'t (.*)", ("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"), ), ( r"I feel (.*)", ( "Good, tell me more about these feelings.", "Do you often feel %1?", "When do you usually feel %1?", "When you feel %1, what do you do?", ), ), ( r"I have (.*)", ( "Why do you tell me that you've %1?", "Have you really %1?", "Now that you have %1, what will you do next?", ), ), ( r"I would (.*)", ( "Could you explain why you would %1?", "Why would you %1?", "Who else knows that you would %1?", ), ), ( r"Is there (.*)", ( "Do you think there is %1?", "It's likely that there is %1.", "Would you like there to be %1?", ), ), ( r"My (.*)", ( "I see, your %1.", "Why do you say that your %1?", "When your %1, how do you feel?", ), ), ( r"You (.*)", ( "We should be discussing you, not me.", "Why do you say that about me?", "Why do you care whether I %1?", ), ), (r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")), ( r"I want (.*)", ( "What would it mean to you if you got %1?", "Why do you want %1?", "What would you do if you got %1?", "If you got %1, then what would you do?", ), ), ( r"(.*) mother(.*)", ( "Tell me more about your mother.", "What was your relationship with your mother like?", "How do you feel about your mother?", "How does this relate to your feelings today?", "Good family relations are important.", ), ), ( r"(.*) father(.*)", ( "Tell me more about your father.", "How did your father make you feel?", "How do you feel about your father?", "Does your relationship with your father relate to your feelings today?", "Do you have trouble showing affection with your family?", ), ), ( r"(.*) child(.*)", ( "Did you have close friends as a child?", "What is your favorite childhood memory?", "Do you remember any dreams or nightmares from childhood?", "Did the other children sometimes tease you?", "How do you think your childhood experiences relate to your feelings today?", ), ), ( r"(.*)\?", ( "Why do you ask that?", "Please consider whether you can answer your own question.", "Perhaps the answer lies within yourself?", "Why don't you tell me?", ), ), ( r"quit", ( "Thank you for talking with me.", "Good-bye.", "Thank you, that will be $150. Have a good day!", ), ), ( r"(.*)", ( "Please tell me more.", "Let's change focus a bit... Tell me about your family.", "Can you elaborate on that?", "Why do you say that %1?", "I see.", "Very interesting.", "%1.", "I see. And what does that tell you?", "How does that make you feel?", "How do you feel when you say that?", ), ), ) eliza_chatbot = Chat(pairs, reflections) def eliza_chat(): print("Therapist\n---------") print("Talk to the program by typing in plain English, using normal upper-") print('and lower-case letters and punctuation. Enter "quit" when done.') print("=" * 72) print("Hello. How are you feeling today?") eliza_chatbot.converse() def demo(): eliza_chat() if __name__ == "__main__": demo() nltk-3.7/nltk/chat/iesha.py000066400000000000000000000102271420073152400156650ustar00rootroot00000000000000# Natural Language Toolkit: Teen Chatbot # # Copyright (C) 2001-2022 NLTK Project # Author: Selina Dennis # URL: # For license information, see LICENSE.TXT """ This chatbot is a tongue-in-cheek take on the average teen anime junky that frequents YahooMessenger or MSNM. All spelling mistakes and flawed grammar are intentional. """ from nltk.chat.util import Chat reflections = { "am": "r", "was": "were", "i": "u", "i'd": "u'd", "i've": "u'v", "ive": "u'v", "i'll": "u'll", "my": "ur", "are": "am", "you're": "im", "you've": "ive", "you'll": "i'll", "your": "my", "yours": "mine", "you": "me", "u": "me", "ur": "my", "urs": "mine", "me": "u", } # Note: %1/2/etc are used without spaces prior as the chat bot seems # to add a superfluous space when matching. pairs = ( ( r"I\'m (.*)", ( "ur%1?? that's so cool! kekekekeke ^_^ tell me more!", "ur%1? neat!! kekeke >_<", ), ), ( r"(.*) don\'t you (.*)", ( r"u think I can%2??! really?? kekeke \<_\<", "what do u mean%2??!", "i could if i wanted, don't you think!! kekeke", ), ), (r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")), ( r"do (you|u) (.*)\??", ("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"), ), ( r"(.*)\?", ( "man u ask lots of questions!", "booooring! how old r u??", "boooooring!! ur not very fun", ), ), ( r"(cos|because) (.*)", ("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"), ), ( r"why can\'t [iI] (.*)", ( "i dunno! y u askin me for!", "try harder, silly! hee! ^_^", "i dunno! but when i can't%1 i jump up and down!", ), ), ( r"I can\'t (.*)", ( "u can't what??! >_<", "that's ok! i can't%1 either! kekekekeke ^_^", "try harder, silly! hee! ^&^", ), ), ( r"(.*) (like|love|watch) anime", ( "omg i love anime!! do u like sailor moon??! ^&^", "anime yay! anime rocks sooooo much!", "oooh anime! i love anime more than anything!", "anime is the bestest evar! evangelion is the best!", "hee anime is the best! do you have ur fav??", ), ), ( r"I (like|love|watch|play) (.*)", ("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"), ), ( r"anime sucks|(.*) (hate|detest) anime", ( "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*", "no way! anime is the best ever!", "nuh-uh, anime is the best!", ), ), ( r"(are|r) (you|u) (.*)", ("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"), ), ( r"what (.*)", ("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"), ), (r"how (.*)", ("not tellin!! kekekekekeke ^_^",)), (r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)), ( r"quit", ( "mom says i have to go eat dinner now :,( bye!!", "awww u have to go?? see u next time!!", "how to see u again soon! ^_^", ), ), ( r"(.*)", ( "ur funny! kekeke", "boooooring! talk about something else! tell me wat u like!", "do u like anime??", "do u watch anime? i like sailor moon! ^_^", "i wish i was a kitty!! kekekeke ^_^", ), ), ) iesha_chatbot = Chat(pairs, reflections) def iesha_chat(): print("Iesha the TeenBoT\n---------") print("Talk to the program by typing in plain English, using normal upper-") print('and lower-case letters and punctuation. Enter "quit" when done.') print("=" * 72) print("hi!! i'm iesha! who r u??!") iesha_chatbot.converse() def demo(): iesha_chat() if __name__ == "__main__": demo() nltk-3.7/nltk/chat/rude.py000066400000000000000000000061341420073152400155350ustar00rootroot00000000000000# Natural Language Toolkit: Rude Chatbot # # Copyright (C) 2001-2022 NLTK Project # Author: Peter Spiller # URL: # For license information, see LICENSE.TXT from nltk.chat.util import Chat, reflections pairs = ( ( r"We (.*)", ( "What do you mean, 'we'?", "Don't include me in that!", "I wouldn't be so sure about that.", ), ), ( r"You should (.*)", ("Don't tell me what to do, buddy.", "Really? I should, should I?"), ), ( r"You\'re(.*)", ( "More like YOU'RE %1!", "Hah! Look who's talking.", "Come over here and tell me I'm %1.", ), ), ( r"You are(.*)", ( "More like YOU'RE %1!", "Hah! Look who's talking.", "Come over here and tell me I'm %1.", ), ), ( r"I can\'t(.*)", ( "You do sound like the type who can't %1.", "Hear that splashing sound? That's my heart bleeding for you.", "Tell somebody who might actually care.", ), ), ( r"I think (.*)", ( "I wouldn't think too hard if I were you.", "You actually think? I'd never have guessed...", ), ), ( r"I (.*)", ( "I'm getting a bit tired of hearing about you.", "How about we talk about me instead?", "Me, me, me... Frankly, I don't care.", ), ), ( r"How (.*)", ( "How do you think?", "Take a wild guess.", "I'm not even going to dignify that with an answer.", ), ), (r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")), ( r"Why (.*)", ( "Why not?", "That's so obvious I thought even you'd have already figured it out.", ), ), ( r"(.*)shut up(.*)", ( "Make me.", "Getting angry at a feeble NLP assignment? Somebody's losing it.", "Say that again, I dare you.", ), ), ( r"Shut up(.*)", ( "Make me.", "Getting angry at a feeble NLP assignment? Somebody's losing it.", "Say that again, I dare you.", ), ), ( r"Hello(.*)", ("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."), ), ( r"(.*)", ( "I'm getting bored here. Become more interesting.", "Either become more thrilling or get lost, buddy.", "Change the subject before I die of fatal boredom.", ), ), ) rude_chatbot = Chat(pairs, reflections) def rude_chat(): print("Talk to the program by typing in plain English, using normal upper-") print('and lower-case letters and punctuation. Enter "quit" when done.') print("=" * 72) print("I suppose I should say hello.") rude_chatbot.converse() def demo(): rude_chat() if __name__ == "__main__": demo() nltk-3.7/nltk/chat/suntsu.py000066400000000000000000000156051420073152400161420ustar00rootroot00000000000000# Natural Language Toolkit: Sun Tsu-Bot # # Copyright (C) 2001-2022 NLTK Project # Author: Sam Huston 2007 # URL: # For license information, see LICENSE.TXT """ Tsu bot responds to all queries with a Sun Tsu sayings Quoted from Sun Tsu's The Art of War Translated by LIONEL GILES, M.A. 1910 Hosted by the Gutenberg Project https://www.gutenberg.org/ """ from nltk.chat.util import Chat, reflections pairs = ( (r"quit", ("Good-bye.", "Plan well", "May victory be your future")), ( r"[^\?]*\?", ( "Please consider whether you can answer your own question.", "Ask me no questions!", ), ), ( r"[0-9]+(.*)", ( "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.", "There are five essentials for victory", ), ), ( r"[A-Ca-c](.*)", ( "The art of war is of vital importance to the State.", "All warfare is based on deception.", "If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.", "If the campaign is protracted, the resources of the State will not be equal to the strain.", "Attack him where he is unprepared, appear where you are not expected.", "There is no instance of a country having benefited from prolonged warfare.", ), ), ( r"[D-Fd-f](.*)", ( "The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.", "Bring war material with you from home, but forage on the enemy.", "In war, then, let your great object be victory, not lengthy campaigns.", "To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.", ), ), ( r"[G-Ig-i](.*)", ( "Heaven signifies night and day, cold and heat, times and seasons.", "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.", "The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.", "One may know how to conquer without being able to do it.", ), ), ( r"[J-Lj-l](.*)", ( "There are three ways in which a ruler can bring misfortune upon his army.", "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.", "By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.", "By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.", "There are five essentials for victory", "He will win who knows when to fight and when not to fight.", "He will win who knows how to handle both superior and inferior forces.", "He will win whose army is animated by the same spirit throughout all its ranks.", "He will win who, prepared himself, waits to take the enemy unprepared.", "He will win who has military capacity and is not interfered with by the sovereign.", ), ), ( r"[M-Om-o](.*)", ( "If you know the enemy and know yourself, you need not fear the result of a hundred battles.", "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.", "If you know neither the enemy nor yourself, you will succumb in every battle.", "The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.", ), ), ( r"[P-Rp-r](.*)", ( "Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.", "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.", "He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.", "A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.", "The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.", ), ), ( r"[S-Us-u](.*)", ( "What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.", "Hence his victories bring him neither reputation for wisdom nor credit for courage.", "Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.", "In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.", "There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.", "Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.", ), ), ( r"[V-Zv-z](.*)", ( "It is a matter of life and death, a road either to safety or to ruin.", "Hold out baits to entice the enemy. Feign disorder, and crush him.", "All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.", "Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.", "So in war, the way is to avoid what is strong and to strike at what is weak.", "Just as water retains no constant shape, so in warfare there are no constant conditions.", ), ), (r"(.*)", ("Your statement insults me.", "")), ) suntsu_chatbot = Chat(pairs, reflections) def suntsu_chat(): print("Talk to the program by typing in plain English, using normal upper-") print('and lower-case letters and punctuation. Enter "quit" when done.') print("=" * 72) print("You seek enlightenment?") suntsu_chatbot.converse() def demo(): suntsu_chat() if __name__ == "__main__": demo() nltk-3.7/nltk/chat/util.py000066400000000000000000000074621420073152400155600ustar00rootroot00000000000000# Natural Language Toolkit: Chatbot Utilities # # Copyright (C) 2001-2022 NLTK Project # Authors: Steven Bird # URL: # For license information, see LICENSE.TXT # Based on an Eliza implementation by Joe Strout , # Jeff Epler and Jez Higgins . import random import re reflections = { "i am": "you are", "i was": "you were", "i": "you", "i'm": "you are", "i'd": "you would", "i've": "you have", "i'll": "you will", "my": "your", "you are": "I am", "you were": "I was", "you've": "I have", "you'll": "I will", "your": "my", "yours": "mine", "you": "me", "me": "you", } class Chat: def __init__(self, pairs, reflections={}): """ Initialize the chatbot. Pairs is a list of patterns and responses. Each pattern is a regular expression matching the user's statement or question, e.g. r'I like (.*)'. For each such pattern a list of possible responses is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to the numbered positions in the responses, e.g. %1. :type pairs: list of tuple :param pairs: The patterns and responses :type reflections: dict :param reflections: A mapping between first and second person expressions :rtype: None """ self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs] self._reflections = reflections self._regex = self._compile_reflections() def _compile_reflections(self): sorted_refl = sorted(self._reflections, key=len, reverse=True) return re.compile( r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE ) def _substitute(self, str): """ Substitute words in the string, according to the specified reflections, e.g. "I'm" -> "you are" :type str: str :param str: The string to be mapped :rtype: str """ return self._regex.sub( lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower() ) def _wildcards(self, response, match): pos = response.find("%") while pos >= 0: num = int(response[pos + 1 : pos + 2]) response = ( response[:pos] + self._substitute(match.group(num)) + response[pos + 2 :] ) pos = response.find("%") return response def respond(self, str): """ Generate a response to the user input. :type str: str :param str: The string to be mapped :rtype: str """ # check each pattern for (pattern, response) in self._pairs: match = pattern.match(str) # did the pattern match? if match: resp = random.choice(response) # pick a random response resp = self._wildcards(resp, match) # process wildcards # fix munged punctuation at the end if resp[-2:] == "?.": resp = resp[:-2] + "." if resp[-2:] == "??": resp = resp[:-2] + "?" return resp # Hold a conversation with a chatbot def converse(self, quit="quit"): user_input = "" while user_input != quit: user_input = quit try: user_input = input(">") except EOFError: print(user_input) if user_input: while user_input[-1] in "!.": user_input = user_input[:-1] print(self.respond(user_input)) nltk-3.7/nltk/chat/zen.py000066400000000000000000000261261420073152400153750ustar00rootroot00000000000000# Natural Language Toolkit: Zen Chatbot # # Copyright (C) 2001-2022 NLTK Project # Author: Amy Holland # URL: # For license information, see LICENSE.TXT """ Zen Chatbot talks in gems of Zen wisdom. This is a sample conversation with Zen Chatbot: ZC: Welcome, my child. me: Good afternoon. ZC: Ask the question you have come to ask. me: How can I achieve enlightenment? ZC: How do you suppose? me: Through meditation. ZC: Form is emptiness, and emptiness form. me: How can I empty my mind of worldly troubles? ZC: Will an answer to that really help in your search for enlightenment? me: Yes. ZC: It is better to be right than to be certain. me: I seek truth and wisdom. ZC: The search for truth is a long journey. me: Are you sure? ZC: Maybe sure, maybe not sure. The chatbot structure is based on that of chat.eliza. Thus, it uses a translation table to convert from question to response i.e. "I am" --> "you are" Of course, since Zen Chatbot does not understand the meaning of any words, responses are very limited. Zen Chatbot will usually answer very vaguely, or respond to a question by asking a different question, in much the same way as Eliza. """ from nltk.chat.util import Chat, reflections # responses are matched top to bottom, so non-specific matches occur later # for each match, a list of possible responses is provided responses = ( # Zen Chatbot opens with the line "Welcome, my child." The usual # response will be a greeting problem: 'good' matches "good morning", # "good day" etc, but also "good grief!" and other sentences starting # with the word 'good' that may not be a greeting ( r"(hello(.*))|(good [a-zA-Z]+)", ( "The path to enlightenment is often difficult to see.", "Greetings. I sense your mind is troubled. Tell me of your troubles.", "Ask the question you have come to ask.", "Hello. Do you seek englightenment?", ), ), # "I need" and "I want" can be followed by a thing (eg 'help') # or an action (eg 'to see you') # # This is a problem with this style of response - # person: "I need you" # chatbot: "me can be achieved by hard work and dedication of the mind" # i.e. 'you' is not really a thing that can be mapped this way, so this # interpretation only makes sense for some inputs # ( r"i need (.*)", ( "%1 can be achieved by hard work and dedication of the mind.", "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.", "Focus your mind on%1, and you will find what you need.", ), ), ( r"i want (.*)", ( "Desires of the heart will distract you from the path to enlightenment.", "Will%1 help you attain enlightenment?", "Is%1 a desire of the mind, or of the heart?", ), ), # why questions are separated into three types: # "why..I" e.g. "why am I here?" "Why do I like cake?" # "why..you" e.g. "why are you here?" "Why won't you tell me?" # "why..." e.g. "Why is the sky blue?" # problems: # person: "Why can't you tell me?" # chatbot: "Are you sure I tell you?" # - this style works for positives (e.g. "why do you like cake?") # but does not work for negatives (e.g. "why don't you like cake?") (r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")), (r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")), (r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")), # e.g. "are you listening?", "are you a duck" ( r"are you (.*)\?", ("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."), ), # e.g. "am I a duck?", "am I going to die?" ( r"am i (.*)\?", ("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."), ), # what questions, e.g. "what time is it?" # problems: # person: "What do you want?" # chatbot: "Seek truth, not what do me want." (r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")), # how questions, e.g. "how do you do?" ( r"how (.*)\?", ( "How do you suppose?", "Will an answer to that really help in your search for enlightenment?", "Ask yourself not how, but why.", ), ), # can questions, e.g. "can you run?", "can you come over here please?" ( r"can you (.*)\?", ( "I probably can, but I may not.", "Maybe I can%1, and maybe I cannot.", "I can do all, and I can do nothing.", ), ), # can questions, e.g. "can I have some cake?", "can I know truth?" ( r"can i (.*)\?", ( "You can%1 if you believe you can%1, and have a pure spirit.", "Seek truth and you will know if you can%1.", ), ), # e.g. "It is raining" - implies the speaker is certain of a fact ( r"it is (.*)", ( "How can you be certain that%1, when you do not even know yourself?", "Whether it is%1 or not does not change the way the world is.", ), ), # e.g. "is there a doctor in the house?" ( r"is there (.*)\?", ("There is%1 if you believe there is.", "It is possible that there is%1."), ), # e.g. "is it possible?", "is this true?" (r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")), # non-specific question ( r"(.*)\?", ( "Do you think %1?", "You seek the truth. Does the truth seek you?", "If you intentionally pursue the answers to your questions, the answers become hard to see.", "The answer to your question cannot be told. It must be experienced.", ), ), # expression of hate of form "I hate you" or "Kelly hates cheese" ( r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)", ( "Perhaps it is not about hating %2, but about hate from within.", "Weeds only grow when we dislike them", "Hate is a very strong emotion.", ), ), # statement containing the word 'truth' ( r"(.*) truth(.*)", ( "Seek truth, and truth will seek you.", "Remember, it is not the spoon which bends - only yourself.", "The search for truth is a long journey.", ), ), # desire to do an action # e.g. "I want to go shopping" ( r"i want to (.*)", ("You may %1 if your heart truly desires to.", "You may have to %1."), ), # desire for an object # e.g. "I want a pony" ( r"i want (.*)", ( "Does your heart truly desire %1?", "Is this a desire of the heart, or of the mind?", ), ), # e.g. "I can't wait" or "I can't do this" ( r"i can\'t (.*)", ( "What we can and can't do is a limitation of the mind.", "There are limitations of the body, and limitations of the mind.", "Have you tried to%1 with a clear mind?", ), ), # "I think.." indicates uncertainty. e.g. "I think so." # problem: exceptions... # e.g. "I think, therefore I am" ( r"i think (.*)", ( "Uncertainty in an uncertain world.", "Indeed, how can we be certain of anything in such uncertain times.", "Are you not, in fact, certain that%1?", ), ), # "I feel...emotions/sick/light-headed..." ( r"i feel (.*)", ( "Your body and your emotions are both symptoms of your mind." "What do you believe is the root of such feelings?", "Feeling%1 can be a sign of your state-of-mind.", ), ), # exclaimation mark indicating emotion # e.g. "Wow!" or "No!" ( r"(.*)!", ( "I sense that you are feeling emotional today.", "You need to calm your emotions.", ), ), # because [statement] # e.g. "because I said so" ( r"because (.*)", ( "Does knowning the reasons behind things help you to understand" " the things themselves?", "If%1, what else must be true?", ), ), # yes or no - raise an issue of certainty/correctness ( r"(yes)|(no)", ( "Is there certainty in an uncertain world?", "It is better to be right than to be certain.", ), ), # sentence containing word 'love' ( r"(.*)love(.*)", ( "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.", "Free love!", ), ), # sentence containing word 'understand' - r ( r"(.*)understand(.*)", ( "If you understand, things are just as they are;" " if you do not understand, things are just as they are.", "Imagination is more important than knowledge.", ), ), # 'I', 'me', 'my' - person is talking about themself. # this breaks down when words contain these - eg 'Thyme', 'Irish' ( r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)", ( "'I', 'me', 'my'... these are selfish expressions.", "Have you ever considered that you might be a selfish person?", "Try to consider others, not just yourself.", "Think not just of yourself, but of others.", ), ), # 'you' starting a sentence # e.g. "you stink!" ( r"you (.*)", ("My path is not of concern to you.", "I am but one, and you but one more."), ), # say goodbye with some extra Zen wisdom. ( r"exit", ( "Farewell. The obstacle is the path.", "Farewell. Life is a journey, not a destination.", "Good bye. We are cups, constantly and quietly being filled." "\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.", ), ), # fall through case - # when stumped, respond with generic zen wisdom # ( r"(.*)", ( "When you're enlightened, every word is wisdom.", "Random talk is useless.", "The reverse side also has a reverse side.", "Form is emptiness, and emptiness is form.", "I pour out a cup of water. Is the cup empty?", ), ), ) zen_chatbot = Chat(responses, reflections) def zen_chat(): print("*" * 75) print("Zen Chatbot!".center(75)) print("*" * 75) print('"Look beyond mere words and letters - look into your mind"'.center(75)) print("* Talk your way to truth with Zen Chatbot.") print("* Type 'quit' when you have had enough.") print("*" * 75) print("Welcome, my child.") zen_chatbot.converse() def demo(): zen_chat() if __name__ == "__main__": demo() nltk-3.7/nltk/chunk/000077500000000000000000000000001420073152400144115ustar00rootroot00000000000000nltk-3.7/nltk/chunk/__init__.py000066400000000000000000000163501420073152400165270ustar00rootroot00000000000000# Natural Language Toolkit: Chunkers # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # """ Classes and interfaces for identifying non-overlapping linguistic groups (such as base noun phrases) in unrestricted text. This task is called "chunk parsing" or "chunking", and the identified groups are called "chunks". The chunked text is represented using a shallow tree called a "chunk structure." A chunk structure is a tree containing tokens and chunks, where each chunk is a subtree containing only tokens. For example, the chunk structure for base noun phrase chunks in the sentence "I saw the big dog on the hill" is:: (SENTENCE: (NP: ) (NP: ) (NP: )) To convert a chunk structure back to a list of tokens, simply use the chunk structure's ``leaves()`` method. This module defines ``ChunkParserI``, a standard interface for chunking texts; and ``RegexpChunkParser``, a regular-expression based implementation of that interface. It also defines ``ChunkScore``, a utility class for scoring chunk parsers. RegexpChunkParser ================= ``RegexpChunkParser`` is an implementation of the chunk parser interface that uses regular-expressions over tags to chunk a text. Its ``parse()`` method first constructs a ``ChunkString``, which encodes a particular chunking of the input text. Initially, nothing is chunked. ``parse.RegexpChunkParser`` then applies a sequence of ``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies the chunking that it encodes. Finally, the ``ChunkString`` is transformed back into a chunk structure, which is returned. ``RegexpChunkParser`` can only be used to chunk a single kind of phrase. For example, you can use an ``RegexpChunkParser`` to chunk the noun phrases in a text, or the verb phrases in a text; but you can not use it to simultaneously chunk both noun phrases and verb phrases in the same text. (This is a limitation of ``RegexpChunkParser``, not of chunk parsers in general.) RegexpChunkRules ---------------- A ``RegexpChunkRule`` is a transformational rule that updates the chunking of a text by modifying its ``ChunkString``. Each ``RegexpChunkRule`` defines the ``apply()`` method, which modifies the chunking encoded by a ``ChunkString``. The ``RegexpChunkRule`` class itself can be used to implement any transformational rule based on regular expressions. There are also a number of subclasses, which can be used to implement simpler types of rules: - ``ChunkRule`` chunks anything that matches a given regular expression. - ``StripRule`` strips anything that matches a given regular expression. - ``UnChunkRule`` will un-chunk any chunk that matches a given regular expression. - ``MergeRule`` can be used to merge two contiguous chunks. - ``SplitRule`` can be used to split a single chunk into two smaller chunks. - ``ExpandLeftRule`` will expand a chunk to incorporate new unchunked material on the left. - ``ExpandRightRule`` will expand a chunk to incorporate new unchunked material on the right. Tag Patterns ~~~~~~~~~~~~ A ``RegexpChunkRule`` uses a modified version of regular expression patterns, called "tag patterns". Tag patterns are used to match sequences of tags. Examples of tag patterns are:: r'(
    ||)+' r'+' r'' The differences between regular expression patterns and tag patterns are: - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so ``'+'`` matches one or more repetitions of ``''``, not ``''``. - Whitespace in tag patterns is ignored. So ``'
    | '`` is equivalent to ``'
    |'`` - In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so ``''`` matches any single tag starting with ``'NN'``. The function ``tag_pattern2re_pattern`` can be used to transform a tag pattern to an equivalent regular expression pattern. Efficiency ---------- Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a rate of about 300 tokens/second, with a moderately complex rule set. There may be problems if ``RegexpChunkParser`` is used with more than 5,000 tokens at a time. In particular, evaluation of some regular expressions may cause the Python regular expression engine to exceed its maximum recursion depth. We have attempted to minimize these problems, but it is impossible to avoid them completely. We therefore recommend that you apply the chunk parser to a single sentence at a time. Emacs Tip --------- If you evaluate the following elisp expression in emacs, it will colorize a ``ChunkString`` when you use an interactive python shell with emacs or xemacs ("C-c !"):: (let () (defconst comint-mode-font-lock-keywords '(("<[^>]+>" 0 'font-lock-reference-face) ("[{}]" 0 'font-lock-function-name-face))) (add-hook 'comint-mode-hook (lambda () (turn-on-font-lock)))) You can evaluate this code by copying it to a temporary buffer, placing the cursor after the last close parenthesis, and typing "``C-x C-e``". You should evaluate it before running the interactive session. The change will last until you close emacs. Unresolved Issues ----------------- If we use the ``re`` module for regular expressions, Python's regular expression engine generates "maximum recursion depth exceeded" errors when processing very large texts, even for regular expressions that should not require any recursion. We therefore use the ``pre`` module instead. But note that ``pre`` does not include Unicode support, so this module will not work with unicode strings. Note also that ``pre`` regular expressions are not quite as advanced as ``re`` ones (e.g., no leftward zero-length assertions). :type CHUNK_TAG_PATTERN: regexp :var CHUNK_TAG_PATTERN: A regular expression to test whether a tag pattern is valid. """ from nltk.chunk.api import ChunkParserI from nltk.chunk.regexp import RegexpChunkParser, RegexpParser from nltk.chunk.util import ( ChunkScore, accuracy, conllstr2tree, conlltags2tree, ieerstr2tree, tagstr2tree, tree2conllstr, tree2conlltags, ) from nltk.data import load # Standard treebank POS tagger _BINARY_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_binary.pickle" _MULTICLASS_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle" def ne_chunk(tagged_tokens, binary=False): """ Use NLTK's currently recommended named entity chunker to chunk the given list of tagged tokens. """ if binary: chunker_pickle = _BINARY_NE_CHUNKER else: chunker_pickle = _MULTICLASS_NE_CHUNKER chunker = load(chunker_pickle) return chunker.parse(tagged_tokens) def ne_chunk_sents(tagged_sentences, binary=False): """ Use NLTK's currently recommended named entity chunker to chunk the given list of tagged sentences, each consisting of a list of tagged tokens. """ if binary: chunker_pickle = _BINARY_NE_CHUNKER else: chunker_pickle = _MULTICLASS_NE_CHUNKER chunker = load(chunker_pickle) return chunker.parse_sents(tagged_sentences) nltk-3.7/nltk/chunk/api.py000066400000000000000000000035421420073152400155400ustar00rootroot00000000000000# Natural Language Toolkit: Chunk parsing API # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT ##////////////////////////////////////////////////////// ## Chunk Parser Interface ##////////////////////////////////////////////////////// from nltk.chunk.util import ChunkScore from nltk.internals import deprecated from nltk.parse import ParserI class ChunkParserI(ParserI): """ A processing interface for identifying non-overlapping groups in unrestricted text. Typically, chunk parsers are used to find base syntactic constituents, such as base noun phrases. Unlike ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method will always generate a parse. """ def parse(self, tokens): """ Return the best chunk structure for the given tokens and return a tree. :param tokens: The list of (word, tag) tokens to be chunked. :type tokens: list(tuple) :rtype: Tree """ raise NotImplementedError() @deprecated("Use accuracy(gold) instead.") def evaluate(self, gold): return self.accuracy(gold) def accuracy(self, gold): """ Score the accuracy of the chunker against the gold standard. Remove the chunking the gold standard text, rechunk it using the chunker, and return a ``ChunkScore`` object reflecting the performance of this chunk parser. :type gold: list(Tree) :param gold: The list of chunked sentences to score the chunker on. :rtype: ChunkScore """ chunkscore = ChunkScore() for correct in gold: chunkscore.score(correct, self.parse(correct.leaves())) return chunkscore nltk-3.7/nltk/chunk/named_entity.py000066400000000000000000000250441420073152400174500ustar00rootroot00000000000000# Natural Language Toolkit: Chunk parsing API # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Named entity chunker """ import os import pickle import re from xml.etree import ElementTree as ET from nltk.tag import ClassifierBasedTagger, pos_tag try: from nltk.classify import MaxentClassifier except ImportError: pass from nltk.chunk.api import ChunkParserI from nltk.chunk.util import ChunkScore from nltk.data import find from nltk.tokenize import word_tokenize from nltk.tree import Tree class NEChunkParserTagger(ClassifierBasedTagger): """ The IOB tagger used by the chunk parser. """ def __init__(self, train): ClassifierBasedTagger.__init__( self, train=train, classifier_builder=self._classifier_builder ) def _classifier_builder(self, train): return MaxentClassifier.train( train, algorithm="megam", gaussian_prior_sigma=1, trace=2 ) def _english_wordlist(self): try: wl = self._en_wordlist except AttributeError: from nltk.corpus import words self._en_wordlist = set(words.words("en-basic")) wl = self._en_wordlist return wl def _feature_detector(self, tokens, index, history): word = tokens[index][0] pos = simplify_pos(tokens[index][1]) if index == 0: prevword = prevprevword = None prevpos = prevprevpos = None prevshape = prevtag = prevprevtag = None elif index == 1: prevword = tokens[index - 1][0].lower() prevprevword = None prevpos = simplify_pos(tokens[index - 1][1]) prevprevpos = None prevtag = history[index - 1][0] prevshape = prevprevtag = None else: prevword = tokens[index - 1][0].lower() prevprevword = tokens[index - 2][0].lower() prevpos = simplify_pos(tokens[index - 1][1]) prevprevpos = simplify_pos(tokens[index - 2][1]) prevtag = history[index - 1] prevprevtag = history[index - 2] prevshape = shape(prevword) if index == len(tokens) - 1: nextword = nextnextword = None nextpos = nextnextpos = None elif index == len(tokens) - 2: nextword = tokens[index + 1][0].lower() nextpos = tokens[index + 1][1].lower() nextnextword = None nextnextpos = None else: nextword = tokens[index + 1][0].lower() nextpos = tokens[index + 1][1].lower() nextnextword = tokens[index + 2][0].lower() nextnextpos = tokens[index + 2][1].lower() # 89.6 features = { "bias": True, "shape": shape(word), "wordlen": len(word), "prefix3": word[:3].lower(), "suffix3": word[-3:].lower(), "pos": pos, "word": word, "en-wordlist": (word in self._english_wordlist()), "prevtag": prevtag, "prevpos": prevpos, "nextpos": nextpos, "prevword": prevword, "nextword": nextword, "word+nextpos": f"{word.lower()}+{nextpos}", "pos+prevtag": f"{pos}+{prevtag}", "shape+prevtag": f"{prevshape}+{prevtag}", } return features class NEChunkParser(ChunkParserI): """ Expected input: list of pos-tagged words """ def __init__(self, train): self._train(train) def parse(self, tokens): """ Each token should be a pos-tagged word """ tagged = self._tagger.tag(tokens) tree = self._tagged_to_parse(tagged) return tree def _train(self, corpus): # Convert to tagged sequence corpus = [self._parse_to_tagged(s) for s in corpus] self._tagger = NEChunkParserTagger(train=corpus) def _tagged_to_parse(self, tagged_tokens): """ Convert a list of tagged tokens to a chunk-parse tree. """ sent = Tree("S", []) for (tok, tag) in tagged_tokens: if tag == "O": sent.append(tok) elif tag.startswith("B-"): sent.append(Tree(tag[2:], [tok])) elif tag.startswith("I-"): if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]: sent[-1].append(tok) else: sent.append(Tree(tag[2:], [tok])) return sent @staticmethod def _parse_to_tagged(sent): """ Convert a chunk-parse tree to a list of tagged tokens. """ toks = [] for child in sent: if isinstance(child, Tree): if len(child) == 0: print("Warning -- empty chunk in sentence") continue toks.append((child[0], f"B-{child.label()}")) for tok in child[1:]: toks.append((tok, f"I-{child.label()}")) else: toks.append((child, "O")) return toks def shape(word): if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE): return "number" elif re.match(r"\W+$", word, re.UNICODE): return "punct" elif re.match(r"\w+$", word, re.UNICODE): if word.istitle(): return "upcase" elif word.islower(): return "downcase" else: return "mixedcase" else: return "other" def simplify_pos(s): if s.startswith("V"): return "V" else: return s.split("-")[0] def postag_tree(tree): # Part-of-speech tagging. words = tree.leaves() tag_iter = (pos for (word, pos) in pos_tag(words)) newtree = Tree("S", []) for child in tree: if isinstance(child, Tree): newtree.append(Tree(child.label(), [])) for subchild in child: newtree[-1].append((subchild, next(tag_iter))) else: newtree.append((child, next(tag_iter))) return newtree def load_ace_data(roots, fmt="binary", skip_bnews=True): for root in roots: for root, dirs, files in os.walk(root): if root.endswith("bnews") and skip_bnews: continue for f in files: if f.endswith(".sgm"): yield from load_ace_file(os.path.join(root, f), fmt) def load_ace_file(textfile, fmt): print(f" - {os.path.split(textfile)[1]}") annfile = textfile + ".tmx.rdc.xml" # Read the xml file, and get a list of entities entities = [] with open(annfile) as infile: xml = ET.parse(infile).getroot() for entity in xml.findall("document/entity"): typ = entity.find("entity_type").text for mention in entity.findall("entity_mention"): if mention.get("TYPE") != "NAME": continue # only NEs s = int(mention.find("head/charseq/start").text) e = int(mention.find("head/charseq/end").text) + 1 entities.append((s, e, typ)) # Read the text file, and mark the entities. with open(textfile) as infile: text = infile.read() # Strip XML tags, since they don't count towards the indices text = re.sub("<(?!/?TEXT)[^>]+>", "", text) # Blank out anything before/after def subfunc(m): return " " * (m.end() - m.start() - 6) text = re.sub(r"[\s\S]*", subfunc, text) text = re.sub(r"[\s\S]*", "", text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = {typ for (s, e, typ) in entities} # Binary distinction (NE or not NE) if fmt == "binary": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree("NE", text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == "multiclass": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError("bad fmt value") # This probably belongs in a more general-purpose location (as does # the parse_to_tagged function). def cmp_chunks(correct, guessed): correct = NEChunkParser._parse_to_tagged(correct) guessed = NEChunkParser._parse_to_tagged(guessed) ellipsis = False for (w, ct), (w, gt) in zip(correct, guessed): if ct == gt == "O": if not ellipsis: print(f" {ct:15} {gt:15} {w}") print(" {:15} {:15} {2}".format("...", "...", "...")) ellipsis = True else: ellipsis = False print(f" {ct:15} {gt:15} {w}") def build_model(fmt="binary"): print("Loading training data...") train_paths = [ find("corpora/ace_data/ace.dev"), find("corpora/ace_data/ace.heldout"), find("corpora/ace_data/bbn.dev"), find("corpora/ace_data/muc.dev"), ] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print("Training...") cp = NEChunkParser(train_data) del train_data print("Loading eval data...") eval_paths = [find("corpora/ace_data/ace.eval")] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print("Evaluating...") chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = f"/tmp/ne_chunker_{fmt}.pickle" print(f"Saving chunker to {outfilename}...") with open(outfilename, "wb") as outfile: pickle.dump(cp, outfile, -1) return cp if __name__ == "__main__": # Make sure that the pickled object has the right class name: from nltk.chunk.named_entity import build_model build_model("binary") build_model("multiclass") nltk-3.7/nltk/chunk/regexp.py000066400000000000000000001522661420073152400162710ustar00rootroot00000000000000# Natural Language Toolkit: Regular Expression Chunkers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT import re from nltk.chunk.api import ChunkParserI from nltk.tree import Tree # ////////////////////////////////////////////////////// # ChunkString # ////////////////////////////////////////////////////// class ChunkString: """ A string-based encoding of a particular chunking of a text. Internally, the ``ChunkString`` class uses a single string to encode the chunking of the input text. This string contains a sequence of angle-bracket delimited tags, with chunking indicated by braces. An example of this encoding is:: {
    }{
    }<.>{
    }<.> ``ChunkString`` are created from tagged texts (i.e., lists of ``tokens`` whose type is ``TaggedType``). Initially, nothing is chunked. The chunking of a ``ChunkString`` can be modified with the ``xform()`` method, which uses a regular expression to transform the string representation. These transformations should only add and remove braces; they should *not* modify the sequence of angle-bracket delimited tags. :type _str: str :ivar _str: The internal string representation of the text's encoding. This string representation contains a sequence of angle-bracket delimited tags, with chunking indicated by braces. An example of this encoding is:: {
    }{
    }<.>{
    }<.> :type _pieces: list(tagged tokens and chunks) :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``. :ivar _debug: The debug level. See the constructor docs. :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that will only match positions that are in chunks. :cvar IN_STRIP_PATTERN: A zero-width regexp pattern string that will only match positions that are in strips. """ CHUNK_TAG_CHAR = r"[^\{\}<>]" CHUNK_TAG = r"(<%s+?>)" % CHUNK_TAG_CHAR IN_CHUNK_PATTERN = r"(?=[^\{]*\})" IN_STRIP_PATTERN = r"(?=[^\}]*(\{|$))" # These are used by _verify _CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG _STRIP = r"(%s+?)+?" % CHUNK_TAG _VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG) _BRACKETS = re.compile(r"[^\{\}]+") _BALANCED_BRACKETS = re.compile(r"(\{\})*$") def __init__(self, chunk_struct, debug_level=1): """ Construct a new ``ChunkString`` that encodes the chunking of the text ``tagged_tokens``. :type chunk_struct: Tree :param chunk_struct: The chunk structure to be further chunked. :type debug_level: int :param debug_level: The level of debugging which should be applied to transformations on the ``ChunkString``. The valid levels are: - 0: no checks - 1: full check on to_chunkstruct - 2: full check on to_chunkstruct and cursory check after each transformation. - 3: full check on to_chunkstruct and full check after each transformation. We recommend you use at least level 1. You should probably use level 3 if you use any non-standard subclasses of ``RegexpChunkRule``. """ self._root_label = chunk_struct.label() self._pieces = chunk_struct[:] tags = [self._tag(tok) for tok in self._pieces] self._str = "<" + "><".join(tags) + ">" self._debug = debug_level def _tag(self, tok): if isinstance(tok, tuple): return tok[1] elif isinstance(tok, Tree): return tok.label() else: raise ValueError("chunk structures must contain tagged " "tokens or trees") def _verify(self, s, verify_tags): """ Check to make sure that ``s`` still corresponds to some chunked version of ``_pieces``. :type verify_tags: bool :param verify_tags: Whether the individual tags should be checked. If this is false, ``_verify`` will check to make sure that ``_str`` encodes a chunked version of *some* list of tokens. If this is true, then ``_verify`` will check to make sure that the tags in ``_str`` match those in ``_pieces``. :raise ValueError: if the internal string representation of this ``ChunkString`` is invalid or not consistent with _pieces. """ # Check overall form if not ChunkString._VALID.match(s): raise ValueError( "Transformation generated invalid " "chunkstring:\n %s" % s ) # Check that parens are balanced. If the string is long, we # have to do this in pieces, to avoid a maximum recursion # depth limit for regular expressions. brackets = ChunkString._BRACKETS.sub("", s) for i in range(1 + len(brackets) // 5000): substr = brackets[i * 5000 : i * 5000 + 5000] if not ChunkString._BALANCED_BRACKETS.match(substr): raise ValueError( "Transformation generated invalid " "chunkstring:\n %s" % s ) if verify_tags <= 0: return tags1 = (re.split(r"[\{\}<>]+", s))[1:-1] tags2 = [self._tag(piece) for piece in self._pieces] if tags1 != tags2: raise ValueError( "Transformation generated invalid " "chunkstring: tag changed" ) def to_chunkstruct(self, chunk_label="CHUNK"): """ Return the chunk structure encoded by this ``ChunkString``. :rtype: Tree :raise ValueError: If a transformation has generated an invalid chunkstring. """ if self._debug > 0: self._verify(self._str, 1) # Use this alternating list to create the chunkstruct. pieces = [] index = 0 piece_in_chunk = 0 for piece in re.split("[{}]", self._str): # Find the list of tokens contained in this piece. length = piece.count("<") subsequence = self._pieces[index : index + length] # Add this list of tokens to our pieces. if piece_in_chunk: pieces.append(Tree(chunk_label, subsequence)) else: pieces += subsequence # Update index, piece_in_chunk index += length piece_in_chunk = not piece_in_chunk return Tree(self._root_label, pieces) def xform(self, regexp, repl): """ Apply the given transformation to the string encoding of this ``ChunkString``. In particular, find all occurrences that match ``regexp``, and replace them using ``repl`` (as done by ``re.sub``). This transformation should only add and remove braces; it should *not* modify the sequence of angle-bracket delimited tags. Furthermore, this transformation may not result in improper bracketing. Note, in particular, that bracketing may not be nested. :type regexp: str or regexp :param regexp: A regular expression matching the substring that should be replaced. This will typically include a named group, which can be used by ``repl``. :type repl: str :param repl: An expression specifying what should replace the matched substring. Typically, this will include a named replacement group, specified by ``regexp``. :rtype: None :raise ValueError: If this transformation generated an invalid chunkstring. """ # Do the actual substitution s = re.sub(regexp, repl, self._str) # The substitution might have generated "empty chunks" # (substrings of the form "{}"). Remove them, so they don't # interfere with other transformations. s = re.sub(r"\{\}", "", s) # Make sure that the transformation was legal. if self._debug > 1: self._verify(s, self._debug - 2) # Commit the transformation. self._str = s def __repr__(self): """ Return a string representation of this ``ChunkString``. It has the form:: }{
    }'> :rtype: str """ return "" % repr(self._str) def __str__(self): """ Return a formatted representation of this ``ChunkString``. This representation will include extra spaces to ensure that tags will line up with the representation of other ``ChunkStrings`` for the same text, regardless of the chunking. :rtype: str """ # Add spaces to make everything line up. str = re.sub(r">(?!\})", r"> ", self._str) str = re.sub(r"([^\{])<", r"\1 <", str) if str[0] == "<": str = " " + str return str # ////////////////////////////////////////////////////// # Chunking Rules # ////////////////////////////////////////////////////// class RegexpChunkRule: """ A rule specifying how to modify the chunking in a ``ChunkString``, using a transformational regular expression. The ``RegexpChunkRule`` class itself can be used to implement any transformational rule based on regular expressions. There are also a number of subclasses, which can be used to implement simpler types of rules, based on matching regular expressions. Each ``RegexpChunkRule`` has a regular expression and a replacement expression. When a ``RegexpChunkRule`` is "applied" to a ``ChunkString``, it searches the ``ChunkString`` for any substring that matches the regular expression, and replaces it using the replacement expression. This search/replace operation has the same semantics as ``re.sub``. Each ``RegexpChunkRule`` also has a description string, which gives a short (typically less than 75 characters) description of the purpose of the rule. This transformation defined by this ``RegexpChunkRule`` should only add and remove braces; it should *not* modify the sequence of angle-bracket delimited tags. Furthermore, this transformation may not result in nested or mismatched bracketing. """ def __init__(self, regexp, repl, descr): """ Construct a new RegexpChunkRule. :type regexp: regexp or str :param regexp: The regular expression for this ``RegexpChunkRule``. When this rule is applied to a ``ChunkString``, any substring that matches ``regexp`` will be replaced using the replacement string ``repl``. Note that this must be a normal regular expression, not a tag pattern. :type repl: str :param repl: The replacement expression for this ``RegexpChunkRule``. When this rule is applied to a ``ChunkString``, any substring that matches ``regexp`` will be replaced using ``repl``. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ if isinstance(regexp, str): regexp = re.compile(regexp) self._repl = repl self._descr = descr self._regexp = regexp def apply(self, chunkstr): # Keep docstring generic so we can inherit it. """ Apply this rule to the given ``ChunkString``. See the class reference documentation for a description of what it means to apply a rule. :type chunkstr: ChunkString :param chunkstr: The chunkstring to which this rule is applied. :rtype: None :raise ValueError: If this transformation generated an invalid chunkstring. """ chunkstr.xform(self._regexp, self._repl) def descr(self): """ Return a short description of the purpose and/or effect of this rule. :rtype: str """ return self._descr def __repr__(self): """ Return a string representation of this rule. It has the form:: }'->''> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ( "" + repr(self._repl) + ">" ) @staticmethod def fromstring(s): """ Create a RegexpChunkRule from a string description. Currently, the following formats are supported:: {regexp} # chunk rule }regexp{ # strip rule regexp}{regexp # split rule regexp{}regexp # merge rule Where ``regexp`` is a regular expression for the rule. Any text following the comment marker (``#``) will be used as the rule's description: >>> from nltk.chunk.regexp import RegexpChunkRule >>> RegexpChunkRule.fromstring('{
    ?+}') ?+'> """ # Split off the comment (but don't split on '\#') m = re.match(r"(?P(\\.|[^#])*)(?P#.*)?", s) rule = m.group("rule").strip() comment = (m.group("comment") or "")[1:].strip() # Pattern bodies: chunk, strip, split, merge try: if not rule: raise ValueError("Empty chunk pattern") if rule[0] == "{" and rule[-1] == "}": return ChunkRule(rule[1:-1], comment) elif rule[0] == "}" and rule[-1] == "{": return StripRule(rule[1:-1], comment) elif "}{" in rule: left, right = rule.split("}{") return SplitRule(left, right, comment) elif "{}" in rule: left, right = rule.split("{}") return MergeRule(left, right, comment) elif re.match("[^{}]*{[^{}]*}[^{}]*", rule): left, chunk, right = re.split("[{}]", rule) return ChunkRuleWithContext(left, chunk, right, comment) else: raise ValueError("Illegal chunk pattern: %s" % rule) except (ValueError, re.error) as e: raise ValueError("Illegal chunk pattern: %s" % rule) from e class ChunkRule(RegexpChunkRule): """ A rule specifying how to add chunks to a ``ChunkString``, using a matching tag pattern. When applied to a ``ChunkString``, it will find any substring that matches this tag pattern and that is not already part of a chunk, and create a new chunk containing that substring. """ def __init__(self, tag_pattern, descr): """ Construct a new ``ChunkRule``. :type tag_pattern: str :param tag_pattern: This rule's tag pattern. When applied to a ``ChunkString``, this rule will chunk any substring that matches this tag pattern and that is not already part of a chunk. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ self._pattern = tag_pattern regexp = re.compile( "(?P%s)%s" % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_STRIP_PATTERN) ) RegexpChunkRule.__init__(self, regexp, r"{\g}", descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return "" class StripRule(RegexpChunkRule): """ A rule specifying how to remove strips to a ``ChunkString``, using a matching tag pattern. When applied to a ``ChunkString``, it will find any substring that matches this tag pattern and that is contained in a chunk, and remove it from that chunk, thus creating two new chunks. """ def __init__(self, tag_pattern, descr): """ Construct a new ``StripRule``. :type tag_pattern: str :param tag_pattern: This rule's tag pattern. When applied to a ``ChunkString``, this rule will find any substring that matches this tag pattern and that is contained in a chunk, and remove it from that chunk, thus creating two new chunks. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ self._pattern = tag_pattern regexp = re.compile( "(?P%s)%s" % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN) ) RegexpChunkRule.__init__(self, regexp, r"}\g{", descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return "" class UnChunkRule(RegexpChunkRule): """ A rule specifying how to remove chunks to a ``ChunkString``, using a matching tag pattern. When applied to a ``ChunkString``, it will find any complete chunk that matches this tag pattern, and un-chunk it. """ def __init__(self, tag_pattern, descr): """ Construct a new ``UnChunkRule``. :type tag_pattern: str :param tag_pattern: This rule's tag pattern. When applied to a ``ChunkString``, this rule will find any complete chunk that matches this tag pattern, and un-chunk it. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ self._pattern = tag_pattern regexp = re.compile(r"\{(?P%s)\}" % tag_pattern2re_pattern(tag_pattern)) RegexpChunkRule.__init__(self, regexp, r"\g", descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return "" class MergeRule(RegexpChunkRule): """ A rule specifying how to merge chunks in a ``ChunkString``, using two matching tag patterns: a left pattern, and a right pattern. When applied to a ``ChunkString``, it will find any chunk whose end matches left pattern, and immediately followed by a chunk whose beginning matches right pattern. It will then merge those two chunks into a single chunk. """ def __init__(self, left_tag_pattern, right_tag_pattern, descr): """ Construct a new ``MergeRule``. :type right_tag_pattern: str :param right_tag_pattern: This rule's right tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose end matches ``left_tag_pattern``, and immediately followed by a chunk whose beginning matches this pattern. It will then merge those two chunks into a single chunk. :type left_tag_pattern: str :param left_tag_pattern: This rule's left tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose end matches this pattern, and immediately followed by a chunk whose beginning matches ``right_tag_pattern``. It will then merge those two chunks into a single chunk. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_tag_pattern)) re.compile(tag_pattern2re_pattern(right_tag_pattern)) self._left_tag_pattern = left_tag_pattern self._right_tag_pattern = right_tag_pattern regexp = re.compile( "(?P%s)}{(?=%s)" % ( tag_pattern2re_pattern(left_tag_pattern), tag_pattern2re_pattern(right_tag_pattern), ) ) RegexpChunkRule.__init__(self, regexp, r"\g", descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', ''> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ( "" ) class SplitRule(RegexpChunkRule): """ A rule specifying how to split chunks in a ``ChunkString``, using two matching tag patterns: a left pattern, and a right pattern. When applied to a ``ChunkString``, it will find any chunk that matches the left pattern followed by the right pattern. It will then split the chunk into two new chunks, at the point between the two pattern matches. """ def __init__(self, left_tag_pattern, right_tag_pattern, descr): """ Construct a new ``SplitRule``. :type right_tag_pattern: str :param right_tag_pattern: This rule's right tag pattern. When applied to a ``ChunkString``, this rule will find any chunk containing a substring that matches ``left_tag_pattern`` followed by this pattern. It will then split the chunk into two new chunks at the point between these two matching patterns. :type left_tag_pattern: str :param left_tag_pattern: This rule's left tag pattern. When applied to a ``ChunkString``, this rule will find any chunk containing a substring that matches this pattern followed by ``right_tag_pattern``. It will then split the chunk into two new chunks at the point between these two matching patterns. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_tag_pattern)) re.compile(tag_pattern2re_pattern(right_tag_pattern)) self._left_tag_pattern = left_tag_pattern self._right_tag_pattern = right_tag_pattern regexp = re.compile( "(?P%s)(?=%s)" % ( tag_pattern2re_pattern(left_tag_pattern), tag_pattern2re_pattern(right_tag_pattern), ) ) RegexpChunkRule.__init__(self, regexp, r"\g}{", descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', '
    '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ( "" ) class ExpandLeftRule(RegexpChunkRule): """ A rule specifying how to expand chunks in a ``ChunkString`` to the left, using two matching tag patterns: a left pattern, and a right pattern. When applied to a ``ChunkString``, it will find any chunk whose beginning matches right pattern, and immediately preceded by a strip whose end matches left pattern. It will then expand the chunk to incorporate the new material on the left. """ def __init__(self, left_tag_pattern, right_tag_pattern, descr): """ Construct a new ``ExpandRightRule``. :type right_tag_pattern: str :param right_tag_pattern: This rule's right tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose beginning matches ``right_tag_pattern``, and immediately preceded by a strip whose end matches this pattern. It will then merge those two chunks into a single chunk. :type left_tag_pattern: str :param left_tag_pattern: This rule's left tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose beginning matches this pattern, and immediately preceded by a strip whose end matches ``left_tag_pattern``. It will then expand the chunk to incorporate the new material on the left. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_tag_pattern)) re.compile(tag_pattern2re_pattern(right_tag_pattern)) self._left_tag_pattern = left_tag_pattern self._right_tag_pattern = right_tag_pattern regexp = re.compile( r"(?P%s)\{(?P%s)" % ( tag_pattern2re_pattern(left_tag_pattern), tag_pattern2re_pattern(right_tag_pattern), ) ) RegexpChunkRule.__init__(self, regexp, r"{\g\g", descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', ''> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ( "" ) class ExpandRightRule(RegexpChunkRule): """ A rule specifying how to expand chunks in a ``ChunkString`` to the right, using two matching tag patterns: a left pattern, and a right pattern. When applied to a ``ChunkString``, it will find any chunk whose end matches left pattern, and immediately followed by a strip whose beginning matches right pattern. It will then expand the chunk to incorporate the new material on the right. """ def __init__(self, left_tag_pattern, right_tag_pattern, descr): """ Construct a new ``ExpandRightRule``. :type right_tag_pattern: str :param right_tag_pattern: This rule's right tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose end matches ``left_tag_pattern``, and immediately followed by a strip whose beginning matches this pattern. It will then merge those two chunks into a single chunk. :type left_tag_pattern: str :param left_tag_pattern: This rule's left tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose end matches this pattern, and immediately followed by a strip whose beginning matches ``right_tag_pattern``. It will then expand the chunk to incorporate the new material on the right. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_tag_pattern)) re.compile(tag_pattern2re_pattern(right_tag_pattern)) self._left_tag_pattern = left_tag_pattern self._right_tag_pattern = right_tag_pattern regexp = re.compile( r"(?P%s)\}(?P%s)" % ( tag_pattern2re_pattern(left_tag_pattern), tag_pattern2re_pattern(right_tag_pattern), ) ) RegexpChunkRule.__init__(self, regexp, r"\g\g}", descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', ''> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ( "" ) class ChunkRuleWithContext(RegexpChunkRule): """ A rule specifying how to add chunks to a ``ChunkString``, using three matching tag patterns: one for the left context, one for the chunk, and one for the right context. When applied to a ``ChunkString``, it will find any substring that matches the chunk tag pattern, is surrounded by substrings that match the two context patterns, and is not already part of a chunk; and create a new chunk containing the substring that matched the chunk tag pattern. Caveat: Both the left and right context are consumed when this rule matches; therefore, if you need to find overlapping matches, you will need to apply your rule more than once. """ def __init__( self, left_context_tag_pattern, chunk_tag_pattern, right_context_tag_pattern, descr, ): """ Construct a new ``ChunkRuleWithContext``. :type left_context_tag_pattern: str :param left_context_tag_pattern: A tag pattern that must match the left context of ``chunk_tag_pattern`` for this rule to apply. :type chunk_tag_pattern: str :param chunk_tag_pattern: A tag pattern that must match for this rule to apply. If the rule does apply, then this pattern also identifies the substring that will be made into a chunk. :type right_context_tag_pattern: str :param right_context_tag_pattern: A tag pattern that must match the right context of ``chunk_tag_pattern`` for this rule to apply. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_context_tag_pattern)) re.compile(tag_pattern2re_pattern(chunk_tag_pattern)) re.compile(tag_pattern2re_pattern(right_context_tag_pattern)) self._left_context_tag_pattern = left_context_tag_pattern self._chunk_tag_pattern = chunk_tag_pattern self._right_context_tag_pattern = right_context_tag_pattern regexp = re.compile( "(?P%s)(?P%s)(?P%s)%s" % ( tag_pattern2re_pattern(left_context_tag_pattern), tag_pattern2re_pattern(chunk_tag_pattern), tag_pattern2re_pattern(right_context_tag_pattern), ChunkString.IN_STRIP_PATTERN, ) ) replacement = r"\g{\g}\g" RegexpChunkRule.__init__(self, regexp, replacement, descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', '', '
    '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return "".format( self._left_context_tag_pattern, self._chunk_tag_pattern, self._right_context_tag_pattern, ) # ////////////////////////////////////////////////////// # Tag Pattern Format Conversion # ////////////////////////////////////////////////////// # this should probably be made more strict than it is -- e.g., it # currently accepts 'foo'. CHUNK_TAG_PATTERN = re.compile( r"^(({}|<{}>)*)$".format(r"([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", r"[^\{\}<>]+") ) def tag_pattern2re_pattern(tag_pattern): """ Convert a tag pattern to a regular expression pattern. A "tag pattern" is a modified version of a regular expression, designed for matching sequences of tags. The differences between regular expression patterns and tag patterns are: - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so ``'+'`` matches one or more repetitions of ``''``, not ``''``. - Whitespace in tag patterns is ignored. So ``'
    | '`` is equivalent to ``'
    |'`` - In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so ``''`` matches any single tag starting with ``'NN'``. In particular, ``tag_pattern2re_pattern`` performs the following transformations on the given pattern: - Replace '.' with '[^<>{}]' - Remove any whitespace - Add extra parens around '<' and '>', to make '<' and '>' act like parentheses. E.g., so that in '+', the '+' has scope over the entire ''; and so that in '', the '|' has scope over 'NN' and 'IN', but not '<' or '>'. - Check to make sure the resulting pattern is valid. :type tag_pattern: str :param tag_pattern: The tag pattern to convert to a regular expression pattern. :raise ValueError: If ``tag_pattern`` is not a valid tag pattern. In particular, ``tag_pattern`` should not include braces; and it should not contain nested or mismatched angle-brackets. :rtype: str :return: A regular expression pattern corresponding to ``tag_pattern``. """ # Clean up the regular expression tag_pattern = re.sub(r"\s", "", tag_pattern) tag_pattern = re.sub(r"<", "(<(", tag_pattern) tag_pattern = re.sub(r">", ")>)", tag_pattern) # Check the regular expression if not CHUNK_TAG_PATTERN.match(tag_pattern): raise ValueError("Bad tag pattern: %r" % tag_pattern) # Replace "." with CHUNK_TAG_CHAR. # We have to do this after, since it adds {}[]<>s, which would # confuse CHUNK_TAG_PATTERN. # PRE doesn't have lookback assertions, so reverse twice, and do # the pattern backwards (with lookahead assertions). This can be # made much cleaner once we can switch back to SRE. def reverse_str(str): lst = list(str) lst.reverse() return "".join(lst) tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR) reversed = reverse_str(tag_pattern) reversed = re.sub(r"\.(?!\\(\\\\)*($|[^\\]))", tc_rev, reversed) tag_pattern = reverse_str(reversed) return tag_pattern # ////////////////////////////////////////////////////// # RegexpChunkParser # ////////////////////////////////////////////////////// class RegexpChunkParser(ChunkParserI): """ A regular expression based chunk parser. ``RegexpChunkParser`` uses a sequence of "rules" to find chunks of a single type within a text. The chunking of the text is encoded using a ``ChunkString``, and each rule acts by modifying the chunking in the ``ChunkString``. The rules are all implemented using regular expression matching and substitution. The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``, ``StripRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``) define the rules that are used by ``RegexpChunkParser``. Each rule defines an ``apply()`` method, which modifies the chunking encoded by a given ``ChunkString``. :type _rules: list(RegexpChunkRule) :ivar _rules: The list of rules that should be applied to a text. :type _trace: int :ivar _trace: The default level of tracing. """ def __init__(self, rules, chunk_label="NP", root_label="S", trace=0): """ Construct a new ``RegexpChunkParser``. :type rules: list(RegexpChunkRule) :param rules: The sequence of rules that should be used to generate the chunking for a tagged text. :type chunk_label: str :param chunk_label: The node value that should be used for chunk subtrees. This is typically a short string describing the type of information contained by the chunk, such as ``"NP"`` for base noun phrases. :type root_label: str :param root_label: The node value that should be used for the top node of the chunk structure. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. """ self._rules = rules self._trace = trace self._chunk_label = chunk_label self._root_label = root_label def _trace_apply(self, chunkstr, verbose): """ Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in turn. Generate trace output between each rule. If ``verbose`` is true, then generate verbose output. :type chunkstr: ChunkString :param chunkstr: The chunk string to which each rule should be applied. :type verbose: bool :param verbose: Whether output should be verbose. :rtype: None """ print("# Input:") print(chunkstr) for rule in self._rules: rule.apply(chunkstr) if verbose: print("#", rule.descr() + " (" + repr(rule) + "):") else: print("#", rule.descr() + ":") print(chunkstr) def _notrace_apply(self, chunkstr): """ Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in turn. :param chunkstr: The chunk string to which each rule should be applied. :type chunkstr: ChunkString :rtype: None """ for rule in self._rules: rule.apply(chunkstr) def parse(self, chunk_struct, trace=None): """ :type chunk_struct: Tree :param chunk_struct: the chunk structure to be (further) chunked :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. This value overrides the trace level value that was given to the constructor. :rtype: Tree :return: a chunk structure that encodes the chunks in a given tagged sentence. A chunk is a non-overlapping linguistic group, such as a noun phrase. The set of chunks identified in the chunk structure depends on the rules used to define this ``RegexpChunkParser``. """ if len(chunk_struct) == 0: print("Warning: parsing empty text") return Tree(self._root_label, []) try: chunk_struct.label() except AttributeError: chunk_struct = Tree(self._root_label, chunk_struct) # Use the default trace value? if trace is None: trace = self._trace chunkstr = ChunkString(chunk_struct) # Apply the sequence of rules to the chunkstring. if trace: verbose = trace > 1 self._trace_apply(chunkstr, verbose) else: self._notrace_apply(chunkstr) # Use the chunkstring to create a chunk structure. return chunkstr.to_chunkstruct(self._chunk_label) def rules(self): """ :return: the sequence of rules used by ``RegexpChunkParser``. :rtype: list(RegexpChunkRule) """ return self._rules def __repr__(self): """ :return: a concise string representation of this ``RegexpChunkParser``. :rtype: str """ return "" % len(self._rules) def __str__(self): """ :return: a verbose string representation of this ``RegexpChunkParser``. :rtype: str """ s = "RegexpChunkParser with %d rules:\n" % len(self._rules) margin = 0 for rule in self._rules: margin = max(margin, len(rule.descr())) if margin < 35: format = " %" + repr(-(margin + 3)) + "s%s\n" else: format = " %s\n %s\n" for rule in self._rules: s += format % (rule.descr(), repr(rule)) return s[:-1] # ////////////////////////////////////////////////////// # Chunk Grammar # ////////////////////////////////////////////////////// class RegexpParser(ChunkParserI): r""" A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of regular expression patterns to specify the behavior of the parser. The chunking of the text is encoded using a ``ChunkString``, and each rule acts by modifying the chunking in the ``ChunkString``. The rules are all implemented using regular expression matching and substitution. A grammar contains one or more clauses in the following form:: NP: {} # chunk determiners and adjectives }<[\.VI].*>+{ # strip any tag beginning with V, I, or . <.*>}{
    # split a chunk at a determiner {} # merge chunk ending with det/adj # with one starting with a noun The patterns of a clause are executed in order. An earlier pattern may introduce a chunk boundary that prevents a later pattern from executing. Sometimes an individual pattern will match on multiple, overlapping extents of the input. As with regular expression substitution more generally, the chunker will identify the first match possible, then continue looking for matches after this one has ended. The clauses of a grammar are also executed in order. A cascaded chunk parser is one having more than one clause. The maximum depth of a parse tree created by this chunk parser is the same as the number of clauses in the grammar. When tracing is turned on, the comment portion of a line is displayed each time the corresponding pattern is applied. :type _start: str :ivar _start: The start symbol of the grammar (the root node of resulting trees) :type _stages: int :ivar _stages: The list of parsing stages corresponding to the grammar """ def __init__(self, grammar, root_label="S", loop=1, trace=0): """ Create a new chunk parser, from the given start state and set of chunk patterns. :param grammar: The grammar, or a list of RegexpChunkParser objects :type grammar: str or list(RegexpChunkParser) :param root_label: The top node of the tree being created :type root_label: str or Nonterminal :param loop: The number of times to run through the patterns :type loop: int :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. """ self._trace = trace self._stages = [] self._grammar = grammar self._loop = loop if isinstance(grammar, str): self._read_grammar(grammar, root_label, trace) else: # Make sur the grammar looks like it has the right type: type_err = ( "Expected string or list of RegexpChunkParsers " "for the grammar." ) try: grammar = list(grammar) except BaseException as e: raise TypeError(type_err) from e for elt in grammar: if not isinstance(elt, RegexpChunkParser): raise TypeError(type_err) self._stages = grammar def _read_grammar(self, grammar, root_label, trace): """ Helper function for __init__: read the grammar if it is a string. """ rules = [] lhs = None for line in grammar.split("\n"): line = line.strip() # New stage begins if there's an unescaped ':' m = re.match("(?P(\\.|[^:])*)(:(?P.*))", line) if m: # Record the stage that we just completed. self._add_stage(rules, lhs, root_label, trace) # Start a new stage. lhs = m.group("nonterminal").strip() rules = [] line = m.group("rule").strip() # Skip blank & comment-only lines if line == "" or line.startswith("#"): continue # Add the rule rules.append(RegexpChunkRule.fromstring(line)) # Record the final stage self._add_stage(rules, lhs, root_label, trace) def _add_stage(self, rules, lhs, root_label, trace): """ Helper function for __init__: add a new stage to the parser. """ if rules != []: if not lhs: raise ValueError("Expected stage marker (eg NP:)") parser = RegexpChunkParser( rules, chunk_label=lhs, root_label=root_label, trace=trace ) self._stages.append(parser) def parse(self, chunk_struct, trace=None): """ Apply the chunk parser to this input. :type chunk_struct: Tree :param chunk_struct: the chunk structure to be (further) chunked (this tree is modified, and is also returned) :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. This value overrides the trace level value that was given to the constructor. :return: the chunked output. :rtype: Tree """ if trace is None: trace = self._trace for i in range(self._loop): for parser in self._stages: chunk_struct = parser.parse(chunk_struct, trace=trace) return chunk_struct def __repr__(self): """ :return: a concise string representation of this ``chunk.RegexpParser``. :rtype: str """ return "" % len(self._stages) def __str__(self): """ :return: a verbose string representation of this ``RegexpParser``. :rtype: str """ s = "chunk.RegexpParser with %d stages:\n" % len(self._stages) margin = 0 for parser in self._stages: s += "%s\n" % parser return s[:-1] # ////////////////////////////////////////////////////// # Demonstration code # ////////////////////////////////////////////////////// def demo_eval(chunkparser, text): """ Demonstration code for evaluating a chunk parser, using a ``ChunkScore``. This function assumes that ``text`` contains one sentence per line, and that each sentence has the form expected by ``tree.chunk``. It runs the given chunk parser on each sentence in the text, and scores the result. It prints the final score (precision, recall, and f-measure); and reports the set of chunks that were missed and the set of chunks that were incorrect. (At most 10 missing chunks and 10 incorrect chunks are reported). :param chunkparser: The chunkparser to be tested :type chunkparser: ChunkParserI :param text: The chunked tagged text that should be used for evaluation. :type text: str """ from nltk import chunk from nltk.tree import Tree # Evaluate our chunk parser. chunkscore = chunk.ChunkScore() for sentence in text.split("\n"): print(sentence) sentence = sentence.strip() if not sentence: continue gold = chunk.tagstr2tree(sentence) tokens = gold.leaves() test = chunkparser.parse(Tree("S", tokens), trace=1) chunkscore.score(gold, test) print() print("/" + ("=" * 75) + "\\") print("Scoring", chunkparser) print("-" * 77) print("Precision: %5.1f%%" % (chunkscore.precision() * 100), " " * 4, end=" ") print("Recall: %5.1f%%" % (chunkscore.recall() * 100), " " * 6, end=" ") print("F-Measure: %5.1f%%" % (chunkscore.f_measure() * 100)) # Missed chunks. if chunkscore.missed(): print("Missed:") missed = chunkscore.missed() for chunk in missed[:10]: print(" ", " ".join(map(str, chunk))) if len(chunkscore.missed()) > 10: print(" ...") # Incorrect chunks. if chunkscore.incorrect(): print("Incorrect:") incorrect = chunkscore.incorrect() for chunk in incorrect[:10]: print(" ", " ".join(map(str, chunk))) if len(chunkscore.incorrect()) > 10: print(" ...") print("\\" + ("=" * 75) + "/") print() def demo(): """ A demonstration for the ``RegexpChunkParser`` class. A single text is parsed with four different chunk parsers, using a variety of rules and strategies. """ from nltk import Tree, chunk text = """\ [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. """ print("*" * 75) print("Evaluation text:") print(text) print("*" * 75) print() grammar = r""" NP: # NP stage {
    ?*} # chunk determiners, adjectives and nouns {+} # chunk proper nouns """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods {} # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {
    ?*} # chunk determiners, adjectives and nouns VP: {?} # VP = verb words """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # strip any verbs, prepositions or periods <.*>}{
    # separate on determiners PP: {} # PP = preposition + noun phrase VP: {*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) # Evaluation from nltk.corpus import conll2000 print() print("Demonstration of empty grammar:") cp = chunk.RegexpParser("") print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt", chunk_types=("NP",)))) print() print("Demonstration of accuracy evaluation using CoNLL tags:") grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods {} # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt")[:5])) print() print("Demonstration of tagged token input") grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # strip any verbs, prepositions or periods <.*>}{
    # separate on determiners PP: {} # PP = preposition + noun phrase VP: {*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) print( cp.parse( [ ("the", "DT"), ("little", "JJ"), ("cat", "NN"), ("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN"), (".", "."), ] ) ) if __name__ == "__main__": demo() nltk-3.7/nltk/chunk/util.py000066400000000000000000000502741420073152400157500ustar00rootroot00000000000000# Natural Language Toolkit: Chunk format conversions # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT import re from nltk.metrics import accuracy as _accuracy from nltk.tag.mapping import map_tag from nltk.tag.util import str2tuple from nltk.tree import Tree ##////////////////////////////////////////////////////// ## EVALUATION ##////////////////////////////////////////////////////// def accuracy(chunker, gold): """ Score the accuracy of the chunker against the gold standard. Strip the chunk information from the gold standard and rechunk it using the chunker, then compute the accuracy score. :type chunker: ChunkParserI :param chunker: The chunker being evaluated. :type gold: tree :param gold: The chunk structures to score the chunker on. :rtype: float """ gold_tags = [] test_tags = [] for gold_tree in gold: test_tree = chunker.parse(gold_tree.flatten()) gold_tags += tree2conlltags(gold_tree) test_tags += tree2conlltags(test_tree) # print 'GOLD:', gold_tags[:50] # print 'TEST:', test_tags[:50] return _accuracy(gold_tags, test_tags) # Patched for increased performance by Yoav Goldberg , 2006-01-13 # -- statistics are evaluated only on demand, instead of at every sentence evaluation # # SB: use nltk.metrics for precision/recall scoring? # class ChunkScore: """ A utility class for scoring chunk parsers. ``ChunkScore`` can evaluate a chunk parser's output, based on a number of statistics (precision, recall, f-measure, misssed chunks, incorrect chunks). It can also combine the scores from the parsing of multiple texts; this makes it significantly easier to evaluate a chunk parser that operates one sentence at a time. Texts are evaluated with the ``score`` method. The results of evaluation can be accessed via a number of accessor methods, such as ``precision`` and ``f_measure``. A typical use of the ``ChunkScore`` class is:: >>> chunkscore = ChunkScore() # doctest: +SKIP >>> for correct in correct_sentences: # doctest: +SKIP ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP ... chunkscore.score(correct, guess) # doctest: +SKIP >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP F Measure: 0.823 :ivar kwargs: Keyword arguments: - max_tp_examples: The maximum number actual examples of true positives to record. This affects the ``correct`` member function: ``correct`` will not return more than this number of true positive examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - max_fp_examples: The maximum number actual examples of false positives to record. This affects the ``incorrect`` member function and the ``guessed`` member function: ``incorrect`` will not return more than this number of examples, and ``guessed`` will not return more than this number of true positive examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - max_fn_examples: The maximum number actual examples of false negatives to record. This affects the ``missed`` member function and the ``correct`` member function: ``missed`` will not return more than this number of examples, and ``correct`` will not return more than this number of true negative examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - chunk_label: A regular expression indicating which chunks should be compared. Defaults to ``'.*'`` (i.e., all chunks). :type _tp: list(Token) :ivar _tp: List of true positives :type _fp: list(Token) :ivar _fp: List of false positives :type _fn: list(Token) :ivar _fn: List of false negatives :type _tp_num: int :ivar _tp_num: Number of true positives :type _fp_num: int :ivar _fp_num: Number of false positives :type _fn_num: int :ivar _fn_num: Number of false negatives. """ def __init__(self, **kwargs): self._correct = set() self._guessed = set() self._tp = set() self._fp = set() self._fn = set() self._max_tp = kwargs.get("max_tp_examples", 100) self._max_fp = kwargs.get("max_fp_examples", 100) self._max_fn = kwargs.get("max_fn_examples", 100) self._chunk_label = kwargs.get("chunk_label", ".*") self._tp_num = 0 self._fp_num = 0 self._fn_num = 0 self._count = 0 self._tags_correct = 0.0 self._tags_total = 0.0 self._measuresNeedUpdate = False def _updateMeasures(self): if self._measuresNeedUpdate: self._tp = self._guessed & self._correct self._fn = self._correct - self._guessed self._fp = self._guessed - self._correct self._tp_num = len(self._tp) self._fp_num = len(self._fp) self._fn_num = len(self._fn) self._measuresNeedUpdate = False def score(self, correct, guessed): """ Given a correctly chunked sentence, score another chunked version of the same sentence. :type correct: chunk structure :param correct: The known-correct ("gold standard") chunked sentence. :type guessed: chunk structure :param guessed: The chunked sentence to be scored. """ self._correct |= _chunksets(correct, self._count, self._chunk_label) self._guessed |= _chunksets(guessed, self._count, self._chunk_label) self._count += 1 self._measuresNeedUpdate = True # Keep track of per-tag accuracy (if possible) try: correct_tags = tree2conlltags(correct) guessed_tags = tree2conlltags(guessed) except ValueError: # This exception case is for nested chunk structures, # where tree2conlltags will fail with a ValueError: "Tree # is too deeply nested to be printed in CoNLL format." correct_tags = guessed_tags = () self._tags_total += len(correct_tags) self._tags_correct += sum( 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g ) def accuracy(self): """ Return the overall tag-based accuracy for all text that have been scored by this ``ChunkScore``, using the IOB (conll2000) tag encoding. :rtype: float """ if self._tags_total == 0: return 1 return self._tags_correct / self._tags_total def precision(self): """ Return the overall precision for all texts that have been scored by this ``ChunkScore``. :rtype: float """ self._updateMeasures() div = self._tp_num + self._fp_num if div == 0: return 0 else: return self._tp_num / div def recall(self): """ Return the overall recall for all texts that have been scored by this ``ChunkScore``. :rtype: float """ self._updateMeasures() div = self._tp_num + self._fn_num if div == 0: return 0 else: return self._tp_num / div def f_measure(self, alpha=0.5): """ Return the overall F measure for all texts that have been scored by this ``ChunkScore``. :param alpha: the relative weighting of precision and recall. Larger alpha biases the score towards the precision value, while smaller alpha biases the score towards the recall value. ``alpha`` should have a value in the range [0,1]. :type alpha: float :rtype: float """ self._updateMeasures() p = self.precision() r = self.recall() if p == 0 or r == 0: # what if alpha is 0 or 1? return 0 return 1 / (alpha / p + (1 - alpha) / r) def missed(self): """ Return the chunks which were included in the correct chunk structures, but not in the guessed chunk structures, listed in input order. :rtype: list of chunks """ self._updateMeasures() chunks = list(self._fn) return [c[1] for c in chunks] # discard position information def incorrect(self): """ Return the chunks which were included in the guessed chunk structures, but not in the correct chunk structures, listed in input order. :rtype: list of chunks """ self._updateMeasures() chunks = list(self._fp) return [c[1] for c in chunks] # discard position information def correct(self): """ Return the chunks which were included in the correct chunk structures, listed in input order. :rtype: list of chunks """ chunks = list(self._correct) return [c[1] for c in chunks] # discard position information def guessed(self): """ Return the chunks which were included in the guessed chunk structures, listed in input order. :rtype: list of chunks """ chunks = list(self._guessed) return [c[1] for c in chunks] # discard position information def __len__(self): self._updateMeasures() return self._tp_num + self._fn_num def __repr__(self): """ Return a concise representation of this ``ChunkScoring``. :rtype: str """ return "" def __str__(self): """ Return a verbose representation of this ``ChunkScoring``. This representation includes the precision, recall, and f-measure scores. For other information about the score, use the accessor methods (e.g., ``missed()`` and ``incorrect()``). :rtype: str """ return ( "ChunkParse score:\n" + (f" IOB Accuracy: {self.accuracy() * 100:5.1f}%%\n") + (f" Precision: {self.precision() * 100:5.1f}%%\n") + (f" Recall: {self.recall() * 100:5.1f}%%\n") + (f" F-Measure: {self.f_measure() * 100:5.1f}%%") ) # extract chunks, and assign unique id, the absolute position of # the first word of the chunk def _chunksets(t, count, chunk_label): pos = 0 chunks = [] for child in t: if isinstance(child, Tree): if re.match(chunk_label, child.label()): chunks.append(((count, pos), child.freeze())) pos += len(child.leaves()) else: pos += 1 return set(chunks) def tagstr2tree( s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None ): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_label: The label to use for chunk nodes :type chunk_label: str :param root_label: The label to use for the root of the tree :type root_label: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+") stack = [Tree(root_label, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == "[": if len(stack) != 1: raise ValueError(f"Unexpected [ at char {match.start():d}") chunk = Tree(chunk_label, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == "]": if len(stack) != 2: raise ValueError(f"Unexpected ] at char {match.start():d}") stack.pop() else: if sep is None: stack[-1].append(text) else: word, tag = str2tuple(text, sep) if source_tagset and target_tagset: tag = map_tag(source_tagset, target_tagset, tag) stack[-1].append((word, tag)) if len(stack) != 1: raise ValueError(f"Expected ] at char {len(s):d}") return stack[0] ### CONLL _LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?") def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"): """ Return a chunk structure for a single sentence encoded in the given CONLL 2000 style string. This function converts a CoNLL IOB string into a tree. It uses the specified chunk types (defaults to NP, PP and VP), and creates a tree rooted at a node labeled S (by default). :param s: The CoNLL string to be converted. :type s: str :param chunk_types: The chunk types to be converted. :type chunk_types: tuple :param root_label: The node label to use for the root. :type root_label: str :rtype: Tree """ stack = [Tree(root_label, [])] for lineno, line in enumerate(s.split("\n")): if not line.strip(): continue # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError(f"Error on line {lineno:d}") (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if chunk_types is not None and chunk_type not in chunk_types: state = "O" # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == "I" and chunk_type != stack[-1].label() if state in "BO" or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == "B" or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0] def tree2conlltags(t): """ Return a list of 3-tuples containing ``(word, tag, IOB-tag)``. Convert a tree to the CoNLL IOB tag format. :param t: The tree to be converted. :type t: Tree :rtype: list(tuple) """ tags = [] for child in t: try: category = child.label() prefix = "B-" for contents in child: if isinstance(contents, Tree): raise ValueError( "Tree is too deeply nested to be printed in CoNLL format" ) tags.append((contents[0], contents[1], prefix + category)) prefix = "I-" except AttributeError: tags.append((child[0], child[1], "O")) return tags def conlltags2tree( sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False ): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word, postag)) elif chunktag.startswith("B-"): tree.append(Tree(chunktag[2:], [(word, postag)])) elif chunktag.startswith("I-"): if ( len(tree) == 0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:] ): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word, postag)])) else: tree[-1].append((word, postag)) elif chunktag == "O": tree.append((word, postag)) else: raise ValueError(f"Bad conll tag {chunktag!r}") return tree def tree2conllstr(t): """ Return a multiline string where each line contains a word, tag and IOB tag. Convert a tree to the CoNLL IOB string format :param t: The tree to be converted. :type t: Tree :rtype: str """ lines = [" ".join(token) for token in tree2conlltags(t)] return "\n".join(lines) ### IEER _IEER_DOC_RE = re.compile( r"\s*" r"(\s*(?P.+?)\s*\s*)?" r"(\s*(?P.+?)\s*\s*)?" r"(\s*(?P.+?)\s*\s*)?" r"\s*" r"(\s*(?P.+?)\s*\s*)?" r"(?P.*?)\s*" r"\s*\s*", re.DOTALL, ) _IEER_TYPE_RE = re.compile(r']*?type="(?P\w+)"') def _ieer_read_text(s, root_label): stack = [Tree(root_label, [])] # s will be None if there is no headline in the text # return the empty list in place of a Tree if s is None: return [] for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s): piece = piece_m.group() try: if piece.startswith(".... m = _IEER_DOC_RE.match(s) if m: return { "text": _ieer_read_text(m.group("text"), root_label), "docno": m.group("docno"), "doctype": m.group("doctype"), "date_time": m.group("date_time"), #'headline': m.group('headline') # we want to capture NEs in the headline too! "headline": _ieer_read_text(m.group("headline"), root_label), } else: return _ieer_read_text(s, root_label) def demo(): s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." import nltk t = nltk.chunk.tagstr2tree(s, chunk_label="NP") t.pprint() print() s = """ These DT B-NP research NN I-NP protocols NNS I-NP offer VBP B-VP to TO B-PP the DT B-NP patient NN I-NP not RB O only RB O the DT B-NP very RB I-NP best JJS I-NP therapy NN I-NP which WDT B-NP we PRP B-NP have VBP B-VP established VBN I-VP today NN B-NP but CC B-NP also RB I-NP the DT B-NP hope NN I-NP of IN B-PP something NN B-NP still RB B-ADJP better JJR I-ADJP . . O """ conll_tree = conllstr2tree(s, chunk_types=("NP", "PP")) conll_tree.pprint() # Demonstrate CoNLL output print("CoNLL output:") print(nltk.chunk.tree2conllstr(conll_tree)) print() if __name__ == "__main__": demo() nltk-3.7/nltk/classify/000077500000000000000000000000001420073152400151165ustar00rootroot00000000000000nltk-3.7/nltk/classify/__init__.py000066400000000000000000000106171420073152400172340ustar00rootroot00000000000000# Natural Language Toolkit: Classifiers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Classes and interfaces for labeling tokens with category labels (or "class labels"). Typically, labels are represented with strings (such as ``'health'`` or ``'sports'``). Classifiers can be used to perform a wide range of classification tasks. For example, classifiers can be used... - to classify documents by topic - to classify ambiguous words by which word sense is intended - to classify acoustic signals by which phoneme they represent - to classify sentences by their author Features ======== In order to decide which category label is appropriate for a given token, classifiers examine one or more 'features' of the token. These "features" are typically chosen by hand, and indicate which aspects of the token are relevant to the classification decision. For example, a document classifier might use a separate feature for each word, recording how often that word occurred in the document. Featuresets =========== The features describing a token are encoded using a "featureset", which is a dictionary that maps from "feature names" to "feature values". Feature names are unique strings that indicate what aspect of the token is encoded by the feature. Examples include ``'prevword'``, for a feature whose value is the previous word; and ``'contains-word(library)'`` for a feature that is true when a document contains the word ``'library'``. Feature values are typically booleans, numbers, or strings, depending on which feature they describe. Featuresets are typically constructed using a "feature detector" (also known as a "feature extractor"). A feature detector is a function that takes a token (and sometimes information about its context) as its input, and returns a featureset describing that token. For example, the following feature detector converts a document (stored as a list of words) to a featureset describing the set of words included in the document: >>> # Define a feature detector function. >>> def document_features(document): ... return dict([('contains-word(%s)' % w, True) for w in document]) Feature detectors are typically applied to each token before it is fed to the classifier: >>> # Classify each Gutenberg document. >>> from nltk.corpus import gutenberg >>> for fileid in gutenberg.fileids(): # doctest: +SKIP ... doc = gutenberg.words(fileid) # doctest: +SKIP ... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP The parameters that a feature detector expects will vary, depending on the task and the needs of the feature detector. For example, a feature detector for word sense disambiguation (WSD) might take as its input a sentence, and the index of a word that should be classified, and return a featureset for that word. The following feature detector for WSD includes features describing the left and right contexts of the target word: >>> def wsd_features(sentence, index): ... featureset = {} ... for i in range(max(0, index-3), index): ... featureset['left-context(%s)' % sentence[i]] = True ... for i in range(index, max(index+3, len(sentence))): ... featureset['right-context(%s)' % sentence[i]] = True ... return featureset Training Classifiers ==================== Most classifiers are built by training them on a list of hand-labeled examples, known as the "training set". Training sets are represented as lists of ``(featuredict, label)`` tuples. """ from nltk.classify.api import ClassifierI, MultiClassifierI from nltk.classify.decisiontree import DecisionTreeClassifier from nltk.classify.maxent import ( BinaryMaxentFeatureEncoding, ConditionalExponentialClassifier, MaxentClassifier, TypedMaxentFeatureEncoding, ) from nltk.classify.megam import call_megam, config_megam from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features from nltk.classify.scikitlearn import SklearnClassifier from nltk.classify.senna import Senna from nltk.classify.textcat import TextCat from nltk.classify.util import accuracy, apply_features, log_likelihood from nltk.classify.weka import WekaClassifier, config_weka nltk-3.7/nltk/classify/api.py000066400000000000000000000144361420073152400162510ustar00rootroot00000000000000# Natural Language Toolkit: Classifier Interface # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT """ Interfaces for labeling tokens with category labels (or "class labels"). ``ClassifierI`` is a standard interface for "single-category classification", in which the set of categories is known, the number of categories is finite, and each text belongs to exactly one category. ``MultiClassifierI`` is a standard interface for "multi-category classification", which is like single-category classification except that each text belongs to zero or more categories. """ from nltk.internals import overridden ##////////////////////////////////////////////////////// # { Classification Interfaces ##////////////////////////////////////////////////////// class ClassifierI: """ A processing interface for labeling tokens with a single category label (or "class"). Labels are typically strs or ints, but can be any immutable type. The set of labels that the classifier chooses from must be fixed and finite. Subclasses must define: - ``labels()`` - either ``classify()`` or ``classify_many()`` (or both) Subclasses may define: - either ``prob_classify()`` or ``prob_classify_many()`` (or both) """ def labels(self): """ :return: the list of category labels used by this classifier. :rtype: list of (immutable) """ raise NotImplementedError() def classify(self, featureset): """ :return: the most appropriate label for the given featureset. :rtype: label """ if overridden(self.classify_many): return self.classify_many([featureset])[0] else: raise NotImplementedError() def prob_classify(self, featureset): """ :return: a probability distribution over labels for the given featureset. :rtype: ProbDistI """ if overridden(self.prob_classify_many): return self.prob_classify_many([featureset])[0] else: raise NotImplementedError() def classify_many(self, featuresets): """ Apply ``self.classify()`` to each element of ``featuresets``. I.e.: return [self.classify(fs) for fs in featuresets] :rtype: list(label) """ return [self.classify(fs) for fs in featuresets] def prob_classify_many(self, featuresets): """ Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: return [self.prob_classify(fs) for fs in featuresets] :rtype: list(ProbDistI) """ return [self.prob_classify(fs) for fs in featuresets] class MultiClassifierI: """ A processing interface for labeling tokens with zero or more category labels (or "labels"). Labels are typically strs or ints, but can be any immutable type. The set of labels that the multi-classifier chooses from must be fixed and finite. Subclasses must define: - ``labels()`` - either ``classify()`` or ``classify_many()`` (or both) Subclasses may define: - either ``prob_classify()`` or ``prob_classify_many()`` (or both) """ def labels(self): """ :return: the list of category labels used by this classifier. :rtype: list of (immutable) """ raise NotImplementedError() def classify(self, featureset): """ :return: the most appropriate set of labels for the given featureset. :rtype: set(label) """ if overridden(self.classify_many): return self.classify_many([featureset])[0] else: raise NotImplementedError() def prob_classify(self, featureset): """ :return: a probability distribution over sets of labels for the given featureset. :rtype: ProbDistI """ if overridden(self.prob_classify_many): return self.prob_classify_many([featureset])[0] else: raise NotImplementedError() def classify_many(self, featuresets): """ Apply ``self.classify()`` to each element of ``featuresets``. I.e.: return [self.classify(fs) for fs in featuresets] :rtype: list(set(label)) """ return [self.classify(fs) for fs in featuresets] def prob_classify_many(self, featuresets): """ Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: return [self.prob_classify(fs) for fs in featuresets] :rtype: list(ProbDistI) """ return [self.prob_classify(fs) for fs in featuresets] # # [XX] IN PROGRESS: # class SequenceClassifierI: # """ # A processing interface for labeling sequences of tokens with a # single category label (or "class"). Labels are typically # strs or ints, but can be any immutable type. The set # of labels that the classifier chooses from must be fixed and # finite. # """ # def labels(self): # """ # :return: the list of category labels used by this classifier. # :rtype: list of (immutable) # """ # raise NotImplementedError() # def prob_classify(self, featureset): # """ # Return a probability distribution over labels for the given # featureset. # If ``featureset`` is a list of featuresets, then return a # corresponding list containing the probability distribution # over labels for each of the given featuresets, where the # *i*\ th element of this list is the most appropriate label for # the *i*\ th element of ``featuresets``. # """ # raise NotImplementedError() # def classify(self, featureset): # """ # Return the most appropriate label for the given featureset. # If ``featureset`` is a list of featuresets, then return a # corresponding list containing the most appropriate label for # each of the given featuresets, where the *i*\ th element of # this list is the most appropriate label for the *i*\ th element # of ``featuresets``. # """ # raise NotImplementedError() nltk-3.7/nltk/classify/decisiontree.py000066400000000000000000000306761420073152400201610ustar00rootroot00000000000000# Natural Language Toolkit: Decision Tree Classifiers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A classifier model that decides which label to assign to a token on the basis of a tree structure, where branches correspond to conditions on feature values, and leaves correspond to label assignments. """ from collections import defaultdict from nltk.classify.api import ClassifierI from nltk.probability import FreqDist, MLEProbDist, entropy class DecisionTreeClassifier(ClassifierI): def __init__(self, label, feature_name=None, decisions=None, default=None): """ :param label: The most likely label for tokens that reach this node in the decision tree. If this decision tree has no children, then this label will be assigned to any token that reaches this decision tree. :param feature_name: The name of the feature that this decision tree selects for. :param decisions: A dictionary mapping from feature values for the feature identified by ``feature_name`` to child decision trees. :param default: The child that will be used if the value of feature ``feature_name`` does not match any of the keys in ``decisions``. This is used when constructing binary decision trees. """ self._label = label self._fname = feature_name self._decisions = decisions self._default = default def labels(self): labels = [self._label] if self._decisions is not None: for dt in self._decisions.values(): labels.extend(dt.labels()) if self._default is not None: labels.extend(self._default.labels()) return list(set(labels)) def classify(self, featureset): # Decision leaf: if self._fname is None: return self._label # Decision tree: fval = featureset.get(self._fname) if fval in self._decisions: return self._decisions[fval].classify(featureset) elif self._default is not None: return self._default.classify(featureset) else: return self._label def error(self, labeled_featuresets): errors = 0 for featureset, label in labeled_featuresets: if self.classify(featureset) != label: errors += 1 return errors / len(labeled_featuresets) def pretty_format(self, width=70, prefix="", depth=4): """ Return a string containing a pretty-printed version of this decision tree. Each line in this string corresponds to a single decision tree node or leaf, and indentation is used to display the structure of the decision tree. """ # [xx] display default!! if self._fname is None: n = width - len(prefix) - 15 return "{}{} {}\n".format(prefix, "." * n, self._label) s = "" for i, (fval, result) in enumerate( sorted( self._decisions.items(), key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()), ) ): hdr = f"{prefix}{self._fname}={fval}? " n = width - 15 - len(hdr) s += "{}{} {}\n".format(hdr, "." * (n), result._label) if result._fname is not None and depth > 1: s += result.pretty_format(width, prefix + " ", depth - 1) if self._default is not None: n = width - len(prefix) - 21 s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label) if self._default._fname is not None and depth > 1: s += self._default.pretty_format(width, prefix + " ", depth - 1) return s def pseudocode(self, prefix="", depth=4): """ Return a string representation of this decision tree that expresses the decisions it makes as a nested set of pseudocode if statements. """ if self._fname is None: return f"{prefix}return {self._label!r}\n" s = "" for (fval, result) in sorted( self._decisions.items(), key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()), ): s += f"{prefix}if {self._fname} == {fval!r}: " if result._fname is not None and depth > 1: s += "\n" + result.pseudocode(prefix + " ", depth - 1) else: s += f"return {result._label!r}\n" if self._default is not None: if len(self._decisions) == 1: s += "{}if {} != {!r}: ".format( prefix, self._fname, list(self._decisions.keys())[0] ) else: s += f"{prefix}else: " if self._default._fname is not None and depth > 1: s += "\n" + self._default.pseudocode(prefix + " ", depth - 1) else: s += f"return {self._default._label!r}\n" return s def __str__(self): return self.pretty_format() @staticmethod def train( labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100, support_cutoff=10, binary=False, feature_values=None, verbose=False, ): """ :param binary: If true, then treat all feature/value pairs as individual binary features, rather than using a single n-way branch for each feature. """ # Collect a list of all feature names. feature_names = set() for featureset, label in labeled_featuresets: for fname in featureset: feature_names.add(fname) # Collect a list of the values each feature can take. if feature_values is None and binary: feature_values = defaultdict(set) for featureset, label in labeled_featuresets: for fname, fval in featureset.items(): feature_values[fname].add(fval) # Start with a stump. if not binary: tree = DecisionTreeClassifier.best_stump( feature_names, labeled_featuresets, verbose ) else: tree = DecisionTreeClassifier.best_binary_stump( feature_names, labeled_featuresets, feature_values, verbose ) # Refine the stump. tree.refine( labeled_featuresets, entropy_cutoff, depth_cutoff - 1, support_cutoff, binary, feature_values, verbose, ) # Return it return tree @staticmethod def leaf(labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() return DecisionTreeClassifier(label) @staticmethod def stump(feature_name, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. freqs = defaultdict(FreqDist) # freq(label|value) for featureset, label in labeled_featuresets: feature_value = featureset.get(feature_name) freqs[feature_value][label] += 1 decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs} return DecisionTreeClassifier(label, feature_name, decisions) def refine( self, labeled_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary=False, feature_values=None, verbose=False, ): if len(labeled_featuresets) <= support_cutoff: return if self._fname is None: return if depth_cutoff <= 0: return for fval in self._decisions: fval_featuresets = [ (featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) == fval ] label_freqs = FreqDist(label for (featureset, label) in fval_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._decisions[fval] = DecisionTreeClassifier.train( fval_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose, ) if self._default is not None: default_featuresets = [ (featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) not in self._decisions ] label_freqs = FreqDist(label for (featureset, label) in default_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._default = DecisionTreeClassifier.train( default_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose, ) @staticmethod def best_stump(feature_names, labeled_featuresets, verbose=False): best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) best_error = best_stump.error(labeled_featuresets) for fname in feature_names: stump = DecisionTreeClassifier.stump(fname, labeled_featuresets) stump_error = stump.error(labeled_featuresets) if stump_error < best_error: best_error = stump_error best_stump = stump if verbose: print( "best stump for {:6d} toks uses {:20} err={:6.4f}".format( len(labeled_featuresets), best_stump._fname, best_error ) ) return best_stump @staticmethod def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist[label] += 1 else: neg_fdist[label] += 1 decisions = {} default = label # But hopefully we have observations! if pos_fdist.N() > 0: decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} if neg_fdist.N() > 0: default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default) @staticmethod def best_binary_stump( feature_names, labeled_featuresets, feature_values, verbose=False ): best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) best_error = best_stump.error(labeled_featuresets) for fname in feature_names: for fval in feature_values[fname]: stump = DecisionTreeClassifier.binary_stump( fname, fval, labeled_featuresets ) stump_error = stump.error(labeled_featuresets) if stump_error < best_error: best_error = stump_error best_stump = stump if verbose: if best_stump._decisions: descr = "{}={}".format( best_stump._fname, list(best_stump._decisions.keys())[0] ) else: descr = "(default)" print( "best stump for {:6d} toks uses {:20} err={:6.4f}".format( len(labeled_featuresets), descr, best_error ) ) return best_stump ##////////////////////////////////////////////////////// ## Demo ##////////////////////////////////////////////////////// def f(x): return DecisionTreeClassifier.train(x, binary=True, verbose=True) def demo(): from nltk.classify.util import binary_names_demo_features, names_demo classifier = names_demo( f, binary_names_demo_features # DecisionTreeClassifier.train, ) print(classifier.pretty_format(depth=7)) print(classifier.pseudocode(depth=7)) if __name__ == "__main__": demo() nltk-3.7/nltk/classify/maxent.py000066400000000000000000001637361420073152400170040ustar00rootroot00000000000000# Natural Language Toolkit: Maximum Entropy Classifiers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Dmitry Chichkov (TypedMaxentFeatureEncoding) # URL: # For license information, see LICENSE.TXT """ A classifier model based on maximum entropy modeling framework. This framework considers all of the probability distributions that are empirically consistent with the training data; and chooses the distribution with the highest entropy. A probability distribution is "empirically consistent" with a set of training data if its estimated frequency with which a class and a feature vector value co-occur is equal to the actual frequency in the data. Terminology: 'feature' ====================== The term *feature* is usually used to refer to some property of an unlabeled token. For example, when performing word sense disambiguation, we might define a ``'prevword'`` feature whose value is the word preceding the target word. However, in the context of maxent modeling, the term *feature* is typically used to refer to a property of a "labeled" token. In order to prevent confusion, we will introduce two distinct terms to disambiguate these two different concepts: - An "input-feature" is a property of an unlabeled token. - A "joint-feature" is a property of a labeled token. In the rest of the ``nltk.classify`` module, the term "features" is used to refer to what we will call "input-features" in this module. In literature that describes and discusses maximum entropy models, input-features are typically called "contexts", and joint-features are simply referred to as "features". Converting Input-Features to Joint-Features ------------------------------------------- In maximum entropy models, joint-features are required to have numeric values. Typically, each input-feature ``input_feat`` is mapped to a set of joint-features of the form: | joint_feat(token, label) = { 1 if input_feat(token) == feat_val | { and label == some_label | { | { 0 otherwise For all values of ``feat_val`` and ``some_label``. This mapping is performed by classes that implement the ``MaxentFeatureEncodingI`` interface. """ try: import numpy except ImportError: pass import os import tempfile from collections import defaultdict from nltk.classify.api import ClassifierI from nltk.classify.megam import call_megam, parse_megam_weights, write_megam_file from nltk.classify.tadm import call_tadm, parse_tadm_weights, write_tadm_file from nltk.classify.util import CutoffChecker, accuracy, log_likelihood from nltk.data import gzip_open_unicode from nltk.probability import DictionaryProbDist from nltk.util import OrderedDict __docformat__ = "epytext en" ###################################################################### # { Classifier Model ###################################################################### class MaxentClassifier(ClassifierI): """ A maximum entropy classifier (also known as a "conditional exponential classifier"). This classifier is parameterized by a set of "weights", which are used to combine the joint-features that are generated from a featureset by an "encoding". In particular, the encoding maps each ``(featureset, label)`` pair to a vector. The probability of each label is then computed using the following equation:: dotprod(weights, encode(fs,label)) prob(fs|label) = --------------------------------------------------- sum(dotprod(weights, encode(fs,l)) for l in labels) Where ``dotprod`` is the dot product:: dotprod(a,b) = sum(x*y for (x,y) in zip(a,b)) """ def __init__(self, encoding, weights, logarithmic=True): """ Construct a new maxent classifier model. Typically, new classifier models are created using the ``train()`` method. :type encoding: MaxentFeatureEncodingI :param encoding: An encoding that is used to convert the featuresets that are given to the ``classify`` method into joint-feature vectors, which are used by the maxent classifier model. :type weights: list of float :param weights: The feature weight vector for this classifier. :type logarithmic: bool :param logarithmic: If false, then use non-logarithmic weights. """ self._encoding = encoding self._weights = weights self._logarithmic = logarithmic # self._logarithmic = False assert encoding.length() == len(weights) def labels(self): return self._encoding.labels() def set_weights(self, new_weights): """ Set the feature weight vector for this classifier. :param new_weights: The new feature weight vector. :type new_weights: list of float """ self._weights = new_weights assert self._encoding.length() == len(new_weights) def weights(self): """ :return: The feature weight vector for this classifier. :rtype: list of float """ return self._weights def classify(self, featureset): return self.prob_classify(featureset).max() def prob_classify(self, featureset): prob_dict = {} for label in self._encoding.labels(): feature_vector = self._encoding.encode(featureset, label) if self._logarithmic: total = 0.0 for (f_id, f_val) in feature_vector: total += self._weights[f_id] * f_val prob_dict[label] = total else: prod = 1.0 for (f_id, f_val) in feature_vector: prod *= self._weights[f_id] ** f_val prob_dict[label] = prod # Normalize the dictionary to give a probability distribution return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True) def explain(self, featureset, columns=4): """ Print a table showing the effect of each of the features in the given feature set, and how they combine to determine the probabilities of each label for that featureset. """ descr_width = 50 TEMPLATE = " %-" + str(descr_width - 2) + "s%s%8.3f" pdist = self.prob_classify(featureset) labels = sorted(pdist.samples(), key=pdist.prob, reverse=True) labels = labels[:columns] print( " Feature".ljust(descr_width) + "".join("%8s" % (("%s" % l)[:7]) for l in labels) ) print(" " + "-" * (descr_width - 2 + 8 * len(labels))) sums = defaultdict(int) for i, label in enumerate(labels): feature_vector = self._encoding.encode(featureset, label) feature_vector.sort( key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True ) for (f_id, f_val) in feature_vector: if self._logarithmic: score = self._weights[f_id] * f_val else: score = self._weights[f_id] ** f_val descr = self._encoding.describe(f_id) descr = descr.split(" and label is ")[0] # hack descr += " (%s)" % f_val # hack if len(descr) > 47: descr = descr[:44] + "..." print(TEMPLATE % (descr, i * 8 * " ", score)) sums[label] += score print(" " + "-" * (descr_width - 1 + 8 * len(labels))) print( " TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels) ) print( " PROBS:".ljust(descr_width) + "".join("%8.3f" % pdist.prob(l) for l in labels) ) def most_informative_features(self, n=10): """ Generates the ranked list of informative features from most to least. """ if hasattr(self, "_most_informative_features"): return self._most_informative_features[:n] else: self._most_informative_features = sorted( list(range(len(self._weights))), key=lambda fid: abs(self._weights[fid]), reverse=True, ) return self._most_informative_features[:n] def show_most_informative_features(self, n=10, show="all"): """ :param show: all, neg, or pos (for negative-only or positive-only) :type show: str :param n: The no. of top features :type n: int """ # Use None the full list of ranked features. fids = self.most_informative_features(None) if show == "pos": fids = [fid for fid in fids if self._weights[fid] > 0] elif show == "neg": fids = [fid for fid in fids if self._weights[fid] < 0] for fid in fids[:n]: print(f"{self._weights[fid]:8.3f} {self._encoding.describe(fid)}") def __repr__(self): return "" % ( len(self._encoding.labels()), self._encoding.length(), ) #: A list of the algorithm names that are accepted for the #: ``train()`` method's ``algorithm`` parameter. ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"] @classmethod def train( cls, train_toks, algorithm=None, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **cutoffs, ): """ Train a new maxent classifier based on the given corpus of training samples. This classifier will have its weights chosen to maximize entropy while remaining empirically consistent with the training corpus. :rtype: MaxentClassifier :return: The new maxent classifier :type train_toks: list :param train_toks: Training data, represented as a list of pairs, the first member of which is a featureset, and the second of which is a classification label. :type algorithm: str :param algorithm: A case-insensitive string, specifying which algorithm should be used to train the classifier. The following algorithms are currently available. - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``), Improved Iterative Scaling (``'IIS'``) - External Libraries (requiring megam): LM-BFGS algorithm, with training performed by Megam (``'megam'``) The default algorithm is ``'IIS'``. :type trace: int :param trace: The level of diagnostic tracing output to produce. Higher values produce more verbose output. :type encoding: MaxentFeatureEncodingI :param encoding: A feature encoding, used to convert featuresets into feature vectors. If none is specified, then a ``BinaryMaxentFeatureEncoding`` will be built based on the features that are attested in the training corpus. :type labels: list(str) :param labels: The set of possible labels. If none is given, then the set of all labels attested in the training data will be used instead. :param gaussian_prior_sigma: The sigma value for a gaussian prior on model weights. Currently, this is supported by ``megam``. For other algorithms, its value is ignored. :param cutoffs: Arguments specifying various conditions under which the training should be halted. (Some of the cutoff conditions are not supported by some algorithms.) - ``max_iter=v``: Terminate after ``v`` iterations. - ``min_ll=v``: Terminate after the negative average log-likelihood drops under ``v``. - ``min_lldelta=v``: Terminate if a single iteration improves log likelihood by less than ``v``. """ if algorithm is None: algorithm = "iis" for key in cutoffs: if key not in ( "max_iter", "min_ll", "min_lldelta", "max_acc", "min_accdelta", "count_cutoff", "norm", "explicit", "bernoulli", ): raise TypeError("Unexpected keyword arg %r" % key) algorithm = algorithm.lower() if algorithm == "iis": return train_maxent_classifier_with_iis( train_toks, trace, encoding, labels, **cutoffs ) elif algorithm == "gis": return train_maxent_classifier_with_gis( train_toks, trace, encoding, labels, **cutoffs ) elif algorithm == "megam": return train_maxent_classifier_with_megam( train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs ) elif algorithm == "tadm": kwargs = cutoffs kwargs["trace"] = trace kwargs["encoding"] = encoding kwargs["labels"] = labels kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma return TadmMaxentClassifier.train(train_toks, **kwargs) else: raise ValueError("Unknown algorithm %s" % algorithm) #: Alias for MaxentClassifier. ConditionalExponentialClassifier = MaxentClassifier ###################################################################### # { Feature Encodings ###################################################################### class MaxentFeatureEncodingI: """ A mapping that converts a set of input-feature values to a vector of joint-feature values, given a label. This conversion is necessary to translate featuresets into a format that can be used by maximum entropy models. The set of joint-features used by a given encoding is fixed, and each index in the generated joint-feature vectors corresponds to a single joint-feature. The length of the generated joint-feature vectors is therefore constant (for a given encoding). Because the joint-feature vectors generated by ``MaxentFeatureEncodingI`` are typically very sparse, they are represented as a list of ``(index, value)`` tuples, specifying the value of each non-zero joint-feature. Feature encodings are generally created using the ``train()`` method, which generates an appropriate encoding based on the input-feature values and labels that are present in a given corpus. """ def encode(self, featureset, label): """ Given a (featureset, label) pair, return the corresponding vector of joint-feature values. This vector is represented as a list of ``(index, value)`` tuples, specifying the value of each non-zero joint-feature. :type featureset: dict :rtype: list(tuple(int, int)) """ raise NotImplementedError() def length(self): """ :return: The size of the fixed-length joint-feature vectors that are generated by this encoding. :rtype: int """ raise NotImplementedError() def labels(self): """ :return: A list of the \"known labels\" -- i.e., all labels ``l`` such that ``self.encode(fs,l)`` can be a nonzero joint-feature vector for some value of ``fs``. :rtype: list """ raise NotImplementedError() def describe(self, fid): """ :return: A string describing the value of the joint-feature whose index in the generated feature vectors is ``fid``. :rtype: str """ raise NotImplementedError() def train(cls, train_toks): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. """ raise NotImplementedError() class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that calls a user-supplied function to map a given featureset/label pair to a sparse joint-feature vector. """ def __init__(self, func, length, labels): """ Construct a new feature encoding based on the given function. :type func: (callable) :param func: A function that takes two arguments, a featureset and a label, and returns the sparse joint feature vector that encodes them:: func(featureset, label) -> feature_vector This sparse joint feature vector (``feature_vector``) is a list of ``(index,value)`` tuples. :type length: int :param length: The size of the fixed-length joint-feature vectors that are generated by this encoding. :type labels: list :param labels: A list of the \"known labels\" for this encoding -- i.e., all labels ``l`` such that ``self.encode(fs,l)`` can be a nonzero joint-feature vector for some value of ``fs``. """ self._length = length self._func = func self._labels = labels def encode(self, featureset, label): return self._func(featureset, label) def length(self): return self._length def labels(self): return self._labels def describe(self, fid): return "no description available" class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that generates vectors containing a binary joint-features of the form: | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) | { | { 0 otherwise Where ``fname`` is the name of an input-feature, ``fval`` is a value for that input-feature, and ``label`` is a label. Typically, these features are constructed based on a training corpus, using the ``train()`` method. This method will create one feature for each combination of ``fname``, ``fval``, and ``label`` that occurs at least once in the training corpus. The ``unseen_features`` parameter can be used to add "unseen-value features", which are used whenever an input feature has a value that was not encountered in the training corpus. These features have the form: | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) | { and l == label | { | { 0 otherwise Where ``is_unseen(fname, fval)`` is true if the encoding does not contain any joint features that are true when ``fs[fname]==fval``. The ``alwayson_features`` parameter can be used to add "always-on features", which have the form:: | joint_feat(fs, l) = { 1 if (l == label) | { | { 0 otherwise These always-on features allow the maxent model to directly model the prior probabilities of each label. """ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): """ :param labels: A list of the \"known labels\" for this encoding. :param mapping: A dictionary mapping from ``(fname,fval,label)`` tuples to corresponding joint-feature indexes. These indexes must be the set of integers from 0...len(mapping). If ``mapping[fname,fval,label]=id``, then ``self.encode(..., fname:fval, ..., label)[id]`` is 1; otherwise, it is 0. :param unseen_features: If true, then include unseen value features in the generated joint-feature vectors. :param alwayson_features: If true, then include always-on features in the generated joint-feature vectors. """ if set(mapping.values()) != set(range(len(mapping))): raise ValueError( "Mapping values must be exactly the " "set of integers from 0...len(mapping)" ) self._labels = list(labels) """A list of attested labels.""" self._mapping = mapping """dict mapping from (fname,fval,label) -> fid""" self._length = len(mapping) """The length of generated joint feature vectors.""" self._alwayson = None """dict mapping from label -> fid""" self._unseen = None """dict mapping from fname -> fid""" if alwayson_features: self._alwayson = { label: i + self._length for (i, label) in enumerate(labels) } self._length += len(self._alwayson) if unseen_features: fnames = {fname for (fname, fval, label) in mapping} self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)} self._length += len(fnames) def encode(self, featureset, label): # Inherit docs. encoding = [] # Convert input-features to joint-features: for fname, fval in featureset.items(): # Known feature name & value: if (fname, fval, label) in self._mapping: encoding.append((self._mapping[fname, fval, label], 1)) # Otherwise, we might want to fire an "unseen-value feature". elif self._unseen: # Have we seen this fname/fval combination with any label? for label2 in self._labels: if (fname, fval, label2) in self._mapping: break # we've seen this fname/fval combo # We haven't -- fire the unseen-value feature else: if fname in self._unseen: encoding.append((self._unseen[fname], 1)) # Add always-on features: if self._alwayson and label in self._alwayson: encoding.append((self._alwayson[label], 1)) return encoding def describe(self, f_id): # Inherit docs. if not isinstance(f_id, int): raise TypeError("describe() expected an int") try: self._inv_mapping except AttributeError: self._inv_mapping = [-1] * len(self._mapping) for (info, i) in self._mapping.items(): self._inv_mapping[i] = info if f_id < len(self._mapping): (fname, fval, label) = self._inv_mapping[f_id] return f"{fname}=={fval!r} and label is {label!r}" elif self._alwayson and f_id in self._alwayson.values(): for (label, f_id2) in self._alwayson.items(): if f_id == f_id2: return "label is %r" % label elif self._unseen and f_id in self._unseen.values(): for (fname, f_id2) in self._unseen.items(): if f_id == f_id2: return "%s is unseen" % fname else: raise ValueError("Bad feature id") def labels(self): # Inherit docs. return self._labels def length(self): # Inherit docs. return self._length @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. See the class description ``BinaryMaxentFeatureEncoding`` for a description of the joint-features that will be included in this encoding. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type count_cutoff: int :param count_cutoff: A cutoff value that is used to discard rare joint-features. If a joint-feature's value is 1 fewer than ``count_cutoff`` times in the training corpus, then that joint-feature is not included in the generated encoding. :type labels: list :param labels: A list of labels that should be used by the classifier. If not specified, then the set of labels attested in ``train_toks`` will be used. :param options: Extra parameters for the constructor, such as ``unseen_features`` and ``alwayson_features``. """ mapping = {} # maps (fname, fval, label) -> fid seen_labels = set() # The set of labels we've encountered count = defaultdict(int) # maps (fname, fval) -> count for (tok, label) in train_toks: if labels and label not in labels: raise ValueError("Unexpected label %s" % label) seen_labels.add(label) # Record each of the features. for (fname, fval) in tok.items(): # If a count cutoff is given, then only add a joint # feature once the corresponding (fname, fval, label) # tuple exceeds that cutoff. count[fname, fval] += 1 if count[fname, fval] >= count_cutoff: if (fname, fval, label) not in mapping: mapping[fname, fval, label] = len(mapping) if labels is None: labels = seen_labels return cls(labels, mapping, **options) class GISEncoding(BinaryMaxentFeatureEncoding): """ A binary feature encoding which adds one new joint-feature to the joint-features defined by ``BinaryMaxentFeatureEncoding``: a correction feature, whose value is chosen to ensure that the sparse vector always sums to a constant non-negative number. This new feature is used to ensure two preconditions for the GIS training algorithm: - At least one feature vector index must be nonzero for every token. - The feature vector must sum to a constant non-negative number for every token. """ def __init__( self, labels, mapping, unseen_features=False, alwayson_features=False, C=None ): """ :param C: The correction constant. The value of the correction feature is based on this value. In particular, its value is ``C - sum([v for (f,v) in encoding])``. :seealso: ``BinaryMaxentFeatureEncoding.__init__`` """ BinaryMaxentFeatureEncoding.__init__( self, labels, mapping, unseen_features, alwayson_features ) if C is None: C = len({fname for (fname, fval, label) in mapping}) + 1 self._C = C @property def C(self): """The non-negative constant that all encoded feature vectors will sum to.""" return self._C def encode(self, featureset, label): # Get the basic encoding. encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label) base_length = BinaryMaxentFeatureEncoding.length(self) # Add a correction feature. total = sum(v for (f, v) in encoding) if total >= self._C: raise ValueError("Correction feature is not high enough!") encoding.append((base_length, self._C - total)) # Return the result return encoding def length(self): return BinaryMaxentFeatureEncoding.length(self) + 1 def describe(self, f_id): if f_id == BinaryMaxentFeatureEncoding.length(self): return "Correction feature (%s)" % self._C else: return BinaryMaxentFeatureEncoding.describe(self, f_id) class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding): def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): self._mapping = OrderedDict(mapping) self._label_mapping = OrderedDict() BinaryMaxentFeatureEncoding.__init__( self, labels, self._mapping, unseen_features, alwayson_features ) def encode(self, featureset, label): encoding = [] for feature, value in featureset.items(): if (feature, label) not in self._mapping: self._mapping[(feature, label)] = len(self._mapping) if value not in self._label_mapping: if not isinstance(value, int): self._label_mapping[value] = len(self._label_mapping) else: self._label_mapping[value] = value encoding.append( (self._mapping[(feature, label)], self._label_mapping[value]) ) return encoding def labels(self): return self._labels def describe(self, fid): for (feature, label) in self._mapping: if self._mapping[(feature, label)] == fid: return (feature, label) def length(self): return len(self._mapping) @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): mapping = OrderedDict() if not labels: labels = [] # This gets read twice, so compute the values in case it's lazy. train_toks = list(train_toks) for (featureset, label) in train_toks: if label not in labels: labels.append(label) for (featureset, label) in train_toks: for label in labels: for feature in featureset: if (feature, label) not in mapping: mapping[(feature, label)] = len(mapping) return cls(labels, mapping, **options) class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that generates vectors containing integer, float and binary joint-features of the form: Binary (for string and boolean features): | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) | { | { 0 otherwise Value (for integer and float features): | joint_feat(fs, l) = { fval if (fs[fname] == type(fval)) | { and (l == label) | { | { not encoded otherwise Where ``fname`` is the name of an input-feature, ``fval`` is a value for that input-feature, and ``label`` is a label. Typically, these features are constructed based on a training corpus, using the ``train()`` method. For string and boolean features [type(fval) not in (int, float)] this method will create one feature for each combination of ``fname``, ``fval``, and ``label`` that occurs at least once in the training corpus. For integer and float features [type(fval) in (int, float)] this method will create one feature for each combination of ``fname`` and ``label`` that occurs at least once in the training corpus. For binary features the ``unseen_features`` parameter can be used to add "unseen-value features", which are used whenever an input feature has a value that was not encountered in the training corpus. These features have the form: | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) | { and l == label | { | { 0 otherwise Where ``is_unseen(fname, fval)`` is true if the encoding does not contain any joint features that are true when ``fs[fname]==fval``. The ``alwayson_features`` parameter can be used to add "always-on features", which have the form: | joint_feat(fs, l) = { 1 if (l == label) | { | { 0 otherwise These always-on features allow the maxent model to directly model the prior probabilities of each label. """ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): """ :param labels: A list of the \"known labels\" for this encoding. :param mapping: A dictionary mapping from ``(fname,fval,label)`` tuples to corresponding joint-feature indexes. These indexes must be the set of integers from 0...len(mapping). If ``mapping[fname,fval,label]=id``, then ``self.encode({..., fname:fval, ...``, label)[id]} is 1; otherwise, it is 0. :param unseen_features: If true, then include unseen value features in the generated joint-feature vectors. :param alwayson_features: If true, then include always-on features in the generated joint-feature vectors. """ if set(mapping.values()) != set(range(len(mapping))): raise ValueError( "Mapping values must be exactly the " "set of integers from 0...len(mapping)" ) self._labels = list(labels) """A list of attested labels.""" self._mapping = mapping """dict mapping from (fname,fval,label) -> fid""" self._length = len(mapping) """The length of generated joint feature vectors.""" self._alwayson = None """dict mapping from label -> fid""" self._unseen = None """dict mapping from fname -> fid""" if alwayson_features: self._alwayson = { label: i + self._length for (i, label) in enumerate(labels) } self._length += len(self._alwayson) if unseen_features: fnames = {fname for (fname, fval, label) in mapping} self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)} self._length += len(fnames) def encode(self, featureset, label): # Inherit docs. encoding = [] # Convert input-features to joint-features: for fname, fval in featureset.items(): if isinstance(fval, (int, float)): # Known feature name & value: if (fname, type(fval), label) in self._mapping: encoding.append((self._mapping[fname, type(fval), label], fval)) else: # Known feature name & value: if (fname, fval, label) in self._mapping: encoding.append((self._mapping[fname, fval, label], 1)) # Otherwise, we might want to fire an "unseen-value feature". elif self._unseen: # Have we seen this fname/fval combination with any label? for label2 in self._labels: if (fname, fval, label2) in self._mapping: break # we've seen this fname/fval combo # We haven't -- fire the unseen-value feature else: if fname in self._unseen: encoding.append((self._unseen[fname], 1)) # Add always-on features: if self._alwayson and label in self._alwayson: encoding.append((self._alwayson[label], 1)) return encoding def describe(self, f_id): # Inherit docs. if not isinstance(f_id, int): raise TypeError("describe() expected an int") try: self._inv_mapping except AttributeError: self._inv_mapping = [-1] * len(self._mapping) for (info, i) in self._mapping.items(): self._inv_mapping[i] = info if f_id < len(self._mapping): (fname, fval, label) = self._inv_mapping[f_id] return f"{fname}=={fval!r} and label is {label!r}" elif self._alwayson and f_id in self._alwayson.values(): for (label, f_id2) in self._alwayson.items(): if f_id == f_id2: return "label is %r" % label elif self._unseen and f_id in self._unseen.values(): for (fname, f_id2) in self._unseen.items(): if f_id == f_id2: return "%s is unseen" % fname else: raise ValueError("Bad feature id") def labels(self): # Inherit docs. return self._labels def length(self): # Inherit docs. return self._length @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. See the class description ``TypedMaxentFeatureEncoding`` for a description of the joint-features that will be included in this encoding. Note: recognized feature values types are (int, float), over types are interpreted as regular binary features. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type count_cutoff: int :param count_cutoff: A cutoff value that is used to discard rare joint-features. If a joint-feature's value is 1 fewer than ``count_cutoff`` times in the training corpus, then that joint-feature is not included in the generated encoding. :type labels: list :param labels: A list of labels that should be used by the classifier. If not specified, then the set of labels attested in ``train_toks`` will be used. :param options: Extra parameters for the constructor, such as ``unseen_features`` and ``alwayson_features``. """ mapping = {} # maps (fname, fval, label) -> fid seen_labels = set() # The set of labels we've encountered count = defaultdict(int) # maps (fname, fval) -> count for (tok, label) in train_toks: if labels and label not in labels: raise ValueError("Unexpected label %s" % label) seen_labels.add(label) # Record each of the features. for (fname, fval) in tok.items(): if type(fval) in (int, float): fval = type(fval) # If a count cutoff is given, then only add a joint # feature once the corresponding (fname, fval, label) # tuple exceeds that cutoff. count[fname, fval] += 1 if count[fname, fval] >= count_cutoff: if (fname, fval, label) not in mapping: mapping[fname, fval, label] = len(mapping) if labels is None: labels = seen_labels return cls(labels, mapping, **options) ###################################################################### # { Classifier Trainer: Generalized Iterative Scaling ###################################################################### def train_maxent_classifier_with_gis( train_toks, trace=3, encoding=None, labels=None, **cutoffs ): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the Generalized Iterative Scaling algorithm. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. """ cutoffs.setdefault("max_iter", 100) cutoffchecker = CutoffChecker(cutoffs) # Construct an encoding from the training data. if encoding is None: encoding = GISEncoding.train(train_toks, labels=labels) if not hasattr(encoding, "C"): raise TypeError( "The GIS algorithm requires an encoding that " "defines C (e.g., GISEncoding)." ) # Cinv is the inverse of the sum of each joint feature vector. # This controls the learning rate: higher Cinv (or lower C) gives # faster learning. Cinv = 1.0 / encoding.C # Count how many times each feature occurs in the training data. empirical_fcount = calculate_empirical_fcount(train_toks, encoding) # Check for any features that are not attested in train_toks. unattested = set(numpy.nonzero(empirical_fcount == 0)[0]) # Build the classifier. Start with weight=0 for each attested # feature, and weight=-infinity for each unattested feature. weights = numpy.zeros(len(empirical_fcount), "d") for fid in unattested: weights[fid] = numpy.NINF classifier = ConditionalExponentialClassifier(encoding, weights) # Take the log of the empirical fcount. log_empirical_fcount = numpy.log2(empirical_fcount) del empirical_fcount if trace > 0: print(" ==> Training (%d iterations)" % cutoffs["max_iter"]) if trace > 2: print() print(" Iteration Log Likelihood Accuracy") print(" ---------------------------------------") # Train the classifier. try: while True: if trace > 2: ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) acc = cutoffchecker.acc or accuracy(classifier, train_toks) iternum = cutoffchecker.iter print(" %9d %14.5f %9.3f" % (iternum, ll, acc)) # Use the model to estimate the number of times each # feature should occur in the training data. estimated_fcount = calculate_estimated_fcount( classifier, train_toks, encoding ) # Take the log of estimated fcount (avoid taking log(0).) for fid in unattested: estimated_fcount[fid] += 1 log_estimated_fcount = numpy.log2(estimated_fcount) del estimated_fcount # Update the classifier weights weights = classifier.weights() weights += (log_empirical_fcount - log_estimated_fcount) * Cinv classifier.set_weights(weights) # Check the log-likelihood & accuracy cutoffs. if cutoffchecker.check(classifier, train_toks): break except KeyboardInterrupt: print(" Training stopped: keyboard interrupt") except: raise if trace > 2: ll = log_likelihood(classifier, train_toks) acc = accuracy(classifier, train_toks) print(f" Final {ll:14.5f} {acc:9.3f}") # Return the classifier. return classifier def calculate_empirical_fcount(train_toks, encoding): fcount = numpy.zeros(encoding.length(), "d") for tok, label in train_toks: for (index, val) in encoding.encode(tok, label): fcount[index] += val return fcount def calculate_estimated_fcount(classifier, train_toks, encoding): fcount = numpy.zeros(encoding.length(), "d") for tok, label in train_toks: pdist = classifier.prob_classify(tok) for label in pdist.samples(): prob = pdist.prob(label) for (fid, fval) in encoding.encode(tok, label): fcount[fid] += prob * fval return fcount ###################################################################### # { Classifier Trainer: Improved Iterative Scaling ###################################################################### def train_maxent_classifier_with_iis( train_toks, trace=3, encoding=None, labels=None, **cutoffs ): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the Improved Iterative Scaling algorithm. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. """ cutoffs.setdefault("max_iter", 100) cutoffchecker = CutoffChecker(cutoffs) # Construct an encoding from the training data. if encoding is None: encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) # Count how many times each feature occurs in the training data. empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks) # Find the nf map, and related variables nfarray and nfident. # nf is the sum of the features for a given labeled text. # nfmap compresses this sparse set of values to a dense list. # nfarray performs the reverse operation. nfident is # nfarray multiplied by an identity matrix. nfmap = calculate_nfmap(train_toks, encoding) nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d") nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) # Check for any features that are not attested in train_toks. unattested = set(numpy.nonzero(empirical_ffreq == 0)[0]) # Build the classifier. Start with weight=0 for each attested # feature, and weight=-infinity for each unattested feature. weights = numpy.zeros(len(empirical_ffreq), "d") for fid in unattested: weights[fid] = numpy.NINF classifier = ConditionalExponentialClassifier(encoding, weights) if trace > 0: print(" ==> Training (%d iterations)" % cutoffs["max_iter"]) if trace > 2: print() print(" Iteration Log Likelihood Accuracy") print(" ---------------------------------------") # Train the classifier. try: while True: if trace > 2: ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) acc = cutoffchecker.acc or accuracy(classifier, train_toks) iternum = cutoffchecker.iter print(" %9d %14.5f %9.3f" % (iternum, ll, acc)) # Calculate the deltas for this iteration, using Newton's method. deltas = calculate_deltas( train_toks, classifier, unattested, empirical_ffreq, nfmap, nfarray, nftranspose, encoding, ) # Use the deltas to update our weights. weights = classifier.weights() weights += deltas classifier.set_weights(weights) # Check the log-likelihood & accuracy cutoffs. if cutoffchecker.check(classifier, train_toks): break except KeyboardInterrupt: print(" Training stopped: keyboard interrupt") except: raise if trace > 2: ll = log_likelihood(classifier, train_toks) acc = accuracy(classifier, train_toks) print(f" Final {ll:14.5f} {acc:9.3f}") # Return the classifier. return classifier def calculate_nfmap(train_toks, encoding): """ Construct a map that can be used to compress ``nf`` (which is typically sparse). *nf(feature_vector)* is the sum of the feature values for *feature_vector*. This represents the number of features that are active for a given labeled text. This method finds all values of *nf(t)* that are attested for at least one token in the given list of training tokens; and constructs a dictionary mapping these attested values to a continuous range *0...N*. For example, if the only values of *nf()* that were attested were 3, 5, and 7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``. :return: A map that can be used to compress ``nf`` to a dense vector. :rtype: dict(int -> int) """ # Map from nf to indices. This allows us to use smaller arrays. nfset = set() for tok, _ in train_toks: for label in encoding.labels(): nfset.add(sum(val for (id, val) in encoding.encode(tok, label))) return {nf: i for (i, nf) in enumerate(nfset)} def calculate_deltas( train_toks, classifier, unattested, ffreq_empirical, nfmap, nfarray, nftranspose, encoding, ): r""" Calculate the update values for the classifier weights for this iteration of IIS. These update weights are the value of ``delta`` that solves the equation:: ffreq_empirical[i] = SUM[fs,l] (classifier.prob_classify(fs).prob(l) * feature_vector(fs,l)[i] * exp(delta[i] * nf(feature_vector(fs,l)))) Where: - *(fs,l)* is a (featureset, label) tuple from ``train_toks`` - *feature_vector(fs,l)* = ``encoding.encode(fs,l)`` - *nf(vector)* = ``sum([val for (id,val) in vector])`` This method uses Newton's method to solve this equation for *delta[i]*. In particular, it starts with a guess of ``delta[i]`` = 1; and iteratively updates ``delta`` with: | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i]) until convergence, where *sum1* and *sum2* are defined as: | sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta) | sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l))) | f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) . | feature_vector(fs,l)[i] . | exp(delta[i] . nf(feature_vector(fs,l)))) Note that *sum1* and *sum2* depend on ``delta``; so they need to be re-computed each iteration. The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are used to generate a dense encoding for *nf(ltext)*. This allows ``_deltas`` to calculate *sum1* and *sum2* using matrices, which yields a significant performance improvement. :param train_toks: The set of training tokens. :type train_toks: list(tuple(dict, str)) :param classifier: The current classifier. :type classifier: ClassifierI :param ffreq_empirical: An array containing the empirical frequency for each feature. The *i*\ th element of this array is the empirical frequency for feature *i*. :type ffreq_empirical: sequence of float :param unattested: An array that is 1 for features that are not attested in the training data; and 0 for features that are attested. In other words, ``unattested[i]==0`` iff ``ffreq_empirical[i]==0``. :type unattested: sequence of int :param nfmap: A map that can be used to compress ``nf`` to a dense vector. :type nfmap: dict(int -> int) :param nfarray: An array that can be used to uncompress ``nf`` from a dense vector. :type nfarray: array(float) :param nftranspose: The transpose of ``nfarray`` :type nftranspose: array(float) """ # These parameters control when we decide that we've # converged. It probably should be possible to set these # manually, via keyword arguments to train. NEWTON_CONVERGE = 1e-12 MAX_NEWTON = 300 deltas = numpy.ones(encoding.length(), "d") # Precompute the A matrix: # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) ) # over all label,fs s.t. num_features[label,fs]=nf A = numpy.zeros((len(nfmap), encoding.length()), "d") for tok, label in train_toks: dist = classifier.prob_classify(tok) for label in encoding.labels(): # Generate the feature vector feature_vector = encoding.encode(tok, label) # Find the number of active features nf = sum(val for (id, val) in feature_vector) # Update the A matrix for (id, val) in feature_vector: A[nfmap[nf], id] += dist.prob(label) * val A /= len(train_toks) # Iteratively solve for delta. Use the following variables: # - nf_delta[x][y] = nfarray[x] * delta[y] # - exp_nf_delta[x][y] = exp(nf[x] * delta[y]) # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y]) # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) # exp(delta[i]nf) # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) # nf exp(delta[i]nf) for rangenum in range(MAX_NEWTON): nf_delta = numpy.outer(nfarray, deltas) exp_nf_delta = 2 ** nf_delta nf_exp_nf_delta = nftranspose * exp_nf_delta sum1 = numpy.sum(exp_nf_delta * A, axis=0) sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0) # Avoid division by zero. for fid in unattested: sum2[fid] += 1 # Update the deltas. deltas -= (ffreq_empirical - sum1) / -sum2 # We can stop once we converge. n_error = numpy.sum(abs(ffreq_empirical - sum1)) / numpy.sum(abs(deltas)) if n_error < NEWTON_CONVERGE: return deltas return deltas ###################################################################### # { Classifier Trainer: megam ###################################################################### # [xx] possible extension: add support for using implicit file format; # this would need to put requirements on what encoding is used. But # we may need this for other maxent classifier trainers that require # implicit formats anyway. def train_maxent_classifier_with_megam( train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs ): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the external ``megam`` library. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. :see: ``nltk.classify.megam`` """ explicit = True bernoulli = True if "explicit" in kwargs: explicit = kwargs["explicit"] if "bernoulli" in kwargs: bernoulli = kwargs["bernoulli"] # Construct an encoding from the training data. if encoding is None: # Count cutoff can also be controlled by megam with the -minfc # option. Not sure where the best place for it is. count_cutoff = kwargs.get("count_cutoff", 0) encoding = BinaryMaxentFeatureEncoding.train( train_toks, count_cutoff, labels=labels, alwayson_features=True ) elif labels is not None: raise ValueError("Specify encoding or labels, not both") # Write a training file for megam. try: fd, trainfile_name = tempfile.mkstemp(prefix="nltk-") with open(trainfile_name, "w") as trainfile: write_megam_file( train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli ) os.close(fd) except (OSError, ValueError) as e: raise ValueError("Error while creating megam training file: %s" % e) from e # Run megam on the training file. options = [] options += ["-nobias", "-repeat", "10"] if explicit: options += ["-explicit"] if not bernoulli: options += ["-fvals"] if gaussian_prior_sigma: # Lambda is just the precision of the Gaussian prior, i.e. it's the # inverse variance, so the parameter conversion is 1.0/sigma**2. # See https://users.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf inv_variance = 1.0 / gaussian_prior_sigma ** 2 else: inv_variance = 0 options += ["-lambda", "%.2f" % inv_variance, "-tune"] if trace < 3: options += ["-quiet"] if "max_iter" in kwargs: options += ["-maxi", "%s" % kwargs["max_iter"]] if "ll_delta" in kwargs: # [xx] this is actually a perplexity delta, not a log # likelihood delta options += ["-dpp", "%s" % abs(kwargs["ll_delta"])] if hasattr(encoding, "cost"): options += ["-multilabel"] # each possible la options += ["multiclass", trainfile_name] stdout = call_megam(options) # print('./megam_i686.opt ', ' '.join(options)) # Delete the training file try: os.remove(trainfile_name) except OSError as e: print(f"Warning: unable to delete {trainfile_name}: {e}") # Parse the generated weight vector. weights = parse_megam_weights(stdout, encoding.length(), explicit) # Convert from base-e to base-2 weights. weights *= numpy.log2(numpy.e) # Build the classifier return MaxentClassifier(encoding, weights) ###################################################################### # { Classifier Trainer: tadm ###################################################################### class TadmMaxentClassifier(MaxentClassifier): @classmethod def train(cls, train_toks, **kwargs): algorithm = kwargs.get("algorithm", "tao_lmvm") trace = kwargs.get("trace", 3) encoding = kwargs.get("encoding", None) labels = kwargs.get("labels", None) sigma = kwargs.get("gaussian_prior_sigma", 0) count_cutoff = kwargs.get("count_cutoff", 0) max_iter = kwargs.get("max_iter") ll_delta = kwargs.get("min_lldelta") # Construct an encoding from the training data. if not encoding: encoding = TadmEventMaxentFeatureEncoding.train( train_toks, count_cutoff, labels=labels ) trainfile_fd, trainfile_name = tempfile.mkstemp( prefix="nltk-tadm-events-", suffix=".gz" ) weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-") trainfile = gzip_open_unicode(trainfile_name, "w") write_tadm_file(train_toks, encoding, trainfile) trainfile.close() options = [] options.extend(["-monitor"]) options.extend(["-method", algorithm]) if sigma: options.extend(["-l2", "%.6f" % sigma ** 2]) if max_iter: options.extend(["-max_it", "%d" % max_iter]) if ll_delta: options.extend(["-fatol", "%.6f" % abs(ll_delta)]) options.extend(["-events_in", trainfile_name]) options.extend(["-params_out", weightfile_name]) if trace < 3: options.extend(["2>&1"]) else: options.extend(["-summary"]) call_tadm(options) with open(weightfile_name) as weightfile: weights = parse_tadm_weights(weightfile) os.remove(trainfile_name) os.remove(weightfile_name) # Convert from base-e to base-2 weights. weights *= numpy.log2(numpy.e) # Build the classifier return cls(encoding, weights) ###################################################################### # { Demo ###################################################################### def demo(): from nltk.classify.util import names_demo classifier = names_demo(MaxentClassifier.train) if __name__ == "__main__": demo() nltk-3.7/nltk/classify/megam.py000066400000000000000000000141041420073152400165560ustar00rootroot00000000000000# Natural Language Toolkit: Interface to Megam Classifier # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A set of functions used to interface with the external megam_ maxent optimization package. Before megam can be used, you should tell NLTK where it can find the megam binary, using the ``config_megam()`` function. Typical usage: >>> from nltk.classify import megam >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP [Found megam: ...] Use with MaxentClassifier. Example below, see MaxentClassifier documentation for details. nltk.classify.MaxentClassifier.train(corpus, 'megam') .. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html """ import subprocess from nltk.internals import find_binary try: import numpy except ImportError: numpy = None ###################################################################### # { Configuration ###################################################################### _megam_bin = None def config_megam(bin=None): """ Configure NLTK's interface to the ``megam`` maxent optimization package. :param bin: The full path to the ``megam`` binary. If not specified, then nltk will search the system for a ``megam`` binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str """ global _megam_bin _megam_bin = find_binary( "megam", bin, env_vars=["MEGAM"], binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"], url="https://www.umiacs.umd.edu/~hal/megam/index.html", ) ###################################################################### # { Megam Interface Functions ###################################################################### def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True): """ Generate an input file for ``megam`` based on the given corpus of classified tokens. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type encoding: MaxentFeatureEncodingI :param encoding: A feature encoding, used to convert featuresets into feature vectors. May optionally implement a cost() method in order to assign different costs to different class predictions. :type stream: stream :param stream: The stream to which the megam input file should be written. :param bernoulli: If true, then use the 'bernoulli' format. I.e., all joint features have binary values, and are listed iff they are true. Otherwise, list feature values explicitly. If ``bernoulli=False``, then you must call ``megam`` with the ``-fvals`` option. :param explicit: If true, then use the 'explicit' format. I.e., list the features that would fire for any of the possible labels, for each token. If ``explicit=True``, then you must call ``megam`` with the ``-explicit`` option. """ # Look up the set of labels. labels = encoding.labels() labelnum = {label: i for (i, label) in enumerate(labels)} # Write the file, which contains one line per instance. for featureset, label in train_toks: # First, the instance number (or, in the weighted multiclass case, the cost of each label). if hasattr(encoding, "cost"): stream.write( ":".join(str(encoding.cost(featureset, label, l)) for l in labels) ) else: stream.write("%d" % labelnum[label]) # For implicit file formats, just list the features that fire # for this instance's actual label. if not explicit: _write_megam_features(encoding.encode(featureset, label), stream, bernoulli) # For explicit formats, list the features that would fire for # any of the possible labels. else: for l in labels: stream.write(" #") _write_megam_features(encoding.encode(featureset, l), stream, bernoulli) # End of the instance. stream.write("\n") def parse_megam_weights(s, features_count, explicit=True): """ Given the stdout output generated by ``megam`` when training a model, return a ``numpy`` array containing the corresponding weight vector. This function does not currently handle bias features. """ if numpy is None: raise ValueError("This function requires that numpy be installed") assert explicit, "non-explicit not supported yet" lines = s.strip().split("\n") weights = numpy.zeros(features_count, "d") for line in lines: if line.strip(): fid, weight = line.split() weights[int(fid)] = float(weight) return weights def _write_megam_features(vector, stream, bernoulli): if not vector: raise ValueError( "MEGAM classifier requires the use of an " "always-on feature." ) for (fid, fval) in vector: if bernoulli: if fval == 1: stream.write(" %s" % fid) elif fval != 0: raise ValueError( "If bernoulli=True, then all" "features must be binary." ) else: stream.write(f" {fid} {fval}") def call_megam(args): """ Call the ``megam`` binary with the given arguments. """ if isinstance(args, str): raise TypeError("args should be a list of strings") if _megam_bin is None: config_megam() # Call megam via a subprocess cmd = [_megam_bin] + args p = subprocess.Popen(cmd, stdout=subprocess.PIPE) (stdout, stderr) = p.communicate() # Check the return code. if p.returncode != 0: print() print(stderr) raise OSError("megam command failed!") if isinstance(stdout, str): return stdout else: return stdout.decode("utf-8") nltk-3.7/nltk/classify/naivebayes.py000066400000000000000000000243251420073152400176240ustar00rootroot00000000000000# Natural Language Toolkit: Naive Bayes Classifiers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A classifier based on the Naive Bayes algorithm. In order to find the probability for a label, this algorithm first uses the Bayes rule to express P(label|features) in terms of P(label) and P(features|label): | P(label) * P(features|label) | P(label|features) = ------------------------------ | P(features) The algorithm then makes the 'naive' assumption that all features are independent, given the label: | P(label) * P(f1|label) * ... * P(fn|label) | P(label|features) = -------------------------------------------- | P(features) Rather than computing P(features) explicitly, the algorithm just calculates the numerator for each label, and normalizes them so they sum to one: | P(label) * P(f1|label) * ... * P(fn|label) | P(label|features) = -------------------------------------------- | SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) ) """ from collections import defaultdict from nltk.classify.api import ClassifierI from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs ##////////////////////////////////////////////////////// ## Naive Bayes Classifier ##////////////////////////////////////////////////////// class NaiveBayesClassifier(ClassifierI): """ A Naive Bayes classifier. Naive Bayes classifiers are paramaterized by two probability distributions: - P(label) gives the probability that an input will receive each label, given no information about the input's features. - P(fname=fval|label) gives the probability that a given feature (fname) will receive a given value (fval), given that the label (label). If the classifier encounters an input with a feature that has never been seen with any label, then rather than assigning a probability of 0 to all labels, it will ignore that feature. The feature value 'None' is reserved for unseen feature values; you generally should not use 'None' as a feature value for one of your own features. """ def __init__(self, label_probdist, feature_probdist): """ :param label_probdist: P(label), the probability distribution over labels. It is expressed as a ``ProbDistI`` whose samples are labels. I.e., P(label) = ``label_probdist.prob(label)``. :param feature_probdist: P(fname=fval|label), the probability distribution for feature values, given labels. It is expressed as a dictionary whose keys are ``(label, fname)`` pairs and whose values are ``ProbDistI`` objects over feature values. I.e., P(fname=fval|label) = ``feature_probdist[label,fname].prob(fval)``. If a given ``(label,fname)`` is not a key in ``feature_probdist``, then it is assumed that the corresponding P(fname=fval|label) is 0 for all values of ``fval``. """ self._label_probdist = label_probdist self._feature_probdist = feature_probdist self._labels = list(label_probdist.samples()) def labels(self): return self._labels def classify(self, featureset): return self.prob_classify(featureset).max() def prob_classify(self, featureset): # Discard any feature names that we've never seen before. # Otherwise, we'll just assign a probability of 0 to # everything. featureset = featureset.copy() for fname in list(featureset.keys()): for label in self._labels: if (label, fname) in self._feature_probdist: break else: # print('Ignoring unseen feature %s' % fname) del featureset[fname] # Find the log probability of each label, given the features. # Start with the log probability of the label itself. logprob = {} for label in self._labels: logprob[label] = self._label_probdist.logprob(label) # Then add in the log probability of features given labels. for label in self._labels: for (fname, fval) in featureset.items(): if (label, fname) in self._feature_probdist: feature_probs = self._feature_probdist[label, fname] logprob[label] += feature_probs.logprob(fval) else: # nb: This case will never come up if the # classifier was created by # NaiveBayesClassifier.train(). logprob[label] += sum_logs([]) # = -INF. return DictionaryProbDist(logprob, normalize=True, log=True) def show_most_informative_features(self, n=10): # Determine the most relevant features, and display them. cpdist = self._feature_probdist print("Most Informative Features") for (fname, fval) in self.most_informative_features(n): def labelprob(l): return cpdist[l, fname].prob(fval) labels = sorted( (l for l in self._labels if fval in cpdist[l, fname].samples()), key=lambda element: (-labelprob(element), element), reverse=True, ) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0, fname].prob(fval) == 0: ratio = "INF" else: ratio = "%8.1f" % ( cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval) ) print( "%24s = %-14r %6s : %-6s = %s : 1.0" % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio) ) def most_informative_features(self, n=100): """ Return a list of the 'most informative' features used by this classifier. For the purpose of this function, the informativeness of a feature ``(fname,fval)`` is equal to the highest value of P(fname=fval|label), for any label, divided by the lowest value of P(fname=fval|label), for any label: | max[ P(fname=fval|label1) / P(fname=fval|label2) ] """ if hasattr(self, "_most_informative_features"): return self._most_informative_features[:n] else: # The set of (fname, fval) pairs used by this classifier. features = set() # The max & min probability associated w/ each (fname, fval) # pair. Maps (fname,fval) -> float. maxprob = defaultdict(lambda: 0.0) minprob = defaultdict(lambda: 1.0) for (label, fname), probdist in self._feature_probdist.items(): for fval in probdist.samples(): feature = (fname, fval) features.add(feature) p = probdist.prob(fval) maxprob[feature] = max(p, maxprob[feature]) minprob[feature] = min(p, minprob[feature]) if minprob[feature] == 0: features.discard(feature) # Convert features to a list, & sort it by how informative # features are. self._most_informative_features = sorted( features, key=lambda feature_: ( minprob[feature_] / maxprob[feature_], feature_[0], feature_[1] in [None, False, True], str(feature_[1]).lower(), ), ) return self._most_informative_features[:n] @classmethod def train(cls, labeled_featuresets, estimator=ELEProbDist): """ :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occurred, given # the label and featurename. for featureset, label in labeled_featuresets: label_freqdist[label] += 1 for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname][fval] += 1 # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() # Only add a None key when necessary, i.e. if there are # any samples with feature 'fname' missing. if num_samples - count > 0: feature_freqdist[label, fname][None] += num_samples - count feature_values[fname].add(None) # Create the P(label) distribution label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label, fname] = probdist return cls(label_probdist, feature_probdist) ##////////////////////////////////////////////////////// ## Demo ##////////////////////////////////////////////////////// def demo(): from nltk.classify.util import names_demo classifier = names_demo(NaiveBayesClassifier.train) classifier.show_most_informative_features() if __name__ == "__main__": demo() nltk-3.7/nltk/classify/positivenaivebayes.py000066400000000000000000000161001420073152400213770ustar00rootroot00000000000000# Natural Language Toolkit: Positive Naive Bayes Classifier # # Copyright (C) 2012 NLTK Project # Author: Alessandro Presta # URL: # For license information, see LICENSE.TXT """ A variant of the Naive Bayes Classifier that performs binary classification with partially-labeled training sets. In other words, assume we want to build a classifier that assigns each example to one of two complementary classes (e.g., male names and female names). If we have a training set with labeled examples for both classes, we can use a standard Naive Bayes Classifier. However, consider the case when we only have labeled examples for one of the classes, and other, unlabeled, examples. Then, assuming a prior distribution on the two labels, we can use the unlabeled set to estimate the frequencies of the various features. Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1 and unlabeled examples. We are also given an estimate of P(1). We compute P(feature|1) exactly as in the standard case. To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are assuming that the unlabeled examples are drawn according to the given prior distribution) and then express the conditional probability as: | P(feature) - P(feature|1) * P(1) | P(feature|0) = ---------------------------------- | P(0) Example: >>> from nltk.classify import PositiveNaiveBayesClassifier Some sentences about sports: >>> sports_sentences = [ 'The team dominated the game', ... 'They lost the ball', ... 'The game was intense', ... 'The goalkeeper catched the ball', ... 'The other team controlled the ball' ] Mixed topics, including sports: >>> various_sentences = [ 'The President did not comment', ... 'I lost the keys', ... 'The team won the game', ... 'Sara has two kids', ... 'The ball went off the court', ... 'They had the ball for the whole game', ... 'The show is over' ] The features of a sentence are simply the words it contains: >>> def features(sentence): ... words = sentence.lower().split() ... return dict(('contains(%s)' % w, True) for w in words) We use the sports sentences as positive examples, the mixed ones ad unlabeled examples: >>> positive_featuresets = map(features, sports_sentences) >>> unlabeled_featuresets = map(features, various_sentences) >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, ... unlabeled_featuresets) Is the following sentence about sports? >>> classifier.classify(features('The cat is on the table')) False What about this one? >>> classifier.classify(features('My team lost the game')) True """ from collections import defaultdict from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist ##////////////////////////////////////////////////////// ## Positive Naive Bayes Classifier ##////////////////////////////////////////////////////// class PositiveNaiveBayesClassifier(NaiveBayesClassifier): @staticmethod def train( positive_featuresets, unlabeled_featuresets, positive_prob_prior=0.5, estimator=ELEProbDist, ): """ :param positive_featuresets: An iterable of featuresets that are known as positive examples (i.e., their label is ``True``). :param unlabeled_featuresets: An iterable of featuresets whose label is unknown. :param positive_prob_prior: A prior estimate of the probability of the label ``True`` (default 0.5). """ positive_feature_freqdist = defaultdict(FreqDist) unlabeled_feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occurred in positive examples. num_positive_examples = 0 for featureset in positive_featuresets: for fname, fval in featureset.items(): positive_feature_freqdist[fname][fval] += 1 feature_values[fname].add(fval) fnames.add(fname) num_positive_examples += 1 # Count up how many times each feature value occurred in unlabeled examples. num_unlabeled_examples = 0 for featureset in unlabeled_featuresets: for fname, fval in featureset.items(): unlabeled_feature_freqdist[fname][fval] += 1 feature_values[fname].add(fval) fnames.add(fname) num_unlabeled_examples += 1 # If a feature didn't have a value given for an instance, then we assume that # it gets the implicit value 'None'. for fname in fnames: count = positive_feature_freqdist[fname].N() positive_feature_freqdist[fname][None] += num_positive_examples - count feature_values[fname].add(None) for fname in fnames: count = unlabeled_feature_freqdist[fname].N() unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count feature_values[fname].add(None) negative_prob_prior = 1.0 - positive_prob_prior # Create the P(label) distribution. label_probdist = DictionaryProbDist( {True: positive_prob_prior, False: negative_prob_prior} ) # Create the P(fval|label, fname) distribution. feature_probdist = {} for fname, freqdist in positive_feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[True, fname] = probdist for fname, freqdist in unlabeled_feature_freqdist.items(): global_probdist = estimator(freqdist, bins=len(feature_values[fname])) negative_feature_probs = {} for fval in feature_values[fname]: prob = ( global_probdist.prob(fval) - positive_prob_prior * feature_probdist[True, fname].prob(fval) ) / negative_prob_prior # TODO: We need to add some kind of smoothing here, instead of # setting negative probabilities to zero and normalizing. negative_feature_probs[fval] = max(prob, 0.0) feature_probdist[False, fname] = DictionaryProbDist( negative_feature_probs, normalize=True ) return PositiveNaiveBayesClassifier(label_probdist, feature_probdist) ##////////////////////////////////////////////////////// ## Demo ##////////////////////////////////////////////////////// def demo(): from nltk.classify.util import partial_names_demo classifier = partial_names_demo(PositiveNaiveBayesClassifier.train) classifier.show_most_informative_features() nltk-3.7/nltk/classify/rte_classify.py000066400000000000000000000137551420073152400201720ustar00rootroot00000000000000# Natural Language Toolkit: RTE Classifier # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ Simple classifier for RTE corpus. It calculates the overlap in words and named entities between text and hypothesis, and also whether there are words / named entities in the hypothesis which fail to occur in the text, since this is an indicator that the hypothesis is more informative than (i.e not entailed by) the text. TO DO: better Named Entity classification TO DO: add lemmatization """ from nltk.classify.maxent import MaxentClassifier from nltk.classify.util import accuracy, check_megam_config from nltk.tokenize import RegexpTokenizer class RTEFeatureExtractor: """ This builds a bag of words for both the text and the hypothesis after throwing away some stopwords, then calculates overlap and difference. """ def __init__(self, rtepair, stop=True, use_lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ self.stop = stop self.stopwords = { "a", "the", "it", "they", "of", "in", "to", "is", "have", "are", "were", "and", "very", ".", ",", } self.negwords = {"no", "not", "never", "failed", "rejected", "denied"} # Try to tokenize so that abbreviations, monetary amounts, email # addresses, URLs are single tokens. tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+") # Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if use_lemmatize: self.text_words = {self._lemmatize(token) for token in self.text_tokens} self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens} if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words def overlap(self, toktype, debug=False): """ Compute the overlap between text and hypothesis. :param toktype: distinguish Named Entities from ordinary words :type toktype: 'ne' or 'word' """ ne_overlap = {token for token in self._overlap if self._ne(token)} if toktype == "ne": if debug: print("ne overlap", ne_overlap) return ne_overlap elif toktype == "word": if debug: print("word overlap", self._overlap - ne_overlap) return self._overlap - ne_overlap else: raise ValueError("Type not recognized:'%s'" % toktype) def hyp_extra(self, toktype, debug=True): """ Compute the extraneous material in the hypothesis. :param toktype: distinguish Named Entities from ordinary words :type toktype: 'ne' or 'word' """ ne_extra = {token for token in self._hyp_extra if self._ne(token)} if toktype == "ne": return ne_extra elif toktype == "word": return self._hyp_extra - ne_extra else: raise ValueError("Type not recognized: '%s'" % toktype) @staticmethod def _ne(token): """ This just assumes that words in all caps or titles are named entities. :type token: str """ if token.istitle() or token.isupper(): return True return False @staticmethod def _lemmatize(word): """ Use morphy from WordNet to find the base form of verbs. """ lemma = nltk.corpus.wordnet.morphy(word, pos=nltk.corpus.wordnet.VERB) if lemma is not None: return lemma return word def rte_features(rtepair): extractor = RTEFeatureExtractor(rtepair) features = {} features["alwayson"] = True features["word_overlap"] = len(extractor.overlap("word")) features["word_hyp_extra"] = len(extractor.hyp_extra("word")) features["ne_overlap"] = len(extractor.overlap("ne")) features["ne_hyp_extra"] = len(extractor.hyp_extra("ne")) features["neg_txt"] = len(extractor.negwords & extractor.text_words) features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words) return features def rte_featurize(rte_pairs): return [(rte_features(pair), pair.value) for pair in rte_pairs] def rte_classifier(algorithm, sample_N=None): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"]) test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"]) if sample_N is not None: train_set = train_set[:sample_N] test_set = test_set[:sample_N] featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print("Training classifier...") if algorithm in ["megam"]: # MEGAM based algorithms. clf = MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str( "RTEClassifier only supports these algorithms:\n " "'megam', 'GIS', 'IIS'.\n" ) raise Exception(err_msg) print("Testing classifier...") acc = accuracy(clf, featurized_test_set) print("Accuracy: %6.4f" % acc) return clf nltk-3.7/nltk/classify/scikitlearn.py000066400000000000000000000124351420073152400200050ustar00rootroot00000000000000# Natural Language Toolkit: Interface to scikit-learn classifiers # # Author: Lars Buitinck # URL: # For license information, see LICENSE.TXT """ scikit-learn (https://scikit-learn.org) is a machine learning library for Python. It supports many classification algorithms, including SVMs, Naive Bayes, logistic regression (MaxEnt) and decision trees. This package implements a wrapper around scikit-learn classifiers. To use this wrapper, construct a scikit-learn estimator object, then use that to construct a SklearnClassifier. E.g., to wrap a linear SVM with default settings: >>> from sklearn.svm import LinearSVC >>> from nltk.classify.scikitlearn import SklearnClassifier >>> classif = SklearnClassifier(LinearSVC()) A scikit-learn classifier may include preprocessing steps when it's wrapped in a Pipeline object. The following constructs and wraps a Naive Bayes text classifier with tf-idf weighting and chi-square feature selection to get the best 1000 features: >>> from sklearn.feature_extraction.text import TfidfTransformer >>> from sklearn.feature_selection import SelectKBest, chi2 >>> from sklearn.naive_bayes import MultinomialNB >>> from sklearn.pipeline import Pipeline >>> pipeline = Pipeline([('tfidf', TfidfTransformer()), ... ('chi2', SelectKBest(chi2, k=1000)), ... ('nb', MultinomialNB())]) >>> classif = SklearnClassifier(pipeline) """ from nltk.classify.api import ClassifierI from nltk.probability import DictionaryProbDist try: from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import LabelEncoder except ImportError: pass __all__ = ["SklearnClassifier"] class SklearnClassifier(ClassifierI): """Wrapper for scikit-learn classifiers.""" def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data. The default value should be fine for almost all situations. :param sparse: Whether to use sparse matrices internally. The estimator must support these; not all scikit-learn classifiers do (see their respective documentation and look for "sparse matrix"). The default value is True, since most NLP problems involve sparse feature sets. Setting this to False may take a great amount of memory. :type sparse: boolean. """ self._clf = estimator self._encoder = LabelEncoder() self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) def __repr__(self): return "" % self._clf def classify_many(self, featuresets): """Classify a batch of samples. :param featuresets: An iterable over featuresets, each a dict mapping strings to either numbers, booleans or strings. :return: The predicted class label for each input sample. :rtype: list """ X = self._vectorizer.transform(featuresets) classes = self._encoder.classes_ return [classes[i] for i in self._clf.predict(X)] def prob_classify_many(self, featuresets): """Compute per-class probabilities for a batch of samples. :param featuresets: An iterable over featuresets, each a dict mapping strings to either numbers, booleans or strings. :rtype: list of ``ProbDistI`` """ X = self._vectorizer.transform(featuresets) y_proba_list = self._clf.predict_proba(X) return [self._make_probdist(y_proba) for y_proba in y_proba_list] def labels(self): """The class labels used by this classifier. :rtype: list """ return list(self._encoder.classes_) def train(self, labeled_featuresets): """ Train (fit) the scikit-learn estimator. :param labeled_featuresets: A list of ``(featureset, label)`` where each ``featureset`` is a dict mapping strings to either numbers, booleans or strings. """ X, y = list(zip(*labeled_featuresets)) X = self._vectorizer.fit_transform(X) y = self._encoder.fit_transform(y) self._clf.fit(X, y) return self def _make_probdist(self, y_proba): classes = self._encoder.classes_ return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)}) if __name__ == "__main__": from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB from nltk.classify.util import names_demo, names_demo_features # Bernoulli Naive Bayes is designed for binary classification. We set the # binarize option to False since we know we're passing boolean features. print("scikit-learn Naive Bayes:") names_demo( SklearnClassifier(BernoulliNB(binarize=False)).train, features=names_demo_features, ) # The C parameter on logistic regression (MaxEnt) controls regularization. # The higher it's set, the less regularized the classifier is. print("\n\nscikit-learn logistic regression:") names_demo( SklearnClassifier(LogisticRegression(C=1000)).train, features=names_demo_features, ) nltk-3.7/nltk/classify/senna.py000066400000000000000000000151101420073152400165720ustar00rootroot00000000000000# Natural Language Toolkit: Senna Interface # # Copyright (C) 2001-2022 NLTK Project # Author: Rami Al-Rfou' # URL: # For license information, see LICENSE.TXT """ A general interface to the SENNA pipeline that supports any of the operations specified in SUPPORTED_OPERATIONS. Applying multiple operations at once has the speed advantage. For example, Senna will automatically determine POS tags if you are extracting named entities. Applying both of the operations will cost only the time of extracting the named entities. The SENNA pipeline has a fixed maximum size of the sentences that it can read. By default it is 1024 token/sentence. If you have larger sentences, changing the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your system specific binary should be rebuilt. Otherwise this could introduce misalignment errors. The input is: - path to the directory that contains SENNA executables. If the path is incorrect, Senna will automatically search for executable file specified in SENNA environment variable - List of the operations needed to be performed. - (optionally) the encoding of the input data (default:utf-8) Note: Unit tests for this module can be found in test/unit/test_senna.py >>> from nltk.classify import Senna >>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) >>> sent = 'Dusseldorf is an international business center'.split() >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'), ('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')] """ from os import environ, path, sep from platform import architecture, system from subprocess import PIPE, Popen from nltk.tag.api import TaggerI class Senna(TaggerI): SUPPORTED_OPERATIONS = ["pos", "chk", "ner"] def __init__(self, senna_path, operations, encoding="utf-8"): self._encoding = encoding self._path = path.normpath(senna_path) + sep # Verifies the existence of the executable on the self._path first # senna_binary_file_1 = self.executable(self._path) exe_file_1 = self.executable(self._path) if not path.isfile(exe_file_1): # Check for the system environment if "SENNA" in environ: # self._path = path.join(environ['SENNA'],'') self._path = path.normpath(environ["SENNA"]) + sep exe_file_2 = self.executable(self._path) if not path.isfile(exe_file_2): raise OSError( "Senna executable expected at %s or %s but not found" % (exe_file_1, exe_file_2) ) self.operations = operations def executable(self, base_path): """ The function that determines the system specific binary that should be used in the pipeline. In case, the system is not known the default senna binary will be used. """ os_name = system() if os_name == "Linux": bits = architecture()[0] if bits == "64bit": return path.join(base_path, "senna-linux64") return path.join(base_path, "senna-linux32") if os_name == "Windows": return path.join(base_path, "senna-win32.exe") if os_name == "Darwin": return path.join(base_path, "senna-osx") return path.join(base_path, "senna") def _map(self): """ A method that calculates the order of the columns that SENNA pipeline will output the tags into. This depends on the operations being ordered. """ _map = {} i = 1 for operation in Senna.SUPPORTED_OPERATIONS: if operation in self.operations: _map[operation] = i i += 1 return _map def tag(self, tokens): """ Applies the specified operation(s) on a list of tokens. """ return self.tag_sents([tokens])[0] def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return a list of dictionaries. Every dictionary will contain a word with its calculated annotations/tags. """ encoding = self._encoding if not path.isfile(self.executable(self._path)): raise OSError( "Senna executable expected at %s but not found" % self.executable(self._path) ) # Build the senna command to run the tagger _senna_cmd = [ self.executable(self._path), "-path", self._path, "-usrtokens", "-iobtags", ] _senna_cmd.extend(["-" + op for op in self.operations]) # Serialize the actual sentences to a temporary string _input = "\n".join(" ".join(x) for x in sentences) + "\n" if isinstance(_input, str) and encoding: _input = _input.encode(encoding) # Run the tagger and get the output p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate(input=_input) senna_output = stdout # Check the return code. if p.returncode != 0: raise RuntimeError("Senna command failed! Details: %s" % stderr) if encoding: senna_output = stdout.decode(encoding) # Output the tagged sentences map_ = self._map() tagged_sentences = [[]] sentence_index = 0 token_index = 0 for tagged_word in senna_output.strip().split("\n"): if not tagged_word: tagged_sentences.append([]) sentence_index += 1 token_index = 0 continue tags = tagged_word.split("\t") result = {} for tag in map_: result[tag] = tags[map_[tag]].strip() try: result["word"] = sentences[sentence_index][token_index] except IndexError as e: raise IndexError( "Misalignment error occurred at sentence number %d. Possible reason" " is that the sentence size exceeded the maximum size. Check the " "documentation of Senna class for more information." % sentence_index ) from e tagged_sentences[-1].append(result) token_index += 1 return tagged_sentences nltk-3.7/nltk/classify/svm.py000066400000000000000000000007741420073152400163050ustar00rootroot00000000000000# Natural Language Toolkit: SVM-based classifier # # Copyright (C) 2001-2022 NLTK Project # Author: Leon Derczynski # # URL: # For license information, see LICENSE.TXT """ nltk.classify.svm was deprecated. For classification based on support vector machines SVMs use nltk.classify.scikitlearn (or `scikit-learn `_ directly). """ class SvmClassifier: def __init__(self, *args, **kwargs): raise NotImplementedError(__doc__) nltk-3.7/nltk/classify/tadm.py000066400000000000000000000065511420073152400164240ustar00rootroot00000000000000# Natural Language Toolkit: Interface to TADM Classifier # # Copyright (C) 2001-2022 NLTK Project # Author: Joseph Frazee # URL: # For license information, see LICENSE.TXT import subprocess import sys from nltk.internals import find_binary try: import numpy except ImportError: pass _tadm_bin = None def config_tadm(bin=None): global _tadm_bin _tadm_bin = find_binary( "tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net" ) def write_tadm_file(train_toks, encoding, stream): """ Generate an input file for ``tadm`` based on the given corpus of classified tokens. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type encoding: TadmEventMaxentFeatureEncoding :param encoding: A feature encoding, used to convert featuresets into feature vectors. :type stream: stream :param stream: The stream to which the ``tadm`` input file should be written. """ # See the following for a file format description: # # https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054 # https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054 labels = encoding.labels() for featureset, label in train_toks: length_line = "%d\n" % len(labels) stream.write(length_line) for known_label in labels: v = encoding.encode(featureset, known_label) line = "%d %d %s\n" % ( int(label == known_label), len(v), " ".join("%d %d" % u for u in v), ) stream.write(line) def parse_tadm_weights(paramfile): """ Given the stdout output generated by ``tadm`` when training a model, return a ``numpy`` array containing the corresponding weight vector. """ weights = [] for line in paramfile: weights.append(float(line.strip())) return numpy.array(weights, "d") def call_tadm(args): """ Call the ``tadm`` binary with the given arguments. """ if isinstance(args, str): raise TypeError("args should be a list of strings") if _tadm_bin is None: config_tadm() # Call tadm via a subprocess cmd = [_tadm_bin] + args p = subprocess.Popen(cmd, stdout=sys.stdout) (stdout, stderr) = p.communicate() # Check the return code. if p.returncode != 0: print() print(stderr) raise OSError("tadm command failed!") def names_demo(): from nltk.classify.maxent import TadmMaxentClassifier from nltk.classify.util import names_demo classifier = names_demo(TadmMaxentClassifier.train) def encoding_demo(): import sys from nltk.classify.maxent import TadmEventMaxentFeatureEncoding tokens = [ ({"f0": 1, "f1": 1, "f3": 1}, "A"), ({"f0": 1, "f2": 1, "f4": 1}, "B"), ({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"), ] encoding = TadmEventMaxentFeatureEncoding.train(tokens) write_tadm_file(tokens, encoding, sys.stdout) print() for i in range(encoding.length()): print("%s --> %d" % (encoding.describe(i), i)) print() if __name__ == "__main__": encoding_demo() names_demo() nltk-3.7/nltk/classify/textcat.py000066400000000000000000000133161420073152400171500ustar00rootroot00000000000000# Natural Language Toolkit: Language ID module using TextCat algorithm # # Copyright (C) 2001-2022 NLTK Project # Author: Avital Pekker # # URL: # For license information, see LICENSE.TXT """ A module for language identification using the TextCat algorithm. An implementation of the text categorization algorithm presented in Cavnar, W. B. and J. M. Trenkle, "N-Gram-Based Text Categorization". The algorithm takes advantage of Zipf's law and uses n-gram frequencies to profile languages and text-yet to be identified-then compares using a distance measure. Language n-grams are provided by the "An Crubadan" project. A corpus reader was created separately to read those files. For details regarding the algorithm, see: https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf For details about An Crubadan, see: https://borel.slu.edu/crubadan/index.html """ from sys import maxsize from nltk.util import trigrams # Note: this is NOT "re" you're likely used to. The regex module # is an alternative to the standard re module that supports # Unicode codepoint properties with the \p{} syntax. # You may have to "pip install regx" try: import regex as re except ImportError: re = None ###################################################################### ## Language identification using TextCat ###################################################################### class TextCat: _corpus = None fingerprints = {} _START_CHAR = "<" _END_CHAR = ">" last_distances = {} def __init__(self): if not re: raise OSError( "classify.textcat requires the regex module that " "supports unicode. Try '$ pip install regex' and " "see https://pypi.python.org/pypi/regex for " "further details." ) from nltk.corpus import crubadan self._corpus = crubadan # Load all language ngrams into cache for lang in self._corpus.langs(): self._corpus.lang_freq(lang) def remove_punctuation(self, text): """Get rid of punctuation except apostrophes""" return re.sub(r"[^\P{P}\']+", "", text) def profile(self, text): """Create FreqDist of trigrams within text""" from nltk import FreqDist, word_tokenize clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = ["".join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint def calc_dist(self, lang, trigram, text_profile): """Calculate the "out-of-place" measure between the text and language profile for a single trigram""" lang_fd = self._corpus.lang_freq(lang) dist = 0 if trigram in lang_fd: idx_lang_profile = list(lang_fd.keys()).index(trigram) idx_text = list(text_profile.keys()).index(trigram) # print(idx_lang_profile, ", ", idx_text) dist = abs(idx_lang_profile - idx_text) else: # Arbitrary but should be larger than # any possible trigram file length # in terms of total lines dist = maxsize return dist def lang_dists(self, text): """Calculate the "out-of-place" measure between the text and all languages""" distances = {} profile = self.profile(text) # For all the languages for lang in self._corpus._all_lang_freq.keys(): # Calculate distance metric for every trigram in # input text to be identified lang_dist = 0 for trigram in profile: lang_dist += self.calc_dist(lang, trigram, profile) distances[lang] = lang_dist return distances def guess_language(self, text): """Find the language with the min distance to the text and return its ISO 639-3 code""" self.last_distances = self.lang_dists(text) return min(self.last_distances, key=self.last_distances.get) #################################################') def demo(): from nltk.corpus import udhr langs = [ "Kurdish-UTF8", "Abkhaz-UTF8", "Farsi_Persian-UTF8", "Hindi-UTF8", "Hawaiian-UTF8", "Russian-UTF8", "Vietnamese-UTF8", "Serbian_Srpski-UTF8", "Esperanto-UTF8", ] friendly = { "kmr": "Northern Kurdish", "abk": "Abkhazian", "pes": "Iranian Persian", "hin": "Hindi", "haw": "Hawaiian", "rus": "Russian", "vie": "Vietnamese", "srp": "Serbian", "epo": "Esperanto", } tc = TextCat() for cur_lang in langs: # Get raw data from UDHR corpus raw_sentences = udhr.sents(cur_lang) rows = len(raw_sentences) - 1 cols = list(map(len, raw_sentences)) sample = "" # Generate a sample text of the language for i in range(0, rows): cur_sent = "" for j in range(0, cols[i]): cur_sent += " " + raw_sentences[i][j] sample += cur_sent # Try to detect what it is print("Language snippet: " + sample[0:140] + "...") guess = tc.guess_language(sample) print(f"Language detection: {guess} ({friendly[guess]})") print("#" * 140) if __name__ == "__main__": demo() nltk-3.7/nltk/classify/util.py000066400000000000000000000275231420073152400164560ustar00rootroot00000000000000# Natural Language Toolkit: Classifier Utility Functions # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT """ Utility functions and classes for classifiers. """ import math # from nltk.util import Deprecated import nltk.classify.util # for accuracy & log_likelihood from nltk.util import LazyMap ###################################################################### # { Helper Functions ###################################################################### # alternative name possibility: 'map_featurefunc()'? # alternative name possibility: 'detect_features()'? # alternative name possibility: 'map_featuredetect()'? # or.. just have users use LazyMap directly? def apply_features(feature_func, toks, labeled=None): """ Use the ``LazyMap`` class to construct a lazy list-like object that is analogous to ``map(feature_func, toks)``. In particular, if ``labeled=False``, then the returned list-like object's values are equal to:: [feature_func(tok) for tok in toks] If ``labeled=True``, then the returned list-like object's values are equal to:: [(feature_func(tok), label) for (tok, label) in toks] The primary purpose of this function is to avoid the memory overhead involved in storing all the featuresets for every token in a corpus. Instead, these featuresets are constructed lazily, as-needed. The reduction in memory overhead can be especially significant when the underlying list of tokens is itself lazy (as is the case with many corpus readers). :param feature_func: The function that will be applied to each token. It should return a featureset -- i.e., a dict mapping feature names to feature values. :param toks: The list of tokens to which ``feature_func`` should be applied. If ``labeled=True``, then the list elements will be passed directly to ``feature_func()``. If ``labeled=False``, then the list elements should be tuples ``(tok,label)``, and ``tok`` will be passed to ``feature_func()``. :param labeled: If true, then ``toks`` contains labeled tokens -- i.e., tuples of the form ``(tok, label)``. (Default: auto-detect based on types.) """ if labeled is None: labeled = toks and isinstance(toks[0], (tuple, list)) if labeled: def lazy_func(labeled_token): return (feature_func(labeled_token[0]), labeled_token[1]) return LazyMap(lazy_func, toks) else: return LazyMap(feature_func, toks) def attested_labels(tokens): """ :return: A list of all labels that are attested in the given list of tokens. :rtype: list of (immutable) :param tokens: The list of classified tokens from which to extract labels. A classified token has the form ``(token, label)``. :type tokens: list """ return tuple({label for (tok, label) in tokens}) def log_likelihood(classifier, gold): results = classifier.prob_classify_many([fs for (fs, l) in gold]) ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)] return math.log(sum(ll) / len(ll)) def accuracy(classifier, gold): results = classifier.classify_many([fs for (fs, l) in gold]) correct = [l == r for ((fs, l), r) in zip(gold, results)] if correct: return sum(correct) / len(correct) else: return 0 class CutoffChecker: """ A helper class that implements cutoff checks based on number of iterations and log likelihood. Accuracy cutoffs are also implemented, but they're almost never a good idea to use. """ def __init__(self, cutoffs): self.cutoffs = cutoffs.copy() if "min_ll" in cutoffs: cutoffs["min_ll"] = -abs(cutoffs["min_ll"]) if "min_lldelta" in cutoffs: cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"]) self.ll = None self.acc = None self.iter = 1 def check(self, classifier, train_toks): cutoffs = self.cutoffs self.iter += 1 if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]: return True # iteration cutoff. new_ll = nltk.classify.util.log_likelihood(classifier, train_toks) if math.isnan(new_ll): return True if "min_ll" in cutoffs or "min_lldelta" in cutoffs: if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]: return True # log likelihood cutoff if ( "min_lldelta" in cutoffs and self.ll and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"])) ): return True # log likelihood delta cutoff self.ll = new_ll if "max_acc" in cutoffs or "min_accdelta" in cutoffs: new_acc = nltk.classify.util.log_likelihood(classifier, train_toks) if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]: return True # log likelihood cutoff if ( "min_accdelta" in cutoffs and self.acc and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"])) ): return True # log likelihood delta cutoff self.acc = new_acc return False # no cutoff reached. ###################################################################### # { Demos ###################################################################### def names_demo_features(name): features = {} features["alwayson"] = True features["startswith"] = name[0].lower() features["endswith"] = name[-1].lower() for letter in "abcdefghijklmnopqrstuvwxyz": features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = letter in name.lower() return features def binary_names_demo_features(name): features = {} features["alwayson"] = True features["startswith(vowel)"] = name[0].lower() in "aeiouy" features["endswith(vowel)"] = name[-1].lower() in "aeiouy" for letter in "abcdefghijklmnopqrstuvwxyz": features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = letter in name.lower() features["startswith(%s)" % letter] = letter == name[0].lower() features["endswith(%s)" % letter] = letter == name[-1].lower() return features def names_demo(trainer, features=names_demo_features): import random from nltk.corpus import names # Construct a list of classified names, using the names corpus. namelist = [(name, "male") for name in names.words("male.txt")] + [ (name, "female") for name in names.words("female.txt") ] # Randomly split the names into a test & train set. random.seed(123456) random.shuffle(namelist) train = namelist[:5000] test = namelist[5000:5500] # Train up a classifier. print("Training classifier...") classifier = trainer([(features(n), g) for (n, g) in train]) # Run the classifier on the test data. print("Testing classifier...") acc = accuracy(classifier, [(features(n), g) for (n, g) in test]) print("Accuracy: %6.4f" % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, g) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) print() print("Unseen Names P(Male) P(Female)\n" + "-" * 40) for ((name, gender), pdist) in list(zip(test, pdists))[:5]: if gender == "male": fmt = " %-15s *%6.4f %6.4f" else: fmt = " %-15s %6.4f *%6.4f" print(fmt % (name, pdist.prob("male"), pdist.prob("female"))) except NotImplementedError: pass # Return the classifier return classifier def partial_names_demo(trainer, features=names_demo_features): import random from nltk.corpus import names male_names = names.words("male.txt") female_names = names.words("female.txt") random.seed(654321) random.shuffle(male_names) random.shuffle(female_names) # Create a list of male names to be used as positive-labeled examples for training positive = map(features, male_names[:2000]) # Create a list of male and female names to be used as unlabeled examples unlabeled = map(features, male_names[2000:2500] + female_names[:500]) # Create a test set with correctly-labeled male and female names test = [(name, True) for name in male_names[2500:2750]] + [ (name, False) for name in female_names[500:750] ] random.shuffle(test) # Train up a classifier. print("Training classifier...") classifier = trainer(positive, unlabeled) # Run the classifier on the test data. print("Testing classifier...") acc = accuracy(classifier, [(features(n), m) for (n, m) in test]) print("Accuracy: %6.4f" % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, m) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) print() print("Unseen Names P(Male) P(Female)\n" + "-" * 40) for ((name, is_male), pdist) in zip(test, pdists)[:5]: if is_male == True: fmt = " %-15s *%6.4f %6.4f" else: fmt = " %-15s %6.4f *%6.4f" print(fmt % (name, pdist.prob(True), pdist.prob(False))) except NotImplementedError: pass # Return the classifier return classifier _inst_cache = {} def wsd_demo(trainer, word, features, n=1000): import random from nltk.corpus import senseval # Get the instances. print("Reading data...") global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] instances = _inst_cache[word][:] if n > len(instances): n = len(instances) senses = list({l for (i, l) in instances}) print(" Senses: " + " ".join(senses)) # Randomly split the names into a test & train set. print("Splitting into test & train...") random.seed(123456) random.shuffle(instances) train = instances[: int(0.8 * n)] test = instances[int(0.8 * n) : n] # Train up a classifier. print("Training classifier...") classifier = trainer([(features(i), l) for (i, l) in train]) # Run the classifier on the test data. print("Testing classifier...") acc = accuracy(classifier, [(features(i), l) for (i, l) in test]) print("Accuracy: %6.4f" % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(i) for (i, n) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) except NotImplementedError: pass # Return the classifier return classifier def check_megam_config(): """ Checks whether the MEGAM binary is configured. """ try: _megam_bin except NameError as e: err_msg = str( "Please configure your megam binary first, e.g.\n" ">>> nltk.config_megam('/usr/bin/local/megam')" ) raise NameError(err_msg) from e nltk-3.7/nltk/classify/weka.py000066400000000000000000000304211420073152400164170ustar00rootroot00000000000000# Natural Language Toolkit: Interface to Weka Classsifiers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Classifiers that make use of the external 'Weka' package. """ import os import re import subprocess import tempfile import time import zipfile from sys import stdin from nltk.classify.api import ClassifierI from nltk.internals import config_java, java from nltk.probability import DictionaryProbDist _weka_classpath = None _weka_search = [ ".", "/usr/share/weka", "/usr/local/share/weka", "/usr/lib/weka", "/usr/local/lib/weka", ] def config_weka(classpath=None): global _weka_classpath # Make sure java's configured first. config_java() if classpath is not None: _weka_classpath = classpath if _weka_classpath is None: searchpath = _weka_search if "WEKAHOME" in os.environ: searchpath.insert(0, os.environ["WEKAHOME"]) for path in searchpath: if os.path.exists(os.path.join(path, "weka.jar")): _weka_classpath = os.path.join(path, "weka.jar") version = _check_weka_version(_weka_classpath) if version: print(f"[Found Weka: {_weka_classpath} (version {version})]") else: print("[Found Weka: %s]" % _weka_classpath) _check_weka_version(_weka_classpath) if _weka_classpath is None: raise LookupError( "Unable to find weka.jar! Use config_weka() " "or set the WEKAHOME environment variable. " "For more information about Weka, please see " "https://www.cs.waikato.ac.nz/ml/weka/" ) def _check_weka_version(jar): try: zf = zipfile.ZipFile(jar) except (SystemExit, KeyboardInterrupt): raise except: return None try: try: return zf.read("weka/core/version.txt") except KeyError: return None finally: zf.close() class WekaClassifier(ClassifierI): def __init__(self, formatter, model_filename): self._formatter = formatter self._model = model_filename def prob_classify_many(self, featuresets): return self._classify_many(featuresets, ["-p", "0", "-distribution"]) def classify_many(self, featuresets): return self._classify_many(featuresets, ["-p", "0"]) def _classify_many(self, featuresets, options): # Make sure we can find java & weka. config_weka() temp_dir = tempfile.mkdtemp() try: # Write the test data file. test_filename = os.path.join(temp_dir, "test.arff") self._formatter.write(test_filename, featuresets) # Call weka to classify the data. cmd = [ "weka.classifiers.bayes.NaiveBayes", "-l", self._model, "-T", test_filename, ] + options (stdout, stderr) = java( cmd, classpath=_weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) # Check if something went wrong: if stderr and not stdout: if "Illegal options: -distribution" in stderr: raise ValueError( "The installed version of weka does " "not support probability distribution " "output." ) else: raise ValueError("Weka failed to generate output:\n%s" % stderr) # Parse weka's output. return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n")) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir) def parse_weka_distribution(self, s): probs = [float(v) for v in re.split("[*,]+", s) if v.strip()] probs = dict(zip(self._formatter.labels(), probs)) return DictionaryProbDist(probs) def parse_weka_output(self, lines): # Strip unwanted text from stdout for i, line in enumerate(lines): if line.strip().startswith("inst#"): lines = lines[i:] break if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]: return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()] elif lines[0].split() == [ "inst#", "actual", "predicted", "error", "distribution", ]: return [ self.parse_weka_distribution(line.split()[-1]) for line in lines[1:] if line.strip() ] # is this safe:? elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]): return [line.split()[1] for line in lines if line.strip()] else: for line in lines[:10]: print(line) raise ValueError( "Unhandled output format -- your version " "of weka may not be supported.\n" " Header: %s" % lines[0] ) # [xx] full list of classifiers (some may be abstract?): # ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule, # DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48, # JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic, # LogisticBase, M5Base, MultilayerPerceptron, # MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial, # NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART, # PreConstructedLinearModel, Prism, RandomForest, # RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor, # RuleNode, SimpleLinearRegression, SimpleLogistic, # SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI, # VotedPerceptron, Winnow, ZeroR _CLASSIFIER_CLASS = { "naivebayes": "weka.classifiers.bayes.NaiveBayes", "C4.5": "weka.classifiers.trees.J48", "log_regression": "weka.classifiers.functions.Logistic", "svm": "weka.classifiers.functions.SMO", "kstar": "weka.classifiers.lazy.KStar", "ripper": "weka.classifiers.rules.JRip", } @classmethod def train( cls, model_filename, featuresets, classifier="naivebayes", options=[], quiet=True, ): # Make sure we can find java & weka. config_weka() # Build an ARFF formatter. formatter = ARFF_Formatter.from_train(featuresets) temp_dir = tempfile.mkdtemp() try: # Write the training data file. train_filename = os.path.join(temp_dir, "train.arff") formatter.write(train_filename, featuresets) if classifier in cls._CLASSIFIER_CLASS: javaclass = cls._CLASSIFIER_CLASS[classifier] elif classifier in cls._CLASSIFIER_CLASS.values(): javaclass = classifier else: raise ValueError("Unknown classifier %s" % classifier) # Train the weka model. cmd = [javaclass, "-d", model_filename, "-t", train_filename] cmd += list(options) if quiet: stdout = subprocess.PIPE else: stdout = None java(cmd, classpath=_weka_classpath, stdout=stdout) # Return the new classifier. return WekaClassifier(formatter, model_filename) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir) class ARFF_Formatter: """ Converts featuresets and labeled featuresets to ARFF-formatted strings, appropriate for input into Weka. Features and classes can be specified manually in the constructor, or may be determined from data using ``from_train``. """ def __init__(self, labels, features): """ :param labels: A list of all class labels that can be generated. :param features: A list of feature specifications, where each feature specification is a tuple (fname, ftype); and ftype is an ARFF type string such as NUMERIC or STRING. """ self._labels = labels self._features = features def format(self, tokens): """Returns a string representation of ARFF output for the given data.""" return self.header_section() + self.data_section(tokens) def labels(self): """Returns the list of classes.""" return list(self._labels) def write(self, outfile, tokens): """Writes ARFF data to a file for the given data.""" if not hasattr(outfile, "write"): outfile = open(outfile, "w") outfile.write(self.format(tokens)) outfile.close() @staticmethod def from_train(tokens): """ Constructs an ARFF_Formatter instance with class labels and feature types determined from the given data. Handles boolean, numeric and string (note: not nominal) types. """ # Find the set of all attested labels. labels = {label for (tok, label) in tokens} # Determine the types of all features. features = {} for tok, label in tokens: for (fname, fval) in tok.items(): if issubclass(type(fval), bool): ftype = "{True, False}" elif issubclass(type(fval), (int, float, bool)): ftype = "NUMERIC" elif issubclass(type(fval), str): ftype = "STRING" elif fval is None: continue # can't tell the type. else: raise ValueError("Unsupported value type %r" % ftype) if features.get(fname, ftype) != ftype: raise ValueError("Inconsistent type for %s" % fname) features[fname] = ftype features = sorted(features.items()) return ARFF_Formatter(labels, features) def header_section(self): """Returns an ARFF header as a string.""" # Header comment. s = ( "% Weka ARFF file\n" + "% Generated automatically by NLTK\n" + "%% %s\n\n" % time.ctime() ) # Relation name s += "@RELATION rel\n\n" # Input attribute specifications for fname, ftype in self._features: s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype) # Label attribute specification s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels)) return s def data_section(self, tokens, labeled=None): """ Returns the ARFF data section for the given data. :param tokens: a list of featuresets (dicts) or labelled featuresets which are tuples (featureset, label). :param labeled: Indicates whether the given tokens are labeled or not. If None, then the tokens will be assumed to be labeled if the first token's value is a tuple or list. """ # Check if the tokens are labeled or unlabeled. If unlabeled, # then use 'None' if labeled is None: labeled = tokens and isinstance(tokens[0], (tuple, list)) if not labeled: tokens = [(tok, None) for tok in tokens] # Data section s = "\n@DATA\n" for (tok, label) in tokens: for fname, ftype in self._features: s += "%s," % self._fmt_arff_val(tok.get(fname)) s += "%s\n" % self._fmt_arff_val(label) return s def _fmt_arff_val(self, fval): if fval is None: return "?" elif isinstance(fval, (bool, int)): return "%s" % fval elif isinstance(fval, float): return "%r" % fval else: return "%r" % fval if __name__ == "__main__": from nltk.classify.util import binary_names_demo_features, names_demo def make_classifier(featuresets): return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5") classifier = names_demo(make_classifier, binary_names_demo_features) nltk-3.7/nltk/cli.py000066400000000000000000000034621420073152400144270ustar00rootroot00000000000000# Natural Language Toolkit: NLTK Command-Line Interface # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT import click from tqdm import tqdm from nltk import word_tokenize from nltk.util import parallelize_preprocess CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @click.group(context_settings=CONTEXT_SETTINGS) @click.version_option() def cli(): pass @cli.command("tokenize") @click.option( "--language", "-l", default="en", help="The language for the Punkt sentence tokenization.", ) @click.option( "--preserve-line", "-l", default=True, is_flag=True, help="An option to keep the preserve the sentence and not sentence tokenize it.", ) @click.option("--processes", "-j", default=1, help="No. of processes.") @click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.") @click.option( "--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens." ) def tokenize_file(language, preserve_line, processes, encoding, delimiter): """This command tokenizes text stream using nltk.word_tokenize""" with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallelization is slower, # so just process line by line normally. if processes == 1: for line in tqdm(fin.readlines()): print(delimiter.join(word_tokenize(line)), end="\n", file=fout) else: for outline in parallelize_preprocess( word_tokenize, fin.readlines(), processes, progress_bar=True ): print(delimiter.join(outline), end="\n", file=fout) nltk-3.7/nltk/cluster/000077500000000000000000000000001420073152400147625ustar00rootroot00000000000000nltk-3.7/nltk/cluster/__init__.py000066400000000000000000000102551420073152400170760ustar00rootroot00000000000000# Natural Language Toolkit: Clusterers # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT """ This module contains a number of basic clustering algorithms. Clustering describes the task of discovering groups of similar items with a large collection. It is also describe as unsupervised machine learning, as the data from which it learns is unannotated with class information, as is the case for supervised learning. Annotated data is difficult and expensive to obtain in the quantities required for the majority of supervised learning algorithms. This problem, the knowledge acquisition bottleneck, is common to most natural language processing tasks, thus fueling the need for quality unsupervised approaches. This module contains a k-means clusterer, E-M clusterer and a group average agglomerative clusterer (GAAC). All these clusterers involve finding good cluster groupings for a set of vectors in multi-dimensional space. The K-means clusterer starts with k arbitrary chosen means then allocates each vector to the cluster with the closest mean. It then recalculates the means of each cluster as the centroid of the vectors in the cluster. This process repeats until the cluster memberships stabilise. This is a hill-climbing algorithm which may converge to a local maximum. Hence the clustering is often repeated with random initial means and the most commonly occurring output means are chosen. The GAAC clusterer starts with each of the *N* vectors as singleton clusters. It then iteratively merges pairs of clusters which have the closest centroids. This continues until there is only one cluster. The order of merges gives rise to a dendrogram - a tree with the earlier merges lower than later merges. The membership of a given number of clusters *c*, *1 <= c <= N*, can be found by cutting the dendrogram at depth *c*. The Gaussian EM clusterer models the vectors as being produced by a mixture of k Gaussian sources. The parameters of these sources (prior probability, mean and covariance matrix) are then found to maximise the likelihood of the given data. This is done with the expectation maximisation algorithm. It starts with k arbitrarily chosen means, priors and covariance matrices. It then calculates the membership probabilities for each vector in each of the clusters - this is the 'E' step. The cluster parameters are then updated in the 'M' step using the maximum likelihood estimate from the cluster membership probabilities. This process continues until the likelihood of the data does not significantly increase. They all extend the ClusterI interface which defines common operations available with each clusterer. These operations include: - cluster: clusters a sequence of vectors - classify: assign a vector to a cluster - classification_probdist: give the probability distribution over cluster memberships The current existing classifiers also extend cluster.VectorSpace, an abstract class which allows for singular value decomposition (SVD) and vector normalisation. SVD is used to reduce the dimensionality of the vector space in such a manner as to preserve as much of the variation as possible, by reparameterising the axes in order of variability and discarding all bar the first d dimensions. Normalisation ensures that vectors fall in the unit hypersphere. Usage example (see also demo()):: from nltk import cluster from nltk.cluster import euclidean_distance from numpy import array vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]] # initialise the clusterer (will also assign the vectors to clusters) clusterer = cluster.KMeansClusterer(2, euclidean_distance) clusterer.cluster(vectors, True) # classify a new vector print(clusterer.classify(array([3, 3]))) Note that the vectors must use numpy array-like objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for efficiency when required. """ from nltk.cluster.em import EMClusterer from nltk.cluster.gaac import GAAClusterer from nltk.cluster.kmeans import KMeansClusterer from nltk.cluster.util import ( Dendrogram, VectorSpaceClusterer, cosine_distance, euclidean_distance, ) nltk-3.7/nltk/cluster/api.py000066400000000000000000000040501420073152400161040ustar00rootroot00000000000000# Natural Language Toolkit: Clusterer Interfaces # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # Porting: Steven Bird # URL: # For license information, see LICENSE.TXT from abc import ABCMeta, abstractmethod from nltk.probability import DictionaryProbDist class ClusterI(metaclass=ABCMeta): """ Interface covering basic clustering functionality. """ @abstractmethod def cluster(self, vectors, assign_clusters=False): """ Assigns the vectors to clusters, learning the clustering parameters from the data. Returns a cluster identifier for each vector. """ @abstractmethod def classify(self, token): """ Classifies the token into a cluster, setting the token's CLUSTER parameter to that cluster identifier. """ def likelihood(self, vector, label): """ Returns the likelihood (a float) of the token having the corresponding cluster. """ if self.classify(vector) == label: return 1.0 else: return 0.0 def classification_probdist(self, vector): """ Classifies the token into a cluster, returning a probability distribution over the cluster identifiers. """ likelihoods = {} sum = 0.0 for cluster in self.cluster_names(): likelihoods[cluster] = self.likelihood(vector, cluster) sum += likelihoods[cluster] for cluster in self.cluster_names(): likelihoods[cluster] /= sum return DictionaryProbDist(likelihoods) @abstractmethod def num_clusters(self): """ Returns the number of clusters. """ def cluster_names(self): """ Returns the names of the clusters. :rtype: list """ return list(range(self.num_clusters())) def cluster_name(self, index): """ Returns the names of the cluster at index. """ return index nltk-3.7/nltk/cluster/em.py000066400000000000000000000200121420073152400157300ustar00rootroot00000000000000# Natural Language Toolkit: Expectation Maximization Clusterer # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT try: import numpy except ImportError: pass from nltk.cluster.util import VectorSpaceClusterer class EMClusterer(VectorSpaceClusterer): """ The Gaussian EM clusterer models the vectors as being produced by a mixture of k Gaussian sources. The parameters of these sources (prior probability, mean and covariance matrix) are then found to maximise the likelihood of the given data. This is done with the expectation maximisation algorithm. It starts with k arbitrarily chosen means, priors and covariance matrices. It then calculates the membership probabilities for each vector in each of the clusters; this is the 'E' step. The cluster parameters are then updated in the 'M' step using the maximum likelihood estimate from the cluster membership probabilities. This process continues until the likelihood of the data does not significantly increase. """ def __init__( self, initial_means, priors=None, covariance_matrices=None, conv_threshold=1e-6, bias=0.1, normalise=False, svd_dimensions=None, ): """ Creates an EM clusterer with the given starting parameters, convergence threshold and vector mangling parameters. :param initial_means: the means of the gaussian cluster centers :type initial_means: [seq of] numpy array or seq of SparseArray :param priors: the prior probability for each cluster :type priors: numpy array or seq of float :param covariance_matrices: the covariance matrix for each cluster :type covariance_matrices: [seq of] numpy array :param conv_threshold: maximum change in likelihood before deemed convergent :type conv_threshold: int or float :param bias: variance bias used to ensure non-singular covariance matrices :type bias: float :param normalise: should vectors be normalised to length 1 :type normalise: boolean :param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD :type svd_dimensions: int """ VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._means = numpy.array(initial_means, numpy.float64) self._num_clusters = len(initial_means) self._conv_threshold = conv_threshold self._covariance_matrices = covariance_matrices self._priors = priors self._bias = bias def num_clusters(self): return self._num_clusters def cluster_vectorspace(self, vectors, trace=False): assert len(vectors) > 0 # set the parameters to initial values dimensions = len(vectors[0]) means = self._means priors = self._priors if not priors: priors = self._priors = ( numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters ) covariances = self._covariance_matrices if not covariances: covariances = self._covariance_matrices = [ numpy.identity(dimensions, numpy.float64) for i in range(self._num_clusters) ] # do the E and M steps until the likelihood plateaus lastl = self._loglikelihood(vectors, priors, means, covariances) converged = False while not converged: if trace: print("iteration; loglikelihood", lastl) # E-step, calculate hidden variables, h[i,j] h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64) for i in range(len(vectors)): for j in range(self._num_clusters): h[i, j] = priors[j] * self._gaussian( means[j], covariances[j], vectors[i] ) h[i, :] /= sum(h[i, :]) # M-step, update parameters - cvm, p, mean for j in range(self._num_clusters): covariance_before = covariances[j] new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64) new_mean = numpy.zeros(dimensions, numpy.float64) sum_hj = 0.0 for i in range(len(vectors)): delta = vectors[i] - means[j] new_covariance += h[i, j] * numpy.multiply.outer(delta, delta) sum_hj += h[i, j] new_mean += h[i, j] * vectors[i] covariances[j] = new_covariance / sum_hj means[j] = new_mean / sum_hj priors[j] = sum_hj / len(vectors) # bias term to stop covariance matrix being singular covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64) # calculate likelihood - FIXME: may be broken l = self._loglikelihood(vectors, priors, means, covariances) # check for convergence if abs(lastl - l) < self._conv_threshold: converged = True lastl = l def classify_vectorspace(self, vector): best = None for j in range(self._num_clusters): p = self._priors[j] * self._gaussian( self._means[j], self._covariance_matrices[j], vector ) if not best or p > best[0]: best = (p, j) return best[1] def likelihood_vectorspace(self, vector, cluster): cid = self.cluster_names().index(cluster) return self._priors[cluster] * self._gaussian( self._means[cluster], self._covariance_matrices[cluster], vector ) def _gaussian(self, mean, cvm, x): m = len(mean) assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape) try: det = numpy.linalg.det(cvm) inv = numpy.linalg.inv(cvm) a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0) dx = x - mean print(dx, inv) b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx) return a * numpy.exp(b) except OverflowError: # happens when the exponent is negative infinity - i.e. b = 0 # i.e. the inverse of cvm is huge (cvm is almost zero) return 0 def _loglikelihood(self, vectors, priors, means, covariances): llh = 0.0 for vector in vectors: p = 0 for j in range(len(priors)): p += priors[j] * self._gaussian(means[j], covariances[j], vector) llh += numpy.log(p) return llh def __repr__(self): return "" % list(self._means) def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk import cluster # example from figure 14.10, page 519, Manning and Schutze vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] means = [[4, 2], [4, 2.01]] clusterer = cluster.EMClusterer(means, bias=0.1) clusters = clusterer.cluster(vectors, True, trace=True) print("Clustered:", vectors) print("As: ", clusters) print() for c in range(2): print("Cluster:", c) print("Prior: ", clusterer._priors[c]) print("Mean: ", clusterer._means[c]) print("Covar: ", clusterer._covariance_matrices[c]) print() # classify a new vector vector = numpy.array([2, 2]) print("classify(%s):" % vector, end=" ") print(clusterer.classify(vector)) # show the classification probabilities vector = numpy.array([2, 2]) print("classification_probdist(%s):" % vector) pdist = clusterer.classification_probdist(vector) for sample in pdist.samples(): print(f"{sample} => {pdist.prob(sample) * 100:.0f}%") if __name__ == "__main__": demo() nltk-3.7/nltk/cluster/gaac.py000066400000000000000000000131671420073152400162370ustar00rootroot00000000000000# Natural Language Toolkit: Group Average Agglomerative Clusterer # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT try: import numpy except ImportError: pass from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance class GAAClusterer(VectorSpaceClusterer): """ The Group Average Agglomerative starts with each of the N vectors as singleton clusters. It then iteratively merges pairs of clusters which have the closest centroids. This continues until there is only one cluster. The order of merges gives rise to a dendrogram: a tree with the earlier merges lower than later merges. The membership of a given number of clusters c, 1 <= c <= N, can be found by cutting the dendrogram at depth c. This clusterer uses the cosine similarity metric only, which allows for efficient speed-up in the clustering process. """ def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None): VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendrogram = None self._groups_values = None def cluster(self, vectors, assign_clusters=False, trace=False): # stores the merge order self._dendrogram = Dendrogram( [numpy.array(vector, numpy.float64) for vector in vectors] ) return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) def cluster_vectorspace(self, vectors, trace=False): # variables describing the initial situation N = len(vectors) cluster_len = [1] * N cluster_count = N index_map = numpy.arange(N) # construct the similarity matrix dims = (N, N) dist = numpy.ones(dims, dtype=float) * numpy.inf for i in range(N): for j in range(i + 1, N): dist[i, j] = cosine_distance(vectors[i], vectors[j]) while cluster_count > max(self._num_clusters, 1): i, j = numpy.unravel_index(dist.argmin(), dims) if trace: print("merging %d and %d" % (i, j)) # update similarities for merging i and j self._merge_similarities(dist, cluster_len, i, j) # remove j dist[:, j] = numpy.inf dist[j, :] = numpy.inf # merge the clusters cluster_len[i] = cluster_len[i] + cluster_len[j] self._dendrogram.merge(index_map[i], index_map[j]) cluster_count -= 1 # update the index map to reflect the indexes if we # had removed j index_map[j + 1 :] -= 1 index_map[j] = N self.update_clusters(self._num_clusters) def _merge_similarities(self, dist, cluster_len, i, j): # the new cluster i merged from i and j adopts the average of # i and j's similarity to each other cluster, weighted by the # number of points in the clusters i and j i_weight = cluster_len[i] j_weight = cluster_len[j] weight_sum = i_weight + j_weight # update for x 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector centroid /= len(cluster) self._centroids.append(centroid) self._num_clusters = len(self._centroids) def classify_vectorspace(self, vector): best = None for i in range(self._num_clusters): centroid = self._centroids[i] dist = cosine_distance(vector, centroid) if not best or dist < best[0]: best = (dist, i) return best[1] def dendrogram(self): """ :return: The dendrogram representing the current clustering :rtype: Dendrogram """ return self._dendrogram def num_clusters(self): return self._num_clusters def __repr__(self): return "" % self._num_clusters def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk.cluster import GAAClusterer # use a set of tokens with 2D indices vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test the GAAC clusterer with 4 clusters clusterer = GAAClusterer(4) clusters = clusterer.cluster(vectors, True) print("Clusterer:", clusterer) print("Clustered:", vectors) print("As:", clusters) print() # show the dendrogram clusterer.dendrogram().show() # classify a new vector vector = numpy.array([3, 3]) print("classify(%s):" % vector, end=" ") print(clusterer.classify(vector)) print() if __name__ == "__main__": demo() nltk-3.7/nltk/cluster/kmeans.py000066400000000000000000000202511420073152400166120ustar00rootroot00000000000000# Natural Language Toolkit: K-Means Clusterer # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT import copy import random import sys try: import numpy except ImportError: pass from nltk.cluster.util import VectorSpaceClusterer class KMeansClusterer(VectorSpaceClusterer): """ The K-means clusterer starts with k arbitrary chosen means then allocates each vector to the cluster with the closest mean. It then recalculates the means of each cluster as the centroid of the vectors in the cluster. This process repeats until the cluster memberships stabilise. This is a hill-climbing algorithm which may converge to a local maximum. Hence the clustering is often repeated with random initial means and the most commonly occurring output means are chosen. """ def __init__( self, num_means, distance, repeats=1, conv_test=1e-6, initial_means=None, normalise=False, svd_dimensions=None, rng=None, avoid_empty_clusters=False, ): """ :param num_means: the number of means to use (may use fewer) :type num_means: int :param distance: measure of distance between two vectors :type distance: function taking two vectors and returning a float :param repeats: number of randomised clustering trials to use :type repeats: int :param conv_test: maximum variation in mean differences before deemed convergent :type conv_test: number :param initial_means: set of k initial means :type initial_means: sequence of vectors :param normalise: should vectors be normalised to length 1 :type normalise: boolean :param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD :type svd_dimensions: int :param rng: random number generator (or None) :type rng: Random :param avoid_empty_clusters: include current centroid in computation of next one; avoids undefined behavior when clusters become empty :type avoid_empty_clusters: boolean """ VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_means = num_means self._distance = distance self._max_difference = conv_test assert not initial_means or len(initial_means) == num_means self._means = initial_means assert repeats >= 1 assert not (initial_means and repeats > 1) self._repeats = repeats self._rng = rng if rng else random.Random() self._avoid_empty_clusters = avoid_empty_clusters def cluster_vectorspace(self, vectors, trace=False): if self._means and self._repeats > 1: print("Warning: means will be discarded for subsequent trials") meanss = [] for trial in range(self._repeats): if trace: print("k-means trial", trial) if not self._means or trial > 1: self._means = self._rng.sample(list(vectors), self._num_means) self._cluster_vectorspace(vectors, trace) meanss.append(self._means) if len(meanss) > 1: # sort the means first (so that different cluster numbering won't # effect the distance comparison) for means in meanss: means.sort(key=sum) # find the set of means that's minimally different from the others min_difference = min_means = None for i in range(len(meanss)): d = 0 for j in range(len(meanss)): if i != j: d += self._sum_distances(meanss[i], meanss[j]) if min_difference is None or d < min_difference: min_difference, min_means = d, meanss[i] # use the best means self._means = min_means def _cluster_vectorspace(self, vectors, trace=False): if self._num_means < len(vectors): # perform k-means clustering converged = False while not converged: # assign the tokens to clusters based on minimum distance to # the cluster means clusters = [[] for m in range(self._num_means)] for vector in vectors: index = self.classify_vectorspace(vector) clusters[index].append(vector) if trace: print("iteration") # for i in range(self._num_means): # print ' mean', i, 'allocated', len(clusters[i]), 'vectors' # recalculate cluster means by computing the centroid of each cluster new_means = list(map(self._centroid, clusters, self._means)) # measure the degree of change from the previous step for convergence difference = self._sum_distances(self._means, new_means) if difference < self._max_difference: converged = True # remember the new means self._means = new_means def classify_vectorspace(self, vector): # finds the closest cluster centroid # returns that cluster's index best_distance = best_index = None for index in range(len(self._means)): mean = self._means[index] dist = self._distance(vector, mean) if best_distance is None or dist < best_distance: best_index, best_distance = index, dist return best_index def num_clusters(self): if self._means: return len(self._means) else: return self._num_means def means(self): """ The means used for clustering. """ return self._means def _sum_distances(self, vectors1, vectors2): difference = 0.0 for u, v in zip(vectors1, vectors2): difference += self._distance(u, v) return difference def _centroid(self, cluster, mean): if self._avoid_empty_clusters: centroid = copy.copy(mean) for vector in cluster: centroid += vector return centroid / (1 + len(cluster)) else: if not len(cluster): sys.stderr.write("Error: no centroid defined for empty cluster.\n") sys.stderr.write( "Try setting argument 'avoid_empty_clusters' to True\n" ) assert False centroid = copy.copy(cluster[0]) for vector in cluster[1:]: centroid += vector return centroid / len(cluster) def __repr__(self): return "" % (self._means, self._repeats) ################################################################################# def demo(): # example from figure 14.9, page 517, Manning and Schutze from nltk.cluster import KMeansClusterer, euclidean_distance vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] means = [[4, 3], [5, 5]] clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) clusters = clusterer.cluster(vectors, True, trace=True) print("Clustered:", vectors) print("As:", clusters) print("Means:", clusterer.means()) print() vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True) print("Clustered:", vectors) print("As:", clusters) print("Means:", clusterer.means()) print() # classify a new vector vector = numpy.array([3, 3]) print("classify(%s):" % vector, end=" ") print(clusterer.classify(vector)) print() if __name__ == "__main__": demo() nltk-3.7/nltk/cluster/util.py000066400000000000000000000230131420073152400163100ustar00rootroot00000000000000# Natural Language Toolkit: Clusterer Utilities # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # Contributor: J Richard Snape # URL: # For license information, see LICENSE.TXT import copy from abc import abstractmethod from math import sqrt from sys import stdout try: import numpy except ImportError: pass from nltk.cluster.api import ClusterI class VectorSpaceClusterer(ClusterI): """ Abstract clusterer which takes tokens and maps them into a vector space. Optionally performs singular value decomposition to reduce the dimensionality. """ def __init__(self, normalise=False, svd_dimensions=None): """ :param normalise: should vectors be normalised to length 1 :type normalise: boolean :param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD :type svd_dimensions: int """ self._Tt = None self._should_normalise = normalise self._svd_dimensions = svd_dimensions def cluster(self, vectors, assign_clusters=False, trace=False): assert len(vectors) > 0 # normalise the vectors if self._should_normalise: vectors = list(map(self._normalise, vectors)) # use SVD to reduce the dimensionality if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors))) S = d[: self._svd_dimensions] * numpy.identity( self._svd_dimensions, numpy.float64 ) T = u[:, : self._svd_dimensions] Dt = vt[: self._svd_dimensions, :] vectors = numpy.transpose(numpy.dot(S, Dt)) self._Tt = numpy.transpose(T) # call abstract method to cluster the vectors self.cluster_vectorspace(vectors, trace) # assign the vectors to clusters if assign_clusters: return [self.classify(vector) for vector in vectors] @abstractmethod def cluster_vectorspace(self, vectors, trace): """ Finds the clusters using the given set of vectors. """ def classify(self, vector): if self._should_normalise: vector = self._normalise(vector) if self._Tt is not None: vector = numpy.dot(self._Tt, vector) cluster = self.classify_vectorspace(vector) return self.cluster_name(cluster) @abstractmethod def classify_vectorspace(self, vector): """ Returns the index of the appropriate cluster for the vector. """ def likelihood(self, vector, label): if self._should_normalise: vector = self._normalise(vector) if self._Tt is not None: vector = numpy.dot(self._Tt, vector) return self.likelihood_vectorspace(vector, label) def likelihood_vectorspace(self, vector, cluster): """ Returns the likelihood of the vector belonging to the cluster. """ predicted = self.classify_vectorspace(vector) return 1.0 if cluster == predicted else 0.0 def vector(self, vector): """ Returns the vector after normalisation and dimensionality reduction """ if self._should_normalise: vector = self._normalise(vector) if self._Tt is not None: vector = numpy.dot(self._Tt, vector) return vector def _normalise(self, vector): """ Normalises the vector to unit length. """ return vector / sqrt(numpy.dot(vector, vector)) def euclidean_distance(u, v): """ Returns the euclidean distance between vectors u and v. This is equivalent to the length of the vector (u - v). """ diff = u - v return sqrt(numpy.dot(diff, diff)) def cosine_distance(u, v): """ Returns 1 minus the cosine of the angle between vectors v and u. This is equal to ``1 - (u.v / |u||v|)``. """ return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v)))) class _DendrogramNode: """Tree node of a dendrogram.""" def __init__(self, value, *children): self._value = value self._children = children def leaves(self, values=True): if self._children: leaves = [] for child in self._children: leaves.extend(child.leaves(values)) return leaves elif values: return [self._value] else: return [self] def groups(self, n): queue = [(self._value, self)] while len(queue) < n: priority, node = queue.pop() if not node._children: queue.push((priority, node)) break for child in node._children: if child._children: queue.append((child._value, child)) else: queue.append((0, child)) # makes the earliest merges at the start, latest at the end queue.sort() groups = [] for priority, node in queue: groups.append(node.leaves()) return groups def __lt__(self, comparator): return cosine_distance(self._value, comparator._value) < 0 class Dendrogram: """ Represents a dendrogram, a tree with a specified branching order. This must be initialised with the leaf items, then iteratively call merge for each branch. This class constructs a tree representing the order of calls to the merge function. """ def __init__(self, items=[]): """ :param items: the items at the leaves of the dendrogram :type items: sequence of (any) """ self._items = [_DendrogramNode(item) for item in items] self._original_items = copy.copy(self._items) self._merge = 1 def merge(self, *indices): """ Merges nodes at given indices in the dendrogram. The nodes will be combined which then replaces the first node specified. All other nodes involved in the merge will be removed. :param indices: indices of the items to merge (at least two) :type indices: seq of int """ assert len(indices) >= 2 node = _DendrogramNode(self._merge, *(self._items[i] for i in indices)) self._merge += 1 self._items[indices[0]] = node for i in indices[1:]: del self._items[i] def groups(self, n): """ Finds the n-groups of items (leaves) reachable from a cut at depth n. :param n: number of groups :type n: int """ if len(self._items) > 1: root = _DendrogramNode(self._merge, *self._items) else: root = self._items[0] return root.groups(n) def show(self, leaf_labels=[]): """ Print the dendrogram in ASCII art to standard out. :param leaf_labels: an optional list of strings to use for labeling the leaves :type leaf_labels: list """ # ASCII rendering characters JOIN, HLINK, VLINK = "+", "-", "|" # find the root (or create one) if len(self._items) > 1: root = _DendrogramNode(self._merge, *self._items) else: root = self._items[0] leaves = self._original_items if leaf_labels: last_row = leaf_labels else: last_row = ["%s" % leaf._value for leaf in leaves] # find the bottom row and the best cell width width = max(map(len, last_row)) + 1 lhalf = width // 2 rhalf = int(width - lhalf - 1) # display functions def format(centre, left=" ", right=" "): return f"{lhalf * left}{centre}{right * rhalf}" def display(str): stdout.write(str) # for each merge, top down queue = [(root._value, root)] verticals = [format(" ") for leaf in leaves] while queue: priority, node = queue.pop() child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children)) indices = list(map(leaves.index, child_left_leaf)) if child_left_leaf: min_idx = min(indices) max_idx = max(indices) for i in range(len(leaves)): if leaves[i] in child_left_leaf: if i == min_idx: display(format(JOIN, " ", HLINK)) elif i == max_idx: display(format(JOIN, HLINK, " ")) else: display(format(JOIN, HLINK, HLINK)) verticals[i] = format(VLINK) elif min_idx <= i <= max_idx: display(format(HLINK, HLINK, HLINK)) else: display(verticals[i]) display("\n") for child in node._children: if child._children: queue.append((child._value, child)) queue.sort() for vertical in verticals: display(vertical) display("\n") # finally, display the last line display("".join(item.center(width) for item in last_row)) display("\n") def __repr__(self): if len(self._items) > 1: root = _DendrogramNode(self._merge, *self._items) else: root = self._items[0] leaves = root.leaves(False) return "" % len(leaves) nltk-3.7/nltk/collections.py000066400000000000000000000547441420073152400162070ustar00rootroot00000000000000# Natural Language Toolkit: Collections # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT import bisect # this unused import is for python 2.7 from collections import Counter, defaultdict, deque from functools import total_ordering from itertools import chain, islice from nltk.internals import raise_unorderable_types, slice_bounds ########################################################################## # Ordered Dictionary ########################################################################## class OrderedDict(dict): def __init__(self, data=None, **kwargs): self._keys = self.keys(data, kwargs.get("keys")) self._default_factory = kwargs.get("default_factory") if data is None: dict.__init__(self) else: dict.__init__(self, data) def __delitem__(self, key): dict.__delitem__(self, key) self._keys.remove(key) def __getitem__(self, key): try: return dict.__getitem__(self, key) except KeyError: return self.__missing__(key) def __iter__(self): return (key for key in self.keys()) def __missing__(self, key): if not self._default_factory and key not in self._keys: raise KeyError() return self._default_factory() def __setitem__(self, key, item): dict.__setitem__(self, key, item) if key not in self._keys: self._keys.append(key) def clear(self): dict.clear(self) self._keys.clear() def copy(self): d = dict.copy(self) d._keys = self._keys return d def items(self): # returns iterator under python 3 and list under python 2 return zip(self.keys(), self.values()) def keys(self, data=None, keys=None): if data: if keys: assert isinstance(keys, list) assert len(data) == len(keys) return keys else: assert ( isinstance(data, dict) or isinstance(data, OrderedDict) or isinstance(data, list) ) if isinstance(data, dict) or isinstance(data, OrderedDict): return data.keys() elif isinstance(data, list): return [key for (key, value) in data] elif "_keys" in self.__dict__: return self._keys else: return [] def popitem(self): if not self._keys: raise KeyError() key = self._keys.pop() value = self[key] del self[key] return (key, value) def setdefault(self, key, failobj=None): dict.setdefault(self, key, failobj) if key not in self._keys: self._keys.append(key) def update(self, data): dict.update(self, data) for key in self.keys(data): if key not in self._keys: self._keys.append(key) def values(self): # returns iterator under python 3 return map(self.get, self._keys) ###################################################################### # Lazy Sequences ###################################################################### @total_ordering class AbstractLazySequence: """ An abstract base class for read-only sequences whose values are computed as needed. Lazy sequences act like tuples -- they can be indexed, sliced, and iterated over; but they may not be modified. The most common application of lazy sequences in NLTK is for corpus view objects, which provide access to the contents of a corpus without loading the entire corpus into memory, by loading pieces of the corpus from disk as needed. The result of modifying a mutable element of a lazy sequence is undefined. In particular, the modifications made to the element may or may not persist, depending on whether and when the lazy sequence caches that element's value or reconstructs it from scratch. Subclasses are required to define two methods: ``__len__()`` and ``iterate_from()``. """ def __len__(self): """ Return the number of tokens in the corpus file underlying this corpus view. """ raise NotImplementedError("should be implemented by subclass") def iterate_from(self, start): """ Return an iterator that generates the tokens in the corpus file underlying this corpus view, starting at the token number ``start``. If ``start>=len(self)``, then this iterator will generate no tokens. """ raise NotImplementedError("should be implemented by subclass") def __getitem__(self, i): """ Return the *i* th token in the corpus file underlying this corpus view. Negative indices and spans are both supported. """ if isinstance(i, slice): start, stop = slice_bounds(self, i) return LazySubsequence(self, start, stop) else: # Handle negative indices if i < 0: i += len(self) if i < 0: raise IndexError("index out of range") # Use iterate_from to extract it. try: return next(self.iterate_from(i)) except StopIteration as e: raise IndexError("index out of range") from e def __iter__(self): """Return an iterator that generates the tokens in the corpus file underlying this corpus view.""" return self.iterate_from(0) def count(self, value): """Return the number of times this list contains ``value``.""" return sum(1 for elt in self if elt == value) def index(self, value, start=None, stop=None): """Return the index of the first occurrence of ``value`` in this list that is greater than or equal to ``start`` and less than ``stop``. Negative start and stop values are treated like negative slice bounds -- i.e., they count from the end of the list.""" start, stop = slice_bounds(self, slice(start, stop)) for i, elt in enumerate(islice(self, start, stop)): if elt == value: return i + start raise ValueError("index(x): x not in list") def __contains__(self, value): """Return true if this list contains ``value``.""" return bool(self.count(value)) def __add__(self, other): """Return a list concatenating self with other.""" return LazyConcatenation([self, other]) def __radd__(self, other): """Return a list concatenating other with self.""" return LazyConcatenation([other, self]) def __mul__(self, count): """Return a list concatenating self with itself ``count`` times.""" return LazyConcatenation([self] * count) def __rmul__(self, count): """Return a list concatenating self with itself ``count`` times.""" return LazyConcatenation([self] * count) _MAX_REPR_SIZE = 60 def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append(repr(elt)) length += len(pieces[-1]) + 2 if length > self._MAX_REPR_SIZE and len(pieces) > 2: return "[%s, ...]" % ", ".join(pieces[:-1]) return "[%s]" % ", ".join(pieces) def __eq__(self, other): return type(self) == type(other) and list(self) == list(other) def __ne__(self, other): return not self == other def __lt__(self, other): if type(other) != type(self): raise_unorderable_types("<", self, other) return list(self) < list(other) def __hash__(self): """ :raise ValueError: Corpus view objects are unhashable. """ raise ValueError("%s objects are unhashable" % self.__class__.__name__) class LazySubsequence(AbstractLazySequence): """ A subsequence produced by slicing a lazy sequence. This slice keeps a reference to its source sequence, and generates its values by looking them up in the source sequence. """ MIN_SIZE = 100 """ The minimum size for which lazy slices should be created. If ``LazySubsequence()`` is called with a subsequence that is shorter than ``MIN_SIZE``, then a tuple will be returned instead. """ def __new__(cls, source, start, stop): """ Construct a new slice from a given underlying sequence. The ``start`` and ``stop`` indices should be absolute indices -- i.e., they should not be negative (for indexing from the back of a list) or greater than the length of ``source``. """ # If the slice is small enough, just use a tuple. if stop - start < cls.MIN_SIZE: return list(islice(source.iterate_from(start), stop - start)) else: return object.__new__(cls) def __init__(self, source, start, stop): self._source = source self._start = start self._stop = stop def __len__(self): return self._stop - self._start def iterate_from(self, start): return islice( self._source.iterate_from(start + self._start), max(0, len(self) - start) ) class LazyConcatenation(AbstractLazySequence): """ A lazy sequence formed by concatenating a list of lists. This underlying list of lists may itself be lazy. ``LazyConcatenation`` maintains an index that it uses to keep track of the relationship between offsets in the concatenated lists and offsets in the sublists. """ def __init__(self, list_of_lists): self._list = list_of_lists self._offsets = [0] def __len__(self): if len(self._offsets) <= len(self._list): for _ in self.iterate_from(self._offsets[-1]): pass return self._offsets[-1] def iterate_from(self, start_index): if start_index < self._offsets[-1]: sublist_index = bisect.bisect_right(self._offsets, start_index) - 1 else: sublist_index = len(self._offsets) - 1 index = self._offsets[sublist_index] # Construct an iterator over the sublists. if isinstance(self._list, AbstractLazySequence): sublist_iter = self._list.iterate_from(sublist_index) else: sublist_iter = islice(self._list, sublist_index, None) for sublist in sublist_iter: if sublist_index == (len(self._offsets) - 1): assert ( index + len(sublist) >= self._offsets[-1] ), "offsets not monotonic increasing!" self._offsets.append(index + len(sublist)) else: assert self._offsets[sublist_index + 1] == index + len( sublist ), "inconsistent list value (num elts)" yield from sublist[max(0, start_index - index) :] index += len(sublist) sublist_index += 1 class LazyMap(AbstractLazySequence): """ A lazy sequence whose elements are formed by applying a given function to each element in one or more underlying lists. The function is applied lazily -- i.e., when you read a value from the list, ``LazyMap`` will calculate that value by applying its function to the underlying lists' value(s). ``LazyMap`` is essentially a lazy version of the Python primitive function ``map``. In particular, the following two expressions are equivalent: >>> from nltk.collections import LazyMap >>> function = str >>> sequence = [1,2,3] >>> map(function, sequence) # doctest: +SKIP ['1', '2', '3'] >>> list(LazyMap(function, sequence)) ['1', '2', '3'] Like the Python ``map`` primitive, if the source lists do not have equal size, then the value None will be supplied for the 'missing' elements. Lazy maps can be useful for conserving memory, in cases where individual values take up a lot of space. This is especially true if the underlying list's values are constructed lazily, as is the case with many corpus readers. A typical example of a use case for this class is performing feature detection on the tokens in a corpus. Since featuresets are encoded as dictionaries, which can take up a lot of memory, using a ``LazyMap`` can significantly reduce memory usage when training and running classifiers. """ def __init__(self, function, *lists, **config): """ :param function: The function that should be applied to elements of ``lists``. It should take as many arguments as there are ``lists``. :param lists: The underlying lists. :param cache_size: Determines the size of the cache used by this lazy map. (default=5) """ if not lists: raise TypeError("LazyMap requires at least two args") self._lists = lists self._func = function self._cache_size = config.get("cache_size", 5) self._cache = {} if self._cache_size > 0 else None # If you just take bool() of sum() here _all_lazy will be true just # in case n >= 1 list is an AbstractLazySequence. Presumably this # isn't what's intended. self._all_lazy = sum( isinstance(lst, AbstractLazySequence) for lst in lists ) == len(lists) def iterate_from(self, index): # Special case: one lazy sublist if len(self._lists) == 1 and self._all_lazy: for value in self._lists[0].iterate_from(index): yield self._func(value) return # Special case: one non-lazy sublist elif len(self._lists) == 1: while True: try: yield self._func(self._lists[0][index]) except IndexError: return index += 1 # Special case: n lazy sublists elif self._all_lazy: iterators = [lst.iterate_from(index) for lst in self._lists] while True: elements = [] for iterator in iterators: try: elements.append(next(iterator)) except: # FIXME: What is this except really catching? StopIteration? elements.append(None) if elements == [None] * len(self._lists): return yield self._func(*elements) index += 1 # general case else: while True: try: elements = [lst[index] for lst in self._lists] except IndexError: elements = [None] * len(self._lists) for i, lst in enumerate(self._lists): try: elements[i] = lst[index] except IndexError: pass if elements == [None] * len(self._lists): return yield self._func(*elements) index += 1 def __getitem__(self, index): if isinstance(index, slice): sliced_lists = [lst[index] for lst in self._lists] return LazyMap(self._func, *sliced_lists) else: # Handle negative indices if index < 0: index += len(self) if index < 0: raise IndexError("index out of range") # Check the cache if self._cache is not None and index in self._cache: return self._cache[index] # Calculate the value try: val = next(self.iterate_from(index)) except StopIteration as e: raise IndexError("index out of range") from e # Update the cache if self._cache is not None: if len(self._cache) > self._cache_size: self._cache.popitem() # discard random entry self._cache[index] = val # Return the value return val def __len__(self): return max(len(lst) for lst in self._lists) class LazyZip(LazyMap): """ A lazy sequence whose elements are tuples, each containing the i-th element from each of the argument sequences. The returned list is truncated in length to the length of the shortest argument sequence. The tuples are constructed lazily -- i.e., when you read a value from the list, ``LazyZip`` will calculate that value by forming a tuple from the i-th element of each of the argument sequences. ``LazyZip`` is essentially a lazy version of the Python primitive function ``zip``. In particular, an evaluated LazyZip is equivalent to a zip: >>> from nltk.collections import LazyZip >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c'] >>> zip(sequence1, sequence2) # doctest: +SKIP [(1, 'a'), (2, 'b'), (3, 'c')] >>> list(LazyZip(sequence1, sequence2)) [(1, 'a'), (2, 'b'), (3, 'c')] >>> sequences = [sequence1, sequence2, [6,7,8,9]] >>> list(zip(*sequences)) == list(LazyZip(*sequences)) True Lazy zips can be useful for conserving memory in cases where the argument sequences are particularly long. A typical example of a use case for this class is combining long sequences of gold standard and predicted values in a classification or tagging task in order to calculate accuracy. By constructing tuples lazily and avoiding the creation of an additional long sequence, memory usage can be significantly reduced. """ def __init__(self, *lists): """ :param lists: the underlying lists :type lists: list(list) """ LazyMap.__init__(self, lambda *elts: elts, *lists) def iterate_from(self, index): iterator = LazyMap.iterate_from(self, index) while index < len(self): yield next(iterator) index += 1 return def __len__(self): return min(len(lst) for lst in self._lists) class LazyEnumerate(LazyZip): """ A lazy sequence whose elements are tuples, each containing a count (from zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is useful for obtaining an indexed list. The tuples are constructed lazily -- i.e., when you read a value from the list, ``LazyEnumerate`` will calculate that value by forming a tuple from the count of the i-th element and the i-th element of the underlying sequence. ``LazyEnumerate`` is essentially a lazy version of the Python primitive function ``enumerate``. In particular, the following two expressions are equivalent: >>> from nltk.collections import LazyEnumerate >>> sequence = ['first', 'second', 'third'] >>> list(enumerate(sequence)) [(0, 'first'), (1, 'second'), (2, 'third')] >>> list(LazyEnumerate(sequence)) [(0, 'first'), (1, 'second'), (2, 'third')] Lazy enumerations can be useful for conserving memory in cases where the argument sequences are particularly long. A typical example of a use case for this class is obtaining an indexed list for a long sequence of values. By constructing tuples lazily and avoiding the creation of an additional long sequence, memory usage can be significantly reduced. """ def __init__(self, lst): """ :param lst: the underlying list :type lst: list """ LazyZip.__init__(self, range(len(lst)), lst) class LazyIteratorList(AbstractLazySequence): """ Wraps an iterator, loading its elements on demand and making them subscriptable. __repr__ displays only the first few elements. """ def __init__(self, it, known_len=None): self._it = it self._len = known_len self._cache = [] def __len__(self): if self._len: return self._len for _ in self.iterate_from(len(self._cache)): pass self._len = len(self._cache) return self._len def iterate_from(self, start): """Create a new iterator over this list starting at the given offset.""" while len(self._cache) < start: v = next(self._it) self._cache.append(v) i = start while i < len(self._cache): yield self._cache[i] i += 1 try: while True: v = next(self._it) self._cache.append(v) yield v except StopIteration: pass def __add__(self, other): """Return a list concatenating self with other.""" return type(self)(chain(self, other)) def __radd__(self, other): """Return a list concatenating other with self.""" return type(self)(chain(other, self)) ###################################################################### # Trie Implementation ###################################################################### class Trie(dict): """A Trie implementation for strings""" LEAF = True def __init__(self, strings=None): """Builds a Trie object, which is built around a ``dict`` If ``strings`` is provided, it will add the ``strings``, which consist of a ``list`` of ``strings``, to the Trie. Otherwise, it'll construct an empty Trie. :param strings: List of strings to insert into the trie (Default is ``None``) :type strings: list(str) """ super().__init__() if strings: for string in strings: self.insert(string) def insert(self, string): """Inserts ``string`` into the Trie :param string: String to insert into the trie :type string: str :Example: >>> from nltk.collections import Trie >>> trie = Trie(["abc", "def"]) >>> expected = {'a': {'b': {'c': {True: None}}}, \ 'd': {'e': {'f': {True: None}}}} >>> trie == expected True """ if len(string): self[string[0]].insert(string[1:]) else: # mark the string is complete self[Trie.LEAF] = None def __missing__(self, key): self[key] = Trie() return self[key] nltk-3.7/nltk/collocations.py000066400000000000000000000343301420073152400163470ustar00rootroot00000000000000# Natural Language Toolkit: Collocations and Association Measures # # Copyright (C) 2001-2022 NLTK Project # Author: Joel Nothman # URL: # For license information, see LICENSE.TXT # """ Tools to identify collocations --- words that often appear consecutively --- within corpora. They may also be used to find other associations between word occurrences. See Manning and Schutze ch. 5 at https://nlp.stanford.edu/fsnlp/promo/colloc.pdf and the Text::NSP Perl package at http://ngram.sourceforge.net Finding collocations requires first calculating the frequencies of words and their appearance in the context of other words. Often the collection of words will then requiring filtering to only retain useful content terms. Each ngram of words may then be scored according to some association measure, in order to determine the relative likelihood of each ngram being a collocation. The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide these functionalities, dependent on being provided a function which scores a ngram given appropriate frequency counts. A number of standard association measures are provided in bigram_measures and trigram_measures. """ # Possible TODOs: # - consider the distinction between f(x,_) and f(x) and whether our # approximation is good enough for fragmented data, and mention it # - add a n-gram collocation finder with measures which only utilise n-gram # and unigram counts (raw_freq, pmi, student_t) import itertools as _itertools # these two unused imports are referenced in collocations.doctest from nltk.metrics import ( BigramAssocMeasures, ContingencyMeasures, QuadgramAssocMeasures, TrigramAssocMeasures, ) from nltk.metrics.spearman import ranks_from_scores, spearman_correlation from nltk.probability import FreqDist from nltk.util import ngrams class AbstractCollocationFinder: """ An abstract base class for collocation finders whose purpose is to collect collocation candidate frequencies, filter and rank them. As a minimum, collocation finders require the frequencies of each word in a corpus, and the joint frequency of word tuples. This data should be provided through nltk.probability.FreqDist objects or an identical interface. """ def __init__(self, word_fd, ngram_fd): self.word_fd = word_fd self.N = word_fd.N() self.ngram_fd = ngram_fd @classmethod def _build_new_documents( cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None ): """ Pad the document with the place holder according to the window_size """ padding = (pad_symbol,) * (window_size - 1) if pad_right: return _itertools.chain.from_iterable( _itertools.chain(doc, padding) for doc in documents ) if pad_left: return _itertools.chain.from_iterable( _itertools.chain(padding, doc) for doc in documents ) @classmethod def from_documents(cls, documents): """Constructs a collocation finder given a collection of documents, each of which is a list (or iterable) of tokens. """ # return cls.from_words(_itertools.chain(*documents)) return cls.from_words( cls._build_new_documents(documents, cls.default_ws, pad_right=True) ) @staticmethod def _ngram_freqdist(words, n): return FreqDist(tuple(words[i : i + n]) for i in range(len(words) - 1)) def _apply_filter(self, fn=lambda ngram, freq: False): """Generic filter removes ngrams from the frequency distribution if the function returns True when passed an ngram tuple. """ tmp_ngram = FreqDist() for ngram, freq in self.ngram_fd.items(): if not fn(ngram, freq): tmp_ngram[ngram] = freq self.ngram_fd = tmp_ngram def apply_freq_filter(self, min_freq): """Removes candidate ngrams which have frequency less than min_freq.""" self._apply_filter(lambda ng, freq: freq < min_freq) def apply_ngram_filter(self, fn): """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...) evaluates to True. """ self._apply_filter(lambda ng, f: fn(*ng)) def apply_word_filter(self, fn): """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2), ...) evaluates to True. """ self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) def _score_ngrams(self, score_fn): """Generates of (ngram, score) pairs as determined by the scoring function provided. """ for tup in self.ngram_fd: score = self.score_ngram(score_fn, *tup) if score is not None: yield tup, score def score_ngrams(self, score_fn): """Returns a sequence of (ngram, score) pairs ordered from highest to lowest score, as determined by the scoring function provided. """ return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0])) def nbest(self, score_fn, n): """Returns the top n ngrams when scored by the given function.""" return [p for p, s in self.score_ngrams(score_fn)[:n]] def above_score(self, score_fn, min_score): """Returns a sequence of ngrams, ordered by decreasing score, whose scores each exceed the given minimum score. """ for ngram, score in self.score_ngrams(score_fn): if score > min_score: yield ngram else: break class BigramCollocationFinder(AbstractCollocationFinder): """A tool for the finding and ranking of bigram collocations or other association measures. It is often useful to use from_words() rather than constructing an instance directly. """ default_ws = 2 def __init__(self, word_fd, bigram_fd, window_size=2): """Construct a BigramCollocationFinder, given FreqDists for appearances of words and (possibly non-contiguous) bigrams. """ AbstractCollocationFinder.__init__(self, word_fd, bigram_fd) self.window_size = window_size @classmethod def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. When window_size > 2, count non-contiguous bigrams, in the style of Church and Hanks's (1990) association ratio. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError("Specify window_size at least 2") for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue wfd[w1] += 1 for w2 in window[1:]: if w2 is not None: bfd[(w1, w2)] += 1 return cls(wfd, bfd, window_size=window_size) def score_ngram(self, score_fn, w1, w2): """Returns the score for a given bigram using the given scoring function. Following Church and Hanks (1990), counts are scaled by a factor of 1/(window_size - 1). """ n_all = self.N n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0) if not n_ii: return n_ix = self.word_fd[w1] n_xi = self.word_fd[w2] return score_fn(n_ii, (n_ix, n_xi), n_all) class TrigramCollocationFinder(AbstractCollocationFinder): """A tool for the finding and ranking of trigram collocations or other association measures. It is often useful to use from_words() rather than constructing an instance directly. """ default_ws = 3 def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd): """Construct a TrigramCollocationFinder, given FreqDists for appearances of words, bigrams, two words with any word between them, and trigrams. """ AbstractCollocationFinder.__init__(self, word_fd, trigram_fd) self.wildcard_fd = wildcard_fd self.bigram_fd = bigram_fd @classmethod def from_words(cls, words, window_size=3): """Construct a TrigramCollocationFinder for all trigrams in the given sequence. """ if window_size < 3: raise ValueError("Specify window_size at least 3") wfd = FreqDist() wildfd = FreqDist() bfd = FreqDist() tfd = FreqDist() for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue for w2, w3 in _itertools.combinations(window[1:], 2): wfd[w1] += 1 if w2 is None: continue bfd[(w1, w2)] += 1 if w3 is None: continue wildfd[(w1, w3)] += 1 tfd[(w1, w2, w3)] += 1 return cls(wfd, bfd, wildfd, tfd) def bigram_finder(self): """Constructs a bigram collocation finder with the bigram and unigram data from this finder. Note that this does not include any filtering applied to this finder. """ return BigramCollocationFinder(self.word_fd, self.bigram_fd) def score_ngram(self, score_fn, w1, w2, w3): """Returns the score for a given trigram using the given scoring function. """ n_all = self.N n_iii = self.ngram_fd[(w1, w2, w3)] if not n_iii: return n_iix = self.bigram_fd[(w1, w2)] n_ixi = self.wildcard_fd[(w1, w3)] n_xii = self.bigram_fd[(w2, w3)] n_ixx = self.word_fd[w1] n_xix = self.word_fd[w2] n_xxi = self.word_fd[w3] return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all) class QuadgramCollocationFinder(AbstractCollocationFinder): """A tool for the finding and ranking of quadgram collocations or other association measures. It is often useful to use from_words() rather than constructing an instance directly. """ default_ws = 4 def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii): """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words, bigrams, trigrams, two words with one word and two words between them, three words with a word between them in both variations. """ AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd) self.iii = iii self.ii = ii self.ixi = ixi self.ixxi = ixxi self.iixi = iixi self.ixii = ixii @classmethod def from_words(cls, words, window_size=4): if window_size < 4: raise ValueError("Specify window_size at least 4") ixxx = FreqDist() iiii = FreqDist() ii = FreqDist() iii = FreqDist() ixi = FreqDist() ixxi = FreqDist() iixi = FreqDist() ixii = FreqDist() for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue for w2, w3, w4 in _itertools.combinations(window[1:], 3): ixxx[w1] += 1 if w2 is None: continue ii[(w1, w2)] += 1 if w3 is None: continue iii[(w1, w2, w3)] += 1 ixi[(w1, w3)] += 1 if w4 is None: continue iiii[(w1, w2, w3, w4)] += 1 ixxi[(w1, w4)] += 1 ixii[(w1, w3, w4)] += 1 iixi[(w1, w2, w4)] += 1 return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) def score_ngram(self, score_fn, w1, w2, w3, w4): n_all = self.N n_iiii = self.ngram_fd[(w1, w2, w3, w4)] if not n_iiii: return n_iiix = self.iii[(w1, w2, w3)] n_xiii = self.iii[(w2, w3, w4)] n_iixi = self.iixi[(w1, w2, w4)] n_ixii = self.ixii[(w1, w3, w4)] n_iixx = self.ii[(w1, w2)] n_xxii = self.ii[(w3, w4)] n_xiix = self.ii[(w2, w3)] n_ixix = self.ixi[(w1, w3)] n_ixxi = self.ixxi[(w1, w4)] n_xixi = self.ixi[(w2, w4)] n_ixxx = self.word_fd[w1] n_xixx = self.word_fd[w2] n_xxix = self.word_fd[w3] n_xxxi = self.word_fd[w4] return score_fn( n_iiii, (n_iiix, n_iixi, n_ixii, n_xiii), (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), (n_ixxx, n_xixx, n_xxix, n_xxxi), n_all, ) def demo(scorer=None, compare_scorer=None): """Finds bigram collocations in the files of the WebText corpus.""" from nltk.metrics import ( BigramAssocMeasures, ranks_from_scores, spearman_correlation, ) if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk.corpus import stopwords, webtext ignored_words = stopwords.words("english") word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) corr = spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer)), ) print(file) print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)]) print(f"\t Correlation to {compare_scorer.__name__}: {corr:0.4f}") # Slows down loading too much # bigram_measures = BigramAssocMeasures() # trigram_measures = TrigramAssocMeasures() if __name__ == "__main__": import sys from nltk.metrics import BigramAssocMeasures try: scorer = eval("BigramAssocMeasures." + sys.argv[1]) except IndexError: scorer = None try: compare_scorer = eval("BigramAssocMeasures." + sys.argv[2]) except IndexError: compare_scorer = None demo(scorer, compare_scorer) __all__ = [ "BigramCollocationFinder", "TrigramCollocationFinder", "QuadgramCollocationFinder", ] nltk-3.7/nltk/compat.py000077500000000000000000000023601420073152400151420ustar00rootroot00000000000000# Natural Language Toolkit: Compatibility # # Copyright (C) 2001-2022 NLTK Project # # URL: # For license information, see LICENSE.TXT import os from functools import wraps # ======= Compatibility for datasets that care about Python versions ======== # The following datasets have a /PY3 subdirectory containing # a full copy of the data which has been re-encoded or repickled. DATA_UPDATES = [ ("chunkers", "maxent_ne_chunker"), ("help", "tagsets"), ("taggers", "maxent_treebank_pos_tagger"), ("tokenizers", "punkt"), ] _PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES] def add_py3_data(path): for item in _PY3_DATA_UPDATES: if item in str(path) and "/PY3" not in str(path): pos = path.index(item) + len(item) if path[pos : pos + 4] == ".zip": pos += 4 path = path[:pos] + "/PY3" + path[pos:] break return path # for use in adding /PY3 to the second (filename) argument # of the file pointers in data.py def py3_data(init_func): def _decorator(*args, **kwargs): args = (args[0], add_py3_data(args[1])) + args[2:] return init_func(*args, **kwargs) return wraps(init_func)(_decorator) nltk-3.7/nltk/corpus/000077500000000000000000000000001420073152400146145ustar00rootroot00000000000000nltk-3.7/nltk/corpus/__init__.py000066400000000000000000000344471420073152400167410ustar00rootroot00000000000000# Natural Language Toolkit: Corpus Readers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT # TODO this docstring isn't up-to-date! """ NLTK corpus readers. The modules in this package provide functions that can be used to read corpus files in a variety of formats. These functions can be used to read both the corpus files that are distributed in the NLTK corpus package, and corpus files that are part of external corpora. Available Corpora ================= Please see https://www.nltk.org/nltk_data/ for a complete list. Install corpora using nltk.download(). Corpus Reader Functions ======================= Each corpus module defines one or more "corpus reader functions", which can be used to read documents from that corpus. These functions take an argument, ``item``, which is used to indicate which document should be read from the corpus: - If ``item`` is one of the unique identifiers listed in the corpus module's ``items`` variable, then the corresponding document will be loaded from the NLTK corpus package. - If ``item`` is a filename, then that file will be read. Additionally, corpus reader functions can be given lists of item names; in which case, they will return a concatenation of the corresponding documents. Corpus reader functions are named based on the type of information they return. Some common examples, and their return types, are: - words(): list of str - sents(): list of (list of str) - paras(): list of (list of (list of str)) - tagged_words(): list of (str,str) tuple - tagged_sents(): list of (list of (str,str)) - tagged_paras(): list of (list of (list of (str,str))) - chunked_sents(): list of (Tree w/ (str,str) leaves) - parsed_sents(): list of (Tree with str leaves) - parsed_paras(): list of (list of (Tree with str leaves)) - xml(): A single xml ElementTree - raw(): unprocessed corpus contents For example, to read a list of the words in the Brown Corpus, use ``nltk.corpus.brown.words()``: >>> from nltk.corpus import brown >>> print(", ".join(brown.words())) The, Fulton, County, Grand, Jury, said, ... """ import re from nltk.corpus.reader import * from nltk.corpus.util import LazyCorpusLoader from nltk.tokenize import RegexpTokenizer abc = LazyCorpusLoader( "abc", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding=[("science", "latin_1"), ("rural", "utf8")], ) alpino = LazyCorpusLoader("alpino", AlpinoCorpusReader, tagset="alpino") brown = LazyCorpusLoader( "brown", CategorizedTaggedCorpusReader, r"c[a-z]\d\d", cat_file="cats.txt", tagset="brown", encoding="ascii", ) cess_cat = LazyCorpusLoader( "cess_cat", BracketParseCorpusReader, r"(?!\.).*\.tbf", tagset="unknown", encoding="ISO-8859-15", ) cess_esp = LazyCorpusLoader( "cess_esp", BracketParseCorpusReader, r"(?!\.).*\.tbf", tagset="unknown", encoding="ISO-8859-15", ) cmudict = LazyCorpusLoader("cmudict", CMUDictCorpusReader, ["cmudict"]) comtrans = LazyCorpusLoader("comtrans", AlignedCorpusReader, r"(?!\.).*\.txt") comparative_sentences = LazyCorpusLoader( "comparative_sentences", ComparativeSentencesCorpusReader, r"labeledSentences\.txt", encoding="latin-1", ) conll2000 = LazyCorpusLoader( "conll2000", ConllChunkCorpusReader, ["train.txt", "test.txt"], ("NP", "VP", "PP"), tagset="wsj", encoding="ascii", ) conll2002 = LazyCorpusLoader( "conll2002", ConllChunkCorpusReader, r".*\.(test|train).*", ("LOC", "PER", "ORG", "MISC"), encoding="utf-8", ) conll2007 = LazyCorpusLoader( "conll2007", DependencyCorpusReader, r".*\.(test|train).*", encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")], ) crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, r".*\.txt") dependency_treebank = LazyCorpusLoader( "dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii" ) floresta = LazyCorpusLoader( "floresta", BracketParseCorpusReader, r"(?!\.).*\.ptb", "#", tagset="unknown", encoding="ISO-8859-15", ) framenet15 = LazyCorpusLoader( "framenet_v15", FramenetCorpusReader, [ "frRelation.xml", "frameIndex.xml", "fulltextIndex.xml", "luIndex.xml", "semTypes.xml", ], ) framenet = LazyCorpusLoader( "framenet_v17", FramenetCorpusReader, [ "frRelation.xml", "frameIndex.xml", "fulltextIndex.xml", "luIndex.xml", "semTypes.xml", ], ) gazetteers = LazyCorpusLoader( "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2" ) genesis = LazyCorpusLoader( "genesis", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding=[ ("finnish|french|german", "latin_1"), ("swedish", "cp865"), (".*", "utf_8"), ], ) gutenberg = LazyCorpusLoader( "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1" ) ieer = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*") inaugural = LazyCorpusLoader( "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1" ) # [XX] This should probably just use TaggedCorpusReader: indian = LazyCorpusLoader( "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8" ) jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8") knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp") lin_thesaurus = LazyCorpusLoader("lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp") mac_morpho = LazyCorpusLoader( "mac_morpho", MacMorphoCorpusReader, r"(?!\.).*\.txt", tagset="unknown", encoding="latin-1", ) machado = LazyCorpusLoader( "machado", PortugueseCategorizedPlaintextCorpusReader, r"(?!\.).*\.txt", cat_pattern=r"([a-z]*)/.*", encoding="latin-1", ) masc_tagged = LazyCorpusLoader( "masc_tagged", CategorizedTaggedCorpusReader, r"(spoken|written)/.*\.txt", cat_file="categories.txt", tagset="wsj", encoding="utf-8", sep="_", ) movie_reviews = LazyCorpusLoader( "movie_reviews", CategorizedPlaintextCorpusReader, r"(?!\.).*\.txt", cat_pattern=r"(neg|pos)/.*", encoding="ascii", ) multext_east = LazyCorpusLoader( "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8" ) names = LazyCorpusLoader( "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii" ) nps_chat = LazyCorpusLoader( "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj" ) opinion_lexicon = LazyCorpusLoader( "opinion_lexicon", OpinionLexiconCorpusReader, r"(\w+)\-words\.txt", encoding="ISO-8859-2", ) ppattach = LazyCorpusLoader( "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"] ) product_reviews_1 = LazyCorpusLoader( "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8" ) product_reviews_2 = LazyCorpusLoader( "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8" ) pros_cons = LazyCorpusLoader( "pros_cons", ProsConsCorpusReader, r"Integrated(Cons|Pros)\.txt", cat_pattern=r"Integrated(Cons|Pros)\.txt", encoding="ISO-8859-2", ) ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions "ptb", CategorizedBracketParseCorpusReader, r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG", cat_file="allcats.txt", tagset="wsj", ) qc = LazyCorpusLoader( "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2" ) reuters = LazyCorpusLoader( "reuters", CategorizedPlaintextCorpusReader, "(training|test).*", cat_file="cats.txt", encoding="ISO-8859-2", ) rte = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml") senseval = LazyCorpusLoader("senseval", SensevalCorpusReader, r"(?!\.).*\.pos") sentence_polarity = LazyCorpusLoader( "sentence_polarity", CategorizedSentencesCorpusReader, r"rt-polarity\.(neg|pos)", cat_pattern=r"rt-polarity\.(neg|pos)", encoding="utf-8", ) sentiwordnet = LazyCorpusLoader( "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8" ) shakespeare = LazyCorpusLoader("shakespeare", XMLCorpusReader, r"(?!\.).*\.xml") sinica_treebank = LazyCorpusLoader( "sinica_treebank", SinicaTreebankCorpusReader, ["parsed"], tagset="unknown", encoding="utf-8", ) state_union = LazyCorpusLoader( "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2" ) stopwords = LazyCorpusLoader( "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8" ) subjectivity = LazyCorpusLoader( "subjectivity", CategorizedSentencesCorpusReader, r"(quote.tok.gt9|plot.tok.gt9)\.5000", cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]}, encoding="latin-1", ) swadesh = LazyCorpusLoader( "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8" ) swadesh110 = LazyCorpusLoader( "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8" ) swadesh207 = LazyCorpusLoader( "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8" ) switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj") timit = LazyCorpusLoader("timit", TimitCorpusReader) timit_tagged = LazyCorpusLoader( "timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii" ) toolbox = LazyCorpusLoader( "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)" ) treebank = LazyCorpusLoader( "treebank/combined", BracketParseCorpusReader, r"wsj_.*\.mrg", tagset="wsj", encoding="ascii", ) treebank_chunk = LazyCorpusLoader( "treebank/tagged", ChunkedCorpusReader, r"wsj_.*\.pos", sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True), para_block_reader=tagged_treebank_para_block_reader, tagset="wsj", encoding="ascii", ) treebank_raw = LazyCorpusLoader( "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2" ) twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, r".*\.json") udhr = LazyCorpusLoader("udhr", UdhrCorpusReader) udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8") universal_treebanks = LazyCorpusLoader( "universal_treebanks_v20", ConllCorpusReader, r".*\.conll", columntypes=( "ignore", "words", "ignore", "ignore", "pos", "ignore", "ignore", "ignore", "ignore", "ignore", ), ) verbnet = LazyCorpusLoader("verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml") webtext = LazyCorpusLoader( "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2" ) wordnet = LazyCorpusLoader( "wordnet", WordNetCorpusReader, LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), ) wordnet31 = LazyCorpusLoader( "wordnet31", WordNetCorpusReader, LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), ) wordnet2021 = LazyCorpusLoader( "wordnet2021", WordNetCorpusReader, LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), ) wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat") words = LazyCorpusLoader( "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii" ) # defined after treebank propbank = LazyCorpusLoader( "propbank", PropbankCorpusReader, "prop.txt", r"frames/.*\.xml", "verbs.txt", lambda filename: re.sub(r"^wsj/\d\d/", "", filename), treebank, ) # Must be defined *after* treebank corpus. nombank = LazyCorpusLoader( "nombank.1.0", NombankCorpusReader, "nombank.1.0", r"frames/.*\.xml", "nombank.1.0.words", lambda filename: re.sub(r"^wsj/\d\d/", "", filename), treebank, ) # Must be defined *after* treebank corpus. propbank_ptb = LazyCorpusLoader( "propbank", PropbankCorpusReader, "prop.txt", r"frames/.*\.xml", "verbs.txt", lambda filename: filename.upper(), ptb, ) # Must be defined *after* ptb corpus. nombank_ptb = LazyCorpusLoader( "nombank.1.0", NombankCorpusReader, "nombank.1.0", r"frames/.*\.xml", "nombank.1.0.words", lambda filename: filename.upper(), ptb, ) # Must be defined *after* ptb corpus. semcor = LazyCorpusLoader( "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet ) # Must be defined *after* wordnet corpus. nonbreaking_prefixes = LazyCorpusLoader( "nonbreaking_prefixes", NonbreakingPrefixesCorpusReader, r"(?!README|\.).*", encoding="utf8", ) perluniprops = LazyCorpusLoader( "perluniprops", UnicharsCorpusReader, r"(?!README|\.).*", nltk_data_subdir="misc", encoding="utf8", ) # mwa_ppdb = LazyCorpusLoader( # 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8') # See https://github.com/nltk/nltk/issues/1579 # and https://github.com/nltk/nltk/issues/1716 # # pl196x = LazyCorpusLoader( # 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml', # cat_file='cats.txt', textid_file='textids.txt', encoding='utf8') # # ipipan = LazyCorpusLoader( # 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml') # # nkjp = LazyCorpusLoader( # 'nkjp', NKJPCorpusReader, r'', encoding='utf8') # # panlex_lite = LazyCorpusLoader( # 'panlex_lite', PanLexLiteCorpusReader) # # ycoe = LazyCorpusLoader( # 'ycoe', YCOECorpusReader) # # corpus not available with NLTK; these lines caused help(nltk.corpus) to break # hebrew_treebank = LazyCorpusLoader( # 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt') # FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116 def demo(): # This is out-of-date: abc.demo() brown.demo() # chat80.demo() cmudict.demo() conll2000.demo() conll2002.demo() genesis.demo() gutenberg.demo() ieer.demo() inaugural.demo() indian.demo() names.demo() ppattach.demo() senseval.demo() shakespeare.demo() sinica_treebank.demo() state_union.demo() stopwords.demo() timit.demo() toolbox.demo() treebank.demo() udhr.demo() webtext.demo() words.demo() # ycoe.demo() if __name__ == "__main__": # demo() pass nltk-3.7/nltk/corpus/europarl_raw.py000066400000000000000000000030761420073152400176760ustar00rootroot00000000000000# Natural Language Toolkit: Europarl Corpus Readers # # Copyright (C) 2001-2022 NLTK Project # Author: Nitin Madnani # URL: # For license information, see LICENSE.TXT import re from nltk.corpus.reader import * from nltk.corpus.util import LazyCorpusLoader # Create a new corpus reader instance for each European language danish = LazyCorpusLoader( "europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8" ) dutch = LazyCorpusLoader( "europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8" ) english = LazyCorpusLoader( "europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8" ) finnish = LazyCorpusLoader( "europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8" ) french = LazyCorpusLoader( "europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8" ) german = LazyCorpusLoader( "europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8" ) greek = LazyCorpusLoader( "europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8" ) italian = LazyCorpusLoader( "europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8" ) portuguese = LazyCorpusLoader( "europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8" ) spanish = LazyCorpusLoader( "europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8" ) swedish = LazyCorpusLoader( "europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8" ) nltk-3.7/nltk/corpus/reader/000077500000000000000000000000001420073152400160565ustar00rootroot00000000000000nltk-3.7/nltk/corpus/reader/__init__.py000066400000000000000000000144071420073152400201750ustar00rootroot00000000000000# Natural Language Toolkit: Corpus Readers # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ NLTK corpus readers. The modules in this package provide functions that can be used to read corpus fileids in a variety of formats. These functions can be used to read both the corpus fileids that are distributed in the NLTK corpus package, and corpus fileids that are part of external corpora. Corpus Reader Functions ======================= Each corpus module defines one or more "corpus reader functions", which can be used to read documents from that corpus. These functions take an argument, ``item``, which is used to indicate which document should be read from the corpus: - If ``item`` is one of the unique identifiers listed in the corpus module's ``items`` variable, then the corresponding document will be loaded from the NLTK corpus package. - If ``item`` is a fileid, then that file will be read. Additionally, corpus reader functions can be given lists of item names; in which case, they will return a concatenation of the corresponding documents. Corpus reader functions are named based on the type of information they return. Some common examples, and their return types, are: - words(): list of str - sents(): list of (list of str) - paras(): list of (list of (list of str)) - tagged_words(): list of (str,str) tuple - tagged_sents(): list of (list of (str,str)) - tagged_paras(): list of (list of (list of (str,str))) - chunked_sents(): list of (Tree w/ (str,str) leaves) - parsed_sents(): list of (Tree with str leaves) - parsed_paras(): list of (list of (Tree with str leaves)) - xml(): A single xml ElementTree - raw(): unprocessed corpus contents For example, to read a list of the words in the Brown Corpus, use ``nltk.corpus.brown.words()``: >>> from nltk.corpus import brown >>> print(", ".join(brown.words())) The, Fulton, County, Grand, Jury, said, ... isort:skip_file """ from nltk.corpus.reader.plaintext import * from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * from nltk.corpus.reader.tagged import * from nltk.corpus.reader.cmudict import * from nltk.corpus.reader.conll import * from nltk.corpus.reader.chunked import * from nltk.corpus.reader.wordlist import * from nltk.corpus.reader.xmldocs import * from nltk.corpus.reader.ppattach import * from nltk.corpus.reader.senseval import * from nltk.corpus.reader.ieer import * from nltk.corpus.reader.sinica_treebank import * from nltk.corpus.reader.bracket_parse import * from nltk.corpus.reader.indian import * from nltk.corpus.reader.toolbox import * from nltk.corpus.reader.timit import * from nltk.corpus.reader.ycoe import * from nltk.corpus.reader.rte import * from nltk.corpus.reader.string_category import * from nltk.corpus.reader.propbank import * from nltk.corpus.reader.verbnet import * from nltk.corpus.reader.bnc import * from nltk.corpus.reader.nps_chat import * from nltk.corpus.reader.wordnet import * from nltk.corpus.reader.switchboard import * from nltk.corpus.reader.dependency import * from nltk.corpus.reader.nombank import * from nltk.corpus.reader.ipipan import * from nltk.corpus.reader.pl196x import * from nltk.corpus.reader.knbc import * from nltk.corpus.reader.chasen import * from nltk.corpus.reader.childes import * from nltk.corpus.reader.aligned import * from nltk.corpus.reader.lin import * from nltk.corpus.reader.semcor import * from nltk.corpus.reader.framenet import * from nltk.corpus.reader.udhr import * from nltk.corpus.reader.bnc import * from nltk.corpus.reader.sentiwordnet import * from nltk.corpus.reader.twitter import * from nltk.corpus.reader.nkjp import * from nltk.corpus.reader.crubadan import * from nltk.corpus.reader.mte import * from nltk.corpus.reader.reviews import * from nltk.corpus.reader.opinion_lexicon import * from nltk.corpus.reader.pros_cons import * from nltk.corpus.reader.categorized_sents import * from nltk.corpus.reader.comparative_sents import * from nltk.corpus.reader.panlex_lite import * from nltk.corpus.reader.panlex_swadesh import * # Make sure that nltk.corpus.reader.bracket_parse gives the module, not # the function bracket_parse() defined in nltk.tree: from nltk.corpus.reader import bracket_parse __all__ = [ "CorpusReader", "CategorizedCorpusReader", "PlaintextCorpusReader", "find_corpus_fileids", "TaggedCorpusReader", "CMUDictCorpusReader", "ConllChunkCorpusReader", "WordListCorpusReader", "PPAttachmentCorpusReader", "SensevalCorpusReader", "IEERCorpusReader", "ChunkedCorpusReader", "SinicaTreebankCorpusReader", "BracketParseCorpusReader", "IndianCorpusReader", "ToolboxCorpusReader", "TimitCorpusReader", "YCOECorpusReader", "MacMorphoCorpusReader", "SyntaxCorpusReader", "AlpinoCorpusReader", "RTECorpusReader", "StringCategoryCorpusReader", "EuroparlCorpusReader", "CategorizedBracketParseCorpusReader", "CategorizedTaggedCorpusReader", "CategorizedPlaintextCorpusReader", "PortugueseCategorizedPlaintextCorpusReader", "tagged_treebank_para_block_reader", "PropbankCorpusReader", "VerbnetCorpusReader", "BNCCorpusReader", "ConllCorpusReader", "XMLCorpusReader", "NPSChatCorpusReader", "SwadeshCorpusReader", "WordNetCorpusReader", "WordNetICCorpusReader", "SwitchboardCorpusReader", "DependencyCorpusReader", "NombankCorpusReader", "IPIPANCorpusReader", "Pl196xCorpusReader", "TEICorpusView", "KNBCorpusReader", "ChasenCorpusReader", "CHILDESCorpusReader", "AlignedCorpusReader", "TimitTaggedCorpusReader", "LinThesaurusCorpusReader", "SemcorCorpusReader", "FramenetCorpusReader", "UdhrCorpusReader", "BNCCorpusReader", "SentiWordNetCorpusReader", "SentiSynset", "TwitterCorpusReader", "NKJPCorpusReader", "CrubadanCorpusReader", "MTECorpusReader", "ReviewsCorpusReader", "OpinionLexiconCorpusReader", "ProsConsCorpusReader", "CategorizedSentencesCorpusReader", "ComparativeSentencesCorpusReader", "PanLexLiteCorpusReader", "NonbreakingPrefixesCorpusReader", "UnicharsCorpusReader", "MWAPPDBCorpusReader", "PanlexSwadeshCorpusReader", ] nltk-3.7/nltk/corpus/reader/aligned.py000066400000000000000000000113631420073152400200370ustar00rootroot00000000000000# Natural Language Toolkit: Aligned Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # URL: # Author: Steven Bird # For license information, see LICENSE.TXT from nltk.corpus.reader.api import CorpusReader from nltk.corpus.reader.util import ( StreamBackedCorpusView, concat, read_alignedsent_block, ) from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer from nltk.translate import AlignedSent, Alignment class AlignedCorpusReader(CorpusReader): """ Reader for corpora of word-aligned sentences. Tokens are assumed to be separated by whitespace. Sentences begin on separate lines. """ def __init__( self, root, fileids, sep="/", word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer("\n", gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding="latin1", ): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat( [ AlignedSentCorpusView( fileid, enc, False, False, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat( [ AlignedSentCorpusView( fileid, enc, False, True, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) def aligned_sents(self, fileids=None): """ :return: the given file(s) as a list of AlignedSent objects. :rtype: list(AlignedSent) """ return concat( [ AlignedSentCorpusView( fileid, enc, True, True, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) class AlignedSentCorpusView(StreamBackedCorpusView): """ A specialized corpus view for aligned sentences. ``AlignedSentCorpusView`` objects are typically created by ``AlignedCorpusReader`` (not directly by nltk users). """ def __init__( self, corpus_file, encoding, aligned, group_by_sent, word_tokenizer, sent_tokenizer, alignedsent_block_reader, ): self._aligned = aligned self._group_by_sent = group_by_sent self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): block = [ self._word_tokenizer.tokenize(sent_str) for alignedsent_str in self._alignedsent_block_reader(stream) for sent_str in self._sent_tokenizer.tokenize(alignedsent_str) ] if self._aligned: block[2] = Alignment.fromstring( " ".join(block[2]) ) # kludge; we shouldn't have tokenized the alignment string block = [AlignedSent(*block)] elif self._group_by_sent: block = [block[0]] else: block = block[0] return block nltk-3.7/nltk/corpus/reader/api.py000066400000000000000000000453231420073152400172100ustar00rootroot00000000000000# Natural Language Toolkit: API for Corpus Readers # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ API for corpus readers. """ import os import re from collections import defaultdict from itertools import chain from nltk.corpus.reader.util import * from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer class CorpusReader: """ A base class for "corpus reader" classes, each of which can be used to read a specific corpus format. Each individual corpus reader instance is used to read a specific corpus, consisting of one or more files under a common root directory. Each file is identified by its ``file identifier``, which is the relative path to the file from the root directory. A separate subclass is defined for each corpus format. These subclasses define one or more methods that provide 'views' on the corpus contents, such as ``words()`` (for a list of words) and ``parsed_sents()`` (for a list of parsed sentences). Called with no arguments, these methods will return the contents of the entire corpus. For most corpora, these methods define one or more selection arguments, such as ``fileids`` or ``categories``, which can be used to select which portion of the corpus should be returned. """ def __init__(self, root, fileids, encoding="utf8", tagset=None): """ :type root: PathPointer or str :param root: A path pointer identifying the root directory for this corpus. If a string is specified, then it will be converted to a ``PathPointer`` automatically. :param fileids: A list of the files that make up this corpus. This list can either be specified explicitly, as a list of strings; or implicitly, as a regular expression over file paths. The absolute path for each file will be constructed by joining the reader's root to each file name. :param encoding: The default unicode encoding for the files that make up the corpus. The value of ``encoding`` can be any of the following: - A string: ``encoding`` is the encoding name for all files. - A dictionary: ``encoding[file_id]`` is the encoding name for the file whose identifier is ``file_id``. If ``file_id`` is not in ``encoding``, then the file contents will be processed using non-unicode byte strings. - A list: ``encoding`` should be a list of ``(regexp, encoding)`` tuples. The encoding for a file whose identifier is ``file_id`` will be the ``encoding`` value for the first tuple whose ``regexp`` matches the ``file_id``. If no tuple's ``regexp`` matches the ``file_id``, the file contents will be processed using non-unicode byte strings. - None: the file contents of all files will be processed using non-unicode byte strings. :param tagset: The name of the tagset used by this corpus, to be used for normalizing or converting the POS tags returned by the ``tagged_...()`` methods. """ # Convert the root to a path pointer, if necessary. if isinstance(root, str) and not isinstance(root, PathPointer): m = re.match(r"(.*\.zip)/?(.*)$|", root) zipfile, zipentry = m.groups() if zipfile: root = ZipFilePathPointer(zipfile, zipentry) else: root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError("CorpusReader: expected a string or a PathPointer") # If `fileids` is a regexp, then expand it. if isinstance(fileids, str): fileids = find_corpus_fileids(root, fileids) self._fileids = fileids """A list of the relative paths for the fileids that make up this corpus.""" self._root = root """The root directory for this corpus.""" self._readme = "README" self._license = "LICENSE" self._citation = "citation.bib" # If encoding was specified as a list of regexps, then convert # it to a dictionary. if isinstance(encoding, list): encoding_dict = {} for fileid in self._fileids: for x in encoding: (regexp, enc) = x if re.match(regexp, fileid): encoding_dict[fileid] = enc break encoding = encoding_dict self._encoding = encoding """The default unicode encoding for the fileids that make up this corpus. If ``encoding`` is None, then the file contents are processed using byte strings.""" self._tagset = tagset def __repr__(self): if isinstance(self._root, ZipFilePathPointer): path = f"{self._root.zipfile.filename}/{self._root.entry}" else: path = "%s" % self._root.path return f"<{self.__class__.__name__} in {path!r}>" def ensure_loaded(self): """ Load this corpus (if it has not already been loaded). This is used by LazyCorpusLoader as a simple method that can be used to make sure a corpus is loaded -- e.g., in case a user wants to do help(some_corpus). """ pass # no need to actually do anything. def readme(self): """ Return the contents of the corpus README file, if it exists. """ with self.open(self._readme) as f: return f.read() def license(self): """ Return the contents of the corpus LICENSE file, if it exists. """ with self.open(self._license) as f: return f.read() def citation(self): """ Return the contents of the corpus citation.bib file, if it exists. """ with self.open(self._citation) as f: return f.read() def fileids(self): """ Return a list of file identifiers for the fileids that make up this corpus. """ return self._fileids def abspath(self, fileid): """ Return the absolute path for the given file. :type fileid: str :param fileid: The file identifier for the file whose path should be returned. :rtype: PathPointer """ return self._root.join(fileid) def abspaths(self, fileids=None, include_encoding=False, include_fileid=False): """ Return a list of the absolute paths for all fileids in this corpus; or for the given list of fileids, if specified. :type fileids: None or str or list :param fileids: Specifies the set of fileids for which paths should be returned. Can be None, for all fileids; a list of file identifiers, for a specified set of fileids; or a single file identifier, for a single file. Note that the return value is always a list of paths, even if ``fileids`` is a single file identifier. :param include_encoding: If true, then return a list of ``(path_pointer, encoding)`` tuples. :rtype: list(PathPointer) """ if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] paths = [self._root.join(f) for f in fileids] if include_encoding and include_fileid: return list(zip(paths, [self.encoding(f) for f in fileids], fileids)) elif include_fileid: return list(zip(paths, fileids)) elif include_encoding: return list(zip(paths, [self.encoding(f) for f in fileids])) else: return paths def raw(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] contents = [] for f in fileids: with self.open(f) as fp: contents.append(fp.read()) return concat(contents) def open(self, file): """ Return an open stream that can be used to read the given file. If the file's encoding is not None, then the stream will automatically decode the file's contents into unicode. :param file: The file identifier of the file to read. """ encoding = self.encoding(file) stream = self._root.join(file).open(encoding) return stream def encoding(self, file): """ Return the unicode encoding for the given corpus file, if known. If the encoding is unknown, or if the given file should be processed using byte strings (str), then return None. """ if isinstance(self._encoding, dict): return self._encoding.get(file) else: return self._encoding def _get_root(self): return self._root root = property( _get_root, doc=""" The directory where this corpus is stored. :type: PathPointer""", ) ###################################################################### # { Corpora containing categorized items ###################################################################### class CategorizedCorpusReader: """ A mixin class used to aid in the implementation of corpus readers for categorized corpora. This class defines the method ``categories()``, which returns a list of the categories for the corpus or for a specified set of fileids; and overrides ``fileids()`` to take a ``categories`` argument, restricting the set of fileids to be returned. Subclasses are expected to: - Call ``__init__()`` to set up the mapping. - Override all view methods to accept a ``categories`` parameter, which can be used *instead* of the ``fileids`` parameter, to select which fileids should be included in the returned view. """ def __init__(self, kwargs): """ Initialize this mapping based on keyword arguments, as follows: - cat_pattern: A regular expression pattern used to find the category for each file identifier. The pattern will be applied to each file identifier, and the first matching group will be used as the category label for that file. - cat_map: A dictionary, mapping from file identifiers to category labels. - cat_file: The name of a file that contains the mapping from file identifiers to categories. The argument ``cat_delimiter`` can be used to specify a delimiter. The corresponding argument will be deleted from ``kwargs``. If more than one argument is specified, an exception will be raised. """ self._f2c = None #: file-to-category mapping self._c2f = None #: category-to-file mapping self._pattern = None #: regexp specifying the mapping self._map = None #: dict specifying the mapping self._file = None #: fileid of file containing the mapping self._delimiter = None #: delimiter for ``self._file`` if "cat_pattern" in kwargs: self._pattern = kwargs["cat_pattern"] del kwargs["cat_pattern"] elif "cat_map" in kwargs: self._map = kwargs["cat_map"] del kwargs["cat_map"] elif "cat_file" in kwargs: self._file = kwargs["cat_file"] del kwargs["cat_file"] if "cat_delimiter" in kwargs: self._delimiter = kwargs["cat_delimiter"] del kwargs["cat_delimiter"] else: raise ValueError( "Expected keyword argument cat_pattern or " "cat_map or cat_file." ) if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs: raise ValueError( "Specify exactly one of: cat_pattern, " "cat_map, cat_file." ) def _init(self): self._f2c = defaultdict(set) self._c2f = defaultdict(set) if self._pattern is not None: for file_id in self._fileids: category = re.match(self._pattern, file_id).group(1) self._add(file_id, category) elif self._map is not None: for (file_id, categories) in self._map.items(): for category in categories: self._add(file_id, category) elif self._file is not None: with self.open(self._file) as f: for line in f.readlines(): line = line.strip() file_id, categories = line.split(self._delimiter, 1) if file_id not in self.fileids(): raise ValueError( "In category mapping file %s: %s " "not found" % (self._file, file_id) ) for category in categories.split(self._delimiter): self._add(file_id, category) def _add(self, file_id, category): self._f2c[file_id].add(category) self._c2f[category].add(file_id) def categories(self, fileids=None): """ Return a list of the categories that are defined for this corpus, or for the file(s) if it is given. """ if self._f2c is None: self._init() if fileids is None: return sorted(self._c2f) if isinstance(fileids, str): fileids = [fileids] return sorted(set.union(*(self._f2c[d] for d in fileids))) def fileids(self, categories=None): """ Return a list of file identifiers for the files that make up this corpus, or that make up the given category(s) if specified. """ if categories is None: return super().fileids() elif isinstance(categories, str): if self._f2c is None: self._init() if categories in self._c2f: return sorted(self._c2f[categories]) else: raise ValueError("Category %s not found" % categories) else: if self._f2c is None: self._init() return sorted(set.union(*(self._c2f[c] for c in categories))) def _resolve(self, fileids, categories): if fileids is not None and categories is not None: raise ValueError("Specify fileids or categories, not both") if categories is not None: return self.fileids(categories) else: return fileids def raw(self, fileids=None, categories=None): return super().raw(self._resolve(fileids, categories)) def words(self, fileids=None, categories=None): return super().words(self._resolve(fileids, categories)) def sents(self, fileids=None, categories=None): return super().sents(self._resolve(fileids, categories)) def paras(self, fileids=None, categories=None): return super().paras(self._resolve(fileids, categories)) ###################################################################### # { Treebank readers ###################################################################### # [xx] is it worth it to factor this out? class SyntaxCorpusReader(CorpusReader): """ An abstract base class for reading corpora consisting of syntactically parsed text. Subclasses should define: - ``__init__``, which specifies the location of the corpus and a method for detecting the sentence blocks in corpus files. - ``_read_block``, which reads a block from the input stream. - ``_word``, which takes a block and returns a list of list of words. - ``_tag``, which takes a block and returns a list of list of tagged words. - ``_parse``, which takes a block and returns a list of parsed sentences. """ def _parse(self, s): raise NotImplementedError() def _word(self, s): raise NotImplementedError() def _tag(self, s): raise NotImplementedError() def _read_block(self, stream): raise NotImplementedError() def parsed_sents(self, fileids=None): reader = self._read_parsed_sent_block return concat( [ StreamBackedCorpusView(fileid, reader, encoding=enc) for fileid, enc in self.abspaths(fileids, True) ] ) def tagged_sents(self, fileids=None, tagset=None): def reader(stream): return self._read_tagged_sent_block(stream, tagset) return concat( [ StreamBackedCorpusView(fileid, reader, encoding=enc) for fileid, enc in self.abspaths(fileids, True) ] ) def sents(self, fileids=None): reader = self._read_sent_block return concat( [ StreamBackedCorpusView(fileid, reader, encoding=enc) for fileid, enc in self.abspaths(fileids, True) ] ) def tagged_words(self, fileids=None, tagset=None): def reader(stream): return self._read_tagged_word_block(stream, tagset) return concat( [ StreamBackedCorpusView(fileid, reader, encoding=enc) for fileid, enc in self.abspaths(fileids, True) ] ) def words(self, fileids=None): return concat( [ StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc) for fileid, enc in self.abspaths(fileids, True) ] ) # ------------------------------------------------------------ # { Block Readers def _read_word_block(self, stream): return list(chain.from_iterable(self._read_sent_block(stream))) def _read_tagged_word_block(self, stream, tagset=None): return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset))) def _read_sent_block(self, stream): return list(filter(None, [self._word(t) for t in self._read_block(stream)])) def _read_tagged_sent_block(self, stream, tagset=None): return list( filter(None, [self._tag(t, tagset) for t in self._read_block(stream)]) ) def _read_parsed_sent_block(self, stream): return list(filter(None, [self._parse(t) for t in self._read_block(stream)])) # } End of Block Readers # ------------------------------------------------------------ nltk-3.7/nltk/corpus/reader/bnc.py000066400000000000000000000223531420073152400171770ustar00rootroot00000000000000# Natural Language Toolkit: Plaintext Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """Corpus reader for the XML version of the British National Corpus.""" from nltk.corpus.reader.util import concat from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView class BNCCorpusReader(XMLCorpusReader): r"""Corpus reader for the XML version of the British National Corpus. For access to the complete XML data structure, use the ``xml()`` method. For access to simple word lists and tagged word lists, use ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. You can obtain the full version of the BNC corpus at https://www.ota.ox.ac.uk/desc/2554 If you extracted the archive to a directory called `BNC`, then you can instantiate the reader as:: BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml') """ def __init__(self, root, fileids, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy def words(self, fileids=None, strip_space=True, stem=False): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param stem: If true, then use word stems instead of word strings. """ return self._views(fileids, False, None, strip_space, stem) def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) :param c5: If true, then the tags used will be the more detailed c5 tags. Otherwise, the simplified tags will be used. :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param stem: If true, then use word stems instead of word strings. """ tag = "c5" if c5 else "pos" return self._views(fileids, False, tag, strip_space, stem) def sents(self, fileids=None, strip_space=True, stem=False): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param stem: If true, then use word stems instead of word strings. """ return self._views(fileids, True, None, strip_space, stem) def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) :param c5: If true, then the tags used will be the more detailed c5 tags. Otherwise, the simplified tags will be used. :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param stem: If true, then use word stems instead of word strings. """ tag = "c5" if c5 else "pos" return self._views( fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem ) def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False): """A helper function that instantiates BNCWordViews or the list of words/sentences.""" f = BNCWordView if self._lazy else self._words return concat( [ f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids) ] ) def _words(self, fileid, bracket_sent, tag, strip_space, stem): """ Helper used to implement the view methods -- returns a list of words or a list of sentences, optionally tagged. :param fileid: The name of the underlying file. :param bracket_sent: If true, include sentence bracketing. :param tag: The name of the tagset to use, or None for no tags. :param strip_space: If true, strip spaces from word tokens. :param stem: If true, then substitute stems for words. """ result = [] xmldoc = ElementTree.parse(fileid).getroot() for xmlsent in xmldoc.findall(".//s"): sent = [] for xmlword in _all_xmlwords_in(xmlsent): word = xmlword.text if not word: word = "" # fixes issue 337? if strip_space or stem: word = word.strip() if stem: word = xmlword.get("hw", word) if tag == "c5": word = (word, xmlword.get("c5")) elif tag == "pos": word = (word, xmlword.get("pos", xmlword.get("c5"))) sent.append(word) if bracket_sent: result.append(BNCSentence(xmlsent.attrib["n"], sent)) else: result.extend(sent) assert None not in result return result def _all_xmlwords_in(elt, result=None): if result is None: result = [] for child in elt: if child.tag in ("c", "w"): result.append(child) else: _all_xmlwords_in(child, result) return result class BNCSentence(list): """ A list of words, augmented by an attribute ``num`` used to record the sentence identifier (the ``n`` attribute from the XML). """ def __init__(self, num, items): self.num = num list.__init__(self, items) class BNCWordView(XMLCorpusView): """ A stream backed corpus view specialized for use with the BNC corpus. """ tags_to_ignore = { "pb", "gap", "vocal", "event", "unclear", "shift", "pause", "align", } """These tags are ignored. For their description refer to the technical documentation, for example, http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html """ def __init__(self, fileid, sent, tag, strip_space, stem): """ :param fileid: The name of the underlying file. :param sent: If true, include sentence bracketing. :param tag: The name of the tagset to use, or None for no tags. :param strip_space: If true, strip spaces from word tokens. :param stem: If true, then substitute stems for words. """ if sent: tagspec = ".*/s" else: tagspec = ".*/s/(.*/)?(c|w)" self._sent = sent self._tag = tag self._strip_space = strip_space self._stem = stem self.title = None #: Title of the document. self.author = None #: Author of the document. self.editor = None #: Editor self.resps = None #: Statement of responsibility XMLCorpusView.__init__(self, fileid, tagspec) # Read in a tasty header. self._open() self.read_block(self._stream, ".*/teiHeader$", self.handle_header) self.close() # Reset tag context. self._tag_context = {0: ()} def handle_header(self, elt, context): # Set up some metadata! titles = elt.findall("titleStmt/title") if titles: self.title = "\n".join(title.text.strip() for title in titles) authors = elt.findall("titleStmt/author") if authors: self.author = "\n".join(author.text.strip() for author in authors) editors = elt.findall("titleStmt/editor") if editors: self.editor = "\n".join(editor.text.strip() for editor in editors) resps = elt.findall("titleStmt/respStmt") if resps: self.resps = "\n\n".join( "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps ) def handle_elt(self, elt, context): if self._sent: return self.handle_sent(elt) else: return self.handle_word(elt) def handle_word(self, elt): word = elt.text if not word: word = "" # fixes issue 337? if self._strip_space or self._stem: word = word.strip() if self._stem: word = elt.get("hw", word) if self._tag == "c5": word = (word, elt.get("c5")) elif self._tag == "pos": word = (word, elt.get("pos", elt.get("c5"))) return word def handle_sent(self, elt): sent = [] for child in elt: if child.tag in ("mw", "hi", "corr", "trunc"): sent += [self.handle_word(w) for w in child] elif child.tag in ("w", "c"): sent.append(self.handle_word(child)) elif child.tag not in self.tags_to_ignore: raise ValueError("Unexpected element %s" % child.tag) return BNCSentence(elt.attrib["n"], sent) nltk-3.7/nltk/corpus/reader/bracket_parse.py000066400000000000000000000222461420073152400212430ustar00rootroot00000000000000# Natural Language Toolkit: Penn Treebank Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Corpus reader for corpora that consist of parenthesis-delineated parse trees. """ import sys from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.tag import map_tag from nltk.tree import Tree # we use [^\s()]+ instead of \S+? to avoid matching () SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)") TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)") WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)") EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(") class BracketParseCorpusReader(SyntaxCorpusReader): """ Reader for corpora that consist of parenthesis-delineated parse trees, like those found in the "combined" section of the Penn Treebank, e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))". """ def __init__( self, root, fileids, comment_char=None, detect_blocks="unindented_paren", encoding="utf8", tagset=None, ): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param comment_char: The character which can appear at the start of a line to indicate that the rest of the line is a comment. :param detect_blocks: The method that is used to find blocks in the corpus; can be 'unindented_paren' (every unindented parenthesis starts a new parse) or 'sexpr' (brackets are matched). :param tagset: The name of the tagset used by this corpus, to be used for normalizing or converting the POS tags returned by the ``tagged_...()`` methods. """ SyntaxCorpusReader.__init__(self, root, fileids, encoding) self._comment_char = comment_char self._detect_blocks = detect_blocks self._tagset = tagset def _read_block(self, stream): if self._detect_blocks == "sexpr": return read_sexpr_block(stream, comment_char=self._comment_char) elif self._detect_blocks == "blankline": return read_blankline_block(stream) elif self._detect_blocks == "unindented_paren": # Tokens start with unindented left parens. toks = read_regexp_block(stream, start_re=r"^\(") # Strip any comments out of the tokens. if self._comment_char: toks = [ re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok) for tok in toks ] return toks else: assert 0, "bad block type" def _normalize(self, t): # Replace leaves of the form (!), (,), with (! !), (, ,) t = re.sub(r"\((.)\)", r"(\1 \1)", t) # Replace leaves of the form (tag word root) with (tag word) t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) return t def _parse(self, t): try: tree = Tree.fromstring(self._normalize(t)) # If there's an empty node at the top, strip it off if tree.label() == "" and len(tree) == 1: return tree[0] else: return tree except ValueError as e: sys.stderr.write("Bad tree detected; trying to recover...\n") # Try to recover, if we can: if e.args == ("mismatched parens",): for n in range(1, 5): try: v = Tree(self._normalize(t + ")" * n)) sys.stderr.write( " Recovered by adding %d close " "paren(s)\n" % n ) return v except ValueError: pass # Try something else: sys.stderr.write(" Recovered by returning a flat parse.\n") # sys.stderr.write(' '.join(t.split())+'\n') return Tree("S", self._tag(t)) def _tag(self, t, tagset=None): tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))] if tagset and tagset != self._tagset: tagged_sent = [ (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent ] return tagged_sent def _word(self, t): return WORD.findall(self._normalize(t)) class CategorizedBracketParseCorpusReader( CategorizedCorpusReader, BracketParseCorpusReader ): """ A reader for parsed corpora whose documents are divided into categories based on their file identifiers. @author: Nathan Schneider """ def __init__(self, *args, **kwargs): """ Initialize the corpus reader. Categorization arguments (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to the L{CategorizedCorpusReader constructor }. The remaining arguments are passed to the L{BracketParseCorpusReader constructor }. """ CategorizedCorpusReader.__init__(self, kwargs) BracketParseCorpusReader.__init__(self, *args, **kwargs) def tagged_words(self, fileids=None, categories=None, tagset=None): return super().tagged_words(self._resolve(fileids, categories), tagset) def tagged_sents(self, fileids=None, categories=None, tagset=None): return super().tagged_sents(self._resolve(fileids, categories), tagset) def tagged_paras(self, fileids=None, categories=None, tagset=None): return super().tagged_paras(self._resolve(fileids, categories), tagset) def parsed_words(self, fileids=None, categories=None): return super().parsed_words(self._resolve(fileids, categories)) def parsed_sents(self, fileids=None, categories=None): return super().parsed_sents(self._resolve(fileids, categories)) def parsed_paras(self, fileids=None, categories=None): return super().parsed_paras(self._resolve(fileids, categories)) class AlpinoCorpusReader(BracketParseCorpusReader): """ Reader for the Alpino Dutch Treebank. This corpus has a lexical breakdown structure embedded, as read by `_parse` Unfortunately this puts punctuation and some other words out of the sentence order in the xml element tree. This is no good for `tag_` and `word_` `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered' to the overridden _normalize function. The _parse function can then remain untouched. """ def __init__(self, root, encoding="ISO-8859-1", tagset=None): BracketParseCorpusReader.__init__( self, root, r"alpino\.xml", detect_blocks="blankline", encoding=encoding, tagset=tagset, ) def _normalize(self, t, ordered=False): """Normalize the xml sentence element in t. The sentence elements , although embedded in a few overall xml elements, are separated by blank lines. That's how the reader can deliver them one at a time. Each sentence has a few category subnodes that are of no use to us. The remaining word nodes may or may not appear in the proper order. Each word node has attributes, among which: - begin : the position of the word in the sentence - pos : Part of Speech: the Tag - word : the actual word The return value is a string with all xml elementes replaced by clauses: either a cat clause with nested clauses, or a word clause. The order of the bracket clauses closely follows the xml. If ordered == True, the word clauses include an order sequence number. If ordered == False, the word clauses only have pos and word parts. """ if t[:10] != "', r"(\1", t) if ordered: t = re.sub( r' ', r"(\1 \2 \3)", t, ) else: t = re.sub(r' ', r"(\1 \2)", t) t = re.sub(r" ", r")", t) t = re.sub(r".*", r"", t) t = re.sub(r"", r"", t) return t def _tag(self, t, tagset=None): tagged_sent = [ (int(o), w, p) for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True)) ] tagged_sent.sort() if tagset and tagset != self._tagset: tagged_sent = [ (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent ] else: tagged_sent = [(w, p) for (o, w, p) in tagged_sent] return tagged_sent def _word(self, t): """Return a correctly ordered list if words""" tagged_sent = self._tag(t) return [w for (w, p) in tagged_sent] nltk-3.7/nltk/corpus/reader/categorized_sents.py000066400000000000000000000135431420073152400221520ustar00rootroot00000000000000# Natural Language Toolkit: Categorized Sentences Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader structured for corpora that contain one instance on each row. This CorpusReader is specifically used for the Subjectivity Dataset and the Sentence Polarity Dataset. - Subjectivity Dataset information - Authors: Bo Pang and Lillian Lee. Url: https://www.cs.cornell.edu/people/pabo/movie-review-data Distributed with permission. Related papers: - Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL, 2004. - Sentence Polarity Dataset information - Authors: Bo Pang and Lillian Lee. Url: https://www.cs.cornell.edu/people/pabo/movie-review-data Related papers: - Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for sentiment categorization with respect to rating scales". Proceedings of the ACL, 2005. """ from nltk.corpus.reader.api import * from nltk.tokenize import * class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader): """ A reader for corpora in which each row represents a single instance, mainly a sentence. Istances are divided into categories based on their file identifiers (see CategorizedCorpusReader). Since many corpora allow rows that contain more than one sentence, it is possible to specify a sentence tokenizer to retrieve all sentences instead than all rows. Examples using the Subjectivity Dataset: >>> from nltk.corpus import subjectivity >>> subjectivity.sents()[23] ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits', 'happened', 'off', 'screen', '.'] >>> subjectivity.categories() ['obj', 'subj'] >>> subjectivity.words(categories='subj') ['smart', 'and', 'alert', ',', 'thirteen', ...] Examples using the Sentence Polarity Dataset: >>> from nltk.corpus import sentence_polarity >>> sentence_polarity.sents() [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.'], ...] >>> sentence_polarity.categories() ['neg', 'pos'] """ CorpusView = StreamBackedCorpusView def __init__( self, root, fileids, word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=None, encoding="utf8", **kwargs ): """ :param root: The root directory for the corpus. :param fileids: a list or regexp specifying the fileids in the corpus. :param word_tokenizer: a tokenizer for breaking sentences or paragraphs into words. Default: `WhitespaceTokenizer` :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences. :param encoding: the encoding that should be used to read the corpus. :param kwargs: additional parameters passed to CategorizedCorpusReader. """ CorpusReader.__init__(self, root, fileids, encoding) CategorizedCorpusReader.__init__(self, kwargs) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer def sents(self, fileids=None, categories=None): """ Return all sentences in the corpus or in the specified file(s). :param fileids: a list or regexp specifying the ids of the files whose sentences have to be returned. :param categories: a list specifying the categories whose sentences have to be returned. :return: the given file(s) as a list of sentences. Each sentence is tokenized using the specified word_tokenizer. :rtype: list(list(str)) """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] return concat( [ self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def words(self, fileids=None, categories=None): """ Return all words and punctuation symbols in the corpus or in the specified file(s). :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :param categories: a list specifying the categories whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] return concat( [ self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def _read_sent_block(self, stream): sents = [] for i in range(20): # Read 20 lines at a time. line = stream.readline() if not line: continue if self._sent_tokenizer: sents.extend( [ self._word_tokenizer.tokenize(sent) for sent in self._sent_tokenizer.tokenize(line) ] ) else: sents.append(self._word_tokenizer.tokenize(line)) return sents def _read_word_block(self, stream): words = [] for sent in self._read_sent_block(stream): words.extend(sent) return words nltk-3.7/nltk/corpus/reader/chasen.py000066400000000000000000000106751420073152400177020ustar00rootroot00000000000000# # Copyright (C) 2001-2022 NLTK Project # Author: Masato Hagiwara # URL: # For license information, see LICENSE.TXT import sys from nltk.corpus.reader import util from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * class ChasenCorpusReader(CorpusReader): def __init__(self, root, fileids, encoding="utf8", sent_splitter=None): self._sent_splitter = sent_splitter CorpusReader.__init__(self, root, fileids, encoding) def words(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_words(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def sents(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_sents(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def paras(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_paras(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) class ChasenCorpusView(StreamBackedCorpusView): """ A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``, but this'll use fixed sets of word and sentence tokenizer. """ def __init__( self, corpus_file, encoding, tagged, group_by_sent, group_by_para, sent_splitter=None, ): self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._sent_splitter = sent_splitter StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in read_regexp_block(stream, r".", r"^EOS\n"): para = [] sent = [] for line in para_str.splitlines(): _eos = line.strip() == "EOS" _cells = line.split("\t") w = (_cells[0], "\t".join(_cells[1:])) if not _eos: sent.append(w) if _eos or (self._sent_splitter and self._sent_splitter(w)): if not self._tagged: sent = [w for (w, t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) sent = [] if len(sent) > 0: if not self._tagged: sent = [w for (w, t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") print("/".join(jeita.words()[22100:22140])) print( "\nEOS\n".join( "\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173] ) ) def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") assert isinstance(jeita.tagged_words()[0][1], str) if __name__ == "__main__": demo() test() nltk-3.7/nltk/corpus/reader/childes.py000066400000000000000000000616041420073152400200520ustar00rootroot00000000000000# CHILDES XML Corpus Reader # Copyright (C) 2001-2022 NLTK Project # Author: Tomonori Nagano # Alexis Dimitriadis # URL: # For license information, see LICENSE.TXT """ Corpus reader for the XML version of the CHILDES corpus. """ __docformat__ = "epytext en" import re from collections import defaultdict from nltk.corpus.reader.util import concat from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader from nltk.util import LazyConcatenation, LazyMap, flatten # to resolve the namespace issue NS = "https://www.talkbank.org/ns/talkbank" class CHILDESCorpusReader(XMLCorpusReader): """ Corpus reader for the XML version of the CHILDES corpus. The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``. Copy the needed parts of the CHILDES XML corpus into the NLTK data directory (``nltk_data/corpora/CHILDES/``). For access to the file text use the usual nltk functions, ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``. """ def __init__(self, root, fileids, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy def words( self, fileids=None, speaker="ALL", stem=False, relation=False, strip_space=True, replace=False, ): """ :return: the given file(s) as a list of words :rtype: list(str) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of (stem, index, dependent_index) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent = None pos = False if not self._lazy: return [ self._get_words( fileid, speaker, sent, stem, relation, pos, strip_space, replace ) for fileid in self.abspaths(fileids) ] get_words = lambda fileid: self._get_words( fileid, speaker, sent, stem, relation, pos, strip_space, replace ) return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) def tagged_words( self, fileids=None, speaker="ALL", stem=False, relation=False, strip_space=True, replace=False, ): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of (stem, index, dependent_index) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent = None pos = True if not self._lazy: return [ self._get_words( fileid, speaker, sent, stem, relation, pos, strip_space, replace ) for fileid in self.abspaths(fileids) ] get_words = lambda fileid: self._get_words( fileid, speaker, sent, stem, relation, pos, strip_space, replace ) return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) def sents( self, fileids=None, speaker="ALL", stem=False, relation=None, strip_space=True, replace=False, ): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of ``(str,pos,relation_list)``. If there is manually-annotated relation info, it will return tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent = True pos = False if not self._lazy: return [ self._get_words( fileid, speaker, sent, stem, relation, pos, strip_space, replace ) for fileid in self.abspaths(fileids) ] get_words = lambda fileid: self._get_words( fileid, speaker, sent, stem, relation, pos, strip_space, replace ) return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) def tagged_sents( self, fileids=None, speaker="ALL", stem=False, relation=None, strip_space=True, replace=False, ): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of ``(str,pos,relation_list)``. If there is manually-annotated relation info, it will return tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent = True pos = True if not self._lazy: return [ self._get_words( fileid, speaker, sent, stem, relation, pos, strip_space, replace ) for fileid in self.abspaths(fileids) ] get_words = lambda fileid: self._get_words( fileid, speaker, sent, stem, relation, pos, strip_space, replace ) return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) def corpus(self, fileids=None): """ :return: the given file(s) as a dict of ``(corpus_property_key, value)`` :rtype: list(dict) """ if not self._lazy: return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)] return LazyMap(self._get_corpus, self.abspaths(fileids)) def _get_corpus(self, fileid): results = dict() xmldoc = ElementTree.parse(fileid).getroot() for key, value in xmldoc.items(): results[key] = value return results def participants(self, fileids=None): """ :return: the given file(s) as a dict of ``(participant_property_key, value)`` :rtype: list(dict) """ if not self._lazy: return [self._get_participants(fileid) for fileid in self.abspaths(fileids)] return LazyMap(self._get_participants, self.abspaths(fileids)) def _get_participants(self, fileid): # multidimensional dicts def dictOfDicts(): return defaultdict(dictOfDicts) xmldoc = ElementTree.parse(fileid).getroot() # getting participants' data pat = dictOfDicts() for participant in xmldoc.findall( f".//{{{NS}}}Participants/{{{NS}}}participant" ): for (key, value) in participant.items(): pat[participant.get("id")][key] = value return pat def age(self, fileids=None, speaker="CHI", month=False): """ :return: the given file(s) as string or int :rtype: list or int :param month: If true, return months instead of year-month-date """ if not self._lazy: return [ self._get_age(fileid, speaker, month) for fileid in self.abspaths(fileids) ] get_age = lambda fileid: self._get_age(fileid, speaker, month) return LazyMap(get_age, self.abspaths(fileids)) def _get_age(self, fileid, speaker, month): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"): try: if pat.get("id") == speaker: age = pat.get("age") if month: age = self.convert_age(age) return age # some files don't have age data except (TypeError, AttributeError) as e: return None def convert_age(self, age_year): "Caclculate age in months from a string in CHILDES format" m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year) age_month = int(m.group(1)) * 12 + int(m.group(2)) try: if int(m.group(3)) > 15: age_month += 1 # some corpora don't have age information? except ValueError as e: pass return age_month def MLU(self, fileids=None, speaker="CHI"): """ :return: the given file(s) as a floating number :rtype: list(float) """ if not self._lazy: return [ self._getMLU(fileid, speaker=speaker) for fileid in self.abspaths(fileids) ] get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker) return LazyMap(get_MLU, self.abspaths(fileids)) def _getMLU(self, fileid, speaker): sents = self._get_words( fileid, speaker=speaker, sent=True, stem=True, relation=False, pos=True, strip_space=True, replace=True, ) results = [] lastSent = [] numFillers = 0 sentDiscount = 0 for sent in sents: posList = [pos for (word, pos) in sent] # if any part of the sentence is intelligible if any(pos == "unk" for pos in posList): continue # if the sentence is null elif sent == []: continue # if the sentence is the same as the last sent elif sent == lastSent: continue else: results.append([word for (word, pos) in sent]) # count number of fillers if len({"co", None}.intersection(posList)) > 0: numFillers += posList.count("co") numFillers += posList.count(None) sentDiscount += 1 lastSent = sent try: thisWordList = flatten(results) # count number of morphemes # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) numWords = ( len(flatten([word.split("-") for word in thisWordList])) - numFillers ) numSents = len(results) - sentDiscount mlu = numWords / numSents except ZeroDivisionError: mlu = 0 # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} return mlu def _get_words( self, fileid, speaker, sent, stem, relation, pos, strip_space, replace ): if ( isinstance(speaker, str) and speaker != "ALL" ): # ensure we have a list of speakers speaker = [speaker] xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall(".//{%s}u" % NS): sents = [] # select speakers if speaker == "ALL" or xmlsent.get("who") in speaker: for xmlword in xmlsent.findall(".//{%s}w" % NS): infl = None suffixStem = None suffixTag = None # getting replaced words if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"): xmlword = xmlsent.find( f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w" ) elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"): xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk") # get text if xmlword.text: word = xmlword.text else: word = "" # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find(".//{%s}stem" % NS) word = xmlstem.text except AttributeError as e: pass # if there is an inflection try: xmlinfl = xmlword.find( f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk" ) word += "-" + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem" % (NS, NS, NS, NS) ) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" if suffixStem: word += "~" + suffixStem # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) xmlpos2 = xmlword.findall(".//{%s}s" % NS) if xmlpos2 != []: tag = xmlpos[0].text + ":" + xmlpos2[0].text else: tag = xmlpos[0].text except (AttributeError, IndexError) as e: tag = "" try: xmlsuffixpos = xmlword.findall( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c" % (NS, NS, NS, NS, NS) ) xmlsuffixpos2 = xmlword.findall( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s" % (NS, NS, NS, NS, NS) ) if xmlsuffixpos2: suffixTag = ( xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text ) else: suffixTag = xmlsuffixpos[0].text except: pass if suffixTag: tag += "~" + suffixTag word = (word, tag) # relational # the gold standard is stored in # if relation == True: for xmlstem_rel in xmlword.findall( f".//{{{NS}}}mor/{{{NS}}}gra" ): if not xmlstem_rel.get("type") == "grt": word = ( word[0], word[1], xmlstem_rel.get("index") + "|" + xmlstem_rel.get("head") + "|" + xmlstem_rel.get("relation"), ) else: word = ( word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get("index") + "|" + xmlstem_rel.get("head") + "|" + xmlstem_rel.get("relation"), ) try: for xmlpost_rel in xmlword.findall( f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra" ): if not xmlpost_rel.get("type") == "grt": suffixStem = ( suffixStem[0], suffixStem[1], xmlpost_rel.get("index") + "|" + xmlpost_rel.get("head") + "|" + xmlpost_rel.get("relation"), ) else: suffixStem = ( suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get("index") + "|" + xmlpost_rel.get("head") + "|" + xmlpost_rel.get("relation"), ) except: pass sents.append(word) if sent or relation: results.append(sents) else: results.extend(sents) return LazyMap(lambda x: x, results) # Ready-to-use browser opener """ The base URL for viewing files on the childes website. This shouldn't need to be changed, unless CHILDES changes the configuration of their server or unless the user sets up their own corpus webserver. """ childes_url_base = r"https://childes.talkbank.org/browser/index.php?url=" def webview_file(self, fileid, urlbase=None): """Map a corpus file to its web version on the CHILDES website, and open it in a web browser. The complete URL to be used is: childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha') If no urlbase is passed, we try to calculate it. This requires that the childes corpus was set up to mirror the folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.: nltk_data/corpora/childes/Eng-USA/Cornell/??? or nltk_data/corpora/childes/Romance/Spanish/Aguirre/??? The function first looks (as a special case) if "Eng-USA" is on the path consisting of +fileid; then if "childes", possibly followed by "data-xml", appears. If neither one is found, we use the unmodified fileid and hope for the best. If this is not right, specify urlbase explicitly, e.g., if the corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'. """ import webbrowser if urlbase: path = urlbase + "/" + fileid else: full = self.root + "/" + fileid full = re.sub(r"\\", "/", full) if "/childes/" in full.lower(): # Discard /data-xml/ if present path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0] elif "eng-usa" in full.lower(): path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0] else: path = fileid # Strip ".xml" and add ".cha", as necessary: if path.endswith(".xml"): path = path[:-4] if not path.endswith(".cha"): path = path + ".cha" url = self.childes_url_base + path webbrowser.open_new_tab(url) print("Opening in browser:", url) # Pausing is a good idea, but it's up to the user... # raw_input("Hit Return to continue") def demo(corpus_root=None): """ The CHILDES corpus should be manually downloaded and saved to ``[NLTK_Data_Dir]/corpora/childes/`` """ if not corpus_root: from nltk.data import find corpus_root = find("corpora/childes/data-xml/Eng-USA/") try: childes = CHILDESCorpusReader(corpus_root, ".*.xml") # describe all corpus for file in childes.fileids()[:5]: corpus = "" corpus_id = "" for (key, value) in childes.corpus(file)[0].items(): if key == "Corpus": corpus = value if key == "Id": corpus_id = value print("Reading", corpus, corpus_id, " .....") print("words:", childes.words(file)[:7], "...") print( "words with replaced words:", childes.words(file, replace=True)[:7], " ...", ) print("words with pos tags:", childes.tagged_words(file)[:7], " ...") print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...") print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...") print("stemmed words:", childes.words(file, stem=True)[:7], " ...") print( "words with relations and pos-tag:", childes.words(file, relation=True)[:5], " ...", ) print("sentence:", childes.sents(file)[:2], " ...") for (participant, values) in childes.participants(file)[0].items(): for (key, value) in values.items(): print("\tparticipant", participant, key, ":", value) print("num of sent:", len(childes.sents(file))) print("num of morphemes:", len(childes.words(file, stem=True))) print("age:", childes.age(file)) print("age in month:", childes.age(file, month=True)) print("MLU:", childes.MLU(file)) print() except LookupError as e: print( """The CHILDES corpus, or the parts you need, should be manually downloaded from https://childes.talkbank.org/data-xml/ and saved at [NLTK_Data_Dir]/corpora/childes/ Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.: demo('/path/to/childes/data-xml/Eng-USA/") """ ) # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip') # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read())) ##this fails # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist()) if __name__ == "__main__": demo() nltk-3.7/nltk/corpus/reader/chunked.py000066400000000000000000000216051420073152400200550ustar00rootroot00000000000000# Natural Language Toolkit: Chunked Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ A reader for corpora that contain chunked (and optionally tagged) documents. """ import codecs import os.path import nltk from nltk.chunk import tagstr2tree from nltk.corpus.reader.api import * from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader from nltk.corpus.reader.util import * from nltk.tokenize import * from nltk.tree import Tree class ChunkedCorpusReader(CorpusReader): """ Reader for chunked (and optionally tagged) corpora. Paragraphs are split using a block reader. They are then tokenized into sentences using a sentence tokenizer. Finally, these sentences are parsed into chunk trees using a string-to-chunktree conversion function. Each of these steps can be performed using a default function or a custom function. By default, paragraphs are split on blank lines; sentences are listed one per line; and sentences are parsed into chunk trees using ``nltk.chunk.tagstr2tree``. """ def __init__( self, root, fileids, extension="", str2chunktree=tagstr2tree, sent_tokenizer=RegexpTokenizer("\n", gaps=True), para_block_reader=read_blankline_block, encoding="utf8", tagset=None, ): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset) """Arguments for corpus views generated by this corpus: a tuple (str2chunktree, sent_tokenizer, para_block_tokenizer)""" def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat( [ ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) for (f, enc) in self.abspaths(fileids, True) ] ) def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat( [ ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) for (f, enc) in self.abspaths(fileids, True) ] ) def paras(self, fileids=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ return concat( [ ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) for (f, enc) in self.abspaths(fileids, True) ] ) def tagged_words(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) """ return concat( [ ChunkedCorpusView( f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset ) for (f, enc) in self.abspaths(fileids, True) ] ) def tagged_sents(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) """ return concat( [ ChunkedCorpusView( f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset ) for (f, enc) in self.abspaths(fileids, True) ] ) def tagged_paras(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of ``(word,tag)`` tuples. :rtype: list(list(list(tuple(str,str)))) """ return concat( [ ChunkedCorpusView( f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset ) for (f, enc) in self.abspaths(fileids, True) ] ) def chunked_words(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of tagged words and chunks. Words are encoded as ``(word, tag)`` tuples (if the corpus has tags) or word strings (if the corpus has no tags). Chunks are encoded as depth-one trees over ``(word,tag)`` tuples or word strings. :rtype: list(tuple(str,str) and Tree) """ return concat( [ ChunkedCorpusView( f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset ) for (f, enc) in self.abspaths(fileids, True) ] ) def chunked_sents(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of sentences, each encoded as a shallow Tree. The leaves of these trees are encoded as ``(word, tag)`` tuples (if the corpus has tags) or word strings (if the corpus has no tags). :rtype: list(Tree) """ return concat( [ ChunkedCorpusView( f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset ) for (f, enc) in self.abspaths(fileids, True) ] ) def chunked_paras(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as a shallow Tree. The leaves of these trees are encoded as ``(word, tag)`` tuples (if the corpus has tags) or word strings (if the corpus has no tags). :rtype: list(list(Tree)) """ return concat( [ ChunkedCorpusView( f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset ) for (f, enc) in self.abspaths(fileids, True) ] ) def _read_block(self, stream): return [tagstr2tree(t) for t in read_blankline_block(stream)] class ChunkedCorpusView(StreamBackedCorpusView): def __init__( self, fileid, encoding, tagged, group_by_sent, group_by_para, chunked, str2chunktree, sent_tokenizer, para_block_reader, source_tagset=None, target_tagset=None, ): StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._chunked = chunked self._str2chunktree = str2chunktree self._sent_tokenizer = sent_tokenizer self._para_block_reader = para_block_reader self._source_tagset = source_tagset self._target_tagset = target_tagset def read_block(self, stream): block = [] for para_str in self._para_block_reader(stream): para = [] for sent_str in self._sent_tokenizer.tokenize(para_str): sent = self._str2chunktree( sent_str, source_tagset=self._source_tagset, target_tagset=self._target_tagset, ) # If requested, throw away the tags. if not self._tagged: sent = self._untag(sent) # If requested, throw away the chunks. if not self._chunked: sent = sent.leaves() # Add the sentence to `para`. if self._group_by_sent: para.append(sent) else: para.extend(sent) # Add the paragraph to `block`. if self._group_by_para: block.append(para) else: block.extend(para) # Return the block return block def _untag(self, tree): for i, child in enumerate(tree): if isinstance(child, Tree): self._untag(child) elif isinstance(child, tuple): tree[i] = child[0] else: raise ValueError("expected child to be Tree or tuple") return tree nltk-3.7/nltk/corpus/reader/cmudict.py000066400000000000000000000063161420073152400200660ustar00rootroot00000000000000# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6] ftp://ftp.cs.cmu.edu/project/speech/dict/ Copyright 1998 Carnegie Mellon University File Format: Each line consists of an uppercased word, a counter (for alternative pronunciations), and a transcription. Vowels are marked for stress (1=primary, 2=secondary, 0=no stress). E.g.: NATURAL 1 N AE1 CH ER0 AH0 L The dictionary contains 127069 entries. Of these, 119400 words are assigned a unique pronunciation, 6830 words have two pronunciations, and 839 words have three or more pronunciations. Many of these are fast-speech variants. Phonemes: There are 39 phonemes, as shown below: Phoneme Example Translation Phoneme Example Translation ------- ------- ----------- ------- ------- ----------- AA odd AA D AE at AE T AH hut HH AH T AO ought AO T AW cow K AW AY hide HH AY D B be B IY CH cheese CH IY Z D dee D IY DH thee DH IY EH Ed EH D ER hurt HH ER T EY ate EY T F fee F IY G green G R IY N HH he HH IY IH it IH T IY eat IY T JH gee JH IY K key K IY L lee L IY M me M IY N knee N IY NG ping P IH NG OW oat OW T OY toy T OY P pee P IY R read R IY D S sea S IY SH she SH IY T tea T IY TH theta TH EY T AH UH hood HH UH D UW two T UW V vee V IY W we W IY Y yield Y IY L D Z zee Z IY ZH seizure S IY ZH ER """ from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.util import Index class CMUDictCorpusReader(CorpusReader): def entries(self): """ :return: the cmudict lexicon as a list of entries containing (word, transcriptions) tuples. """ return concat( [ StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc) for fileid, enc in self.abspaths(None, True) ] ) def words(self): """ :return: a list of all words defined in the cmudict lexicon. """ return [word.lower() for (word, _) in self.entries()] def dict(self): """ :return: the cmudict lexicon as a dictionary, whose keys are lowercase words and whose values are lists of pronunciations. """ return dict(Index(self.entries())) def read_cmudict_block(stream): entries = [] while len(entries) < 100: # Read 100 at a time. line = stream.readline() if line == "": return entries # end of file. pieces = line.split() entries.append((pieces[0].lower(), pieces[2:])) return entries nltk-3.7/nltk/corpus/reader/comparative_sents.py000066400000000000000000000267171420073152400221730ustar00rootroot00000000000000# Natural Language Toolkit: Comparative Sentence Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader for the Comparative Sentence Dataset. - Comparative Sentence Dataset information - Annotated by: Nitin Jindal and Bing Liu, 2006. Department of Computer Sicence University of Illinois at Chicago Contact: Nitin Jindal, njindal@cs.uic.edu Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub) Distributed with permission. Related papers: - Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents". Proceedings of the ACM SIGIR International Conference on Information Retrieval (SIGIR-06), 2006. - Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations". Proceedings of Twenty First National Conference on Artificial Intelligence (AAAI-2006), 2006. - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences". Proceedings of the 22nd International Conference on Computational Linguistics (Coling-2008), Manchester, 18-22 August, 2008. """ import re from nltk.corpus.reader.api import * from nltk.tokenize import * # Regular expressions for dataset components STARS = re.compile(r"^\*+$") COMPARISON = re.compile(r"") CLOSE_COMPARISON = re.compile(r"") GRAD_COMPARISON = re.compile(r"") NON_GRAD_COMPARISON = re.compile(r"") ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)") KEYWORD = re.compile(r"\(([^\(]*)\)$") class Comparison: """ A Comparison represents a comparative sentence and its constituents. """ def __init__( self, text=None, comp_type=None, entity_1=None, entity_2=None, feature=None, keyword=None, ): """ :param text: a string (optionally tokenized) containing a comparison. :param comp_type: an integer defining the type of comparison expressed. Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative), 4 (Non-gradable). :param entity_1: the first entity considered in the comparison relation. :param entity_2: the second entity considered in the comparison relation. :param feature: the feature considered in the comparison relation. :param keyword: the word or phrase which is used for that comparative relation. """ self.text = text self.comp_type = comp_type self.entity_1 = entity_1 self.entity_2 = entity_2 self.feature = feature self.keyword = keyword def __repr__(self): return ( 'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", ' 'feature="{}", keyword="{}")' ).format( self.text, self.comp_type, self.entity_1, self.entity_2, self.feature, self.keyword, ) class ComparativeSentencesCorpusReader(CorpusReader): """ Reader for the Comparative Sentence Dataset by Jindal and Liu (2006). >>> from nltk.corpus import comparative_sentences >>> comparison = comparative_sentences.comparisons()[0] >>> comparison.text ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly', 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve", 'had', '.'] >>> comparison.entity_2 'models' >>> (comparison.feature, comparison.keyword) ('rewind', 'more') >>> len(comparative_sentences.comparisons()) 853 """ CorpusView = StreamBackedCorpusView def __init__( self, root, fileids, word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=None, encoding="utf8", ): """ :param root: The root directory for this corpus. :param fileids: a list or regexp specifying the fileids in this corpus. :param word_tokenizer: tokenizer for breaking sentences or paragraphs into words. Default: `WhitespaceTokenizer` :param sent_tokenizer: tokenizer for breaking paragraphs into sentences. :param encoding: the encoding that should be used to read the corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._readme = "README.txt" def comparisons(self, fileids=None): """ Return all comparisons in the corpus. :param fileids: a list or regexp specifying the ids of the files whose comparisons have to be returned. :return: the given file(s) as a list of Comparison objects. :rtype: list(Comparison) """ if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] return concat( [ self.CorpusView(path, self._read_comparison_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def keywords(self, fileids=None): """ Return a set of all keywords used in the corpus. :param fileids: a list or regexp specifying the ids of the files whose keywords have to be returned. :return: the set of keywords and comparative phrases used in the corpus. :rtype: set(str) """ all_keywords = concat( [ self.CorpusView(path, self._read_keyword_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) keywords_set = {keyword.lower() for keyword in all_keywords if keyword} return keywords_set def keywords_readme(self): """ Return the list of words and constituents considered as clues of a comparison (from listOfkeywords.txt). """ keywords = [] with self.open("listOfkeywords.txt") as fp: raw_text = fp.read() for line in raw_text.split("\n"): if not line or line.startswith("//"): continue keywords.append(line.strip()) return keywords def sents(self, fileids=None): """ Return all sentences in the corpus. :param fileids: a list or regexp specifying the ids of the files whose sentences have to be returned. :return: all sentences of the corpus as lists of tokens (or as plain strings, if no word tokenizer is specified). :rtype: list(list(str)) or list(str) """ return concat( [ self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def words(self, fileids=None): """ Return all words and punctuation symbols in the corpus. :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat( [ self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def _read_comparison_block(self, stream): while True: line = stream.readline() if not line: return [] # end of file. comparison_tags = re.findall(COMPARISON, line) if comparison_tags: grad_comparisons = re.findall(GRAD_COMPARISON, line) non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line) # Advance to the next line (it contains the comparative sentence) comparison_text = stream.readline().strip() if self._word_tokenizer: comparison_text = self._word_tokenizer.tokenize(comparison_text) # Skip the next line (it contains closing comparison tags) stream.readline() # If gradable comparisons are found, create Comparison instances # and populate their fields comparison_bundle = [] if grad_comparisons: # Each comparison tag has its own relations on a separate line for comp in grad_comparisons: comp_type = int(re.match(r"", comp).group(1)) comparison = Comparison( text=comparison_text, comp_type=comp_type ) line = stream.readline() entities_feats = ENTITIES_FEATS.findall(line) if entities_feats: for (code, entity_feat) in entities_feats: if code == "1": comparison.entity_1 = entity_feat.strip() elif code == "2": comparison.entity_2 = entity_feat.strip() elif code == "3": comparison.feature = entity_feat.strip() keyword = KEYWORD.findall(line) if keyword: comparison.keyword = keyword[0] comparison_bundle.append(comparison) # If non-gradable comparisons are found, create a simple Comparison # instance for each one if non_grad_comparisons: for comp in non_grad_comparisons: # comp_type in this case should always be 4. comp_type = int(re.match(r"", comp).group(1)) comparison = Comparison( text=comparison_text, comp_type=comp_type ) comparison_bundle.append(comparison) # Flatten the list of comparisons before returning them # return concat([comparison_bundle]) return comparison_bundle def _read_keyword_block(self, stream): keywords = [] for comparison in self._read_comparison_block(stream): keywords.append(comparison.keyword) return keywords def _read_sent_block(self, stream): while True: line = stream.readline() if re.match(STARS, line): while True: line = stream.readline() if re.match(STARS, line): break continue if ( not re.findall(COMPARISON, line) and not ENTITIES_FEATS.findall(line) and not re.findall(CLOSE_COMPARISON, line) ): if self._sent_tokenizer: return [ self._word_tokenizer.tokenize(sent) for sent in self._sent_tokenizer.tokenize(line) ] else: return [self._word_tokenizer.tokenize(line)] def _read_word_block(self, stream): words = [] for sent in self._read_sent_block(stream): words.extend(sent) return words nltk-3.7/nltk/corpus/reader/conll.py000066400000000000000000000523321420073152400175440ustar00rootroot00000000000000# Natural Language Toolkit: CONLL Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Read CoNLL-style chunk fileids. """ import textwrap from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.tag import map_tag from nltk.tree import Tree from nltk.util import LazyConcatenation, LazyMap class ConllCorpusReader(CorpusReader): """ A corpus reader for CoNLL-style files. These files consist of a series of sentences, separated by blank lines. Each sentence is encoded using a table (or "grid") of values, where each line corresponds to a single word, and each column corresponds to an annotation type. The set of columns used by CoNLL-style files can vary from corpus to corpus; the ``ConllCorpusReader`` constructor therefore takes an argument, ``columntypes``, which is used to specify the columns that are used by a given corpus. By default columns are split by consecutive whitespaces, with the ``separator`` argument you can set a string to split by (e.g. ``\'\t\'``). @todo: Add support for reading from corpora where different parallel files contain different columns. @todo: Possibly add caching of the grid corpus view? This would allow the same grid view to be used by different data access methods (eg words() and parsed_sents() could both share the same grid corpus view object). @todo: Better support for -DOCSTART-. Currently, we just ignore it, but it could be used to define methods that retrieve a document at a time (eg parsed_documents()). """ # ///////////////////////////////////////////////////////////////// # Column Types # ///////////////////////////////////////////////////////////////// WORDS = "words" #: column type for words POS = "pos" #: column type for part-of-speech tags TREE = "tree" #: column type for parse trees CHUNK = "chunk" #: column type for chunk structures NE = "ne" #: column type for named entities SRL = "srl" #: column type for semantic role labels IGNORE = "ignore" #: column type for column that should be ignored #: A list of all column types supported by the conll corpus reader. COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE) # ///////////////////////////////////////////////////////////////// # Constructor # ///////////////////////////////////////////////////////////////// def __init__( self, root, fileids, columntypes, chunk_types=None, root_label="S", pos_in_tree=False, srl_includes_roleset=True, encoding="utf8", tree_class=Tree, tagset=None, separator=None, ): for columntype in columntypes: if columntype not in self.COLUMN_TYPES: raise ValueError("Bad column type %r" % columntype) if isinstance(chunk_types, str): chunk_types = [chunk_types] self._chunk_types = chunk_types self._colmap = {c: i for (i, c) in enumerate(columntypes)} self._pos_in_tree = pos_in_tree self._root_label = root_label # for chunks self._srl_includes_roleset = srl_includes_roleset self._tree_class = tree_class CorpusReader.__init__(self, root, fileids, encoding) self._tagset = tagset self.sep = separator # ///////////////////////////////////////////////////////////////// # Data Access Methods # ///////////////////////////////////////////////////////////////// def words(self, fileids=None): self._require(self.WORDS) return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids))) def sents(self, fileids=None): self._require(self.WORDS) return LazyMap(self._get_words, self._grids(fileids)) def tagged_words(self, fileids=None, tagset=None): self._require(self.WORDS, self.POS) def get_tagged_words(grid): return self._get_tagged_words(grid, tagset) return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids))) def tagged_sents(self, fileids=None, tagset=None): self._require(self.WORDS, self.POS) def get_tagged_words(grid): return self._get_tagged_words(grid, tagset) return LazyMap(get_tagged_words, self._grids(fileids)) def chunked_words(self, fileids=None, chunk_types=None, tagset=None): self._require(self.WORDS, self.POS, self.CHUNK) if chunk_types is None: chunk_types = self._chunk_types def get_chunked_words(grid): # capture chunk_types as local var return self._get_chunked_words(grid, chunk_types, tagset) return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids))) def chunked_sents(self, fileids=None, chunk_types=None, tagset=None): self._require(self.WORDS, self.POS, self.CHUNK) if chunk_types is None: chunk_types = self._chunk_types def get_chunked_words(grid): # capture chunk_types as local var return self._get_chunked_words(grid, chunk_types, tagset) return LazyMap(get_chunked_words, self._grids(fileids)) def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None): self._require(self.WORDS, self.POS, self.TREE) if pos_in_tree is None: pos_in_tree = self._pos_in_tree def get_parsed_sent(grid): # capture pos_in_tree as local var return self._get_parsed_sent(grid, pos_in_tree, tagset) return LazyMap(get_parsed_sent, self._grids(fileids)) def srl_spans(self, fileids=None): self._require(self.SRL) return LazyMap(self._get_srl_spans, self._grids(fileids)) def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True): self._require(self.WORDS, self.POS, self.TREE, self.SRL) if pos_in_tree is None: pos_in_tree = self._pos_in_tree def get_srl_instances(grid): # capture pos_in_tree as local var return self._get_srl_instances(grid, pos_in_tree) result = LazyMap(get_srl_instances, self._grids(fileids)) if flatten: result = LazyConcatenation(result) return result def iob_words(self, fileids=None, tagset=None): """ :return: a list of word/tag/IOB tuples :rtype: list(tuple) :param fileids: the list of fileids that make up this corpus :type fileids: None or str or list """ self._require(self.WORDS, self.POS, self.CHUNK) def get_iob_words(grid): return self._get_iob_words(grid, tagset) return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids))) def iob_sents(self, fileids=None, tagset=None): """ :return: a list of lists of word/tag/IOB tuples :rtype: list(list) :param fileids: the list of fileids that make up this corpus :type fileids: None or str or list """ self._require(self.WORDS, self.POS, self.CHUNK) def get_iob_words(grid): return self._get_iob_words(grid, tagset) return LazyMap(get_iob_words, self._grids(fileids)) # ///////////////////////////////////////////////////////////////// # Grid Reading # ///////////////////////////////////////////////////////////////// def _grids(self, fileids=None): # n.b.: we could cache the object returned here (keyed on # fileids), which would let us reuse the same corpus view for # different things (eg srl and parse trees). return concat( [ StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def _read_grid_block(self, stream): grids = [] for block in read_blankline_block(stream): block = block.strip() if not block: continue grid = [line.split(self.sep) for line in block.split("\n")] # If there's a docstart row, then discard. ([xx] eventually it # would be good to actually use it) if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-": del grid[0] # Check that the grid is consistent. for row in grid: if len(row) != len(grid[0]): raise ValueError("Inconsistent number of columns:\n%s" % block) grids.append(grid) return grids # ///////////////////////////////////////////////////////////////// # Transforms # ///////////////////////////////////////////////////////////////// # given a grid, transform it into some representation (e.g., # a list of words or a parse tree). def _get_words(self, grid): return self._get_column(grid, self._colmap["words"]) def _get_tagged_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap["pos"]) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags)) def _get_iob_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap["pos"]) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list( zip( self._get_column(grid, self._colmap["words"]), pos_tags, self._get_column(grid, self._colmap["chunk"]), ) ) def _get_chunked_words(self, grid, chunk_types, tagset=None): # n.b.: this method is very similar to conllstr2tree. words = self._get_column(grid, self._colmap["words"]) pos_tags = self._get_column(grid, self._colmap["pos"]) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] chunk_tags = self._get_column(grid, self._colmap["chunk"]) stack = [Tree(self._root_label, [])] for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): if chunk_tag == "O": state, chunk_type = "O", "" else: (state, chunk_type) = chunk_tag.split("-") # If it's a chunk we don't care about, treat it as O. if chunk_types is not None and chunk_type not in chunk_types: state = "O" # Treat a mismatching I like a B. if state == "I" and chunk_type != stack[-1].label(): state = "B" # For B or I: close any open chunks if state in "BO" and len(stack) == 2: stack.pop() # For B: start a new chunk. if state == "B": new_chunk = Tree(chunk_type, []) stack[-1].append(new_chunk) stack.append(new_chunk) # Add the word token. stack[-1].append((word, pos_tag)) return stack[0] def _get_parsed_sent(self, grid, pos_in_tree, tagset=None): words = self._get_column(grid, self._colmap["words"]) pos_tags = self._get_column(grid, self._colmap["pos"]) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] parse_tags = self._get_column(grid, self._colmap["tree"]) treestr = "" for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): if word == "(": word = "-LRB-" if word == ")": word = "-RRB-" if pos_tag == "(": pos_tag = "-LRB-" if pos_tag == ")": pos_tag = "-RRB-" (left, right) = parse_tag.split("*") right = right.count(")") * ")" # only keep ')'. treestr += f"{left} ({pos_tag} {word}) {right}" try: tree = self._tree_class.fromstring(treestr) except (ValueError, IndexError): tree = self._tree_class.fromstring(f"({self._root_label} {treestr})") if not pos_in_tree: for subtree in tree.subtrees(): for i, child in enumerate(subtree): if ( isinstance(child, Tree) and len(child) == 1 and isinstance(child[0], str) ): subtree[i] = (child[0], child.label()) return tree def _get_srl_spans(self, grid): """ list of list of (start, end), tag) tuples """ if self._srl_includes_roleset: predicates = self._get_column(grid, self._colmap["srl"] + 1) start_col = self._colmap["srl"] + 2 else: predicates = self._get_column(grid, self._colmap["srl"]) start_col = self._colmap["srl"] + 1 # Count how many predicates there are. This tells us how many # columns to expect for SRL data. num_preds = len([p for p in predicates if p != "-"]) spanlists = [] for i in range(num_preds): col = self._get_column(grid, start_col + i) spanlist = [] stack = [] for wordnum, srl_tag in enumerate(col): (left, right) = srl_tag.split("*") for tag in left.split("("): if tag: stack.append((tag, wordnum)) for i in range(right.count(")")): (tag, start) = stack.pop() spanlist.append(((start, wordnum + 1), tag)) spanlists.append(spanlist) return spanlists def _get_srl_instances(self, grid, pos_in_tree): tree = self._get_parsed_sent(grid, pos_in_tree) spanlists = self._get_srl_spans(grid) if self._srl_includes_roleset: predicates = self._get_column(grid, self._colmap["srl"] + 1) rolesets = self._get_column(grid, self._colmap["srl"]) else: predicates = self._get_column(grid, self._colmap["srl"]) rolesets = [None] * len(predicates) instances = ConllSRLInstanceList(tree) for wordnum, predicate in enumerate(predicates): if predicate == "-": continue # Decide which spanlist to use. Don't assume that they're # sorted in the same order as the predicates (even though # they usually are). for spanlist in spanlists: for (start, end), tag in spanlist: if wordnum in range(start, end) and tag in ("V", "C-V"): break else: continue break else: raise ValueError("No srl column found for %r" % predicate) instances.append( ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist) ) return instances # ///////////////////////////////////////////////////////////////// # Helper Methods # ///////////////////////////////////////////////////////////////// def _require(self, *columntypes): for columntype in columntypes: if columntype not in self._colmap: raise ValueError( "This corpus does not contain a %s " "column." % columntype ) @staticmethod def _get_column(grid, column_index): return [grid[i][column_index] for i in range(len(grid))] class ConllSRLInstance: """ An SRL instance from a CoNLL corpus, which identifies and providing labels for the arguments of a single verb. """ # [xx] add inst.core_arguments, inst.argm_arguments? def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans): self.verb = [] """A list of the word indices of the words that compose the verb whose arguments are identified by this instance. This will contain multiple word indices when multi-word verbs are used (e.g. 'turn on').""" self.verb_head = verb_head """The word index of the head word of the verb whose arguments are identified by this instance. E.g., for a sentence that uses the verb 'turn on,' ``verb_head`` will be the word index of the word 'turn'.""" self.verb_stem = verb_stem self.roleset = roleset self.arguments = [] """A list of ``(argspan, argid)`` tuples, specifying the location and type for each of the arguments identified by this instance. ``argspan`` is a tuple ``start, end``, indicating that the argument consists of the ``words[start:end]``.""" self.tagged_spans = tagged_spans """A list of ``(span, id)`` tuples, specifying the location and type for each of the arguments, as well as the verb pieces, that make up this instance.""" self.tree = tree """The parse tree for the sentence containing this instance.""" self.words = tree.leaves() """A list of the words in the sentence containing this instance.""" # Fill in the self.verb and self.arguments values. for (start, end), tag in tagged_spans: if tag in ("V", "C-V"): self.verb += list(range(start, end)) else: self.arguments.append(((start, end), tag)) def __repr__(self): # Originally, its: ##plural = 's' if len(self.arguments) != 1 else '' plural = "s" if len(self.arguments) != 1 else "" return "" % ( (self.verb_stem, len(self.arguments), plural) ) def pprint(self): verbstr = " ".join(self.words[i][0] for i in self.verb) hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n" s = "" for i, word in enumerate(self.words): if isinstance(word, tuple): word = word[0] for (start, end), argid in self.arguments: if i == start: s += "[%s " % argid if i == end: s += "] " if i in self.verb: word = "<<%s>>" % word s += word + " " return hdr + textwrap.fill( s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" " ) class ConllSRLInstanceList(list): """ Set of instances for a single sentence """ def __init__(self, tree, instances=()): self.tree = tree list.__init__(self, instances) def __str__(self): return self.pprint() def pprint(self, include_tree=False): # Sanity check: trees should be the same for inst in self: if inst.tree != self.tree: raise ValueError("Tree mismatch!") # If desired, add trees: if include_tree: words = self.tree.leaves() pos = [None] * len(words) synt = ["*"] * len(words) self._tree2conll(self.tree, 0, words, pos, synt) s = "" for i in range(len(words)): # optional tree columns if include_tree: s += "%-20s " % words[i] s += "%-8s " % pos[i] s += "%15s*%-8s " % tuple(synt[i].split("*")) # verb head column for inst in self: if i == inst.verb_head: s += "%-20s " % inst.verb_stem break else: s += "%-20s " % "-" # Remaining columns: self for inst in self: argstr = "*" for (start, end), argid in inst.tagged_spans: if i == start: argstr = f"({argid}{argstr}" if i == (end - 1): argstr += ")" s += "%-12s " % argstr s += "\n" return s def _tree2conll(self, tree, wordnum, words, pos, synt): assert isinstance(tree, Tree) if len(tree) == 1 and isinstance(tree[0], str): pos[wordnum] = tree.label() assert words[wordnum] == tree[0] return wordnum + 1 elif len(tree) == 1 and isinstance(tree[0], tuple): assert len(tree[0]) == 2 pos[wordnum], pos[wordnum] = tree[0] return wordnum + 1 else: synt[wordnum] = f"({tree.label()}{synt[wordnum]}" for child in tree: wordnum = self._tree2conll(child, wordnum, words, pos, synt) synt[wordnum - 1] += ")" return wordnum class ConllChunkCorpusReader(ConllCorpusReader): """ A ConllCorpusReader whose data file contains three columns: words, pos, and chunk. """ def __init__( self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None ): ConllCorpusReader.__init__( self, root, fileids, ("words", "pos", "chunk"), chunk_types=chunk_types, encoding=encoding, tagset=tagset, separator=separator, ) nltk-3.7/nltk/corpus/reader/crubadan.py000066400000000000000000000067011420073152400202130ustar00rootroot00000000000000# Natural Language Toolkit: An Crubadan N-grams Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Avital Pekker # # URL: # For license information, see LICENSE.TXT """ An NLTK interface for the n-gram statistics gathered from the corpora for each language using An Crubadan. There are multiple potential applications for the data but this reader was created with the goal of using it in the context of language identification. For details about An Crubadan, this data, and its potential uses, see: http://borel.slu.edu/crubadan/index.html """ import re from os import path from nltk.corpus.reader import CorpusReader from nltk.data import ZipFilePathPointer from nltk.probability import FreqDist class CrubadanCorpusReader(CorpusReader): """ A corpus reader used to access language An Crubadan n-gram files. """ _LANG_MAPPER_FILE = "table.txt" _all_lang_freq = {} def __init__(self, root, fileids, encoding="utf8", tagset=None): super().__init__(root, fileids, encoding="utf8") self._lang_mapping_data = [] self._load_lang_mapping_data() def lang_freq(self, lang): """Return n-gram FreqDist for a specific language given ISO 639-3 language code""" if lang not in self._all_lang_freq: self._all_lang_freq[lang] = self._load_lang_ngrams(lang) return self._all_lang_freq[lang] def langs(self): """Return a list of supported languages as ISO 639-3 codes""" return [row[1] for row in self._lang_mapping_data] def iso_to_crubadan(self, lang): """Return internal Crubadan code based on ISO 639-3 code""" for i in self._lang_mapping_data: if i[1].lower() == lang.lower(): return i[0] def crubadan_to_iso(self, lang): """Return ISO 639-3 code given internal Crubadan code""" for i in self._lang_mapping_data: if i[0].lower() == lang.lower(): return i[1] def _load_lang_mapping_data(self): """Load language mappings between codes and description from table.txt""" if isinstance(self.root, ZipFilePathPointer): raise RuntimeError( "Please install the 'crubadan' corpus first, use nltk.download()" ) mapper_file = path.join(self.root, self._LANG_MAPPER_FILE) if self._LANG_MAPPER_FILE not in self.fileids(): raise RuntimeError("Could not find language mapper file: " + mapper_file) with open(mapper_file, encoding="utf-8") as raw: strip_raw = raw.read().strip() self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")] def _load_lang_ngrams(self, lang): """Load single n-gram language file given the ISO 639-3 language code and return its FreqDist""" if lang not in self.langs(): raise RuntimeError("Unsupported language.") crubadan_code = self.iso_to_crubadan(lang) ngram_file = path.join(self.root, crubadan_code + "-3grams.txt") if not path.isfile(ngram_file): raise RuntimeError("No N-gram file found for requested language.") counts = FreqDist() with open(ngram_file, encoding="utf-8") as f: for line in f: data = line.split(" ") ngram = data[1].strip("\n") freq = int(data[0]) counts[ngram] = freq return counts nltk-3.7/nltk/corpus/reader/dependency.py000066400000000000000000000072771420073152400205630ustar00rootroot00000000000000# Natural Language Toolkit: Dependency Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Kepa Sarasola # Iker Manterola # # URL: # For license information, see LICENSE.TXT from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.parse import DependencyGraph from nltk.tokenize import * class DependencyCorpusReader(SyntaxCorpusReader): def __init__( self, root, fileids, encoding="utf8", word_tokenizer=TabTokenizer(), sent_tokenizer=RegexpTokenizer("\n", gaps=True), para_block_reader=read_blankline_block, ): SyntaxCorpusReader.__init__(self, root, fileids, encoding) ######################################################### def words(self, fileids=None): return concat( [ DependencyCorpusView(fileid, False, False, False, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True) ] ) def tagged_words(self, fileids=None): return concat( [ DependencyCorpusView(fileid, True, False, False, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True) ] ) def sents(self, fileids=None): return concat( [ DependencyCorpusView(fileid, False, True, False, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True) ] ) def tagged_sents(self, fileids=None): return concat( [ DependencyCorpusView(fileid, True, True, False, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True) ] ) def parsed_sents(self, fileids=None): sents = concat( [ DependencyCorpusView(fileid, False, True, True, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True) ] ) return [DependencyGraph(sent) for sent in sents] class DependencyCorpusView(StreamBackedCorpusView): _DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da def __init__( self, corpus_file, tagged, group_by_sent, dependencies, chunk_types=None, encoding="utf8", ): self._tagged = tagged self._dependencies = dependencies self._group_by_sent = group_by_sent self._chunk_types = chunk_types StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): # Read the next sentence. sent = read_blankline_block(stream)[0].strip() # Strip off the docstart marker, if present. if sent.startswith(self._DOCSTART): sent = sent[len(self._DOCSTART) :].lstrip() # extract word and tag from any of the formats if not self._dependencies: lines = [line.split("\t") for line in sent.split("\n")] if len(lines[0]) == 3 or len(lines[0]) == 4: sent = [(line[0], line[1]) for line in lines] elif len(lines[0]) == 10: sent = [(line[1], line[4]) for line in lines] else: raise ValueError("Unexpected number of fields in dependency tree file") # discard tags if they weren't requested if not self._tagged: sent = [word for (word, tag) in sent] # Return the result. if self._group_by_sent: return [sent] else: return list(sent) nltk-3.7/nltk/corpus/reader/framenet.py000066400000000000000000003771771420073152400202570ustar00rootroot00000000000000# Natural Language Toolkit: Framenet Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Authors: Chuck Wooters , # Nathan Schneider # URL: # For license information, see LICENSE.TXT """ Corpus reader for the FrameNet 1.7 lexicon and corpus. """ import itertools import os import re import sys import textwrap import types from collections import OrderedDict, defaultdict from itertools import zip_longest from operator import itemgetter from pprint import pprint from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView from nltk.util import LazyConcatenation, LazyIteratorList, LazyMap __docformat__ = "epytext en" def mimic_wrap(lines, wrap_at=65, **kwargs): """ Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same positions as the first. """ l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split("\n") yield l0 def _(line): il0 = 0 while line and il0 < len(l0) - 1: yield line[: len(l0[il0])] line = line[len(l0[il0]) :] il0 += 1 if line: # Remaining stuff on this line past the end of the mimicked line. # So just textwrap this line. yield from textwrap.fill(line, wrap_at, drop_whitespace=False).split("\n") for l in lines[1:]: yield list(_(l)) def _pretty_longstring(defstr, prefix="", wrap_at=65): """ Helper function for pretty-printing a long string. :param defstr: The string to be printed. :type defstr: str :return: A nicely formatted string representation of the long string. :rtype: str """ outstr = "" for line in textwrap.fill(defstr, wrap_at).split("\n"): outstr += prefix + line + "\n" return outstr def _pretty_any(obj): """ Helper function for pretty-printing any AttrDict object. :param obj: The obj to be printed. :type obj: AttrDict :return: A nicely formatted string representation of the AttrDict object. :rtype: str """ outstr = "" for k in obj: if isinstance(obj[k], str) and len(obj[k]) > 65: outstr += f"[{k}]\n" outstr += "{}".format(_pretty_longstring(obj[k], prefix=" ")) outstr += "\n" else: outstr += f"[{k}] {obj[k]}\n" return outstr def _pretty_semtype(st): """ Helper function for pretty-printing a semantic type. :param st: The semantic type to be printed. :type st: AttrDict :return: A nicely formatted string representation of the semantic type. :rtype: str """ semkeys = st.keys() if len(semkeys) == 1: return "" outstr = "" outstr += "semantic type ({0.ID}): {0.name}\n".format(st) if "abbrev" in semkeys: outstr += f"[abbrev] {st.abbrev}\n" if "definition" in semkeys: outstr += "[definition]\n" outstr += _pretty_longstring(st.definition, " ") outstr += f"[rootType] {st.rootType.name}({st.rootType.ID})\n" if st.superType is None: outstr += "[superType] \n" else: outstr += f"[superType] {st.superType.name}({st.superType.ID})\n" outstr += f"[subTypes] {len(st.subTypes)} subtypes\n" outstr += ( " " + ", ".join(f"{x.name}({x.ID})" for x in st.subTypes) + "\n" * (len(st.subTypes) > 0) ) return outstr def _pretty_frame_relation_type(freltyp): """ Helper function for pretty-printing a frame relation type. :param freltyp: The frame relation type to be printed. :type freltyp: AttrDict :return: A nicely formatted string representation of the frame relation type. :rtype: str """ outstr = " {0.subFrameName}>".format( freltyp ) return outstr def _pretty_frame_relation(frel): """ Helper function for pretty-printing a frame relation. :param frel: The frame relation to be printed. :type frel: AttrDict :return: A nicely formatted string representation of the frame relation. :rtype: str """ outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format( frel ) return outstr def _pretty_fe_relation(ferel): """ Helper function for pretty-printing an FE relation. :param ferel: The FE relation to be printed. :type ferel: AttrDict :return: A nicely formatted string representation of the FE relation. :rtype: str """ outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format( ferel ) return outstr def _pretty_lu(lu): """ Helper function for pretty-printing a lexical unit. :param lu: The lu to be printed. :type lu: AttrDict :return: A nicely formatted string representation of the lexical unit. :rtype: str """ lukeys = lu.keys() outstr = "" outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu) if "definition" in lukeys: outstr += "[definition]\n" outstr += _pretty_longstring(lu.definition, " ") if "frame" in lukeys: outstr += f"\n[frame] {lu.frame.name}({lu.frame.ID})\n" if "incorporatedFE" in lukeys: outstr += f"\n[incorporatedFE] {lu.incorporatedFE}\n" if "POS" in lukeys: outstr += f"\n[POS] {lu.POS}\n" if "status" in lukeys: outstr += f"\n[status] {lu.status}\n" if "totalAnnotated" in lukeys: outstr += f"\n[totalAnnotated] {lu.totalAnnotated} annotated examples\n" if "lexemes" in lukeys: outstr += "\n[lexemes] {}\n".format( " ".join(f"{lex.name}/{lex.POS}" for lex in lu.lexemes) ) if "semTypes" in lukeys: outstr += f"\n[semTypes] {len(lu.semTypes)} semantic types\n" outstr += ( " " * (len(lu.semTypes) > 0) + ", ".join(f"{x.name}({x.ID})" for x in lu.semTypes) + "\n" * (len(lu.semTypes) > 0) ) if "URL" in lukeys: outstr += f"\n[URL] {lu.URL}\n" if "subCorpus" in lukeys: subc = [x.name for x in lu.subCorpus] outstr += f"\n[subCorpus] {len(lu.subCorpus)} subcorpora\n" for line in textwrap.fill(", ".join(sorted(subc)), 60).split("\n"): outstr += f" {line}\n" if "exemplars" in lukeys: outstr += "\n[exemplars] {} sentences across all subcorpora\n".format( len(lu.exemplars) ) return outstr def _pretty_exemplars(exemplars, lu): """ Helper function for pretty-printing a list of exemplar sentences for a lexical unit. :param sent: The list of exemplar sentences to be printed. :type sent: list(AttrDict) :return: An index of the text of the exemplar sentences. :rtype: str """ outstr = "" outstr += "exemplar sentences for {0.name} in {0.frame.name}:\n\n".format(lu) for i, sent in enumerate(exemplars): outstr += f"[{i}] {sent.text}\n" outstr += "\n" return outstr def _pretty_fulltext_sentences(sents): """ Helper function for pretty-printing a list of annotated sentences for a full-text document. :param sent: The list of sentences to be printed. :type sent: list(AttrDict) :return: An index of the text of the sentences. :rtype: str """ outstr = "" outstr += "full-text document ({0.ID}) {0.name}:\n\n".format(sents) outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format( sents ) outstr += f"[sentence]\n" for i, sent in enumerate(sents.sentence): outstr += f"[{i}] {sent.text}\n" outstr += "\n" return outstr def _pretty_fulltext_sentence(sent): """ Helper function for pretty-printing an annotated sentence from a full-text document. :param sent: The sentence to be printed. :type sent: list(AttrDict) :return: The text of the sentence with annotation set indices on frame targets. :rtype: str """ outstr = "" outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format( sent, sent.doc.get("name", sent.doc.description) ) outstr += f"\n[POS] {len(sent.POS)} tags\n" outstr += f"\n[POS_tagset] {sent.POS_tagset}\n\n" outstr += "[text] + [annotationSet]\n\n" outstr += sent._ascii() # -> _annotation_ascii() outstr += "\n" return outstr def _pretty_pos(aset): """ Helper function for pretty-printing a sentence with its POS tags. :param aset: The POS annotation set of the sentence to be printed. :type sent: list(AttrDict) :return: The text of the sentence and its POS tags. :rtype: str """ outstr = "" outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format( aset ) # list the target spans and their associated aset index overt = sorted(aset.POS) sent = aset.sent s0 = sent.text s1 = "" s2 = "" i = 0 adjust = 0 for j, k, lbl in overt: assert j >= i, ("Overlapping targets?", (j, k, lbl)) s1 += " " * (j - i) + "-" * (k - j) if len(lbl) > (k - j): # add space in the sentence to make room for the annotation index amt = len(lbl) - (k - j) s0 = ( s0[: k + adjust] + "~" * amt + s0[k + adjust :] ) # '~' to prevent line wrapping s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :] adjust += amt s2 += " " * (j - i) + lbl.ljust(k - j) i = k long_lines = [s0, s1, s2] outstr += "\n\n".join( map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) ).replace("~", " ") outstr += "\n" return outstr def _pretty_annotation(sent, aset_level=False): """ Helper function for pretty-printing an exemplar sentence for a lexical unit. :param sent: An annotation set or exemplar sentence to be printed. :param aset_level: If True, 'sent' is actually an annotation set within a sentence. :type sent: AttrDict :return: A nicely formatted string representation of the exemplar sentence with its target, frame, and FE annotations. :rtype: str """ sentkeys = sent.keys() outstr = "annotation set" if aset_level else "exemplar sentence" outstr += f" ({sent.ID}):\n" if aset_level: # TODO: any UNANN exemplars? outstr += f"\n[status] {sent.status}\n" for k in ("corpID", "docID", "paragNo", "sentNo", "aPos"): if k in sentkeys: outstr += f"[{k}] {sent[k]}\n" outstr += ( "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU) if sent.LU else "\n[LU] Not found!" ) outstr += "\n[frame] ({0.ID}) {0.name}\n".format( sent.frame ) # redundant with above, but .frame is convenient if not aset_level: outstr += "\n[annotationSet] {} annotation sets\n".format( len(sent.annotationSet) ) outstr += f"\n[POS] {len(sent.POS)} tags\n" outstr += f"\n[POS_tagset] {sent.POS_tagset}\n" outstr += "\n[GF] {} relation{}\n".format( len(sent.GF), "s" if len(sent.GF) != 1 else "" ) outstr += "\n[PT] {} phrase{}\n".format( len(sent.PT), "s" if len(sent.PT) != 1 else "" ) """ Special Layers -------------- The 'NER' layer contains, for some of the data, named entity labels. The 'WSL' (word status layer) contains, for some of the data, spans which should not in principle be considered targets (NT). The 'Other' layer records relative clause constructions (Rel=relativizer, Ant=antecedent), pleonastic 'it' (Null), and existential 'there' (Exist). On occasion they are duplicated by accident (e.g., annotationSet 1467275 in lu6700.xml). The 'Sent' layer appears to contain labels that the annotator has flagged the sentence with for their convenience: values include 'sense1', 'sense2', 'sense3', etc.; 'Blend', 'Canonical', 'Idiom', 'Metaphor', 'Special-Sent', 'keepS', 'deleteS', 'reexamine' (sometimes they are duplicated for no apparent reason). The POS-specific layers may contain the following kinds of spans: Asp (aspectual particle), Non-Asp (non-aspectual particle), Cop (copula), Supp (support), Ctrlr (controller), Gov (governor), X. Gov and X always cooccur. >>> from nltk.corpus import framenet as fn >>> def f(luRE, lyr, ignore=set()): ... for i,ex in enumerate(fn.exemplars(luRE)): ... if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore: ... print(i,ex[lyr]) - Verb: Asp, Non-Asp - Noun: Cop, Supp, Ctrlr, Gov, X - Adj: Cop, Supp, Ctrlr, Gov, X - Prep: Cop, Supp, Ctrlr - Adv: Ctrlr - Scon: (none) - Art: (none) """ for lyr in ("NER", "WSL", "Other", "Sent"): if lyr in sent and sent[lyr]: outstr += "\n[{}] {} entr{}\n".format( lyr, len(sent[lyr]), "ies" if len(sent[lyr]) != 1 else "y" ) outstr += "\n[text] + [Target] + [FE]" # POS-specific layers: syntactically important words that are neither the target # nor the FEs. Include these along with the first FE layer but with '^' underlining. for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"): if lyr in sent and sent[lyr]: outstr += f" + [{lyr}]" if "FE2" in sentkeys: outstr += " + [FE2]" if "FE3" in sentkeys: outstr += " + [FE3]" outstr += "\n\n" outstr += sent._ascii() # -> _annotation_ascii() outstr += "\n" return outstr def _annotation_ascii(sent): """ Given a sentence or FE annotation set, construct the width-limited string showing an ASCII visualization of the sentence's annotations, calling either _annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate. This will be attached as a method to appropriate AttrDict instances and called in the full pretty-printing of the instance. """ if sent._type == "fulltext_sentence" or ( "annotationSet" in sent and len(sent.annotationSet) > 2 ): # a full-text sentence OR sentence with multiple targets. # (multiple targets = >2 annotation sets, because the first annotation set is POS.) return _annotation_ascii_frames(sent) else: # an FE annotation set, or an LU sentence with 1 target return _annotation_ascii_FEs(sent) def _annotation_ascii_frames(sent): """ ASCII string rendering of the sentence along with its targets and frame names. Called for all full-text sentences, as well as the few LU sentences with multiple targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets). Line-wrapped to limit the display width. """ # list the target spans and their associated aset index overt = [] for a, aset in enumerate(sent.annotationSet[1:]): for j, k in aset.Target: indexS = f"[{a + 1}]" if aset.status == "UNANN" or aset.LU.status == "Problem": indexS += " " if aset.status == "UNANN": indexS += "!" # warning indicator that there is a frame annotation but no FE annotation if aset.LU.status == "Problem": indexS += "?" # warning indicator that there is a missing LU definition (because the LU has Problem status) overt.append((j, k, aset.LU.frame.name, indexS)) overt = sorted(overt) duplicates = set() for o, (j, k, fname, asetIndex) in enumerate(overt): if o > 0 and j <= overt[o - 1][1]: # multiple annotation sets on the same target # (e.g. due to a coordination construction or multiple annotators) if ( overt[o - 1][:2] == (j, k) and overt[o - 1][2] == fname ): # same target, same frame # splice indices together combinedIndex = ( overt[o - 1][3] + asetIndex ) # e.g., '[1][2]', '[1]! [2]' combinedIndex = combinedIndex.replace(" !", "! ").replace(" ?", "? ") overt[o - 1] = overt[o - 1][:3] + (combinedIndex,) duplicates.add(o) else: # different frames, same or overlapping targets s = sent.text for j, k, fname, asetIndex in overt: s += "\n" + asetIndex + " " + sent.text[j:k] + " :: " + fname s += "\n(Unable to display sentence with targets marked inline due to overlap)" return s for o in reversed(sorted(duplicates)): del overt[o] s0 = sent.text s1 = "" s11 = "" s2 = "" i = 0 adjust = 0 fAbbrevs = OrderedDict() for j, k, fname, asetIndex in overt: if not j >= i: assert j >= i, ( "Overlapping targets?" + ( " UNANN" if any(aset.status == "UNANN" for aset in sent.annotationSet[1:]) else "" ), (j, k, asetIndex), ) s1 += " " * (j - i) + "*" * (k - j) short = fname[: k - j] if (k - j) < len(fname): r = 0 while short in fAbbrevs: if fAbbrevs[short] == fname: break r += 1 short = fname[: k - j - 1] + str(r) else: # short not in fAbbrevs fAbbrevs[short] = fname s11 += " " * (j - i) + short.ljust(k - j) if len(asetIndex) > (k - j): # add space in the sentence to make room for the annotation index amt = len(asetIndex) - (k - j) s0 = ( s0[: k + adjust] + "~" * amt + s0[k + adjust :] ) # '~' to prevent line wrapping s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :] s11 = s11[: k + adjust] + " " * amt + s11[k + adjust :] adjust += amt s2 += " " * (j - i) + asetIndex.ljust(k - j) i = k long_lines = [s0, s1, s11, s2] outstr = "\n\n".join( map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) ).replace("~", " ") outstr += "\n" if fAbbrevs: outstr += " (" + ", ".join("=".join(pair) for pair in fAbbrevs.items()) + ")" assert len(fAbbrevs) == len(dict(fAbbrevs)), "Abbreviation clash" return outstr def _annotation_ascii_FE_layer(overt, ni, feAbbrevs): """Helper for _annotation_ascii_FEs().""" s1 = "" s2 = "" i = 0 for j, k, fename in overt: s1 += " " * (j - i) + ("^" if fename.islower() else "-") * (k - j) short = fename[: k - j] if len(fename) > len(short): r = 0 while short in feAbbrevs: if feAbbrevs[short] == fename: break r += 1 short = fename[: k - j - 1] + str(r) else: # short not in feAbbrevs feAbbrevs[short] = fename s2 += " " * (j - i) + short.ljust(k - j) i = k sNI = "" if ni: sNI += " [" + ", ".join(":".join(x) for x in sorted(ni.items())) + "]" return [s1, s2, sNI] def _annotation_ascii_FEs(sent): """ ASCII string rendering of the sentence along with a single target and its FEs. Secondary and tertiary FE layers are included if present. 'sent' can be an FE annotation set or an LU sentence with a single target. Line-wrapped to limit the display width. """ feAbbrevs = OrderedDict() posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula]) posspec_separate = False for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"): if lyr in sent and sent[lyr]: for a, b, lbl in sent[lyr]: if ( lbl == "X" ): # skip this, which covers an entire phrase typically containing the target and all its FEs # (but do display the Gov) continue if any(1 for x, y, felbl in sent.FE[0] if x <= a < y or a <= x < b): # overlap between one of the POS-specific layers and first FE layer posspec_separate = ( True # show POS-specific layers on a separate line ) posspec.append( (a, b, lbl.lower().replace("-", "")) ) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names if posspec_separate: POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs) FE1 = _annotation_ascii_FE_layer( sorted(sent.FE[0] + (posspec if not posspec_separate else [])), sent.FE[1], feAbbrevs, ) FE2 = FE3 = None if "FE2" in sent: FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs) if "FE3" in sent: FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs) for i, j in sent.Target: FE1span, FE1name, FE1exp = FE1 if len(FE1span) < j: FE1span += " " * (j - len(FE1span)) if len(FE1name) < j: FE1name += " " * (j - len(FE1name)) FE1[1] = FE1name FE1[0] = ( FE1span[:i] + FE1span[i:j].replace(" ", "*").replace("-", "=") + FE1span[j:] ) long_lines = [sent.text] if posspec_separate: long_lines.extend(POSSPEC[:2]) long_lines.extend([FE1[0], FE1[1] + FE1[2]]) # lines with no length limit if FE2: long_lines.extend([FE2[0], FE2[1] + FE2[2]]) if FE3: long_lines.extend([FE3[0], FE3[1] + FE3[2]]) long_lines.append("") outstr = "\n".join( map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) ) if feAbbrevs: outstr += "(" + ", ".join("=".join(pair) for pair in feAbbrevs.items()) + ")" assert len(feAbbrevs) == len(dict(feAbbrevs)), "Abbreviation clash" outstr += "\n" return outstr def _pretty_fe(fe): """ Helper function for pretty-printing a frame element. :param fe: The frame element to be printed. :type fe: AttrDict :return: A nicely formatted string representation of the frame element. :rtype: str """ fekeys = fe.keys() outstr = "" outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format( fe, fe.frame ) if "definition" in fekeys: outstr += "[definition]\n" outstr += _pretty_longstring(fe.definition, " ") if "abbrev" in fekeys: outstr += f"[abbrev] {fe.abbrev}\n" if "coreType" in fekeys: outstr += f"[coreType] {fe.coreType}\n" if "requiresFE" in fekeys: outstr += "[requiresFE] " if fe.requiresFE is None: outstr += "\n" else: outstr += f"{fe.requiresFE.name}({fe.requiresFE.ID})\n" if "excludesFE" in fekeys: outstr += "[excludesFE] " if fe.excludesFE is None: outstr += "\n" else: outstr += f"{fe.excludesFE.name}({fe.excludesFE.ID})\n" if "semType" in fekeys: outstr += "[semType] " if fe.semType is None: outstr += "\n" else: outstr += "\n " + f"{fe.semType.name}({fe.semType.ID})" + "\n" return outstr def _pretty_frame(frame): """ Helper function for pretty-printing a frame. :param frame: The frame to be printed. :type frame: AttrDict :return: A nicely formatted string representation of the frame. :rtype: str """ outstr = "" outstr += "frame ({0.ID}): {0.name}\n\n".format(frame) outstr += f"[URL] {frame.URL}\n\n" outstr += "[definition]\n" outstr += _pretty_longstring(frame.definition, " ") + "\n" outstr += f"[semTypes] {len(frame.semTypes)} semantic types\n" outstr += ( " " * (len(frame.semTypes) > 0) + ", ".join(f"{x.name}({x.ID})" for x in frame.semTypes) + "\n" * (len(frame.semTypes) > 0) ) outstr += "\n[frameRelations] {} frame relations\n".format( len(frame.frameRelations) ) outstr += " " + "\n ".join(repr(frel) for frel in frame.frameRelations) + "\n" outstr += f"\n[lexUnit] {len(frame.lexUnit)} lexical units\n" lustrs = [] for luName, lu in sorted(frame.lexUnit.items()): tmpstr = f"{luName} ({lu.ID})" lustrs.append(tmpstr) outstr += "{}\n".format(_pretty_longstring(", ".join(lustrs), prefix=" ")) outstr += f"\n[FE] {len(frame.FE)} frame elements\n" fes = {} for feName, fe in sorted(frame.FE.items()): try: fes[fe.coreType].append(f"{feName} ({fe.ID})") except KeyError: fes[fe.coreType] = [] fes[fe.coreType].append(f"{feName} ({fe.ID})") for ct in sorted( fes.keys(), key=lambda ct2: [ "Core", "Core-Unexpressed", "Peripheral", "Extra-Thematic", ].index(ct2), ): outstr += "{:>16}: {}\n".format(ct, ", ".join(sorted(fes[ct]))) outstr += "\n[FEcoreSets] {} frame element core sets\n".format( len(frame.FEcoreSets) ) outstr += ( " " + "\n ".join( ", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets ) + "\n" ) return outstr class FramenetError(Exception): """An exception class for framenet-related errors.""" class AttrDict(dict): """A class that wraps a dict and allows accessing the keys of the dict as if they were attributes. Taken from here: https://stackoverflow.com/a/14620633/8879 >>> foo = {'a':1, 'b':2, 'c':3} >>> bar = AttrDict(foo) >>> pprint(dict(bar)) {'a': 1, 'b': 2, 'c': 3} >>> bar.b 2 >>> bar.d = 4 >>> pprint(dict(bar)) {'a': 1, 'b': 2, 'c': 3, 'd': 4} """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # self.__dict__ = self def __setattr__(self, name, value): self[name] = value def __getattr__(self, name): if name == "_short_repr": return self._short_repr return self[name] def __getitem__(self, name): v = super().__getitem__(name) if isinstance(v, Future): return v._data() return v def _short_repr(self): if "_type" in self: if self["_type"].endswith("relation"): return self.__repr__() try: return "<{} ID={} name={}>".format( self["_type"], self["ID"], self["name"] ) except KeyError: try: # no ID--e.g., for _type=lusubcorpus return "<{} name={}>".format(self["_type"], self["name"]) except KeyError: # no name--e.g., for _type=lusentence return "<{} ID={}>".format(self["_type"], self["ID"]) else: return self.__repr__() def _str(self): outstr = "" if "_type" not in self: outstr = _pretty_any(self) elif self["_type"] == "frame": outstr = _pretty_frame(self) elif self["_type"] == "fe": outstr = _pretty_fe(self) elif self["_type"] == "lu": outstr = _pretty_lu(self) elif self["_type"] == "luexemplars": # list of ALL exemplars for LU outstr = _pretty_exemplars(self, self[0].LU) elif ( self["_type"] == "fulltext_annotation" ): # list of all sentences for full-text doc outstr = _pretty_fulltext_sentences(self) elif self["_type"] == "lusentence": outstr = _pretty_annotation(self) elif self["_type"] == "fulltext_sentence": outstr = _pretty_fulltext_sentence(self) elif self["_type"] in ("luannotationset", "fulltext_annotationset"): outstr = _pretty_annotation(self, aset_level=True) elif self["_type"] == "posannotationset": outstr = _pretty_pos(self) elif self["_type"] == "semtype": outstr = _pretty_semtype(self) elif self["_type"] == "framerelationtype": outstr = _pretty_frame_relation_type(self) elif self["_type"] == "framerelation": outstr = _pretty_frame_relation(self) elif self["_type"] == "ferelation": outstr = _pretty_fe_relation(self) else: outstr = _pretty_any(self) # ensure result is unicode string prior to applying the # decorator (because non-ASCII characters # could in principle occur in the data and would trigger an encoding error when # passed as arguments to str.format()). # assert isinstance(outstr, unicode) # not in Python 3.2 return outstr def __str__(self): return self._str() def __repr__(self): return self.__str__() class SpecialList(list): """ A list subclass which adds a '_type' attribute for special printing (similar to an AttrDict, though this is NOT an AttrDict subclass). """ def __init__(self, typ, *args, **kwargs): super().__init__(*args, **kwargs) self._type = typ def _str(self): outstr = "" assert self._type if len(self) == 0: outstr = "[]" elif self._type == "luexemplars": # list of ALL exemplars for LU outstr = _pretty_exemplars(self, self[0].LU) else: assert False, self._type return outstr def __str__(self): return self._str() def __repr__(self): return self.__str__() class Future: """ Wraps and acts as a proxy for a value to be loaded lazily (on demand). Adapted from https://gist.github.com/sergey-miryanov/2935416 """ def __init__(self, loader, *args, **kwargs): """ :param loader: when called with no arguments, returns the value to be stored :type loader: callable """ super().__init__(*args, **kwargs) self._loader = loader self._d = None def _data(self): if callable(self._loader): self._d = self._loader() self._loader = None # the data is now cached return self._d def __nonzero__(self): return bool(self._data()) def __len__(self): return len(self._data()) def __setitem__(self, key, value): return self._data().__setitem__(key, value) def __getitem__(self, key): return self._data().__getitem__(key) def __getattr__(self, key): return self._data().__getattr__(key) def __str__(self): return self._data().__str__() def __repr__(self): return self._data().__repr__() class PrettyDict(AttrDict): """ Displays an abbreviated repr of values where possible. Inherits from AttrDict, so a callable value will be lazily converted to an actual value. """ def __init__(self, *args, **kwargs): _BREAK_LINES = kwargs.pop("breakLines", False) super().__init__(*args, **kwargs) dict.__setattr__(self, "_BREAK_LINES", _BREAK_LINES) def __repr__(self): parts = [] for k, v in sorted(self.items()): kv = repr(k) + ": " try: kv += v._short_repr() except AttributeError: kv += repr(v) parts.append(kv) return "{" + (",\n " if self._BREAK_LINES else ", ").join(parts) + "}" class PrettyList(list): """ Displays an abbreviated repr of only the first several elements, not the whole list. """ # from nltk.util def __init__(self, *args, **kwargs): self._MAX_REPR_SIZE = kwargs.pop("maxReprSize", 60) self._BREAK_LINES = kwargs.pop("breakLines", False) super().__init__(*args, **kwargs) def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append( elt._short_repr() ) # key difference from inherited version: call to _short_repr() length += len(pieces[-1]) + 2 if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2: return "[%s, ...]" % str(",\n " if self._BREAK_LINES else ", ").join( pieces[:-1] ) return "[%s]" % str(",\n " if self._BREAK_LINES else ", ").join(pieces) class PrettyLazyMap(LazyMap): """ Displays an abbreviated repr of only the first several elements, not the whole list. """ # from nltk.util _MAX_REPR_SIZE = 60 def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append( elt._short_repr() ) # key difference from inherited version: call to _short_repr() length += len(pieces[-1]) + 2 if length > self._MAX_REPR_SIZE and len(pieces) > 2: return "[%s, ...]" % ", ".join(pieces[:-1]) return "[%s]" % ", ".join(pieces) class PrettyLazyIteratorList(LazyIteratorList): """ Displays an abbreviated repr of only the first several elements, not the whole list. """ # from nltk.util _MAX_REPR_SIZE = 60 def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append( elt._short_repr() ) # key difference from inherited version: call to _short_repr() length += len(pieces[-1]) + 2 if length > self._MAX_REPR_SIZE and len(pieces) > 2: return "[%s, ...]" % ", ".join(pieces[:-1]) return "[%s]" % ", ".join(pieces) class PrettyLazyConcatenation(LazyConcatenation): """ Displays an abbreviated repr of only the first several elements, not the whole list. """ # from nltk.util _MAX_REPR_SIZE = 60 def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append( elt._short_repr() ) # key difference from inherited version: call to _short_repr() length += len(pieces[-1]) + 2 if length > self._MAX_REPR_SIZE and len(pieces) > 2: return "[%s, ...]" % ", ".join(pieces[:-1]) return "[%s]" % ", ".join(pieces) def __add__(self, other): """Return a list concatenating self with other.""" return PrettyLazyIteratorList(itertools.chain(self, other)) def __radd__(self, other): """Return a list concatenating other with self.""" return PrettyLazyIteratorList(itertools.chain(other, self)) class FramenetCorpusReader(XMLCorpusReader): """A corpus reader for the Framenet Corpus. >>> from nltk.corpus import framenet as fn >>> fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238) True >>> fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame True >>> fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality') True """ _bad_statuses = ["Problem"] """ When loading LUs for a frame, those whose status is in this list will be ignored. Due to caching, if user code modifies this, it should do so before loading any data. 'Problem' should always be listed for FrameNet 1.5, as these LUs are not included in the XML index. """ _warnings = False def warnings(self, v): """Enable or disable warnings of data integrity issues as they are encountered. If v is truthy, warnings will be enabled. (This is a function rather than just an attribute/property to ensure that if enabling warnings is the first action taken, the corpus reader is instantiated first.) """ self._warnings = v def __init__(self, root, fileids): XMLCorpusReader.__init__(self, root, fileids) # framenet corpus sub dirs # sub dir containing the xml files for frames self._frame_dir = "frame" # sub dir containing the xml files for lexical units self._lu_dir = "lu" # sub dir containing the xml files for fulltext annotation files self._fulltext_dir = "fulltext" # location of latest development version of FrameNet self._fnweb_url = "https://framenet2.icsi.berkeley.edu/fnReports/data" # Indexes used for faster look-ups self._frame_idx = None self._cached_frames = {} # name -> ID self._lu_idx = None self._fulltext_idx = None self._semtypes = None self._freltyp_idx = None # frame relation types (Inheritance, Using, etc.) self._frel_idx = None # frame-to-frame relation instances self._ferel_idx = None # FE-to-FE relation instances self._frel_f_idx = None # frame-to-frame relations associated with each frame self._readme = "README.txt" def help(self, attrname=None): """Display help information summarizing the main methods.""" if attrname is not None: return help(self.__getattribute__(attrname)) # No need to mention frame_by_name() or frame_by_id(), # as it's easier to just call frame(). # Also not mentioning lu_basic(). msg = """ Citation: Nathan Schneider and Chuck Wooters (2017), "The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource". Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438 Use the following methods to access data in FrameNet. Provide a method name to `help()` for more information. FRAMES ====== frame() to look up a frame by its exact name or ID frames() to get frames matching a name pattern frames_by_lemma() to get frames containing an LU matching a name pattern frame_ids_and_names() to get a mapping from frame IDs to names FRAME ELEMENTS ============== fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally constrained by a frame name pattern LEXICAL UNITS ============= lu() to look up an LU by its ID lus() to get lexical units matching a name pattern, optionally constrained by frame lu_ids_and_names() to get a mapping from LU IDs to names RELATIONS ========= frame_relation_types() to get the different kinds of frame-to-frame relations (Inheritance, Subframe, Using, etc.). frame_relations() to get the relation instances, optionally constrained by frame(s) or relation type fe_relations() to get the frame element pairs belonging to a frame-to-frame relation SEMANTIC TYPES ============== semtypes() to get the different kinds of semantic types that can be applied to FEs, LUs, and entire frames semtype() to look up a particular semtype by name, ID, or abbreviation semtype_inherits() to check whether two semantic types have a subtype-supertype relationship in the semtype hierarchy propagate_semtypes() to apply inference rules that distribute semtypes over relations between FEs ANNOTATIONS =========== annotations() to get annotation sets, in which a token in a sentence is annotated with a lexical unit in a frame, along with its frame elements and their syntactic properties; can be constrained by LU name pattern and limited to lexicographic exemplars or full-text. Sentences of full-text annotation can have multiple annotation sets. sents() to get annotated sentences illustrating one or more lexical units exemplars() to get sentences of lexicographic annotation, most of which have just 1 annotation set; can be constrained by LU name pattern, frame, and overt FE(s) doc() to look up a document of full-text annotation by its ID docs() to get documents of full-text annotation that match a name pattern docs_metadata() to get metadata about all full-text documents without loading them ft_sents() to iterate over sentences of full-text annotation UTILITIES ========= buildindexes() loads metadata about all frames, LUs, etc. into memory to avoid delay when one is accessed for the first time. It does not load annotations. readme() gives the text of the FrameNet README file warnings(True) to display corpus consistency warnings when loading data """ print(msg) def _buildframeindex(self): # The total number of Frames in Framenet is fairly small (~1200) so # this index should not be very large if not self._frel_idx: self._buildrelationindex() # always load frame relations before frames, # otherwise weird ordering effects might result in incomplete information self._frame_idx = {} with XMLCorpusView( self.abspath("frameIndex.xml"), "frameIndex/frame", self._handle_elt ) as view: for f in view: self._frame_idx[f["ID"]] = f def _buildcorpusindex(self): # The total number of fulltext annotated documents in Framenet # is fairly small (~90) so this index should not be very large self._fulltext_idx = {} with XMLCorpusView( self.abspath("fulltextIndex.xml"), "fulltextIndex/corpus", self._handle_fulltextindex_elt, ) as view: for doclist in view: for doc in doclist: self._fulltext_idx[doc.ID] = doc def _buildluindex(self): # The number of LUs in Framenet is about 13,000 so this index # should not be very large self._lu_idx = {} with XMLCorpusView( self.abspath("luIndex.xml"), "luIndex/lu", self._handle_elt ) as view: for lu in view: self._lu_idx[ lu["ID"] ] = lu # populate with LU index entries. if any of these # are looked up they will be replaced by full LU objects. def _buildrelationindex(self): # print('building relation index...', file=sys.stderr) self._freltyp_idx = {} self._frel_idx = {} self._frel_f_idx = defaultdict(set) self._ferel_idx = {} with XMLCorpusView( self.abspath("frRelation.xml"), "frameRelations/frameRelationType", self._handle_framerelationtype_elt, ) as view: for freltyp in view: self._freltyp_idx[freltyp.ID] = freltyp for frel in freltyp.frameRelations: supF = frel.superFrame = frel[freltyp.superFrameName] = Future( (lambda fID: lambda: self.frame_by_id(fID))(frel.supID) ) subF = frel.subFrame = frel[freltyp.subFrameName] = Future( (lambda fID: lambda: self.frame_by_id(fID))(frel.subID) ) self._frel_idx[frel.ID] = frel self._frel_f_idx[frel.supID].add(frel.ID) self._frel_f_idx[frel.subID].add(frel.ID) for ferel in frel.feRelations: ferel.superFrame = supF ferel.subFrame = subF ferel.superFE = Future( (lambda fer: lambda: fer.superFrame.FE[fer.superFEName])( ferel ) ) ferel.subFE = Future( (lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel) ) self._ferel_idx[ferel.ID] = ferel # print('...done building relation index', file=sys.stderr) def _warn(self, *message, **kwargs): if self._warnings: kwargs.setdefault("file", sys.stderr) print(*message, **kwargs) def buildindexes(self): """ Build the internal indexes to make look-ups faster. """ # Frames self._buildframeindex() # LUs self._buildluindex() # Fulltext annotation corpora index self._buildcorpusindex() # frame and FE relations self._buildrelationindex() def doc(self, fn_docid): """ Returns the annotated document whose id number is ``fn_docid``. This id number can be obtained by calling the Documents() function. The dict that is returned from this function will contain the following keys: - '_type' : 'fulltextannotation' - 'sentence' : a list of sentences in the document - Each item in the list is a dict containing the following keys: - 'ID' : the ID number of the sentence - '_type' : 'sentence' - 'text' : the text of the sentence - 'paragNo' : the paragraph number - 'sentNo' : the sentence number - 'docID' : the document ID number - 'corpID' : the corpus ID number - 'aPos' : the annotation position - 'annotationSet' : a list of annotation layers for the sentence - Each item in the list is a dict containing the following keys: - 'ID' : the ID number of the annotation set - '_type' : 'annotationset' - 'status' : either 'MANUAL' or 'UNANN' - 'luName' : (only if status is 'MANUAL') - 'luID' : (only if status is 'MANUAL') - 'frameID' : (only if status is 'MANUAL') - 'frameName': (only if status is 'MANUAL') - 'layer' : a list of labels for the layer - Each item in the layer is a dict containing the following keys: - '_type': 'layer' - 'rank' - 'name' - 'label' : a list of labels in the layer - Each item is a dict containing the following keys: - 'start' - 'end' - 'name' - 'feID' (optional) :param fn_docid: The Framenet id number of the document :type fn_docid: int :return: Information about the annotated document :rtype: dict """ try: xmlfname = self._fulltext_idx[fn_docid].filename except TypeError: # happens when self._fulltext_idx == None # build the index self._buildcorpusindex() xmlfname = self._fulltext_idx[fn_docid].filename except KeyError as e: # probably means that fn_docid was not in the index raise FramenetError(f"Unknown document id: {fn_docid}") from e # construct the path name for the xml file containing the document info locpath = os.path.join(f"{self._root}", self._fulltext_dir, xmlfname) # Grab the top-level xml element containing the fulltext annotation with XMLCorpusView(locpath, "fullTextAnnotation") as view: elt = view[0] info = self._handle_fulltextannotation_elt(elt) # add metadata for k, v in self._fulltext_idx[fn_docid].items(): info[k] = v return info def frame_by_id(self, fn_fid, ignorekeys=[]): """ Get the details for the specified Frame using the frame's id number. Usage examples: >>> from nltk.corpus import framenet as fn >>> f = fn.frame_by_id(256) >>> f.ID 256 >>> f.name 'Medical_specialties' >>> f.definition "This frame includes words that name ..." :param fn_fid: The Framenet id number of the frame :type fn_fid: int :param ignorekeys: The keys to ignore. These keys will not be included in the output. (optional) :type ignorekeys: list(str) :return: Information about a frame :rtype: dict Also see the ``frame()`` function for details about what is contained in the dict that is returned. """ # get the name of the frame with this id number try: fentry = self._frame_idx[fn_fid] if "_type" in fentry: return fentry # full frame object is cached name = fentry["name"] except TypeError: self._buildframeindex() name = self._frame_idx[fn_fid]["name"] except KeyError as e: raise FramenetError(f"Unknown frame id: {fn_fid}") from e return self.frame_by_name(name, ignorekeys, check_cache=False) def frame_by_name(self, fn_fname, ignorekeys=[], check_cache=True): """ Get the details for the specified Frame using the frame's name. Usage examples: >>> from nltk.corpus import framenet as fn >>> f = fn.frame_by_name('Medical_specialties') >>> f.ID 256 >>> f.name 'Medical_specialties' >>> f.definition "This frame includes words that name ..." :param fn_fname: The name of the frame :type fn_fname: str :param ignorekeys: The keys to ignore. These keys will not be included in the output. (optional) :type ignorekeys: list(str) :return: Information about a frame :rtype: dict Also see the ``frame()`` function for details about what is contained in the dict that is returned. """ if check_cache and fn_fname in self._cached_frames: return self._frame_idx[self._cached_frames[fn_fname]] elif not self._frame_idx: self._buildframeindex() # construct the path name for the xml file containing the Frame info locpath = os.path.join(f"{self._root}", self._frame_dir, fn_fname + ".xml") # print(locpath, file=sys.stderr) # Grab the xml for the frame try: with XMLCorpusView(locpath, "frame") as view: elt = view[0] except OSError as e: raise FramenetError(f"Unknown frame: {fn_fname}") from e fentry = self._handle_frame_elt(elt, ignorekeys) assert fentry fentry.URL = self._fnweb_url + "/" + self._frame_dir + "/" + fn_fname + ".xml" # INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs for st in fentry.semTypes: if st.rootType.name == "Lexical_type": for lu in fentry.lexUnit.values(): if not any( x is st for x in lu.semTypes ): # identity containment check lu.semTypes.append(st) self._frame_idx[fentry.ID] = fentry self._cached_frames[fentry.name] = fentry.ID """ # now set up callables to resolve the LU pointers lazily. # (could also do this here--caching avoids infinite recursion.) for luName,luinfo in fentry.lexUnit.items(): fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID) """ return fentry def frame(self, fn_fid_or_fname, ignorekeys=[]): """ Get the details for the specified Frame using the frame's name or id number. Usage examples: >>> from nltk.corpus import framenet as fn >>> f = fn.frame(256) >>> f.name 'Medical_specialties' >>> f = fn.frame('Medical_specialties') >>> f.ID 256 >>> # ensure non-ASCII character in definition doesn't trigger an encoding error: >>> fn.frame('Imposing_obligation') frame (1494): Imposing_obligation... The dict that is returned from this function will contain the following information about the Frame: - 'name' : the name of the Frame (e.g. 'Birth', 'Apply_heat', etc.) - 'definition' : textual definition of the Frame - 'ID' : the internal ID number of the Frame - 'semTypes' : a list of semantic types for this frame - Each item in the list is a dict containing the following keys: - 'name' : can be used with the semtype() function - 'ID' : can be used with the semtype() function - 'lexUnit' : a dict containing all of the LUs for this frame. The keys in this dict are the names of the LUs and the value for each key is itself a dict containing info about the LU (see the lu() function for more info.) - 'FE' : a dict containing the Frame Elements that are part of this frame The keys in this dict are the names of the FEs (e.g. 'Body_system') and the values are dicts containing the following keys - 'definition' : The definition of the FE - 'name' : The name of the FE e.g. 'Body_system' - 'ID' : The id number - '_type' : 'fe' - 'abbrev' : Abbreviation e.g. 'bod' - 'coreType' : one of "Core", "Peripheral", or "Extra-Thematic" - 'semType' : if not None, a dict with the following two keys: - 'name' : name of the semantic type. can be used with the semtype() function - 'ID' : id number of the semantic type. can be used with the semtype() function - 'requiresFE' : if not None, a dict with the following two keys: - 'name' : the name of another FE in this frame - 'ID' : the id of the other FE in this frame - 'excludesFE' : if not None, a dict with the following two keys: - 'name' : the name of another FE in this frame - 'ID' : the id of the other FE in this frame - 'frameRelation' : a list of objects describing frame relations - 'FEcoreSets' : a list of Frame Element core sets for this frame - Each item in the list is a list of FE objects :param fn_fid_or_fname: The Framenet name or id number of the frame :type fn_fid_or_fname: int or str :param ignorekeys: The keys to ignore. These keys will not be included in the output. (optional) :type ignorekeys: list(str) :return: Information about a frame :rtype: dict """ # get the frame info by name or id number if isinstance(fn_fid_or_fname, str): f = self.frame_by_name(fn_fid_or_fname, ignorekeys) else: f = self.frame_by_id(fn_fid_or_fname, ignorekeys) return f def frames_by_lemma(self, pat): """ Returns a list of all frames that contain LUs in which the ``name`` attribute of the LU matches the given regular expression ``pat``. Note that LU names are composed of "lemma.POS", where the "lemma" part can be made up of either a single lexeme (e.g. 'run') or multiple lexemes (e.g. 'a little'). Note: if you are going to be doing a lot of this type of searching, you'd want to build an index that maps from lemmas to frames because each time frames_by_lemma() is called, it has to search through ALL of the frame XML files in the db. >>> from nltk.corpus import framenet as fn >>> from nltk.corpus.reader.framenet import PrettyList >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) # doctest: +ELLIPSIS [, ] :return: A list of frame objects. :rtype: list(AttrDict) """ return PrettyList( f for f in self.frames() if any(re.search(pat, luName) for luName in f.lexUnit) ) def lu_basic(self, fn_luid): """ Returns basic information about the LU whose id is ``fn_luid``. This is basically just a wrapper around the ``lu()`` function with "subCorpus" info excluded. >>> from nltk.corpus import framenet as fn >>> lu = PrettyDict(fn.lu_basic(256), breakLines=True) >>> # ellipses account for differences between FN 1.5 and 1.7 >>> lu # doctest: +ELLIPSIS {'ID': 256, 'POS': 'V', 'URL': 'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml', '_type': 'lu', 'cBy': ..., 'cDate': '02/08/2001 01:27:50 PST Thu', 'definition': 'COD: be aware of beforehand; predict.', 'definitionMarkup': 'COD: be aware of beforehand; predict.', 'frame': , 'lemmaID': 15082, 'lexemes': [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}], 'name': 'foresee.v', 'semTypes': [], 'sentenceCount': {'annotated': ..., 'total': ...}, 'status': 'FN1_Sent'} :param fn_luid: The id number of the desired LU :type fn_luid: int :return: Basic information about the lexical unit :rtype: dict """ return self.lu(fn_luid, ignorekeys=["subCorpus", "exemplars"]) def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None): """ Access a lexical unit by its ID. luName, frameID, and frameName are used only in the event that the LU does not have a file in the database (which is the case for LUs with "Problem" status); in this case, a placeholder LU is created which just contains its name, ID, and frame. Usage examples: >>> from nltk.corpus import framenet as fn >>> fn.lu(256).name 'foresee.v' >>> fn.lu(256).definition 'COD: be aware of beforehand; predict.' >>> fn.lu(256).frame.name 'Expectation' >>> pprint(list(map(PrettyDict, fn.lu(256).lexemes))) [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}] >>> fn.lu(227).exemplars[23] exemplar sentence (352962): [sentNo] 0 [aPos] 59699508 [LU] (227) guess.v in Coming_to_believe [frame] (23) Coming_to_believe [annotationSet] 2 annotation sets [POS] 18 tags [POS_tagset] BNC [GF] 3 relations [PT] 3 phrases [Other] 1 entry [text] + [Target] + [FE] When he was inside the house , Culley noticed the characteristic ------------------ Content he would n't have guessed at . -- ******* -- Co C1 [Evidence:INI] (Co=Cognizer, C1=Content) The dict that is returned from this function will contain most of the following information about the LU. Note that some LUs do not contain all of these pieces of information - particularly 'totalAnnotated' and 'incorporatedFE' may be missing in some LUs: - 'name' : the name of the LU (e.g. 'merger.n') - 'definition' : textual definition of the LU - 'ID' : the internal ID number of the LU - '_type' : 'lu' - 'status' : e.g. 'Created' - 'frame' : Frame that this LU belongs to - 'POS' : the part of speech of this LU (e.g. 'N') - 'totalAnnotated' : total number of examples annotated with this LU - 'incorporatedFE' : FE that incorporates this LU (e.g. 'Ailment') - 'sentenceCount' : a dict with the following two keys: - 'annotated': number of sentences annotated with this LU - 'total' : total number of sentences with this LU - 'lexemes' : a list of dicts describing the lemma of this LU. Each dict in the list contains these keys: - 'POS' : part of speech e.g. 'N' - 'name' : either single-lexeme e.g. 'merger' or multi-lexeme e.g. 'a little' - 'order': the order of the lexeme in the lemma (starting from 1) - 'headword': a boolean ('true' or 'false') - 'breakBefore': Can this lexeme be separated from the previous lexeme? Consider: "take over.v" as in:: Germany took over the Netherlands in 2 days. Germany took the Netherlands over in 2 days. In this case, 'breakBefore' would be "true" for the lexeme "over". Contrast this with "take after.v" as in:: Mary takes after her grandmother. *Mary takes her grandmother after. In this case, 'breakBefore' would be "false" for the lexeme "after" - 'lemmaID' : Can be used to connect lemmas in different LUs - 'semTypes' : a list of semantic type objects for this LU - 'subCorpus' : a list of subcorpora - Each item in the list is a dict containing the following keys: - 'name' : - 'sentence' : a list of sentences in the subcorpus - each item in the list is a dict with the following keys: - 'ID': - 'sentNo': - 'text': the text of the sentence - 'aPos': - 'annotationSet': a list of annotation sets - each item in the list is a dict with the following keys: - 'ID': - 'status': - 'layer': a list of layers - each layer is a dict containing the following keys: - 'name': layer name (e.g. 'BNC') - 'rank': - 'label': a list of labels for the layer - each label is a dict containing the following keys: - 'start': start pos of label in sentence 'text' (0-based) - 'end': end pos of label in sentence 'text' (0-based) - 'name': name of label (e.g. 'NN1') Under the hood, this implementation looks up the lexical unit information in the *frame* definition file. That file does not contain corpus annotations, so the LU files will be accessed on demand if those are needed. In principle, valence patterns could be loaded here too, though these are not currently supported. :param fn_luid: The id number of the lexical unit :type fn_luid: int :param ignorekeys: The keys to ignore. These keys will not be included in the output. (optional) :type ignorekeys: list(str) :return: All information about the lexical unit :rtype: dict """ # look for this LU in cache if not self._lu_idx: self._buildluindex() OOV = object() luinfo = self._lu_idx.get(fn_luid, OOV) if luinfo is OOV: # LU not in the index. We create a placeholder by falling back to # luName, frameID, and frameName. However, this will not be listed # among the LUs for its frame. self._warn( "LU ID not found: {} ({}) in {} ({})".format( luName, fn_luid, frameName, frameID ) ) luinfo = AttrDict( { "_type": "lu", "ID": fn_luid, "name": luName, "frameID": frameID, "status": "Problem", } ) f = self.frame_by_id(luinfo.frameID) assert f.name == frameName, (f.name, frameName) luinfo["frame"] = f self._lu_idx[fn_luid] = luinfo elif "_type" not in luinfo: # we only have an index entry for the LU. loading the frame will replace this. f = self.frame_by_id(luinfo.frameID) luinfo = self._lu_idx[fn_luid] if ignorekeys: return AttrDict({k: v for k, v in luinfo.items() if k not in ignorekeys}) return luinfo def _lu_file(self, lu, ignorekeys=[]): """ Augment the LU information that was loaded from the frame file with additional information from the LU file. """ fn_luid = lu.ID fname = f"lu{fn_luid}.xml" locpath = os.path.join(f"{self._root}", self._lu_dir, fname) # print(locpath, file=sys.stderr) if not self._lu_idx: self._buildluindex() try: with XMLCorpusView(locpath, "lexUnit") as view: elt = view[0] except OSError as e: raise FramenetError(f"Unknown LU id: {fn_luid}") from e lu2 = self._handle_lexunit_elt(elt, ignorekeys) lu.URL = self._fnweb_url + "/" + self._lu_dir + "/" + fname lu.subCorpus = lu2.subCorpus lu.exemplars = SpecialList( "luexemplars", [sent for subc in lu.subCorpus for sent in subc.sentence] ) for sent in lu.exemplars: sent["LU"] = lu sent["frame"] = lu.frame for aset in sent.annotationSet: aset["LU"] = lu aset["frame"] = lu.frame return lu def _loadsemtypes(self): """Create the semantic types index.""" self._semtypes = AttrDict() with XMLCorpusView( self.abspath("semTypes.xml"), "semTypes/semType", self._handle_semtype_elt, ) as view: for st in view: n = st["name"] a = st["abbrev"] i = st["ID"] # Both name and abbrev should be able to retrieve the # ID. The ID will retrieve the semantic type dict itself. self._semtypes[n] = i self._semtypes[a] = i self._semtypes[i] = st # now that all individual semtype XML is loaded, we can link them together roots = [] for st in self.semtypes(): if st.superType: st.superType = self.semtype(st.superType.supID) st.superType.subTypes.append(st) else: if st not in roots: roots.append(st) st.rootType = st queue = list(roots) assert queue while queue: st = queue.pop(0) for child in st.subTypes: child.rootType = st.rootType queue.append(child) # self.propagate_semtypes() # apply inferencing over FE relations def propagate_semtypes(self): """ Apply inference rules to distribute semtypes over relations between FEs. For FrameNet 1.5, this results in 1011 semtypes being propagated. (Not done by default because it requires loading all frame files, which takes several seconds. If this needed to be fast, it could be rewritten to traverse the neighboring relations on demand for each FE semtype.) >>> from nltk.corpus import framenet as fn >>> x = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) >>> fn.propagate_semtypes() >>> y = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) >>> y-x > 1000 True """ if not self._semtypes: self._loadsemtypes() if not self._ferel_idx: self._buildrelationindex() changed = True i = 0 nPropagations = 0 while changed: # make a pass and see if anything needs to be propagated i += 1 changed = False for ferel in self.fe_relations(): superST = ferel.superFE.semType subST = ferel.subFE.semType try: if superST and superST is not subST: # propagate downward assert subST is None or self.semtype_inherits(subST, superST), ( superST.name, ferel, subST.name, ) if subST is None: ferel.subFE.semType = subST = superST changed = True nPropagations += 1 if ( ferel.type.name in ["Perspective_on", "Subframe", "Precedes"] and subST and subST is not superST ): # propagate upward assert superST is None, (superST.name, ferel, subST.name) ferel.superFE.semType = superST = subST changed = True nPropagations += 1 except AssertionError as ex: # bug in the data! ignore # print(ex, file=sys.stderr) continue # print(i, nPropagations, file=sys.stderr) def semtype(self, key): """ >>> from nltk.corpus import framenet as fn >>> fn.semtype(233).name 'Temperature' >>> fn.semtype(233).abbrev 'Temp' >>> fn.semtype('Temperature').ID 233 :param key: The name, abbreviation, or id number of the semantic type :type key: string or int :return: Information about a semantic type :rtype: dict """ if isinstance(key, int): stid = key else: try: stid = self._semtypes[key] except TypeError: self._loadsemtypes() stid = self._semtypes[key] try: st = self._semtypes[stid] except TypeError: self._loadsemtypes() st = self._semtypes[stid] return st def semtype_inherits(self, st, superST): if not isinstance(st, dict): st = self.semtype(st) if not isinstance(superST, dict): superST = self.semtype(superST) par = st.superType while par: if par is superST: return True par = par.superType return False def frames(self, name=None): """ Obtain details for a specific frame. >>> from nltk.corpus import framenet as fn >>> len(fn.frames()) in (1019, 1221) # FN 1.5 and 1.7, resp. True >>> x = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True) >>> x.sort(key=itemgetter('ID')) >>> x [, , , ] A brief intro to Frames (excerpted from "FrameNet II: Extended Theory and Practice" by Ruppenhofer et. al., 2010): A Frame is a script-like conceptual structure that describes a particular type of situation, object, or event along with the participants and props that are needed for that Frame. For example, the "Apply_heat" frame describes a common situation involving a Cook, some Food, and a Heating_Instrument, and is evoked by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. We call the roles of a Frame "frame elements" (FEs) and the frame-evoking words are called "lexical units" (LUs). FrameNet includes relations between Frames. Several types of relations are defined, of which the most important are: - Inheritance: An IS-A relation. The child frame is a subtype of the parent frame, and each FE in the parent is bound to a corresponding FE in the child. An example is the "Revenge" frame which inherits from the "Rewards_and_punishments" frame. - Using: The child frame presupposes the parent frame as background, e.g the "Speed" frame "uses" (or presupposes) the "Motion" frame; however, not all parent FEs need to be bound to child FEs. - Subframe: The child frame is a subevent of a complex event represented by the parent, e.g. the "Criminal_process" frame has subframes of "Arrest", "Arraignment", "Trial", and "Sentencing". - Perspective_on: The child frame provides a particular perspective on an un-perspectivized parent frame. A pair of examples consists of the "Hiring" and "Get_a_job" frames, which perspectivize the "Employment_start" frame from the Employer's and the Employee's point of view, respectively. :param name: A regular expression pattern used to match against Frame names. If 'name' is None, then a list of all Framenet Frames will be returned. :type name: str :return: A list of matching Frames (or all Frames). :rtype: list(AttrDict) """ try: fIDs = list(self._frame_idx.keys()) except AttributeError: self._buildframeindex() fIDs = list(self._frame_idx.keys()) if name is not None: return PrettyList( self.frame(fID) for fID, finfo in self.frame_ids_and_names(name).items() ) else: return PrettyLazyMap(self.frame, fIDs) def frame_ids_and_names(self, name=None): """ Uses the frame index, which is much faster than looking up each frame definition if only the names and IDs are needed. """ if not self._frame_idx: self._buildframeindex() return { fID: finfo.name for fID, finfo in self._frame_idx.items() if name is None or re.search(name, finfo.name) is not None } def fes(self, name=None, frame=None): """ Lists frame element objects. If 'name' is provided, this is treated as a case-insensitive regular expression to filter by frame name. (Case-insensitivity is because casing of frame element names is not always consistent across frames.) Specify 'frame' to filter by a frame name pattern, ID, or object. >>> from nltk.corpus import framenet as fn >>> fn.fes('Noise_maker') [] >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound')]) [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'), ('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'), ('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'), ('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'), ('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'), ('Vocalizations', 'Sound_source')] >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound',r'(?i)make_noise')]) [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'), ('Make_noise', 'Sound_source')] >>> sorted(set(fe.name for fe in fn.fes('^sound'))) ['Sound', 'Sound_maker', 'Sound_source'] >>> len(fn.fes('^sound$')) 2 :param name: A regular expression pattern used to match against frame element names. If 'name' is None, then a list of all frame elements will be returned. :type name: str :return: A list of matching frame elements :rtype: list(AttrDict) """ # what frames are we searching in? if frame is not None: if isinstance(frame, int): frames = [self.frame(frame)] elif isinstance(frame, str): frames = self.frames(frame) else: frames = [frame] else: frames = self.frames() return PrettyList( fe for f in frames for fename, fe in f.FE.items() if name is None or re.search(name, fename, re.I) ) def lus(self, name=None, frame=None): """ Obtain details for lexical units. Optionally restrict by lexical unit name pattern, and/or to a certain frame or frames whose name matches a pattern. >>> from nltk.corpus import framenet as fn >>> len(fn.lus()) in (11829, 13572) # FN 1.5 and 1.7, resp. True >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID')), maxReprSize=0, breakLines=True) [, , ] >>> PrettyList(sorted(fn.lus(r'interest', r'(?i)stimulus'), key=itemgetter('ID'))) [, ] A brief intro to Lexical Units (excerpted from "FrameNet II: Extended Theory and Practice" by Ruppenhofer et. al., 2010): A lexical unit (LU) is a pairing of a word with a meaning. For example, the "Apply_heat" Frame describes a common situation involving a Cook, some Food, and a Heating Instrument, and is _evoked_ by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. These frame-evoking words are the LUs in the Apply_heat frame. Each sense of a polysemous word is a different LU. We have used the word "word" in talking about LUs. The reality is actually rather complex. When we say that the word "bake" is polysemous, we mean that the lemma "bake.v" (which has the word-forms "bake", "bakes", "baked", and "baking") is linked to three different frames: - Apply_heat: "Michelle baked the potatoes for 45 minutes." - Cooking_creation: "Michelle baked her mother a cake for her birthday." - Absorb_heat: "The potatoes have to bake for more than 30 minutes." These constitute three different LUs, with different definitions. Multiword expressions such as "given name" and hyphenated words like "shut-eye" can also be LUs. Idiomatic phrases such as "middle of nowhere" and "give the slip (to)" are also defined as LUs in the appropriate frames ("Isolated_places" and "Evading", respectively), and their internal structure is not analyzed. Framenet provides multiple annotated examples of each sense of a word (i.e. each LU). Moreover, the set of examples (approximately 20 per LU) illustrates all of the combinatorial possibilities of the lexical unit. Each LU is linked to a Frame, and hence to the other words which evoke that Frame. This makes the FrameNet database similar to a thesaurus, grouping together semantically similar words. In the simplest case, frame-evoking words are verbs such as "fried" in: "Matilde fried the catfish in a heavy iron skillet." Sometimes event nouns may evoke a Frame. For example, "reduction" evokes "Cause_change_of_scalar_position" in: "...the reduction of debt levels to $665 million from $2.6 billion." Adjectives may also evoke a Frame. For example, "asleep" may evoke the "Sleep" frame as in: "They were asleep for hours." Many common nouns, such as artifacts like "hat" or "tower", typically serve as dependents rather than clearly evoking their own frames. :param name: A regular expression pattern used to search the LU names. Note that LU names take the form of a dotted string (e.g. "run.v" or "a little.adv") in which a lemma precedes the "." and a POS follows the dot. The lemma may be composed of a single lexeme (e.g. "run") or of multiple lexemes (e.g. "a little"). If 'name' is not given, then all LUs will be returned. The valid POSes are: v - verb n - noun a - adjective adv - adverb prep - preposition num - numbers intj - interjection art - article c - conjunction scon - subordinating conjunction :type name: str :type frame: str or int or frame :return: A list of selected (or all) lexical units :rtype: list of LU objects (dicts). See the lu() function for info about the specifics of LU objects. """ if not self._lu_idx: self._buildluindex() if name is not None: # match LUs, then restrict by frame result = PrettyList( self.lu(luID) for luID, luName in self.lu_ids_and_names(name).items() ) if frame is not None: if isinstance(frame, int): frameIDs = {frame} elif isinstance(frame, str): frameIDs = {f.ID for f in self.frames(frame)} else: frameIDs = {frame.ID} result = PrettyList(lu for lu in result if lu.frame.ID in frameIDs) elif frame is not None: # all LUs in matching frames if isinstance(frame, int): frames = [self.frame(frame)] elif isinstance(frame, str): frames = self.frames(frame) else: frames = [frame] result = PrettyLazyIteratorList( iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames)) ) else: # all LUs luIDs = [ luID for luID, lu in self._lu_idx.items() if lu.status not in self._bad_statuses ] result = PrettyLazyMap(self.lu, luIDs) return result def lu_ids_and_names(self, name=None): """ Uses the LU index, which is much faster than looking up each LU definition if only the names and IDs are needed. """ if not self._lu_idx: self._buildluindex() return { luID: luinfo.name for luID, luinfo in self._lu_idx.items() if luinfo.status not in self._bad_statuses and (name is None or re.search(name, luinfo.name) is not None) } def docs_metadata(self, name=None): """ Return an index of the annotated documents in Framenet. Details for a specific annotated document can be obtained using this class's doc() function and pass it the value of the 'ID' field. >>> from nltk.corpus import framenet as fn >>> len(fn.docs()) in (78, 107) # FN 1.5 and 1.7, resp. True >>> set([x.corpname for x in fn.docs_metadata()])>=set(['ANC', 'KBEval', \ 'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank']) True :param name: A regular expression pattern used to search the file name of each annotated document. The document's file name contains the name of the corpus that the document is from, followed by two underscores "__" followed by the document name. So, for example, the file name "LUCorpus-v0.3__20000410_nyt-NEW.xml" is from the corpus named "LUCorpus-v0.3" and the document name is "20000410_nyt-NEW.xml". :type name: str :return: A list of selected (or all) annotated documents :rtype: list of dicts, where each dict object contains the following keys: - 'name' - 'ID' - 'corpid' - 'corpname' - 'description' - 'filename' """ try: ftlist = PrettyList(self._fulltext_idx.values()) except AttributeError: self._buildcorpusindex() ftlist = PrettyList(self._fulltext_idx.values()) if name is None: return ftlist else: return PrettyList( x for x in ftlist if re.search(name, x["filename"]) is not None ) def docs(self, name=None): """ Return a list of the annotated full-text documents in FrameNet, optionally filtered by a regex to be matched against the document name. """ return PrettyLazyMap((lambda x: self.doc(x.ID)), self.docs_metadata(name)) def sents(self, exemplars=True, full_text=True): """ Annotated sentences matching the specified criteria. """ if exemplars: if full_text: return self.exemplars() + self.ft_sents() else: return self.exemplars() elif full_text: return self.ft_sents() def annotations(self, luNamePattern=None, exemplars=True, full_text=True): """ Frame annotation sets matching the specified criteria. """ if exemplars: epart = PrettyLazyIteratorList( sent.frameAnnotation for sent in self.exemplars(luNamePattern) ) else: epart = [] if full_text: if luNamePattern is not None: matchedLUIDs = set(self.lu_ids_and_names(luNamePattern).keys()) ftpart = PrettyLazyIteratorList( aset for sent in self.ft_sents() for aset in sent.annotationSet[1:] if luNamePattern is None or aset.get("luID", "CXN_ASET") in matchedLUIDs ) else: ftpart = [] if exemplars: if full_text: return epart + ftpart else: return epart elif full_text: return ftpart def exemplars(self, luNamePattern=None, frame=None, fe=None, fe2=None): """ Lexicographic exemplar sentences, optionally filtered by LU name and/or 1-2 FEs that are realized overtly. 'frame' may be a name pattern, frame ID, or frame instance. 'fe' may be a name pattern or FE instance; if specified, 'fe2' may also be specified to retrieve sentences with both overt FEs (in either order). """ if fe is None and fe2 is not None: raise FramenetError("exemplars(..., fe=None, fe2=) is not allowed") elif fe is not None and fe2 is not None: if not isinstance(fe2, str): if isinstance(fe, str): # fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame. fe, fe2 = fe2, fe elif fe.frame is not fe2.frame: # ensure frames match raise FramenetError( "exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)" ) if frame is None and fe is not None and not isinstance(fe, str): frame = fe.frame # narrow down to frames matching criteria lusByFrame = defaultdict( list ) # frame name -> matching LUs, if luNamePattern is specified if frame is not None or luNamePattern is not None: if frame is None or isinstance(frame, str): if luNamePattern is not None: frames = set() for lu in self.lus(luNamePattern, frame=frame): frames.add(lu.frame.ID) lusByFrame[lu.frame.name].append(lu) frames = LazyMap(self.frame, list(frames)) else: frames = self.frames(frame) else: if isinstance(frame, int): frames = [self.frame(frame)] else: # frame object frames = [frame] if luNamePattern is not None: lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)} if fe is not None: # narrow to frames that define this FE if isinstance(fe, str): frames = PrettyLazyIteratorList( f for f in frames if fe in f.FE or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys()) ) else: if fe.frame not in frames: raise FramenetError( "exemplars() call with inconsistent `frame` and `fe` specification" ) frames = [fe.frame] if fe2 is not None: # narrow to frames that ALSO define this FE if isinstance(fe2, str): frames = PrettyLazyIteratorList( f for f in frames if fe2 in f.FE or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys()) ) # else we already narrowed it to a single frame else: # frame, luNamePattern are None. fe, fe2 are None or strings if fe is not None: frames = {ffe.frame.ID for ffe in self.fes(fe)} if fe2 is not None: frames2 = {ffe.frame.ID for ffe in self.fes(fe2)} frames = frames & frames2 frames = LazyMap(self.frame, list(frames)) else: frames = self.frames() # we've narrowed down 'frames' # now get exemplars for relevant LUs in those frames def _matching_exs(): for f in frames: fes = fes2 = None # FEs of interest if fe is not None: fes = ( {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)} if isinstance(fe, str) else {fe.name} ) if fe2 is not None: fes2 = ( {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)} if isinstance(fe2, str) else {fe2.name} ) for lu in ( lusByFrame[f.name] if luNamePattern is not None else f.lexUnit.values() ): for ex in lu.exemplars: if (fes is None or self._exemplar_of_fes(ex, fes)) and ( fes2 is None or self._exemplar_of_fes(ex, fes2) ): yield ex return PrettyLazyIteratorList(_matching_exs()) def _exemplar_of_fes(self, ex, fes=None): """ Given an exemplar sentence and a set of FE names, return the subset of FE names that are realized overtly in the sentence on the FE, FE2, or FE3 layer. If 'fes' is None, returns all overt FE names. """ overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set() if "FE2" in ex: overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set() if "FE3" in ex: overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set() return overtNames & fes if fes is not None else overtNames def ft_sents(self, docNamePattern=None): """ Full-text annotation sentences, optionally filtered by document name. """ return PrettyLazyIteratorList( sent for d in self.docs(docNamePattern) for sent in d.sentence ) def frame_relation_types(self): """ Obtain a list of frame relation types. >>> from nltk.corpus import framenet as fn >>> frts = sorted(fn.frame_relation_types(), key=itemgetter('ID')) >>> isinstance(frts, list) True >>> len(frts) in (9, 10) # FN 1.5 and 1.7, resp. True >>> PrettyDict(frts[0], breakLines=True) {'ID': 1, '_type': 'framerelationtype', 'frameRelations': [ Child=Change_of_consistency>, Child=Rotting>, ...], 'name': 'Inheritance', 'subFrameName': 'Child', 'superFrameName': 'Parent'} :return: A list of all of the frame relation types in framenet :rtype: list(dict) """ if not self._freltyp_idx: self._buildrelationindex() return self._freltyp_idx.values() def frame_relations(self, frame=None, frame2=None, type=None): """ :param frame: (optional) frame object, name, or ID; only relations involving this frame will be returned :param frame2: (optional; 'frame' must be a different frame) only show relations between the two specified frames, in either direction :param type: (optional) frame relation type (name or object); show only relations of this type :type frame: int or str or AttrDict :return: A list of all of the frame relations in framenet :rtype: list(dict) >>> from nltk.corpus import framenet as fn >>> frels = fn.frame_relations() >>> isinstance(frels, list) True >>> len(frels) in (1676, 2070) # FN 1.5 and 1.7, resp. True >>> PrettyList(fn.frame_relations('Cooking_creation'), maxReprSize=0, breakLines=True) [ Child=Cooking_creation>, Child=Cooking_creation>, ReferringEntry=Cooking_creation>] >>> PrettyList(fn.frame_relations(274), breakLines=True) [ Child=Dodging>, Child=Evading>, ...] >>> PrettyList(fn.frame_relations(fn.frame('Cooking_creation')), breakLines=True) [ Child=Cooking_creation>, Child=Cooking_creation>, ...] >>> PrettyList(fn.frame_relations('Cooking_creation', type='Inheritance')) [ Child=Cooking_creation>] >>> PrettyList(fn.frame_relations('Cooking_creation', 'Apply_heat'), breakLines=True) [ Child=Cooking_creation>, ReferringEntry=Cooking_creation>] """ relation_type = type if not self._frel_idx: self._buildrelationindex() rels = None if relation_type is not None: if not isinstance(relation_type, dict): type = [rt for rt in self.frame_relation_types() if rt.name == type][0] assert isinstance(type, dict) # lookup by 'frame' if frame is not None: if isinstance(frame, dict) and "frameRelations" in frame: rels = PrettyList(frame.frameRelations) else: if not isinstance(frame, int): if isinstance(frame, dict): frame = frame.ID else: frame = self.frame_by_name(frame).ID rels = [self._frel_idx[frelID] for frelID in self._frel_f_idx[frame]] # filter by 'type' if type is not None: rels = [rel for rel in rels if rel.type is type] elif type is not None: # lookup by 'type' rels = type.frameRelations else: rels = self._frel_idx.values() # filter by 'frame2' if frame2 is not None: if frame is None: raise FramenetError( "frame_relations(frame=None, frame2=) is not allowed" ) if not isinstance(frame2, int): if isinstance(frame2, dict): frame2 = frame2.ID else: frame2 = self.frame_by_name(frame2).ID if frame == frame2: raise FramenetError( "The two frame arguments to frame_relations() must be different frames" ) rels = [ rel for rel in rels if rel.superFrame.ID == frame2 or rel.subFrame.ID == frame2 ] return PrettyList( sorted( rels, key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName), ) ) def fe_relations(self): """ Obtain a list of frame element relations. >>> from nltk.corpus import framenet as fn >>> ferels = fn.fe_relations() >>> isinstance(ferels, list) True >>> len(ferels) in (10020, 12393) # FN 1.5 and 1.7, resp. True >>> PrettyDict(ferels[0], breakLines=True) {'ID': 14642, '_type': 'ferelation', 'frameRelation': Child=Lively_place>, 'subFE': , 'subFEName': 'Degree', 'subFrame': , 'subID': 11370, 'supID': 2271, 'superFE': , 'superFEName': 'Degree', 'superFrame': , 'type': } :return: A list of all of the frame element relations in framenet :rtype: list(dict) """ if not self._ferel_idx: self._buildrelationindex() return PrettyList( sorted( self._ferel_idx.values(), key=lambda ferel: ( ferel.type.ID, ferel.frameRelation.superFrameName, ferel.superFEName, ferel.frameRelation.subFrameName, ferel.subFEName, ), ) ) def semtypes(self): """ Obtain a list of semantic types. >>> from nltk.corpus import framenet as fn >>> stypes = fn.semtypes() >>> len(stypes) in (73, 109) # FN 1.5 and 1.7, resp. True >>> sorted(stypes[0].keys()) ['ID', '_type', 'abbrev', 'definition', 'definitionMarkup', 'name', 'rootType', 'subTypes', 'superType'] :return: A list of all of the semantic types in framenet :rtype: list(dict) """ if not self._semtypes: self._loadsemtypes() return PrettyList( self._semtypes[i] for i in self._semtypes if isinstance(i, int) ) def _load_xml_attributes(self, d, elt): """ Extracts a subset of the attributes from the given element and returns them in a dictionary. :param d: A dictionary in which to store the attributes. :type d: dict :param elt: An ElementTree Element :type elt: Element :return: Returns the input dict ``d`` possibly including attributes from ``elt`` :rtype: dict """ d = type(d)(d) try: attr_dict = elt.attrib except AttributeError: return d if attr_dict is None: return d # Ignore these attributes when loading attributes from an xml node ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest "xsi", "schemaLocation", "xmlns", "bgColor", "fgColor", ] for attr in attr_dict: if any(attr.endswith(x) for x in ignore_attrs): continue val = attr_dict[attr] if val.isdigit(): d[attr] = int(val) else: d[attr] = val return d def _strip_tags(self, data): """ Gets rid of all tags and newline characters from the given input :return: A cleaned-up version of the input string :rtype: str """ try: r""" # Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.) m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data) if m: print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr) """ data = data.replace("", "") data = data.replace("", "") data = re.sub('', "", data) data = data.replace("", "") data = data.replace("", "") data = data.replace("", "") data = data.replace("", "") data = data.replace("", "") data = data.replace("", "") data = data.replace("", "") data = data.replace("", "'") data = data.replace("", "'") data = data.replace("", "") data = data.replace("", "") data = data.replace("", "") data = data.replace("", "") # Get rid of and tags data = data.replace("", "") data = data.replace("", "") data = data.replace("\n", " ") except AttributeError: pass return data def _handle_elt(self, elt, tagspec=None): """Extracts and returns the attributes of the given element""" return self._load_xml_attributes(AttrDict(), elt) def _handle_fulltextindex_elt(self, elt, tagspec=None): """ Extracts corpus/document info from the fulltextIndex.xml file. Note that this function "flattens" the information contained in each of the "corpus" elements, so that each "document" element will contain attributes for the corpus and corpusid. Also, each of the "document" items will contain a new attribute called "filename" that is the base file name of the xml file for the document in the "fulltext" subdir of the Framenet corpus. """ ftinfo = self._load_xml_attributes(AttrDict(), elt) corpname = ftinfo.name corpid = ftinfo.ID retlist = [] for sub in elt: if sub.tag.endswith("document"): doc = self._load_xml_attributes(AttrDict(), sub) if "name" in doc: docname = doc.name else: docname = doc.description doc.filename = f"{corpname}__{docname}.xml" doc.URL = ( self._fnweb_url + "/" + self._fulltext_dir + "/" + doc.filename ) doc.corpname = corpname doc.corpid = corpid retlist.append(doc) return retlist def _handle_frame_elt(self, elt, ignorekeys=[]): """Load the info for a Frame from a frame xml file""" frinfo = self._load_xml_attributes(AttrDict(), elt) frinfo["_type"] = "frame" frinfo["definition"] = "" frinfo["definitionMarkup"] = "" frinfo["FE"] = PrettyDict() frinfo["FEcoreSets"] = [] frinfo["lexUnit"] = PrettyDict() frinfo["semTypes"] = [] for k in ignorekeys: if k in frinfo: del frinfo[k] for sub in elt: if sub.tag.endswith("definition") and "definition" not in ignorekeys: frinfo["definitionMarkup"] = sub.text frinfo["definition"] = self._strip_tags(sub.text) elif sub.tag.endswith("FE") and "FE" not in ignorekeys: feinfo = self._handle_fe_elt(sub) frinfo["FE"][feinfo.name] = feinfo feinfo["frame"] = frinfo # backpointer elif sub.tag.endswith("FEcoreSet") and "FEcoreSet" not in ignorekeys: coreset = self._handle_fecoreset_elt(sub) # assumes all FEs have been loaded before coresets frinfo["FEcoreSets"].append( PrettyList(frinfo["FE"][fe.name] for fe in coreset) ) elif sub.tag.endswith("lexUnit") and "lexUnit" not in ignorekeys: luentry = self._handle_framelexunit_elt(sub) if luentry["status"] in self._bad_statuses: # problematic LU entry; ignore it continue luentry["frame"] = frinfo luentry["URL"] = ( self._fnweb_url + "/" + self._lu_dir + "/" + "lu{}.xml".format(luentry["ID"]) ) luentry["subCorpus"] = Future( (lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry) ) luentry["exemplars"] = Future( (lambda lu: lambda: self._lu_file(lu).exemplars)(luentry) ) frinfo["lexUnit"][luentry.name] = luentry if not self._lu_idx: self._buildluindex() self._lu_idx[luentry.ID] = luentry elif sub.tag.endswith("semType") and "semTypes" not in ignorekeys: semtypeinfo = self._load_xml_attributes(AttrDict(), sub) frinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) frinfo["frameRelations"] = self.frame_relations(frame=frinfo) # resolve 'requires' and 'excludes' links between FEs of this frame for fe in frinfo.FE.values(): if fe.requiresFE: name, ID = fe.requiresFE.name, fe.requiresFE.ID fe.requiresFE = frinfo.FE[name] assert fe.requiresFE.ID == ID if fe.excludesFE: name, ID = fe.excludesFE.name, fe.excludesFE.ID fe.excludesFE = frinfo.FE[name] assert fe.excludesFE.ID == ID return frinfo def _handle_fecoreset_elt(self, elt): """Load fe coreset info from xml.""" info = self._load_xml_attributes(AttrDict(), elt) tmp = [] for sub in elt: tmp.append(self._load_xml_attributes(AttrDict(), sub)) return tmp def _handle_framerelationtype_elt(self, elt, *args): """Load frame-relation element and its child fe-relation elements from frRelation.xml.""" info = self._load_xml_attributes(AttrDict(), elt) info["_type"] = "framerelationtype" info["frameRelations"] = PrettyList() for sub in elt: if sub.tag.endswith("frameRelation"): frel = self._handle_framerelation_elt(sub) frel["type"] = info # backpointer for ferel in frel.feRelations: ferel["type"] = info info["frameRelations"].append(frel) return info def _handle_framerelation_elt(self, elt): """Load frame-relation element and its child fe-relation elements from frRelation.xml.""" info = self._load_xml_attributes(AttrDict(), elt) assert info["superFrameName"] != info["subFrameName"], (elt, info) info["_type"] = "framerelation" info["feRelations"] = PrettyList() for sub in elt: if sub.tag.endswith("FERelation"): ferel = self._handle_elt(sub) ferel["_type"] = "ferelation" ferel["frameRelation"] = info # backpointer info["feRelations"].append(ferel) return info def _handle_fulltextannotation_elt(self, elt): """Load full annotation info for a document from its xml file. The main element (fullTextAnnotation) contains a 'header' element (which we ignore here) and a bunch of 'sentence' elements.""" info = AttrDict() info["_type"] = "fulltext_annotation" info["sentence"] = [] for sub in elt: if sub.tag.endswith("header"): continue # not used elif sub.tag.endswith("sentence"): s = self._handle_fulltext_sentence_elt(sub) s.doc = info info["sentence"].append(s) return info def _handle_fulltext_sentence_elt(self, elt): """Load information from the given 'sentence' element. Each 'sentence' element contains a "text" and "annotationSet" sub elements.""" info = self._load_xml_attributes(AttrDict(), elt) info["_type"] = "fulltext_sentence" info["annotationSet"] = [] info["targets"] = [] target_spans = set() info["_ascii"] = types.MethodType( _annotation_ascii, info ) # attach a method for this instance info["text"] = "" for sub in elt: if sub.tag.endswith("text"): info["text"] = self._strip_tags(sub.text) elif sub.tag.endswith("annotationSet"): a = self._handle_fulltextannotationset_elt( sub, is_pos=(len(info["annotationSet"]) == 0) ) if "cxnID" in a: # ignoring construction annotations for now continue a.sent = info a.text = info.text info["annotationSet"].append(a) if "Target" in a: for tspan in a.Target: if tspan in target_spans: self._warn( 'Duplicate target span "{}"'.format( info.text[slice(*tspan)] ), tspan, "in sentence", info["ID"], info.text, ) # this can happen in cases like "chemical and biological weapons" # being annotated as "chemical weapons" and "biological weapons" else: target_spans.add(tspan) info["targets"].append((a.Target, a.luName, a.frameName)) assert info["annotationSet"][0].status == "UNANN" info["POS"] = info["annotationSet"][0].POS info["POS_tagset"] = info["annotationSet"][0].POS_tagset return info def _handle_fulltextannotationset_elt(self, elt, is_pos=False): """Load information from the given 'annotationSet' element. Each 'annotationSet' contains several "layer" elements.""" info = self._handle_luannotationset_elt(elt, is_pos=is_pos) if not is_pos: info["_type"] = "fulltext_annotationset" if "cxnID" not in info: # ignoring construction annotations for now info["LU"] = self.lu( info.luID, luName=info.luName, frameID=info.frameID, frameName=info.frameName, ) info["frame"] = info.LU.frame return info def _handle_fulltextlayer_elt(self, elt): """Load information from the given 'layer' element. Each 'layer' contains several "label" elements.""" info = self._load_xml_attributes(AttrDict(), elt) info["_type"] = "layer" info["label"] = [] for sub in elt: if sub.tag.endswith("label"): l = self._load_xml_attributes(AttrDict(), sub) info["label"].append(l) return info def _handle_framelexunit_elt(self, elt): """Load the lexical unit info from an xml element in a frame's xml file.""" luinfo = AttrDict() luinfo["_type"] = "lu" luinfo = self._load_xml_attributes(luinfo, elt) luinfo["definition"] = "" luinfo["definitionMarkup"] = "" luinfo["sentenceCount"] = PrettyDict() luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes for sub in elt: if sub.tag.endswith("definition"): luinfo["definitionMarkup"] = sub.text luinfo["definition"] = self._strip_tags(sub.text) elif sub.tag.endswith("sentenceCount"): luinfo["sentenceCount"] = self._load_xml_attributes(PrettyDict(), sub) elif sub.tag.endswith("lexeme"): lexemeinfo = self._load_xml_attributes(PrettyDict(), sub) if not isinstance(lexemeinfo.name, str): # some lexeme names are ints by default: e.g., # thousand.num has lexeme with name="1000" lexemeinfo.name = str(lexemeinfo.name) luinfo["lexemes"].append(lexemeinfo) elif sub.tag.endswith("semType"): semtypeinfo = self._load_xml_attributes(PrettyDict(), sub) luinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) # sort lexemes by 'order' attribute # otherwise, e.g., 'write down.v' may have lexemes in wrong order luinfo["lexemes"].sort(key=lambda x: x.order) return luinfo def _handle_lexunit_elt(self, elt, ignorekeys): """ Load full info for a lexical unit from its xml file. This should only be called when accessing corpus annotations (which are not included in frame files). """ luinfo = self._load_xml_attributes(AttrDict(), elt) luinfo["_type"] = "lu" luinfo["definition"] = "" luinfo["definitionMarkup"] = "" luinfo["subCorpus"] = PrettyList() luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes for k in ignorekeys: if k in luinfo: del luinfo[k] for sub in elt: if sub.tag.endswith("header"): continue # not used elif sub.tag.endswith("valences"): continue # not used elif sub.tag.endswith("definition") and "definition" not in ignorekeys: luinfo["definitionMarkup"] = sub.text luinfo["definition"] = self._strip_tags(sub.text) elif sub.tag.endswith("subCorpus") and "subCorpus" not in ignorekeys: sc = self._handle_lusubcorpus_elt(sub) if sc is not None: luinfo["subCorpus"].append(sc) elif sub.tag.endswith("lexeme") and "lexeme" not in ignorekeys: luinfo["lexemes"].append(self._load_xml_attributes(PrettyDict(), sub)) elif sub.tag.endswith("semType") and "semType" not in ignorekeys: semtypeinfo = self._load_xml_attributes(AttrDict(), sub) luinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) return luinfo def _handle_lusubcorpus_elt(self, elt): """Load a subcorpus of a lexical unit from the given xml.""" sc = AttrDict() try: sc["name"] = elt.get("name") except AttributeError: return None sc["_type"] = "lusubcorpus" sc["sentence"] = [] for sub in elt: if sub.tag.endswith("sentence"): s = self._handle_lusentence_elt(sub) if s is not None: sc["sentence"].append(s) return sc def _handle_lusentence_elt(self, elt): """Load a sentence from a subcorpus of an LU from xml.""" info = self._load_xml_attributes(AttrDict(), elt) info["_type"] = "lusentence" info["annotationSet"] = [] info["_ascii"] = types.MethodType( _annotation_ascii, info ) # attach a method for this instance for sub in elt: if sub.tag.endswith("text"): info["text"] = self._strip_tags(sub.text) elif sub.tag.endswith("annotationSet"): annset = self._handle_luannotationset_elt( sub, is_pos=(len(info["annotationSet"]) == 0) ) if annset is not None: assert annset.status == "UNANN" or "FE" in annset, annset if annset.status != "UNANN": info["frameAnnotation"] = annset # copy layer info up to current level for k in ( "Target", "FE", "FE2", "FE3", "GF", "PT", "POS", "POS_tagset", "Other", "Sent", "Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art", ): if k in annset: info[k] = annset[k] info["annotationSet"].append(annset) annset["sent"] = info annset["text"] = info.text return info def _handle_luannotationset_elt(self, elt, is_pos=False): """Load an annotation set from a sentence in an subcorpus of an LU""" info = self._load_xml_attributes(AttrDict(), elt) info["_type"] = "posannotationset" if is_pos else "luannotationset" info["layer"] = [] info["_ascii"] = types.MethodType( _annotation_ascii, info ) # attach a method for this instance if "cxnID" in info: # ignoring construction annotations for now. return info for sub in elt: if sub.tag.endswith("layer"): l = self._handle_lulayer_elt(sub) if l is not None: overt = [] ni = {} # null instantiations info["layer"].append(l) for lbl in l.label: if "start" in lbl: thespan = (lbl.start, lbl.end + 1, lbl.name) if l.name not in ( "Sent", "Other", ): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans assert thespan not in overt, (info.ID, l.name, thespan) overt.append(thespan) else: # null instantiation if lbl.name in ni: self._warn( "FE with multiple NI entries:", lbl.name, ni[lbl.name], lbl.itype, ) else: ni[lbl.name] = lbl.itype overt = sorted(overt) if l.name == "Target": if not overt: self._warn( "Skipping empty Target layer in annotation set ID={}".format( info.ID ) ) continue assert all(lblname == "Target" for i, j, lblname in overt) if "Target" in info: self._warn( "Annotation set {} has multiple Target layers".format( info.ID ) ) else: info["Target"] = [(i, j) for (i, j, _) in overt] elif l.name == "FE": if l.rank == 1: assert "FE" not in info info["FE"] = (overt, ni) # assert False,info else: # sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v assert 2 <= l.rank <= 3, l.rank k = "FE" + str(l.rank) assert k not in info info[k] = (overt, ni) elif l.name in ("GF", "PT"): assert l.rank == 1 info[l.name] = overt elif l.name in ("BNC", "PENN"): assert l.rank == 1 info["POS"] = overt info["POS_tagset"] = l.name else: if is_pos: if l.name not in ("NER", "WSL"): self._warn( "Unexpected layer in sentence annotationset:", l.name, ) else: if l.name not in ( "Sent", "Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art", "Other", ): self._warn( "Unexpected layer in frame annotationset:", l.name ) info[l.name] = overt if not is_pos and "cxnID" not in info: if "Target" not in info: self._warn(f"Missing target in annotation set ID={info.ID}") assert "FE" in info if "FE3" in info: assert "FE2" in info return info def _handle_lulayer_elt(self, elt): """Load a layer from an annotation set""" layer = self._load_xml_attributes(AttrDict(), elt) layer["_type"] = "lulayer" layer["label"] = [] for sub in elt: if sub.tag.endswith("label"): l = self._load_xml_attributes(AttrDict(), sub) if l is not None: layer["label"].append(l) return layer def _handle_fe_elt(self, elt): feinfo = self._load_xml_attributes(AttrDict(), elt) feinfo["_type"] = "fe" feinfo["definition"] = "" feinfo["definitionMarkup"] = "" feinfo["semType"] = None feinfo["requiresFE"] = None feinfo["excludesFE"] = None for sub in elt: if sub.tag.endswith("definition"): feinfo["definitionMarkup"] = sub.text feinfo["definition"] = self._strip_tags(sub.text) elif sub.tag.endswith("semType"): stinfo = self._load_xml_attributes(AttrDict(), sub) feinfo["semType"] = self.semtype(stinfo.ID) elif sub.tag.endswith("requiresFE"): feinfo["requiresFE"] = self._load_xml_attributes(AttrDict(), sub) elif sub.tag.endswith("excludesFE"): feinfo["excludesFE"] = self._load_xml_attributes(AttrDict(), sub) return feinfo def _handle_semtype_elt(self, elt, tagspec=None): semt = self._load_xml_attributes(AttrDict(), elt) semt["_type"] = "semtype" semt["superType"] = None semt["subTypes"] = PrettyList() for sub in elt: if sub.text is not None: semt["definitionMarkup"] = sub.text semt["definition"] = self._strip_tags(sub.text) else: supertypeinfo = self._load_xml_attributes(AttrDict(), sub) semt["superType"] = supertypeinfo # the supertype may not have been loaded yet return semt # # Demo # def demo(): from nltk.corpus import framenet as fn # # It is not necessary to explicitly build the indexes by calling # buildindexes(). We do this here just for demo purposes. If the # indexes are not built explicitly, they will be built as needed. # print("Building the indexes...") fn.buildindexes() # # Get some statistics about the corpus # print("Number of Frames:", len(fn.frames())) print("Number of Lexical Units:", len(fn.lus())) print("Number of annotated documents:", len(fn.docs())) print() # # Frames # print( 'getting frames whose name matches the (case insensitive) regex: "(?i)medical"' ) medframes = fn.frames(r"(?i)medical") print(f'Found {len(medframes)} Frames whose name matches "(?i)medical":') print([(f.name, f.ID) for f in medframes]) # # store the first frame in the list of frames # tmp_id = medframes[0].ID m_frame = fn.frame(tmp_id) # reads all info for the frame # # get the frame relations # print( '\nNumber of frame relations for the "{}" ({}) frame:'.format( m_frame.name, m_frame.ID ), len(m_frame.frameRelations), ) for fr in m_frame.frameRelations: print(" ", fr) # # get the names of the Frame Elements # print( f'\nNumber of Frame Elements in the "{m_frame.name}" frame:', len(m_frame.FE), ) print(" ", [x for x in m_frame.FE]) # # get the names of the "Core" Frame Elements # print(f'\nThe "core" Frame Elements in the "{m_frame.name}" frame:') print(" ", [x.name for x in m_frame.FE.values() if x.coreType == "Core"]) # # get all of the Lexical Units that are incorporated in the # 'Ailment' FE of the 'Medical_conditions' frame (id=239) # print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') m_frame = fn.frame(239) ailment_lus = [ x for x in m_frame.lexUnit.values() if "incorporatedFE" in x and x.incorporatedFE == "Ailment" ] print(" ", [x.name for x in ailment_lus]) # # get all of the Lexical Units for the frame # print( f'\nNumber of Lexical Units in the "{m_frame.name}" frame:', len(m_frame.lexUnit), ) print(" ", [x.name for x in m_frame.lexUnit.values()][:5], "...") # # get basic info on the second LU in the frame # tmp_id = m_frame.lexUnit["ailment.n"].ID # grab the id of the specified LU luinfo = fn.lu_basic(tmp_id) # get basic info on the LU print(f"\nInformation on the LU: {luinfo.name}") pprint(luinfo) # # Get a list of all of the corpora used for fulltext annotation # print("\nNames of all of the corpora used for fulltext annotation:") allcorpora = {x.corpname for x in fn.docs_metadata()} pprint(list(allcorpora)) # # Get the names of the annotated documents in the first corpus # firstcorp = list(allcorpora)[0] firstcorp_docs = fn.docs(firstcorp) print(f'\nNames of the annotated documents in the "{firstcorp}" corpus:') pprint([x.filename for x in firstcorp_docs]) # # Search for frames containing LUs whose name attribute matches a # regexp pattern. # # Note: if you were going to be doing a lot of this type of # searching, you'd want to build an index that maps from # lemmas to frames because each time frames_by_lemma() is # called, it has to search through ALL of the frame XML files # in the db. print( '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":' ) pprint(fn.frames_by_lemma(r"^run.v$")) if __name__ == "__main__": demo() nltk-3.7/nltk/corpus/reader/ieer.py000066400000000000000000000071461420073152400173640ustar00rootroot00000000000000# Natural Language Toolkit: IEER Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Corpus reader for the Information Extraction and Entity Recognition Corpus. NIST 1999 Information Extraction: Entity Recognition Evaluation https://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm This corpus contains the NEWSWIRE development test data for the NIST 1999 IE-ER Evaluation. The files were taken from the subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt`` and filenames were shortened. The corpus contains the following files: APW_19980314, APW_19980424, APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407. """ import nltk from nltk.corpus.reader.api import * #: A dictionary whose keys are the names of documents in this corpus; #: and whose values are descriptions of those documents' contents. titles = { "APW_19980314": "Associated Press Weekly, 14 March 1998", "APW_19980424": "Associated Press Weekly, 24 April 1998", "APW_19980429": "Associated Press Weekly, 29 April 1998", "NYT_19980315": "New York Times, 15 March 1998", "NYT_19980403": "New York Times, 3 April 1998", "NYT_19980407": "New York Times, 7 April 1998", } #: A list of all documents in this corpus. documents = sorted(titles) class IEERDocument: def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""): self.text = text self.docno = docno self.doctype = doctype self.date_time = date_time self.headline = headline def __repr__(self): if self.headline: headline = " ".join(self.headline.leaves()) else: headline = ( " ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..." ) if self.docno is not None: return f"" else: return "" % headline class IEERCorpusReader(CorpusReader): """ """ def docs(self, fileids=None): return concat( [ StreamBackedCorpusView(fileid, self._read_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def parsed_docs(self, fileids=None): return concat( [ StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def _read_parsed_block(self, stream): # TODO: figure out while empty documents are being returned return [ self._parse(doc) for doc in self._read_block(stream) if self._parse(doc).docno is not None ] def _parse(self, doc): val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT") if isinstance(val, dict): return IEERDocument(**val) else: return IEERDocument(val) def _read_block(self, stream): out = [] # Skip any preamble. while True: line = stream.readline() if not line: break if line.strip() == "": break out.append(line) # Read the document while True: line = stream.readline() if not line: break out.append(line) if line.strip() == "": break # Return the document return ["\n".join(out)] nltk-3.7/nltk/corpus/reader/indian.py000066400000000000000000000055511420073152400177000ustar00rootroot00000000000000# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Indian Language POS-Tagged Corpus Collected by A Kumaran, Microsoft Research, India Distributed with permission Contents: - Bangla: IIT Kharagpur - Hindi: Microsoft Research India - Marathi: IIT Bombay - Telugu: IIIT Hyderabad """ from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.tag import map_tag, str2tuple class IndianCorpusReader(CorpusReader): """ List of words, one per line. Blank lines are ignored. """ def words(self, fileids=None): return concat( [ IndianCorpusView(fileid, enc, False, False) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_words(self, fileids=None, tagset=None): if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat( [ IndianCorpusView(fileid, enc, True, False, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True) ] ) def sents(self, fileids=None): return concat( [ IndianCorpusView(fileid, enc, False, True) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_sents(self, fileids=None, tagset=None): if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat( [ IndianCorpusView(fileid, enc, True, True, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True) ] ) class IndianCorpusView(StreamBackedCorpusView): def __init__( self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None ): self._tagged = tagged self._group_by_sent = group_by_sent self._tag_mapping_function = tag_mapping_function StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): line = stream.readline() if line.startswith("<"): return [] sent = [str2tuple(word, sep="_") for word in line.split()] if self._tag_mapping_function: sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent] if not self._tagged: sent = [w for (w, t) in sent] if self._group_by_sent: return [sent] else: return sent nltk-3.7/nltk/corpus/reader/ipipan.py000066400000000000000000000307001420073152400177100ustar00rootroot00000000000000# Natural Language Toolkit: IPI PAN Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Konrad Goluchowski # URL: # For license information, see LICENSE.TXT import functools from nltk.corpus.reader.api import CorpusReader from nltk.corpus.reader.util import StreamBackedCorpusView, concat def _parse_args(fun): @functools.wraps(fun) def decorator(self, fileids=None, **kwargs): kwargs.pop("tags", None) if not fileids: fileids = self.fileids() return fun(self, fileids, **kwargs) return decorator class IPIPANCorpusReader(CorpusReader): """ Corpus reader designed to work with corpus created by IPI PAN. See http://korpus.pl/en/ for more details about IPI PAN corpus. The corpus includes information about text domain, channel and categories. You can access possible values using ``domains()``, ``channels()`` and ``categories()``. You can use also this metadata to filter files, e.g.: ``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``. The reader supports methods: words, sents, paras and their tagged versions. You can get part of speech instead of full tag by giving "simplify_tags=True" parameter, e.g.: ``tagged_sents(simplify_tags=True)``. Also you can get all tags disambiguated tags specifying parameter "one_tag=False", e.g.: ``tagged_paras(one_tag=False)``. You can get all tags that were assigned by a morphological analyzer specifying parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``. The IPIPAN Corpus contains tags indicating if there is a space between two tokens. To add special "no space" markers, you should specify parameter "append_no_space=True", e.g. ``tagged_words(append_no_space=True)``. As a result in place where there should be no space between two tokens new pair ('', 'no-space') will be inserted (for tagged data) and just '' for methods without tags. The corpus reader can also try to append spaces between words. To enable this option, specify parameter "append_space=True", e.g. ``words(append_space=True)``. As a result either ' ' or (' ', 'space') will be inserted between tokens. By default, xml entities like " and & are replaced by corresponding characters. You can turn off this feature, specifying parameter "replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``. """ def __init__(self, root, fileids): CorpusReader.__init__(self, root, fileids, None, None) def channels(self, fileids=None): if not fileids: fileids = self.fileids() return self._parse_header(fileids, "channel") def domains(self, fileids=None): if not fileids: fileids = self.fileids() return self._parse_header(fileids, "domain") def categories(self, fileids=None): if not fileids: fileids = self.fileids() return [ self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm") ] def fileids(self, channels=None, domains=None, categories=None): if channels is not None and domains is not None and categories is not None: raise ValueError( "You can specify only one of channels, domains " "and categories parameter at once" ) if channels is None and domains is None and categories is None: return CorpusReader.fileids(self) if isinstance(channels, str): channels = [channels] if isinstance(domains, str): domains = [domains] if isinstance(categories, str): categories = [categories] if channels: return self._list_morph_files_by("channel", channels) elif domains: return self._list_morph_files_by("domain", domains) else: return self._list_morph_files_by( "keyTerm", categories, map=self._map_category ) @_parse_args def sents(self, fileids=None, **kwargs): return concat( [ self._view( fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs ) for fileid in self._list_morph_files(fileids) ] ) @_parse_args def paras(self, fileids=None, **kwargs): return concat( [ self._view( fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs ) for fileid in self._list_morph_files(fileids) ] ) @_parse_args def words(self, fileids=None, **kwargs): return concat( [ self._view(fileid, tags=False, **kwargs) for fileid in self._list_morph_files(fileids) ] ) @_parse_args def tagged_sents(self, fileids=None, **kwargs): return concat( [ self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs) for fileid in self._list_morph_files(fileids) ] ) @_parse_args def tagged_paras(self, fileids=None, **kwargs): return concat( [ self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs) for fileid in self._list_morph_files(fileids) ] ) @_parse_args def tagged_words(self, fileids=None, **kwargs): return concat( [self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)] ) def _list_morph_files(self, fileids): return [f for f in self.abspaths(fileids)] def _list_header_files(self, fileids): return [ f.replace("morph.xml", "header.xml") for f in self._list_morph_files(fileids) ] def _parse_header(self, fileids, tag): values = set() for f in self._list_header_files(fileids): values_list = self._get_tag(f, tag) for v in values_list: values.add(v) return list(values) def _list_morph_files_by(self, tag, values, map=None): fileids = self.fileids() ret_fileids = set() for f in fileids: fp = self.abspath(f).replace("morph.xml", "header.xml") values_list = self._get_tag(fp, tag) for value in values_list: if map is not None: value = map(value) if value in values: ret_fileids.add(f) return list(ret_fileids) def _get_tag(self, f, tag): tags = [] with open(f) as infile: header = infile.read() tag_end = 0 while True: tag_pos = header.find("<" + tag, tag_end) if tag_pos < 0: return tags tag_end = header.find("", tag_pos) tags.append(header[tag_pos + len(tag) + 2 : tag_end]) def _map_category(self, cat): pos = cat.find(">") if pos == -1: return cat else: return cat[pos + 1 :] def _view(self, filename, **kwargs): tags = kwargs.pop("tags", True) mode = kwargs.pop("mode", 0) simplify_tags = kwargs.pop("simplify_tags", False) one_tag = kwargs.pop("one_tag", True) disamb_only = kwargs.pop("disamb_only", True) append_no_space = kwargs.pop("append_no_space", False) append_space = kwargs.pop("append_space", False) replace_xmlentities = kwargs.pop("replace_xmlentities", True) if len(kwargs) > 0: raise ValueError("Unexpected arguments: %s" % kwargs.keys()) if not one_tag and not disamb_only: raise ValueError( "You cannot specify both one_tag=False and " "disamb_only=False" ) if not tags and (simplify_tags or not one_tag or not disamb_only): raise ValueError( "You cannot specify simplify_tags, one_tag or " "disamb_only with functions other than tagged_*" ) return IPIPANCorpusView( filename, tags=tags, mode=mode, simplify_tags=simplify_tags, one_tag=one_tag, disamb_only=disamb_only, append_no_space=append_no_space, append_space=append_space, replace_xmlentities=replace_xmlentities, ) class IPIPANCorpusView(StreamBackedCorpusView): WORDS_MODE = 0 SENTS_MODE = 1 PARAS_MODE = 2 def __init__(self, filename, startpos=0, **kwargs): StreamBackedCorpusView.__init__(self, filename, None, startpos, None) self.in_sentence = False self.position = 0 self.show_tags = kwargs.pop("tags", True) self.disamb_only = kwargs.pop("disamb_only", True) self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE) self.simplify_tags = kwargs.pop("simplify_tags", False) self.one_tag = kwargs.pop("one_tag", True) self.append_no_space = kwargs.pop("append_no_space", False) self.append_space = kwargs.pop("append_space", False) self.replace_xmlentities = kwargs.pop("replace_xmlentities", True) def read_block(self, stream): sentence = [] sentences = [] space = False no_space = False tags = set() lines = self._read_data(stream) while True: # we may have only part of last line if len(lines) <= 1: self._seek(stream) lines = self._read_data(stream) if lines == [""]: assert not sentences return [] line = lines.pop() self.position += len(line) + 1 if line.startswith('"): if self.append_space: no_space = True if self.append_no_space: if self.show_tags: sentence.append(("", "no-space")) else: sentence.append("") elif line.startswith(" # URL: # For license information, see LICENSE.TXT # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html import re from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader from nltk.corpus.reader.util import ( FileSystemPathPointer, find_corpus_fileids, read_blankline_block, ) from nltk.parse import DependencyGraph # default function to convert morphlist to str for tree representation _morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS") class KNBCorpusReader(SyntaxCorpusReader): """ This class implements: - ``__init__``, which specifies the location of the corpus and a method for detecting the sentence blocks in corpus files. - ``_read_block``, which reads a block from the input stream. - ``_word``, which takes a block and returns a list of list of words. - ``_tag``, which takes a block and returns a list of list of tagged words. - ``_parse``, which takes a block and returns a list of parsed sentences. The structure of tagged words: tagged_word = (word(str), tags(tuple)) tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) Usage example >>> from nltk.corpus.util import LazyCorpusLoader >>> knbc = LazyCorpusLoader( ... 'knbc/corpus1', ... KNBCorpusReader, ... r'.*/KN.*', ... encoding='euc-jp', ... ) >>> len(knbc.sents()[0]) 9 """ def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default): """ Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """ SyntaxCorpusReader.__init__(self, root, fileids, encoding) self.morphs2str = morphs2str def _read_block(self, stream): # blocks are split by blankline (or EOF) - default return read_blankline_block(stream) def _word(self, t): res = [] for line in t.splitlines(): # ignore the Bunsets headers if not re.match(r"EOS|\*|\#|\+", line): cells = line.strip().split(" ") res.append(cells[0]) return res # ignores tagset argument def _tag(self, t, tagset=None): res = [] for line in t.splitlines(): # ignore the Bunsets headers if not re.match(r"EOS|\*|\#|\+", line): cells = line.strip().split(" ") # convert cells to morph tuples res.append((cells[0], " ".join(cells[1:]))) return res def _parse(self, t): dg = DependencyGraph() i = 0 for line in t.splitlines(): if line[0] in "*+": # start of bunsetsu or tag cells = line.strip().split(" ", 3) m = re.match(r"([\-0-9]*)([ADIP])", cells[1]) assert m is not None node = dg.nodes[i] node.update({"address": i, "rel": m.group(2), "word": []}) dep_parent = int(m.group(1)) if dep_parent == -1: dg.root = node else: dg.nodes[dep_parent]["deps"].append(i) i += 1 elif line[0] != "#": # normal morph cells = line.strip().split(" ") # convert cells to morph tuples morph = cells[0], " ".join(cells[1:]) dg.nodes[i - 1]["word"].append(morph) if self.morphs2str: for node in dg.nodes.values(): node["word"] = self.morphs2str(node["word"]) return dg.tree() ###################################################################### # Demo ###################################################################### def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp", ) print(knbc.fileids()[:10]) print("".join(knbc.words()[:100])) print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: "/".join( "{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" ).encode("utf-8") print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) print( "\n".join( " ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) ) def test(): from nltk.corpus.util import LazyCorpusLoader knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp" ) assert isinstance(knbc.words()[0], str) assert isinstance(knbc.sents()[0][0], str) assert isinstance(knbc.tagged_words()[0], tuple) assert isinstance(knbc.tagged_sents()[0][0], tuple) if __name__ == "__main__": demo() nltk-3.7/nltk/corpus/reader/lin.py000066400000000000000000000145071420073152400172210ustar00rootroot00000000000000# Natural Language Toolkit: Lin's Thesaurus # # Copyright (C) 2001-2022 NLTK Project # Author: Dan Blanchard # URL: # For license information, see LICENSE.txt import re from collections import defaultdict from functools import reduce from nltk.corpus.reader import CorpusReader class LinThesaurusCorpusReader(CorpusReader): """Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin.""" # Compiled regular expression for extracting the key from the first line of each # thesaurus entry _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+') @staticmethod def __defaultdict_factory(): """Factory for creating defaultdict of defaultdict(dict)s""" return defaultdict(dict) def __init__(self, root, badscore=0.0): """ Initialize the thesaurus. :param root: root directory containing thesaurus LISP files :type root: C{string} :param badscore: the score to give to words which do not appear in each other's sets of synonyms :type badscore: C{float} """ super().__init__(root, r"sim[A-Z]\.lsp") self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory) self._badscore = badscore for path, encoding, fileid in self.abspaths( include_encoding=True, include_fileid=True ): with open(path) as lin_file: first = True for line in lin_file: line = line.strip() # Start of entry if first: key = LinThesaurusCorpusReader._key_re.sub(r"\1", line) first = False # End of entry elif line == "))": first = True # Lines with pairs of ngrams and scores else: split_line = line.split("\t") if len(split_line) == 2: ngram, score = split_line self._thesaurus[fileid][key][ngram.strip('"')] = float( score ) def similarity(self, ngram1, ngram2, fileid=None): """ Returns the similarity score for two ngrams. :param ngram1: first ngram to compare :type ngram1: C{string} :param ngram2: second ngram to compare :type ngram2: C{string} :param fileid: thesaurus fileid to search in. If None, search all fileids. :type fileid: C{string} :return: If fileid is specified, just the score for the two ngrams; otherwise, list of tuples of fileids and scores. """ # Entries don't contain themselves, so make sure similarity between item and itself is 1.0 if ngram1 == ngram2: if fileid: return 1.0 else: return [(fid, 1.0) for fid in self._fileids] else: if fileid: return ( self._thesaurus[fileid][ngram1][ngram2] if ngram2 in self._thesaurus[fileid][ngram1] else self._badscore ) else: return [ ( fid, ( self._thesaurus[fid][ngram1][ngram2] if ngram2 in self._thesaurus[fid][ngram1] else self._badscore ), ) for fid in self._fileids ] def scored_synonyms(self, ngram, fileid=None): """ Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram :param ngram: ngram to lookup :type ngram: C{string} :param fileid: thesaurus fileid to search in. If None, search all fileids. :type fileid: C{string} :return: If fileid is specified, list of tuples of scores and synonyms; otherwise, list of tuples of fileids and lists, where inner lists consist of tuples of scores and synonyms. """ if fileid: return self._thesaurus[fileid][ngram].items() else: return [ (fileid, self._thesaurus[fileid][ngram].items()) for fileid in self._fileids ] def synonyms(self, ngram, fileid=None): """ Returns a list of synonyms for the current ngram. :param ngram: ngram to lookup :type ngram: C{string} :param fileid: thesaurus fileid to search in. If None, search all fileids. :type fileid: C{string} :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and lists, where inner lists contain synonyms. """ if fileid: return self._thesaurus[fileid][ngram].keys() else: return [ (fileid, self._thesaurus[fileid][ngram].keys()) for fileid in self._fileids ] def __contains__(self, ngram): """ Determines whether or not the given ngram is in the thesaurus. :param ngram: ngram to lookup :type ngram: C{string} :return: whether the given ngram is in the thesaurus. """ return reduce( lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), self._fileids, False, ) ###################################################################### # Demo ###################################################################### def demo(): from nltk.corpus import lin_thesaurus as thes word1 = "business" word2 = "enterprise" print("Getting synonyms for " + word1) print(thes.synonyms(word1)) print("Getting scored synonyms for " + word1) print(thes.scored_synonyms(word1)) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print(f"Similarity score for {word1} and {word2}:") print(thes.similarity(word1, word2)) if __name__ == "__main__": demo() nltk-3.7/nltk/corpus/reader/mte.py000066400000000000000000000332441420073152400172230ustar00rootroot00000000000000""" A reader for corpora whose documents are in MTE format. """ import os import re from functools import reduce from nltk.corpus.reader import TaggedCorpusReader, concat from nltk.corpus.reader.xmldocs import XMLCorpusView def xpath(root, path, ns): return root.findall(path, ns) class MTECorpusView(XMLCorpusView): """ Class for lazy viewing the MTE Corpus. """ def __init__(self, fileid, tagspec, elt_handler=None): XMLCorpusView.__init__(self, fileid, tagspec, elt_handler) def read_block(self, stream, tagspec=None, elt_handler=None): return list( filter( lambda x: x is not None, XMLCorpusView.read_block(self, stream, tagspec, elt_handler), ) ) class MTEFileReader: """ Class for loading the content of the multext-east corpus. It parses the xml files and does some tag-filtering depending on the given method parameters. """ ns = { "tei": "https://www.tei-c.org/ns/1.0", "xml": "https://www.w3.org/XML/1998/namespace", } tag_ns = "{https://www.tei-c.org/ns/1.0}" xml_ns = "{https://www.w3.org/XML/1998/namespace}" word_path = "TEI/text/body/div/div/p/s/(w|c)" sent_path = "TEI/text/body/div/div/p/s" para_path = "TEI/text/body/div/div/p" def __init__(self, file_path): self.__file_path = file_path @classmethod def _word_elt(cls, elt, context): return elt.text @classmethod def _sent_elt(cls, elt, context): return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)] @classmethod def _para_elt(cls, elt, context): return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)] @classmethod def _tagged_word_elt(cls, elt, context): if "ana" not in elt.attrib: return (elt.text, "") if cls.__tags == "" and cls.__tagset == "msd": return (elt.text, elt.attrib["ana"]) elif cls.__tags == "" and cls.__tagset == "universal": return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"])) else: tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$") if tags.match(elt.attrib["ana"]): if cls.__tagset == "msd": return (elt.text, elt.attrib["ana"]) else: return ( elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]), ) else: return None @classmethod def _tagged_sent_elt(cls, elt, context): return list( filter( lambda x: x is not None, [cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)], ) ) @classmethod def _tagged_para_elt(cls, elt, context): return list( filter( lambda x: x is not None, [cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)], ) ) @classmethod def _lemma_word_elt(cls, elt, context): if "lemma" not in elt.attrib: return (elt.text, "") else: return (elt.text, elt.attrib["lemma"]) @classmethod def _lemma_sent_elt(cls, elt, context): return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)] @classmethod def _lemma_para_elt(cls, elt, context): return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)] def words(self): return MTECorpusView( self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt ) def sents(self): return MTECorpusView( self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt ) def paras(self): return MTECorpusView( self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt ) def lemma_words(self): return MTECorpusView( self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt ) def tagged_words(self, tagset, tags): MTEFileReader.__tagset = tagset MTEFileReader.__tags = tags return MTECorpusView( self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt ) def lemma_sents(self): return MTECorpusView( self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt ) def tagged_sents(self, tagset, tags): MTEFileReader.__tagset = tagset MTEFileReader.__tags = tags return MTECorpusView( self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt ) def lemma_paras(self): return MTECorpusView( self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt ) def tagged_paras(self, tagset, tags): MTEFileReader.__tagset = tagset MTEFileReader.__tags = tags return MTECorpusView( self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt ) class MTETagConverter: """ Class for converting msd tags to universal tags, more conversion options are currently not implemented. """ mapping_msd_universal = { "A": "ADJ", "S": "ADP", "R": "ADV", "C": "CONJ", "D": "DET", "N": "NOUN", "M": "NUM", "Q": "PRT", "P": "PRON", "V": "VERB", ".": ".", "-": "X", } @staticmethod def msd_to_universal(tag): """ This function converts the annotation from the Multex-East to the universal tagset as described in Chapter 5 of the NLTK-Book Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so """ indicator = tag[0] if not tag[0] == "#" else tag[1] if not indicator in MTETagConverter.mapping_msd_universal: indicator = "-" return MTETagConverter.mapping_msd_universal[indicator] class MTECorpusReader(TaggedCorpusReader): """ Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East. MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging scheme. These tags can be converted to the Universal tagset """ def __init__(self, root=None, fileids=None, encoding="utf8"): """ Construct a new MTECorpusreader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP :param root: The root directory for this corpus. (default points to location in multext config file) :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml) :param encoding: The encoding of the given files (default is utf8) """ TaggedCorpusReader.__init__(self, root, fileids, encoding) self._readme = "00README.txt" def __fileids(self, fileids): if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] # filter wrong userinput fileids = filter(lambda x: x in self._fileids, fileids) # filter multext-east sourcefiles that are not compatible to the teip5 specification fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids) if not fileids: print("No valid multext-east file specified") return fileids def words(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat( [ MTEFileReader(os.path.join(self._root, f)).words() for f in self.__fileids(fileids) ] ) def sents(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings :rtype: list(list(str)) """ return concat( [ MTEFileReader(os.path.join(self._root, f)).sents() for f in self.__fileids(fileids) ] ) def paras(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word string :rtype: list(list(list(str))) """ return concat( [ MTEFileReader(os.path.join(self._root, f)).paras() for f in self.__fileids(fileids) ] ) def lemma_words(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of words, the corresponding lemmas and punctuation symbols, encoded as tuples (word, lemma) :rtype: list(tuple(str,str)) """ return concat( [ MTEFileReader(os.path.join(self._root, f)).lemma_words() for f in self.__fileids(fileids) ] ) def tagged_words(self, fileids=None, tagset="msd", tags=""): """ :param fileids: A list specifying the fileids that should be used. :param tagset: The tagset that should be used in the returned object, either "universal" or "msd", "msd" is the default :param tags: An MSD Tag that is used to filter all parts of the used corpus that are not more precise or at least equal to the given tag :return: the given file(s) as a list of tagged words and punctuation symbols encoded as tuples (word, tag) :rtype: list(tuple(str, str)) """ if tagset == "universal" or tagset == "msd": return concat( [ MTEFileReader(os.path.join(self._root, f)).tagged_words( tagset, tags ) for f in self.__fileids(fileids) ] ) else: print("Unknown tagset specified.") def lemma_sents(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of sentences or utterances, each encoded as a list of tuples of the word and the corresponding lemma (word, lemma) :rtype: list(list(tuple(str, str))) """ return concat( [ MTEFileReader(os.path.join(self._root, f)).lemma_sents() for f in self.__fileids(fileids) ] ) def tagged_sents(self, fileids=None, tagset="msd", tags=""): """ :param fileids: A list specifying the fileids that should be used. :param tagset: The tagset that should be used in the returned object, either "universal" or "msd", "msd" is the default :param tags: An MSD Tag that is used to filter all parts of the used corpus that are not more precise or at least equal to the given tag :return: the given file(s) as a list of sentences or utterances, each each encoded as a list of (word,tag) tuples :rtype: list(list(tuple(str, str))) """ if tagset == "universal" or tagset == "msd": return concat( [ MTEFileReader(os.path.join(self._root, f)).tagged_sents( tagset, tags ) for f in self.__fileids(fileids) ] ) else: print("Unknown tagset specified.") def lemma_paras(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as a list of tuples of the word and the corresponding lemma (word, lemma) :rtype: list(List(List(tuple(str, str)))) """ return concat( [ MTEFileReader(os.path.join(self._root, f)).lemma_paras() for f in self.__fileids(fileids) ] ) def tagged_paras(self, fileids=None, tagset="msd", tags=""): """ :param fileids: A list specifying the fileids that should be used. :param tagset: The tagset that should be used in the returned object, either "universal" or "msd", "msd" is the default :param tags: An MSD Tag that is used to filter all parts of the used corpus that are not more precise or at least equal to the given tag :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as a list of (word,tag) tuples :rtype: list(list(list(tuple(str, str)))) """ if tagset == "universal" or tagset == "msd": return concat( [ MTEFileReader(os.path.join(self._root, f)).tagged_paras( tagset, tags ) for f in self.__fileids(fileids) ] ) else: print("Unknown tagset specified.") nltk-3.7/nltk/corpus/reader/nkjp.py000066400000000000000000000367451420073152400174110ustar00rootroot00000000000000# Natural Language Toolkit: NKJP Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Gabriela Kaczka # URL: # For license information, see LICENSE.TXT import functools import os import re import tempfile from nltk.corpus.reader.util import concat from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView def _parse_args(fun): """ Wraps function arguments: if fileids not specified then function set NKJPCorpusReader paths. """ @functools.wraps(fun) def decorator(self, fileids=None, **kwargs): if not fileids: fileids = self._paths return fun(self, fileids, **kwargs) return decorator class NKJPCorpusReader(XMLCorpusReader): WORDS_MODE = 0 SENTS_MODE = 1 HEADER_MODE = 2 RAW_MODE = 3 def __init__(self, root, fileids=".*"): """ Corpus reader designed to work with National Corpus of Polish. See http://nkjp.pl/ for more details about NKJP. use example: import nltk import nkjp from nkjp import NKJPCorpusReader x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus x.header() x.raw() x.words() x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html x.sents() x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s) x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy']) x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp']) """ if isinstance(fileids, str): XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml") else: XMLCorpusReader.__init__( self, root, [fileid + "/header.xml" for fileid in fileids] ) self._paths = self.get_paths() def get_paths(self): return [ os.path.join(str(self._root), f.split("header.xml")[0]) for f in self._fileids ] def fileids(self): """ Returns a list of file identifiers for the fileids that make up this corpus. """ return [f.split("header.xml")[0] for f in self._fileids] def _view(self, filename, tags=None, **kwargs): """ Returns a view specialised for use with particular corpus file. """ mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE) if mode is NKJPCorpusReader.WORDS_MODE: return NKJPCorpus_Morph_View(filename, tags=tags) elif mode is NKJPCorpusReader.SENTS_MODE: return NKJPCorpus_Segmentation_View(filename, tags=tags) elif mode is NKJPCorpusReader.HEADER_MODE: return NKJPCorpus_Header_View(filename, tags=tags) elif mode is NKJPCorpusReader.RAW_MODE: return NKJPCorpus_Text_View( filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE ) else: raise NameError("No such mode!") def add_root(self, fileid): """ Add root if necessary to specified fileid. """ if self.root in fileid: return fileid return self.root + fileid @_parse_args def header(self, fileids=None, **kwargs): """ Returns header(s) of specified fileids. """ return concat( [ self._view( self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs ).handle_query() for fileid in fileids ] ) @_parse_args def sents(self, fileids=None, **kwargs): """ Returns sentences in specified fileids. """ return concat( [ self._view( self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs ).handle_query() for fileid in fileids ] ) @_parse_args def words(self, fileids=None, **kwargs): """ Returns words in specified fileids. """ return concat( [ self._view( self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs ).handle_query() for fileid in fileids ] ) @_parse_args def tagged_words(self, fileids=None, **kwargs): """ Call with specified tags as a list, e.g. tags=['subst', 'comp']. Returns tagged words in specified fileids. """ tags = kwargs.pop("tags", []) return concat( [ self._view( self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, tags=tags, **kwargs ).handle_query() for fileid in fileids ] ) @_parse_args def raw(self, fileids=None, **kwargs): """ Returns words in specified fileids. """ return concat( [ self._view( self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs ).handle_query() for fileid in fileids ] ) class NKJPCorpus_Header_View(XMLCorpusView): def __init__(self, filename, **kwargs): """ HEADER_MODE A stream backed corpus view specialized for use with header.xml files in NKJP corpus. """ self.tagspec = ".*/sourceDesc$" XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec) def handle_query(self): self._open() header = [] while True: segm = XMLCorpusView.read_block(self, self._stream) if len(segm) == 0: break header.extend(segm) self.close() return header def handle_elt(self, elt, context): titles = elt.findall("bibl/title") title = [] if titles: title = "\n".join(title.text.strip() for title in titles) authors = elt.findall("bibl/author") author = [] if authors: author = "\n".join(author.text.strip() for author in authors) dates = elt.findall("bibl/date") date = [] if dates: date = "\n".join(date.text.strip() for date in dates) publishers = elt.findall("bibl/publisher") publisher = [] if publishers: publisher = "\n".join(publisher.text.strip() for publisher in publishers) idnos = elt.findall("bibl/idno") idno = [] if idnos: idno = "\n".join(idno.text.strip() for idno in idnos) notes = elt.findall("bibl/note") note = [] if notes: note = "\n".join(note.text.strip() for note in notes) return { "title": title, "author": author, "date": date, "publisher": publisher, "idno": idno, "note": note, } class XML_Tool: """ Helper class creating xml file to one without references to nkjp: namespace. That's needed because the XMLCorpusView assumes that one can find short substrings of XML that are valid XML, which is not true if a namespace is declared at top level """ def __init__(self, root, filename): self.read_file = os.path.join(root, filename) self.write_file = tempfile.NamedTemporaryFile(delete=False) def build_preprocessed_file(self): try: fr = open(self.read_file) fw = self.write_file line = " " while len(line): line = fr.readline() x = re.split(r"nkjp:[^ ]* ", line) # in all files ret = " ".join(x) x = re.split("", ret) # in ann_segmentation.xml ret = " ".join(x) x = re.split("", ret) # in ann_segmentation.xml ret = " ".join(x) x = re.split("", ret) # in ann_segmentation.xml ret = " ".join(x) x = re.split("", ret) # in ann_segmentation.xml ret = " ".join(x) fw.write(ret) fr.close() fw.close() return self.write_file.name except Exception as e: self.remove_preprocessed_file() raise Exception from e def remove_preprocessed_file(self): os.remove(self.write_file.name) class NKJPCorpus_Segmentation_View(XMLCorpusView): """ A stream backed corpus view specialized for use with ann_segmentation.xml files in NKJP corpus. """ def __init__(self, filename, **kwargs): self.tagspec = ".*p/.*s" # intersperse NKJPCorpus_Text_View self.text_view = NKJPCorpus_Text_View( filename, mode=NKJPCorpus_Text_View.SENTS_MODE ) self.text_view.handle_query() # xml preprocessing self.xml_tool = XML_Tool(filename, "ann_segmentation.xml") # base class init XMLCorpusView.__init__( self, self.xml_tool.build_preprocessed_file(), self.tagspec ) def get_segm_id(self, example_word): return example_word.split("(")[1].split(",")[0] def get_sent_beg(self, beg_word): # returns index of beginning letter in sentence return int(beg_word.split(",")[1]) def get_sent_end(self, end_word): # returns index of end letter in sentence splitted = end_word.split(")")[0].split(",") return int(splitted[1]) + int(splitted[2]) def get_sentences(self, sent_segm): # returns one sentence id = self.get_segm_id(sent_segm[0]) segm = self.text_view.segm_dict[id] # text segment beg = self.get_sent_beg(sent_segm[0]) end = self.get_sent_end(sent_segm[len(sent_segm) - 1]) return segm[beg:end] def remove_choice(self, segm): ret = [] prev_txt_end = -1 prev_txt_nr = -1 for word in segm: txt_nr = self.get_segm_id(word) # get increasing sequence of ids: in case of choice get first possibility if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr: ret.append(word) prev_txt_end = self.get_sent_end(word) prev_txt_nr = txt_nr return ret def handle_query(self): try: self._open() sentences = [] while True: sent_segm = XMLCorpusView.read_block(self, self._stream) if len(sent_segm) == 0: break for segm in sent_segm: segm = self.remove_choice(segm) sentences.append(self.get_sentences(segm)) self.close() self.xml_tool.remove_preprocessed_file() return sentences except Exception as e: self.xml_tool.remove_preprocessed_file() raise Exception from e def handle_elt(self, elt, context): ret = [] for seg in elt: ret.append(seg.get("corresp")) return ret class NKJPCorpus_Text_View(XMLCorpusView): """ A stream backed corpus view specialized for use with text.xml files in NKJP corpus. """ SENTS_MODE = 0 RAW_MODE = 1 def __init__(self, filename, **kwargs): self.mode = kwargs.pop("mode", 0) self.tagspec = ".*/div/ab" self.segm_dict = dict() # xml preprocessing self.xml_tool = XML_Tool(filename, "text.xml") # base class init XMLCorpusView.__init__( self, self.xml_tool.build_preprocessed_file(), self.tagspec ) def handle_query(self): try: self._open() x = self.read_block(self._stream) self.close() self.xml_tool.remove_preprocessed_file() return x except Exception as e: self.xml_tool.remove_preprocessed_file() raise Exception from e def read_block(self, stream, tagspec=None, elt_handler=None): """ Returns text as a list of sentences. """ txt = [] while True: segm = XMLCorpusView.read_block(self, stream) if len(segm) == 0: break for part in segm: txt.append(part) return [" ".join([segm for segm in txt])] def get_segm_id(self, elt): for attr in elt.attrib: if attr.endswith("id"): return elt.get(attr) def handle_elt(self, elt, context): # fill dictionary to use later in sents mode if self.mode is NKJPCorpus_Text_View.SENTS_MODE: self.segm_dict[self.get_segm_id(elt)] = elt.text return elt.text class NKJPCorpus_Morph_View(XMLCorpusView): """ A stream backed corpus view specialized for use with ann_morphosyntax.xml files in NKJP corpus. """ def __init__(self, filename, **kwargs): self.tags = kwargs.pop("tags", None) self.tagspec = ".*/seg/fs" self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml") XMLCorpusView.__init__( self, self.xml_tool.build_preprocessed_file(), self.tagspec ) def handle_query(self): try: self._open() words = [] while True: segm = XMLCorpusView.read_block(self, self._stream) if len(segm) == 0: break for part in segm: if part is not None: words.append(part) self.close() self.xml_tool.remove_preprocessed_file() return words except Exception as e: self.xml_tool.remove_preprocessed_file() raise Exception from e def handle_elt(self, elt, context): word = "" flag = False is_not_interp = True # if tags not specified, then always return word if self.tags is None: flag = True for child in elt: # get word if "name" in child.keys() and child.attrib["name"] == "orth": for symbol in child: if symbol.tag == "string": word = symbol.text elif "name" in child.keys() and child.attrib["name"] == "interps": for symbol in child: if "type" in symbol.keys() and symbol.attrib["type"] == "lex": for symbol2 in symbol: if ( "name" in symbol2.keys() and symbol2.attrib["name"] == "ctag" ): for symbol3 in symbol2: if ( "value" in symbol3.keys() and self.tags is not None and symbol3.attrib["value"] in self.tags ): flag = True elif ( "value" in symbol3.keys() and symbol3.attrib["value"] == "interp" ): is_not_interp = False if flag and is_not_interp: return word nltk-3.7/nltk/corpus/reader/nombank.py000066400000000000000000000366451420073152400200730ustar00rootroot00000000000000# Natural Language Toolkit: NomBank Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Authors: Paul Bedaride # Edward Loper # URL: # For license information, see LICENSE.TXT from functools import total_ordering from xml.etree import ElementTree from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.internals import raise_unorderable_types from nltk.tree import Tree class NombankCorpusReader(CorpusReader): """ Corpus reader for the nombank corpus, which augments the Penn Treebank with information about the predicate argument structure of every noun instance. The corpus consists of two parts: the predicate-argument annotations themselves, and a set of "frameset files" which define the argument labels used by the annotations, on a per-noun basis. Each "frameset file" contains one or more predicates, such as ``'turn'`` or ``'turn_on'``, each of which is divided into coarse-grained word senses called "rolesets". For each "roleset", the frameset file provides descriptions of the argument roles, along with examples. """ def __init__( self, root, nomfile, framefiles="", nounsfile=None, parse_fileid_xform=None, parse_corpus=None, encoding="utf8", ): """ :param root: The root directory for this corpus. :param nomfile: The name of the file containing the predicate- argument annotations (relative to ``root``). :param framefiles: A list or regexp specifying the frameset fileids for this corpus. :param parse_fileid_xform: A transform that should be applied to the fileids in this corpus. This should be a function of one argument (a fileid) that returns a string (the new fileid). :param parse_corpus: The corpus containing the parse trees corresponding to this corpus. These parse trees are necessary to resolve the tree pointers used by nombank. """ # If framefiles is specified as a regexp, expand it. if isinstance(framefiles, str): self._fileids = find_corpus_fileids(root, framefiles) self._fileids = list(framefiles) # Initialize the corpus reader. CorpusReader.__init__(self, root, framefiles, encoding) # Record our nom file & nouns file. self._nomfile = nomfile self._nounsfile = nounsfile self._parse_fileid_xform = parse_fileid_xform self._parse_corpus = parse_corpus def instances(self, baseform=None): """ :return: a corpus view that acts as a list of ``NombankInstance`` objects, one for each noun in the corpus. """ kwargs = {} if baseform is not None: kwargs["instance_filter"] = lambda inst: inst.baseform == baseform return StreamBackedCorpusView( self.abspath(self._nomfile), lambda stream: self._read_instance_block(stream, **kwargs), encoding=self.encoding(self._nomfile), ) def lines(self): """ :return: a corpus view that acts as a list of strings, one for each line in the predicate-argument annotation file. """ return StreamBackedCorpusView( self.abspath(self._nomfile), read_line_block, encoding=self.encoding(self._nomfile), ) def roleset(self, roleset_id): """ :return: the xml description for the given roleset. """ baseform = roleset_id.split(".")[0] baseform = baseform.replace("perc-sign", "%") baseform = baseform.replace("oneslashonezero", "1/10").replace( "1/10", "1-slash-10" ) framefile = "frames/%s.xml" % baseform if framefile not in self.fileids(): raise ValueError("Frameset file for %s not found" % roleset_id) # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. with self.abspath(framefile).open() as fp: etree = ElementTree.parse(fp).getroot() for roleset in etree.findall("predicate/roleset"): if roleset.attrib["id"] == roleset_id: return roleset raise ValueError(f"Roleset {roleset_id} not found in {framefile}") def rolesets(self, baseform=None): """ :return: list of xml descriptions for rolesets. """ if baseform is not None: framefile = "frames/%s.xml" % baseform if framefile not in self.fileids(): raise ValueError("Frameset file for %s not found" % baseform) framefiles = [framefile] else: framefiles = self.fileids() rsets = [] for framefile in framefiles: # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. with self.abspath(framefile).open() as fp: etree = ElementTree.parse(fp).getroot() rsets.append(etree.findall("predicate/roleset")) return LazyConcatenation(rsets) def nouns(self): """ :return: a corpus view that acts as a list of all noun lemmas in this corpus (from the nombank.1.0.words file). """ return StreamBackedCorpusView( self.abspath(self._nounsfile), read_line_block, encoding=self.encoding(self._nounsfile), ) def _read_instance_block(self, stream, instance_filter=lambda inst: True): block = [] # Read 100 at a time. for i in range(100): line = stream.readline().strip() if line: inst = NombankInstance.parse( line, self._parse_fileid_xform, self._parse_corpus ) if instance_filter(inst): block.append(inst) return block ###################################################################### # { Nombank Instance & related datatypes ###################################################################### class NombankInstance: def __init__( self, fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus=None, ): self.fileid = fileid """The name of the file containing the parse tree for this instance's sentence.""" self.sentnum = sentnum """The sentence number of this sentence within ``fileid``. Indexing starts from zero.""" self.wordnum = wordnum """The word number of this instance's predicate within its containing sentence. Word numbers are indexed starting from zero, and include traces and other empty parse elements.""" self.baseform = baseform """The baseform of the predicate.""" self.sensenumber = sensenumber """The sense number of the predicate.""" self.predicate = predicate """A ``NombankTreePointer`` indicating the position of this instance's predicate within its containing sentence.""" self.predid = predid """Identifier of the predicate.""" self.arguments = tuple(arguments) """A list of tuples (argloc, argid), specifying the location and identifier for each of the predicate's argument in the containing sentence. Argument identifiers are strings such as ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain the predicate.""" self.parse_corpus = parse_corpus """A corpus reader for the parse trees corresponding to the instances in this nombank corpus.""" @property def roleset(self): """The name of the roleset used by this instance's predicate. Use ``nombank.roleset() `` to look up information about the roleset.""" r = self.baseform.replace("%", "perc-sign") r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero") return f"{r}.{self.sensenumber}" def __repr__(self): return "".format( self.fileid, self.sentnum, self.wordnum, ) def __str__(self): s = "{} {} {} {} {}".format( self.fileid, self.sentnum, self.wordnum, self.baseform, self.sensenumber, ) items = self.arguments + ((self.predicate, "rel"),) for (argloc, argid) in sorted(items): s += f" {argloc}-{argid}" return s def _get_tree(self): if self.parse_corpus is None: return None if self.fileid not in self.parse_corpus.fileids(): return None return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] tree = property( _get_tree, doc=""" The parse tree corresponding to this instance, or None if the corresponding tree is not available.""", ) @staticmethod def parse(s, parse_fileid_xform=None, parse_corpus=None): pieces = s.split() if len(pieces) < 6: raise ValueError("Badly formatted nombank line: %r" % s) # Divide the line into its basic pieces. (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5] args = pieces[5:] rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p] if len(rel) != 1: raise ValueError("Badly formatted nombank line: %r" % s) # Apply the fileid selector, if any. if parse_fileid_xform is not None: fileid = parse_fileid_xform(fileid) # Convert sentence & word numbers to ints. sentnum = int(sentnum) wordnum = int(wordnum) # Parse the predicate location. predloc, predid = rel[0].split("-", 1) predicate = NombankTreePointer.parse(predloc) # Parse the arguments. arguments = [] for arg in args: argloc, argid = arg.split("-", 1) arguments.append((NombankTreePointer.parse(argloc), argid)) # Put it all together. return NombankInstance( fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus, ) class NombankPointer: """ A pointer used by nombank to identify one or more constituents in a parse tree. ``NombankPointer`` is an abstract base class with three concrete subclasses: - ``NombankTreePointer`` is used to point to single constituents. - ``NombankSplitTreePointer`` is used to point to 'split' constituents, which consist of a sequence of two or more ``NombankTreePointer`` pointers. - ``NombankChainTreePointer`` is used to point to entire trace chains in a tree. It consists of a sequence of pieces, which can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers. """ def __init__(self): if self.__class__ == NombankPointer: raise NotImplementedError() class NombankChainTreePointer(NombankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements may be either ``NombankSplitTreePointer`` or ``NombankTreePointer`` pointers.""" def __str__(self): return "*".join("%s" % p for p in self.pieces) def __repr__(self): return "" % self def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) class NombankSplitTreePointer(NombankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements are all ``NombankTreePointer`` pointers.""" def __str__(self): return ",".join("%s" % p for p in self.pieces) def __repr__(self): return "" % self def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) @total_ordering class NombankTreePointer(NombankPointer): """ wordnum:height*wordnum:height*... wordnum:height, """ def __init__(self, wordnum, height): self.wordnum = wordnum self.height = height @staticmethod def parse(s): # Deal with chains (xx*yy*zz) pieces = s.split("*") if len(pieces) > 1: return NombankChainTreePointer( [NombankTreePointer.parse(elt) for elt in pieces] ) # Deal with split args (xx,yy,zz) pieces = s.split(",") if len(pieces) > 1: return NombankSplitTreePointer( [NombankTreePointer.parse(elt) for elt in pieces] ) # Deal with normal pointers. pieces = s.split(":") if len(pieces) != 2: raise ValueError("bad nombank pointer %r" % s) return NombankTreePointer(int(pieces[0]), int(pieces[1])) def __str__(self): return f"{self.wordnum}:{self.height}" def __repr__(self): return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height) def __eq__(self, other): while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, NombankTreePointer): return self is other return self.wordnum == other.wordnum and self.height == other.height def __ne__(self, other): return not self == other def __lt__(self, other): while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, NombankTreePointer): return id(self) < id(other) return (self.wordnum, -self.height) < (other.wordnum, -other.height) def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return tree[self.treepos(tree)] def treepos(self, tree): """ Convert this pointer to a standard 'tree position' pointer, given that it points to the given tree. """ if tree is None: raise ValueError("Parse tree not available") stack = [tree] treepos = [] wordnum = 0 while True: # tree node: if isinstance(stack[-1], Tree): # Select the next child. if len(treepos) < len(stack): treepos.append(0) else: treepos[-1] += 1 # Update the stack. if treepos[-1] < len(stack[-1]): stack.append(stack[-1][treepos[-1]]) else: # End of node's child list: pop up a level. stack.pop() treepos.pop() # word node: else: if wordnum == self.wordnum: return tuple(treepos[: len(treepos) - self.height - 1]) else: wordnum += 1 stack.pop() nltk-3.7/nltk/corpus/reader/nps_chat.py000066400000000000000000000054421420073152400202340ustar00rootroot00000000000000# Natural Language Toolkit: NPS Chat Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT import re import textwrap from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.corpus.reader.xmldocs import * from nltk.internals import ElementWrapper from nltk.tag import map_tag from nltk.util import LazyConcatenation class NPSChatCorpusReader(XMLCorpusReader): def __init__(self, root, fileids, wrap_etree=False, tagset=None): XMLCorpusReader.__init__(self, root, fileids, wrap_etree) self._tagset = tagset def xml_posts(self, fileids=None): if self._wrap_etree: return concat( [ XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt) for fileid in self.abspaths(fileids) ] ) else: return concat( [ XMLCorpusView(fileid, "Session/Posts/Post") for fileid in self.abspaths(fileids) ] ) def posts(self, fileids=None): return concat( [ XMLCorpusView( fileid, "Session/Posts/Post/terminals", self._elt_to_words ) for fileid in self.abspaths(fileids) ] ) def tagged_posts(self, fileids=None, tagset=None): def reader(elt, handler): return self._elt_to_tagged_words(elt, handler, tagset) return concat( [ XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader) for fileid in self.abspaths(fileids) ] ) def words(self, fileids=None): return LazyConcatenation(self.posts(fileids)) def tagged_words(self, fileids=None, tagset=None): return LazyConcatenation(self.tagged_posts(fileids, tagset)) def _wrap_elt(self, elt, handler): return ElementWrapper(elt) def _elt_to_words(self, elt, handler): return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")] def _elt_to_tagged_words(self, elt, handler, tagset=None): tagged_post = [ (self._simplify_username(t.attrib["word"]), t.attrib["pos"]) for t in elt.findall("t") ] if tagset and tagset != self._tagset: tagged_post = [ (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post ] return tagged_post @staticmethod def _simplify_username(word): if "User" in word: word = "U" + word.split("User", 1)[1] elif isinstance(word, bytes): word = word.decode("ascii") return word nltk-3.7/nltk/corpus/reader/opinion_lexicon.py000066400000000000000000000077071420073152400216370ustar00rootroot00000000000000# Natural Language Toolkit: Opinion Lexicon Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader for the Opinion Lexicon. Opinion Lexicon information =========================== Authors: Minqing Hu and Bing Liu, 2004. Department of Computer Science University of Illinois at Chicago Contact: Bing Liu, liub@cs.uic.edu https://www.cs.uic.edu/~liub Distributed with permission. Related papers: - Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA. - Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing Opinions on the Web". Proceedings of the 14th International World Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan. """ from nltk.corpus.reader import WordListCorpusReader from nltk.corpus.reader.api import * class IgnoreReadmeCorpusView(StreamBackedCorpusView): """ This CorpusView is used to skip the initial readme block of the corpus. """ def __init__(self, *args, **kwargs): StreamBackedCorpusView.__init__(self, *args, **kwargs) # open self._stream self._open() # skip the readme block read_blankline_block(self._stream) # Set the initial position to the current stream position self._filepos = [self._stream.tell()] class OpinionLexiconCorpusReader(WordListCorpusReader): """ Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored. >>> from nltk.corpus import opinion_lexicon >>> opinion_lexicon.words() ['2-faced', '2-faces', 'abnormal', 'abolish', ...] The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative words: >>> opinion_lexicon.negative() ['2-faced', '2-faces', 'abnormal', 'abolish', ...] Note that words from `words()` method are sorted by file id, not alphabetically: >>> opinion_lexicon.words()[0:10] ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted'] >>> sorted(opinion_lexicon.words())[0:10] ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort'] """ CorpusView = IgnoreReadmeCorpusView def words(self, fileids=None): """ Return all words in the opinion lexicon. Note that these words are not sorted in alphabetical order. :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] return concat( [ self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def positive(self): """ Return all positive words in alphabetical order. :return: a list of positive words. :rtype: list(str) """ return self.words("positive-words.txt") def negative(self): """ Return all negative words in alphabetical order. :return: a list of negative words. :rtype: list(str) """ return self.words("negative-words.txt") def _read_word_block(self, stream): words = [] for i in range(20): # Read 20 lines at a time. line = stream.readline() if not line: continue words.append(line.strip()) return words nltk-3.7/nltk/corpus/reader/panlex_lite.py000066400000000000000000000122221420073152400207330ustar00rootroot00000000000000# Natural Language Toolkit: PanLex Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: David Kamholz # URL: # For license information, see LICENSE.TXT """ CorpusReader for PanLex Lite, a stripped down version of PanLex distributed as an SQLite database. See the README.txt in the panlex_lite corpus directory for more information on PanLex Lite. """ import os import sqlite3 from nltk.corpus.reader.api import CorpusReader class PanLexLiteCorpusReader(CorpusReader): MEANING_Q = """ SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv FROM dnx JOIN ex ON (ex.ex = dnx.ex) JOIN dnx dnx2 ON (dnx2.mn = dnx.mn) JOIN ex ex2 ON (ex2.ex = dnx2.ex) WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ? ORDER BY dnx2.uq DESC """ TRANSLATION_Q = """ SELECT s.tt, sum(s.uq) AS trq FROM ( SELECT ex2.tt, max(dnx.uq) AS uq FROM dnx JOIN ex ON (ex.ex = dnx.ex) JOIN dnx dnx2 ON (dnx2.mn = dnx.mn) JOIN ex ex2 ON (ex2.ex = dnx2.ex) WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ? GROUP BY ex2.tt, dnx.ui ) s GROUP BY s.tt ORDER BY trq DESC, s.tt """ def __init__(self, root): self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor() self._uid_lv = {} self._lv_uid = {} for row in self._c.execute("SELECT uid, lv FROM lv"): self._uid_lv[row[0]] = row[1] self._lv_uid[row[1]] = row[0] def language_varieties(self, lc=None): """ Return a list of PanLex language varieties. :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties by this code. If unspecified, all varieties are returned. :return: the specified language varieties as a list of tuples. The first element is the language variety's seven-character uniform identifier, and the second element is its default name. :rtype: list(tuple) """ if lc is None: return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall() else: return self._c.execute( "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,) ).fetchall() def meanings(self, expr_uid, expr_tt): """ Return a list of meanings for an expression. :param expr_uid: the expression's language variety, as a seven-character uniform identifier. :param expr_tt: the expression's text. :return: a list of Meaning objects. :rtype: list(Meaning) """ expr_lv = self._uid_lv[expr_uid] mn_info = {} for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)): mn = i[0] uid = self._lv_uid[i[5]] if not mn in mn_info: mn_info[mn] = { "uq": i[1], "ap": i[2], "ui": i[3], "ex": {expr_uid: [expr_tt]}, } if not uid in mn_info[mn]["ex"]: mn_info[mn]["ex"][uid] = [] mn_info[mn]["ex"][uid].append(i[4]) return [Meaning(mn, mn_info[mn]) for mn in mn_info] def translations(self, from_uid, from_tt, to_uid): """ Return a list of translations for an expression into a single language variety. :param from_uid: the source expression's language variety, as a seven-character uniform identifier. :param from_tt: the source expression's text. :param to_uid: the target language variety, as a seven-character uniform identifier. :return: a list of translation tuples. The first element is the expression text and the second element is the translation quality. :rtype: list(tuple) """ from_lv = self._uid_lv[from_uid] to_lv = self._uid_lv[to_uid] return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall() class Meaning(dict): """ Represents a single PanLex meaning. A meaning is a translation set derived from a single source. """ def __init__(self, mn, attr): super().__init__(**attr) self["mn"] = mn def id(self): """ :return: the meaning's id. :rtype: int """ return self["mn"] def quality(self): """ :return: the meaning's source's quality (0=worst, 9=best). :rtype: int """ return self["uq"] def source(self): """ :return: the meaning's source id. :rtype: int """ return self["ap"] def source_group(self): """ :return: the meaning's source group id. :rtype: int """ return self["ui"] def expressions(self): """ :return: the meaning's expressions as a dictionary whose keys are language variety uniform identifiers and whose values are lists of expression texts. :rtype: dict """ return self["ex"] nltk-3.7/nltk/corpus/reader/panlex_swadesh.py000066400000000000000000000061701420073152400214410ustar00rootroot00000000000000# Natural Language Toolkit: Word List Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT import re from collections import defaultdict, namedtuple from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.corpus.reader.wordlist import WordListCorpusReader from nltk.tokenize import line_tokenize PanlexLanguage = namedtuple( "PanlexLanguage", [ "panlex_uid", # (1) PanLex UID "iso639", # (2) ISO 639 language code "iso639_type", # (3) ISO 639 language type, see README "script", # (4) normal scripts of expressions "name", # (5) PanLex default name "langvar_uid", # (6) UID of the language variety in which the default name is an expression ], ) class PanlexSwadeshCorpusReader(WordListCorpusReader): """ This is a class to read the PanLex Swadesh list from David Kamholz, Jonathan Pool, and Susan M. Colowick (2014). PanLex: Building a Resource for Panlingual Lexical Translation. In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf License: CC0 1.0 Universal https://creativecommons.org/publicdomain/zero/1.0/legalcode """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Find the swadesh size using the fileids' path. self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1) self._languages = {lang.panlex_uid: lang for lang in self.get_languages()} self._macro_langauges = self.get_macrolanguages() def license(self): return "CC0 1.0 Universal" def language_codes(self): return self._languages.keys() def get_languages(self): for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"): if not line.strip(): # Skip empty lines. continue yield PanlexLanguage(*line.strip().split("\t")) def get_macrolanguages(self): macro_langauges = defaultdict(list) for lang in self._languages.values(): macro_langauges[lang.iso639].append(lang.panlex_uid) return macro_langauges def words_by_lang(self, lang_code): """ :return: a list of list(str) """ fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt" return [concept.split("\t") for concept in self.words(fileid)] def words_by_iso639(self, iso63_code): """ :return: a list of list(str) """ fileids = [ f"swadesh{self.swadesh_size}/{lang_code}.txt" for lang_code in self._macro_langauges[iso63_code] ] return [ concept.split("\t") for fileid in fileids for concept in self.words(fileid) ] def entries(self, fileids=None): """ :return: a tuple of words for the specified fileids. """ if not fileids: fileids = self.fileids() wordlists = [self.words(f) for f in fileids] return list(zip(*wordlists)) nltk-3.7/nltk/corpus/reader/pl196x.py000066400000000000000000000272511420073152400175020ustar00rootroot00000000000000# Natural Language Toolkit: # # Copyright (C) 2001-2022 NLTK Project # Author: Piotr Kasprzyk # URL: # For license information, see LICENSE.TXT from nltk.corpus.reader.api import * from nltk.corpus.reader.xmldocs import XMLCorpusReader PARA = re.compile(r"]*){0,1}>(.*?)

    ") SENT = re.compile(r"]*){0,1}>(.*?)
    ") TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)") WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)") TYPE = re.compile(r'type="(.*?)"') ANA = re.compile(r'ana="(.*?)"') TEXTID = re.compile(r'text id="(.*?)"') class TEICorpusView(StreamBackedCorpusView): def __init__( self, corpus_file, tagged, group_by_sent, group_by_para, tagset=None, head_len=0, textids=None, ): self._tagged = tagged self._textids = textids self._group_by_sent = group_by_sent self._group_by_para = group_by_para # WARNING -- skip header StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len) _pagesize = 4096 def read_block(self, stream): block = stream.readlines(self._pagesize) block = concat(block) while (block.count(" block.count("")) or block.count( "") + len("") block = block[:beg] + block[beg + end :] output = [] for para_str in PARA.findall(block): para = [] for sent_str in SENT.findall(para_str): if not self._tagged: sent = WORD.findall(sent_str) else: sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str))) if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: output.append(para) else: output.extend(para) return output def _parse_tag(self, tag_word_tuple): (tag, word) = tag_word_tuple if tag.startswith("w"): tag = ANA.search(tag).group(1) else: # tag.startswith('c') tag = TYPE.search(tag).group(1) return word, tag class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader): head_len = 2770 def __init__(self, *args, **kwargs): if "textid_file" in kwargs: self._textids = kwargs["textid_file"] else: self._textids = None XMLCorpusReader.__init__(self, *args) CategorizedCorpusReader.__init__(self, kwargs) self._init_textids() def _init_textids(self): self._f2t = defaultdict(list) self._t2f = defaultdict(list) if self._textids is not None: with open(self._textids) as fp: for line in fp: line = line.strip() file_id, text_ids = line.split(" ", 1) if file_id not in self.fileids(): raise ValueError( "In text_id mapping file %s: %s not found" % (self._textids, file_id) ) for text_id in text_ids.split(self._delimiter): self._add_textids(file_id, text_id) def _add_textids(self, file_id, text_id): self._f2t[file_id].append(text_id) self._t2f[text_id].append(file_id) def _resolve(self, fileids, categories, textids=None): tmp = None if ( len( list( filter( lambda accessor: accessor is None, (fileids, categories, textids), ) ) ) != 1 ): raise ValueError( "Specify exactly one of: fileids, " "categories or textids" ) if fileids is not None: return fileids, None if categories is not None: return self.fileids(categories), None if textids is not None: if isinstance(textids, str): textids = [textids] files = sum((self._t2f[t] for t in textids), []) tdict = dict() for f in files: tdict[f] = set(self._f2t[f]) & set(textids) return files, tdict def decode_tag(self, tag): # to be implemented return tag def textids(self, fileids=None, categories=None): """ In the pl196x corpus each category is stored in single file and thus both methods provide identical functionality. In order to accommodate finer granularity, a non-standard textids() method was implemented. All the main functions can be supplied with a list of required chunks---giving much more control to the user. """ fileids, _ = self._resolve(fileids, categories) if fileids is None: return sorted(self._t2f) if isinstance(fileids, str): fileids = [fileids] return sorted(sum((self._f2t[d] for d in fileids), [])) def words(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] if textids: return concat( [ TEICorpusView( self.abspath(fileid), False, False, False, head_len=self.head_len, textids=textids[fileid], ) for fileid in fileids ] ) else: return concat( [ TEICorpusView( self.abspath(fileid), False, False, False, head_len=self.head_len, ) for fileid in fileids ] ) def sents(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] if textids: return concat( [ TEICorpusView( self.abspath(fileid), False, True, False, head_len=self.head_len, textids=textids[fileid], ) for fileid in fileids ] ) else: return concat( [ TEICorpusView( self.abspath(fileid), False, True, False, head_len=self.head_len ) for fileid in fileids ] ) def paras(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] if textids: return concat( [ TEICorpusView( self.abspath(fileid), False, True, True, head_len=self.head_len, textids=textids[fileid], ) for fileid in fileids ] ) else: return concat( [ TEICorpusView( self.abspath(fileid), False, True, True, head_len=self.head_len ) for fileid in fileids ] ) def tagged_words(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] if textids: return concat( [ TEICorpusView( self.abspath(fileid), True, False, False, head_len=self.head_len, textids=textids[fileid], ) for fileid in fileids ] ) else: return concat( [ TEICorpusView( self.abspath(fileid), True, False, False, head_len=self.head_len ) for fileid in fileids ] ) def tagged_sents(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] if textids: return concat( [ TEICorpusView( self.abspath(fileid), True, True, False, head_len=self.head_len, textids=textids[fileid], ) for fileid in fileids ] ) else: return concat( [ TEICorpusView( self.abspath(fileid), True, True, False, head_len=self.head_len ) for fileid in fileids ] ) def tagged_paras(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] if textids: return concat( [ TEICorpusView( self.abspath(fileid), True, True, True, head_len=self.head_len, textids=textids[fileid], ) for fileid in fileids ] ) else: return concat( [ TEICorpusView( self.abspath(fileid), True, True, True, head_len=self.head_len ) for fileid in fileids ] ) def xml(self, fileids=None, categories=None): fileids, _ = self._resolve(fileids, categories) if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0]) else: raise TypeError("Expected a single file") nltk-3.7/nltk/corpus/reader/plaintext.py000066400000000000000000000200451420073152400204410ustar00rootroot00000000000000# Natural Language Toolkit: Plaintext Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # Nitin Madnani # URL: # For license information, see LICENSE.TXT """ A reader for corpora that consist of plaintext documents. """ import nltk.data from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.tokenize import * class PlaintextCorpusReader(CorpusReader): """ Reader for corpora that consist of plaintext documents. Paragraphs are assumed to be split using blank lines. Sentences and words can be tokenized using the default tokenizers, or by custom tokenizers specified as parameters to the constructor. This corpus reader can be customized (e.g., to skip preface sections of specific document formats) by creating a subclass and overriding the ``CorpusView`` class variable. """ CorpusView = StreamBackedCorpusView """The corpus view class used by this reader. Subclasses of ``PlaintextCorpusReader`` may specify alternative corpus view classes (e.g., to skip the preface sections of documents.)""" def __init__( self, root, fileids, word_tokenizer=WordPunctTokenizer(), sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"), para_block_reader=read_blankline_block, encoding="utf8", ): r""" Construct a new plaintext corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/usr/local/share/nltk_data/corpora/webtext/' >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param word_tokenizer: Tokenizer for breaking sentences or paragraphs into words. :param sent_tokenizer: Tokenizer for breaking paragraphs into words. :param para_block_reader: The block reader used to divide the corpus into paragraph blocks. """ CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._para_block_reader = para_block_reader def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat( [ self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ if self._sent_tokenizer is None: raise ValueError("No sentence tokenizer for this corpus") return concat( [ self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def paras(self, fileids=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ if self._sent_tokenizer is None: raise ValueError("No sentence tokenizer for this corpus") return concat( [ self.CorpusView(path, self._read_para_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def _read_word_block(self, stream): words = [] for i in range(20): # Read 20 lines at a time. words.extend(self._word_tokenizer.tokenize(stream.readline())) return words def _read_sent_block(self, stream): sents = [] for para in self._para_block_reader(stream): sents.extend( [ self._word_tokenizer.tokenize(sent) for sent in self._sent_tokenizer.tokenize(para) ] ) return sents def _read_para_block(self, stream): paras = [] for para in self._para_block_reader(stream): paras.append( [ self._word_tokenizer.tokenize(sent) for sent in self._sent_tokenizer.tokenize(para) ] ) return paras class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader): """ A reader for plaintext corpora whose documents are divided into categories based on their file identifiers. """ def __init__(self, *args, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``PlaintextCorpusReader`` constructor. """ CategorizedCorpusReader.__init__(self, kwargs) PlaintextCorpusReader.__init__(self, *args, **kwargs) # FIXME: Is there a better way? How to not hardcode this? # Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to # override the `sent_tokenizer`. class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader): def __init__(self, *args, **kwargs): CategorizedCorpusReader.__init__(self, kwargs) kwargs["sent_tokenizer"] = nltk.data.LazyLoader( "tokenizers/punkt/portuguese.pickle" ) PlaintextCorpusReader.__init__(self, *args, **kwargs) class EuroparlCorpusReader(PlaintextCorpusReader): """ Reader for Europarl corpora that consist of plaintext documents. Documents are divided into chapters instead of paragraphs as for regular plaintext documents. Chapters are separated using blank lines. Everything is inherited from ``PlaintextCorpusReader`` except that: - Since the corpus is pre-processed and pre-tokenized, the word tokenizer should just split the line at whitespaces. - For the same reason, the sentence tokenizer should just split the paragraph at line breaks. - There is a new 'chapters()' method that returns chapters instead instead of paragraphs. - The 'paras()' method inherited from PlaintextCorpusReader is made non-functional to remove any confusion between chapters and paragraphs for Europarl. """ def _read_word_block(self, stream): words = [] for i in range(20): # Read 20 lines at a time. words.extend(stream.readline().split()) return words def _read_sent_block(self, stream): sents = [] for para in self._para_block_reader(stream): sents.extend([sent.split() for sent in para.splitlines()]) return sents def _read_para_block(self, stream): paras = [] for para in self._para_block_reader(stream): paras.append([sent.split() for sent in para.splitlines()]) return paras def chapters(self, fileids=None): """ :return: the given file(s) as a list of chapters, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ return concat( [ self.CorpusView(fileid, self._read_para_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def paras(self, fileids=None): raise NotImplementedError( "The Europarl corpus reader does not support paragraphs. Please use chapters() instead." ) nltk-3.7/nltk/corpus/reader/ppattach.py000066400000000000000000000053701420073152400202410ustar00rootroot00000000000000# Natural Language Toolkit: PP Attachment Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Read lines from the Prepositional Phrase Attachment Corpus. The PP Attachment Corpus contains several files having the format: sentence_id verb noun1 preposition noun2 attachment For example: 42960 gives authority to administration V 46742 gives inventors of microchip N The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.: (VP gives (NP authority) (PP to administration)) (VP gives (NP inventors (PP of microchip))) The corpus contains the following files: training: training set devset: development test set, used for algorithm development. test: test set, used to report results bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal. Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional Phrase Attachment. Proceedings of the ARPA Human Language Technology Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps] The PP Attachment Corpus is distributed with NLTK with the permission of the author. """ from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * class PPAttachment: def __init__(self, sent, verb, noun1, prep, noun2, attachment): self.sent = sent self.verb = verb self.noun1 = noun1 self.prep = prep self.noun2 = noun2 self.attachment = attachment def __repr__(self): return ( "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, " "noun2=%r, attachment=%r)" % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment) ) class PPAttachmentCorpusReader(CorpusReader): """ sentence_id verb noun1 preposition noun2 attachment """ def attachments(self, fileids): return concat( [ StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tuples(self, fileids): return concat( [ StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def _read_tuple_block(self, stream): line = stream.readline() if line: return [tuple(line.split())] else: return [] def _read_obj_block(self, stream): line = stream.readline() if line: return [PPAttachment(*line.split())] else: return [] nltk-3.7/nltk/corpus/reader/propbank.py000066400000000000000000000415501420073152400202510ustar00rootroot00000000000000# Natural Language Toolkit: PropBank Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT import re from functools import total_ordering from xml.etree import ElementTree from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.internals import raise_unorderable_types from nltk.tree import Tree class PropbankCorpusReader(CorpusReader): """ Corpus reader for the propbank corpus, which augments the Penn Treebank with information about the predicate argument structure of every verb instance. The corpus consists of two parts: the predicate-argument annotations themselves, and a set of "frameset files" which define the argument labels used by the annotations, on a per-verb basis. Each "frameset file" contains one or more predicates, such as ``'turn'`` or ``'turn_on'``, each of which is divided into coarse-grained word senses called "rolesets". For each "roleset", the frameset file provides descriptions of the argument roles, along with examples. """ def __init__( self, root, propfile, framefiles="", verbsfile=None, parse_fileid_xform=None, parse_corpus=None, encoding="utf8", ): """ :param root: The root directory for this corpus. :param propfile: The name of the file containing the predicate- argument annotations (relative to ``root``). :param framefiles: A list or regexp specifying the frameset fileids for this corpus. :param parse_fileid_xform: A transform that should be applied to the fileids in this corpus. This should be a function of one argument (a fileid) that returns a string (the new fileid). :param parse_corpus: The corpus containing the parse trees corresponding to this corpus. These parse trees are necessary to resolve the tree pointers used by propbank. """ # If framefiles is specified as a regexp, expand it. if isinstance(framefiles, str): framefiles = find_corpus_fileids(root, framefiles) framefiles = list(framefiles) # Initialize the corpus reader. CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding) # Record our frame fileids & prop file. self._propfile = propfile self._framefiles = framefiles self._verbsfile = verbsfile self._parse_fileid_xform = parse_fileid_xform self._parse_corpus = parse_corpus def instances(self, baseform=None): """ :return: a corpus view that acts as a list of ``PropBankInstance`` objects, one for each noun in the corpus. """ kwargs = {} if baseform is not None: kwargs["instance_filter"] = lambda inst: inst.baseform == baseform return StreamBackedCorpusView( self.abspath(self._propfile), lambda stream: self._read_instance_block(stream, **kwargs), encoding=self.encoding(self._propfile), ) def lines(self): """ :return: a corpus view that acts as a list of strings, one for each line in the predicate-argument annotation file. """ return StreamBackedCorpusView( self.abspath(self._propfile), read_line_block, encoding=self.encoding(self._propfile), ) def roleset(self, roleset_id): """ :return: the xml description for the given roleset. """ baseform = roleset_id.split(".")[0] framefile = "frames/%s.xml" % baseform if framefile not in self._framefiles: raise ValueError("Frameset file for %s not found" % roleset_id) # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. with self.abspath(framefile).open() as fp: etree = ElementTree.parse(fp).getroot() for roleset in etree.findall("predicate/roleset"): if roleset.attrib["id"] == roleset_id: return roleset raise ValueError(f"Roleset {roleset_id} not found in {framefile}") def rolesets(self, baseform=None): """ :return: list of xml descriptions for rolesets. """ if baseform is not None: framefile = "frames/%s.xml" % baseform if framefile not in self._framefiles: raise ValueError("Frameset file for %s not found" % baseform) framefiles = [framefile] else: framefiles = self._framefiles rsets = [] for framefile in framefiles: # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. with self.abspath(framefile).open() as fp: etree = ElementTree.parse(fp).getroot() rsets.append(etree.findall("predicate/roleset")) return LazyConcatenation(rsets) def verbs(self): """ :return: a corpus view that acts as a list of all verb lemmas in this corpus (from the verbs.txt file). """ return StreamBackedCorpusView( self.abspath(self._verbsfile), read_line_block, encoding=self.encoding(self._verbsfile), ) def _read_instance_block(self, stream, instance_filter=lambda inst: True): block = [] # Read 100 at a time. for i in range(100): line = stream.readline().strip() if line: inst = PropbankInstance.parse( line, self._parse_fileid_xform, self._parse_corpus ) if instance_filter(inst): block.append(inst) return block ###################################################################### # { Propbank Instance & related datatypes ###################################################################### class PropbankInstance: def __init__( self, fileid, sentnum, wordnum, tagger, roleset, inflection, predicate, arguments, parse_corpus=None, ): self.fileid = fileid """The name of the file containing the parse tree for this instance's sentence.""" self.sentnum = sentnum """The sentence number of this sentence within ``fileid``. Indexing starts from zero.""" self.wordnum = wordnum """The word number of this instance's predicate within its containing sentence. Word numbers are indexed starting from zero, and include traces and other empty parse elements.""" self.tagger = tagger """An identifier for the tagger who tagged this instance; or ``'gold'`` if this is an adjuticated instance.""" self.roleset = roleset """The name of the roleset used by this instance's predicate. Use ``propbank.roleset() `` to look up information about the roleset.""" self.inflection = inflection """A ``PropbankInflection`` object describing the inflection of this instance's predicate.""" self.predicate = predicate """A ``PropbankTreePointer`` indicating the position of this instance's predicate within its containing sentence.""" self.arguments = tuple(arguments) """A list of tuples (argloc, argid), specifying the location and identifier for each of the predicate's argument in the containing sentence. Argument identifiers are strings such as ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain the predicate.""" self.parse_corpus = parse_corpus """A corpus reader for the parse trees corresponding to the instances in this propbank corpus.""" @property def baseform(self): """The baseform of the predicate.""" return self.roleset.split(".")[0] @property def sensenumber(self): """The sense number of the predicate.""" return self.roleset.split(".")[1] @property def predid(self): """Identifier of the predicate.""" return "rel" def __repr__(self): return "".format( self.fileid, self.sentnum, self.wordnum, ) def __str__(self): s = "{} {} {} {} {} {}".format( self.fileid, self.sentnum, self.wordnum, self.tagger, self.roleset, self.inflection, ) items = self.arguments + ((self.predicate, "rel"),) for (argloc, argid) in sorted(items): s += f" {argloc}-{argid}" return s def _get_tree(self): if self.parse_corpus is None: return None if self.fileid not in self.parse_corpus.fileids(): return None return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] tree = property( _get_tree, doc=""" The parse tree corresponding to this instance, or None if the corresponding tree is not available.""", ) @staticmethod def parse(s, parse_fileid_xform=None, parse_corpus=None): pieces = s.split() if len(pieces) < 7: raise ValueError("Badly formatted propbank line: %r" % s) # Divide the line into its basic pieces. (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6] rel = [p for p in pieces[6:] if p.endswith("-rel")] args = [p for p in pieces[6:] if not p.endswith("-rel")] if len(rel) != 1: raise ValueError("Badly formatted propbank line: %r" % s) # Apply the fileid selector, if any. if parse_fileid_xform is not None: fileid = parse_fileid_xform(fileid) # Convert sentence & word numbers to ints. sentnum = int(sentnum) wordnum = int(wordnum) # Parse the inflection inflection = PropbankInflection.parse(inflection) # Parse the predicate location. predicate = PropbankTreePointer.parse(rel[0][:-4]) # Parse the arguments. arguments = [] for arg in args: argloc, argid = arg.split("-", 1) arguments.append((PropbankTreePointer.parse(argloc), argid)) # Put it all together. return PropbankInstance( fileid, sentnum, wordnum, tagger, roleset, inflection, predicate, arguments, parse_corpus, ) class PropbankPointer: """ A pointer used by propbank to identify one or more constituents in a parse tree. ``PropbankPointer`` is an abstract base class with three concrete subclasses: - ``PropbankTreePointer`` is used to point to single constituents. - ``PropbankSplitTreePointer`` is used to point to 'split' constituents, which consist of a sequence of two or more ``PropbankTreePointer`` pointers. - ``PropbankChainTreePointer`` is used to point to entire trace chains in a tree. It consists of a sequence of pieces, which can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers. """ def __init__(self): if self.__class__ == PropbankPointer: raise NotImplementedError() class PropbankChainTreePointer(PropbankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements may be either ``PropbankSplitTreePointer`` or ``PropbankTreePointer`` pointers.""" def __str__(self): return "*".join("%s" % p for p in self.pieces) def __repr__(self): return "" % self def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) class PropbankSplitTreePointer(PropbankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements are all ``PropbankTreePointer`` pointers.""" def __str__(self): return ",".join("%s" % p for p in self.pieces) def __repr__(self): return "" % self def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) @total_ordering class PropbankTreePointer(PropbankPointer): """ wordnum:height*wordnum:height*... wordnum:height, """ def __init__(self, wordnum, height): self.wordnum = wordnum self.height = height @staticmethod def parse(s): # Deal with chains (xx*yy*zz) pieces = s.split("*") if len(pieces) > 1: return PropbankChainTreePointer( [PropbankTreePointer.parse(elt) for elt in pieces] ) # Deal with split args (xx,yy,zz) pieces = s.split(",") if len(pieces) > 1: return PropbankSplitTreePointer( [PropbankTreePointer.parse(elt) for elt in pieces] ) # Deal with normal pointers. pieces = s.split(":") if len(pieces) != 2: raise ValueError("bad propbank pointer %r" % s) return PropbankTreePointer(int(pieces[0]), int(pieces[1])) def __str__(self): return f"{self.wordnum}:{self.height}" def __repr__(self): return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height) def __eq__(self, other): while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, PropbankTreePointer): return self is other return self.wordnum == other.wordnum and self.height == other.height def __ne__(self, other): return not self == other def __lt__(self, other): while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, PropbankTreePointer): return id(self) < id(other) return (self.wordnum, -self.height) < (other.wordnum, -other.height) def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return tree[self.treepos(tree)] def treepos(self, tree): """ Convert this pointer to a standard 'tree position' pointer, given that it points to the given tree. """ if tree is None: raise ValueError("Parse tree not available") stack = [tree] treepos = [] wordnum = 0 while True: # tree node: if isinstance(stack[-1], Tree): # Select the next child. if len(treepos) < len(stack): treepos.append(0) else: treepos[-1] += 1 # Update the stack. if treepos[-1] < len(stack[-1]): stack.append(stack[-1][treepos[-1]]) else: # End of node's child list: pop up a level. stack.pop() treepos.pop() # word node: else: if wordnum == self.wordnum: return tuple(treepos[: len(treepos) - self.height - 1]) else: wordnum += 1 stack.pop() class PropbankInflection: # { Inflection Form INFINITIVE = "i" GERUND = "g" PARTICIPLE = "p" FINITE = "v" # { Inflection Tense FUTURE = "f" PAST = "p" PRESENT = "n" # { Inflection Aspect PERFECT = "p" PROGRESSIVE = "o" PERFECT_AND_PROGRESSIVE = "b" # { Inflection Person THIRD_PERSON = "3" # { Inflection Voice ACTIVE = "a" PASSIVE = "p" # { Inflection NONE = "-" # } def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"): self.form = form self.tense = tense self.aspect = aspect self.person = person self.voice = voice def __str__(self): return self.form + self.tense + self.aspect + self.person + self.voice def __repr__(self): return "" % self _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$") @staticmethod def parse(s): if not isinstance(s, str): raise TypeError("expected a string") if len(s) != 5 or not PropbankInflection._VALIDATE.match(s): raise ValueError("Bad propbank inflection string %r" % s) return PropbankInflection(*s) nltk-3.7/nltk/corpus/reader/pros_cons.py000066400000000000000000000111721420073152400204370ustar00rootroot00000000000000# Natural Language Toolkit: Pros and Cons Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader for the Pros and Cons dataset. - Pros and Cons dataset information - Contact: Bing Liu, liub@cs.uic.edu https://www.cs.uic.edu/~liub Distributed with permission. Related papers: - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences". Proceedings of the 22nd International Conference on Computational Linguistics (Coling-2008), Manchester, 18-22 August, 2008. - Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing Opinions on the Web". Proceedings of the 14th international World Wide Web conference (WWW-2005), May 10-14, 2005, in Chiba, Japan. """ import re from nltk.corpus.reader.api import * from nltk.tokenize import * class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader): """ Reader for the Pros and Cons sentence dataset. >>> from nltk.corpus import pros_cons >>> pros_cons.sents(categories='Cons') [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy', 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'], ...] >>> pros_cons.words('IntegratedPros.txt') ['Easy', 'to', 'use', ',', 'economical', '!', ...] """ CorpusView = StreamBackedCorpusView def __init__( self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8", **kwargs ): """ :param root: The root directory for the corpus. :param fileids: a list or regexp specifying the fileids in the corpus. :param word_tokenizer: a tokenizer for breaking sentences or paragraphs into words. Default: `WhitespaceTokenizer` :param encoding: the encoding that should be used to read the corpus. :param kwargs: additional parameters passed to CategorizedCorpusReader. """ CorpusReader.__init__(self, root, fileids, encoding) CategorizedCorpusReader.__init__(self, kwargs) self._word_tokenizer = word_tokenizer def sents(self, fileids=None, categories=None): """ Return all sentences in the corpus or in the specified files/categories. :param fileids: a list or regexp specifying the ids of the files whose sentences have to be returned. :param categories: a list specifying the categories whose sentences have to be returned. :return: the given file(s) as a list of sentences. Each sentence is tokenized using the specified word_tokenizer. :rtype: list(list(str)) """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] return concat( [ self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def words(self, fileids=None, categories=None): """ Return all words and punctuation symbols in the corpus or in the specified files/categories. :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :param categories: a list specifying the categories whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] return concat( [ self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def _read_sent_block(self, stream): sents = [] for i in range(20): # Read 20 lines at a time. line = stream.readline() if not line: continue sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)", line) if sent: sents.append(self._word_tokenizer.tokenize(sent.group(2).strip())) return sents def _read_word_block(self, stream): words = [] for sent in self._read_sent_block(stream): words.extend(sent) return words nltk-3.7/nltk/corpus/reader/reviews.py000066400000000000000000000272241420073152400201230ustar00rootroot00000000000000# Natural Language Toolkit: Product Reviews Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader for reviews corpora (syntax based on Customer Review Corpus). Customer Review Corpus information ================================== Annotated by: Minqing Hu and Bing Liu, 2004. Department of Computer Science University of Illinois at Chicago Contact: Bing Liu, liub@cs.uic.edu https://www.cs.uic.edu/~liub Distributed with permission. The "product_reviews_1" and "product_reviews_2" datasets respectively contain annotated customer reviews of 5 and 9 products from amazon.com. Related papers: - Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD-04), 2004. - Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews". Proceedings of Nineteeth National Conference on Artificial Intelligence (AAAI-2004), 2004. - Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to Opinion Mining." Proceedings of First ACM International Conference on Web Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University, Stanford, California, USA. Symbols used in the annotated reviews: :[t]: the title of the review: Each [t] tag starts a review. :xxxx[+|-n]: xxxx is a product feature. :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest. Note that the strength is quite subjective. You may want ignore it, but only considering + and - :[-n]: Negative opinion :##: start of each sentence. Each line is a sentence. :[u]: feature not appeared in the sentence. :[p]: feature not appeared in the sentence. Pronoun resolution is needed. :[s]: suggestion or recommendation. :[cc]: comparison with a competing product from a different brand. :[cs]: comparison with a competing product from the same brand. Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not provide separation between different reviews. This is due to the fact that the dataset was specifically designed for aspect/feature-based sentiment analysis, for which sentence-level annotation is sufficient. For document- level classification and analysis, this peculiarity should be taken into consideration. """ import re from nltk.corpus.reader.api import * from nltk.tokenize import * TITLE = re.compile(r"^\[t\](.*)$") # [t] Title FEATURES = re.compile( r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]" ) # find 'feature' in feature[+3] NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p] SENT = re.compile(r"##(.*)$") # find tokenized sentence class Review: """ A Review is the main block of a ReviewsCorpusReader. """ def __init__(self, title=None, review_lines=None): """ :param title: the title of the review. :param review_lines: the list of the ReviewLines that belong to the Review. """ self.title = title if review_lines is None: self.review_lines = [] else: self.review_lines = review_lines def add_line(self, review_line): """ Add a line (ReviewLine) to the review. :param review_line: a ReviewLine instance that belongs to the Review. """ assert isinstance(review_line, ReviewLine) self.review_lines.append(review_line) def features(self): """ Return a list of features in the review. Each feature is a tuple made of the specific item feature and the opinion strength about that feature. :return: all features of the review as a list of tuples (feat, score). :rtype: list(tuple) """ features = [] for review_line in self.review_lines: features.extend(review_line.features) return features def sents(self): """ Return all tokenized sentences in the review. :return: all sentences of the review as lists of tokens. :rtype: list(list(str)) """ return [review_line.sent for review_line in self.review_lines] def __repr__(self): return 'Review(title="{}", review_lines={})'.format( self.title, self.review_lines ) class ReviewLine: """ A ReviewLine represents a sentence of the review, together with (optional) annotations of its features and notes about the reviewed item. """ def __init__(self, sent, features=None, notes=None): self.sent = sent if features is None: self.features = [] else: self.features = features if notes is None: self.notes = [] else: self.notes = notes def __repr__(self): return "ReviewLine(features={}, notes={}, sent={})".format( self.features, self.notes, self.sent ) class ReviewsCorpusReader(CorpusReader): """ Reader for the Customer Review Data dataset by Hu, Liu (2004). Note: we are not applying any sentence tokenization at the moment, just word tokenization. >>> from nltk.corpus import product_reviews_1 >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt') >>> review = camera_reviews[0] >>> review.sents()[0] ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am', 'extremely', 'satisfied', 'with', 'the', 'purchase', '.'] >>> review.features() [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'), ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'), ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'), ('option', '+1')] We can also reach the same information directly from the stream: >>> product_reviews_1.features('Canon_G3.txt') [('canon powershot g3', '+3'), ('use', '+2'), ...] We can compute stats for specific product features: >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) >>> mean = tot / n_reviews >>> print(n_reviews, tot, mean) 15 24 1.6 """ CorpusView = StreamBackedCorpusView def __init__( self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8" ): """ :param root: The root directory for the corpus. :param fileids: a list or regexp specifying the fileids in the corpus. :param word_tokenizer: a tokenizer for breaking sentences or paragraphs into words. Default: `WordPunctTokenizer` :param encoding: the encoding that should be used to read the corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._readme = "README.txt" def features(self, fileids=None): """ Return a list of features. Each feature is a tuple made of the specific item feature and the opinion strength about that feature. :param fileids: a list or regexp specifying the ids of the files whose features have to be returned. :return: all features for the item(s) in the given file(s). :rtype: list(tuple) """ if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] return concat( [ self.CorpusView(fileid, self._read_features, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def reviews(self, fileids=None): """ Return all the reviews as a list of Review objects. If `fileids` is specified, return all the reviews from each of the specified files. :param fileids: a list or regexp specifying the ids of the files whose reviews have to be returned. :return: the given file(s) as a list of reviews. """ if fileids is None: fileids = self._fileids return concat( [ self.CorpusView(fileid, self._read_review_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def sents(self, fileids=None): """ Return all sentences in the corpus or in the specified files. :param fileids: a list or regexp specifying the ids of the files whose sentences have to be returned. :return: the given file(s) as a list of sentences, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat( [ self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def words(self, fileids=None): """ Return all words and punctuation symbols in the corpus or in the specified files. :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat( [ self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def _read_features(self, stream): features = [] for i in range(20): line = stream.readline() if not line: return features features.extend(re.findall(FEATURES, line)) return features def _read_review_block(self, stream): while True: line = stream.readline() if not line: return [] # end of file. title_match = re.match(TITLE, line) if title_match: review = Review( title=title_match.group(1).strip() ) # We create a new review break # Scan until we find another line matching the regexp, or EOF. while True: oldpos = stream.tell() line = stream.readline() # End of file: if not line: return [review] # Start of a new review: backup to just before it starts, and # return the review we've already collected. if re.match(TITLE, line): stream.seek(oldpos) return [review] # Anything else is part of the review line. feats = re.findall(FEATURES, line) notes = re.findall(NOTES, line) sent = re.findall(SENT, line) if sent: sent = self._word_tokenizer.tokenize(sent[0]) review_line = ReviewLine(sent=sent, features=feats, notes=notes) review.add_line(review_line) def _read_sent_block(self, stream): sents = [] for review in self._read_review_block(stream): sents.extend([sent for sent in review.sents()]) return sents def _read_word_block(self, stream): words = [] for i in range(20): # Read 20 lines at a time. line = stream.readline() sent = re.findall(SENT, line) if sent: words.extend(self._word_tokenizer.tokenize(sent[0])) return words nltk-3.7/nltk/corpus/reader/rte.py000066400000000000000000000110371420073152400172240ustar00rootroot00000000000000# Natural Language Toolkit: RTE Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora. The files were taken from the RTE1, RTE2 and RTE3 datasets and the files were regularized. Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the gold standard annotated files. Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following example is taken from RTE3:: The sale was made to pay Yukos' US$ 27.5 billion tax bill, Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known company Baikalfinansgroup which was later bought by the Russian state-owned oil company Rosneft . Baikalfinansgroup was sold to Rosneft. In order to provide globally unique IDs for each pair, a new attribute ``challenge`` has been added to the root element ``entailment-corpus`` of each file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the challenge number and 'n' is the pair ID. """ from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.corpus.reader.xmldocs import * def norm(value_string): """ Normalize the string value in an RTE pair's ``value`` or ``entailment`` attribute as an integer (1, 0). :param value_string: the label used to classify a text/hypothesis pair :type value_string: str :rtype: int """ valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0} return valdict[value_string.upper()] class RTEPair: """ Container for RTE text-hypothesis pairs. The entailment relation is signalled by the ``value`` attribute in RTE1, and by ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment`` attribute of this class. """ def __init__( self, pair, challenge=None, id=None, text=None, hyp=None, value=None, task=None, length=None, ): """ :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3) :param id: identifier for the pair :param text: the text component of the pair :param hyp: the hypothesis component of the pair :param value: classification label for the pair :param task: attribute for the particular NLP task that the data was drawn from :param length: attribute for the length of the text of the pair """ self.challenge = challenge self.id = pair.attrib["id"] self.gid = f"{self.challenge}-{self.id}" self.text = pair[0].text self.hyp = pair[1].text if "value" in pair.attrib: self.value = norm(pair.attrib["value"]) elif "entailment" in pair.attrib: self.value = norm(pair.attrib["entailment"]) else: self.value = value if "task" in pair.attrib: self.task = pair.attrib["task"] else: self.task = task if "length" in pair.attrib: self.length = pair.attrib["length"] else: self.length = length def __repr__(self): if self.challenge: return f"" else: return "" % self.id class RTECorpusReader(XMLCorpusReader): """ Corpus reader for corpora in RTE challenges. This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected structure of input documents. """ def _read_etree(self, doc): """ Map the XML input into an RTEPair. This uses the ``getiterator()`` method from the ElementTree package to find all the ```` elements. :param doc: a parsed XML document :rtype: list(RTEPair) """ try: challenge = doc.attrib["challenge"] except KeyError: challenge = None pairiter = doc.iter("pair") return [RTEPair(pair, challenge=challenge) for pair in pairiter] def pairs(self, fileids): """ Build a list of RTEPairs from a RTE corpus. :param fileids: a list of RTE corpus fileids :type: list :rtype: list(RTEPair) """ if isinstance(fileids, str): fileids = [fileids] return concat([self._read_etree(self.xml(fileid)) for fileid in fileids]) nltk-3.7/nltk/corpus/reader/semcor.py000066400000000000000000000262061420073152400177260ustar00rootroot00000000000000# Natural Language Toolkit: SemCor Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Nathan Schneider # URL: # For license information, see LICENSE.TXT """ Corpus reader for the SemCor Corpus. """ __docformat__ = "epytext en" from nltk.corpus.reader.api import * from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView from nltk.tree import Tree class SemcorCorpusReader(XMLCorpusReader): """ Corpus reader for the SemCor Corpus. For access to the complete XML data structure, use the ``xml()`` method. For access to simple word lists and tagged word lists, use ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. """ def __init__(self, root, fileids, wordnet, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy self._wordnet = wordnet def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return self._items(fileids, "word", False, False, False) def chunks(self, fileids=None): """ :return: the given file(s) as a list of chunks, each of which is a list of words and punctuation symbols that form a unit. :rtype: list(list(str)) """ return self._items(fileids, "chunk", False, False, False) def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")): """ :return: the given file(s) as a list of tagged chunks, represented in tree form. :rtype: list(Tree) :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` to indicate the kind of tags to include. Semantic tags consist of WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity without a specific entry in WordNet. (Named entities of type 'other' have no lemma. Other chunks not in WordNet have no semantic tag. Punctuation tokens have `None` for their part of speech tag.) """ return self._items(fileids, "chunk", False, tag != "sem", tag != "pos") def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of word strings. :rtype: list(list(str)) """ return self._items(fileids, "word", True, False, False) def chunk_sents(self, fileids=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of chunks. :rtype: list(list(list(str))) """ return self._items(fileids, "chunk", True, False, False) def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")): """ :return: the given file(s) as a list of sentences. Each sentence is represented as a list of tagged chunks (in tree form). :rtype: list(list(Tree)) :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` to indicate the kind of tags to include. Semantic tags consist of WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity without a specific entry in WordNet. (Named entities of type 'other' have no lemma. Other chunks not in WordNet have no semantic tag. Punctuation tokens have `None` for their part of speech tag.) """ return self._items(fileids, "chunk", True, tag != "sem", tag != "pos") def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag): if unit == "word" and not bracket_sent: # the result of the SemcorWordView may be a multiword unit, so the # LazyConcatenation will make sure the sentence is flattened _ = lambda *args: LazyConcatenation( (SemcorWordView if self._lazy else self._words)(*args) ) else: _ = SemcorWordView if self._lazy else self._words return concat( [ _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet) for fileid in self.abspaths(fileids) ] ) def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag): """ Helper used to implement the view methods -- returns a list of tokens, (segmented) words, chunks, or sentences. The tokens and chunks may optionally be tagged (with POS and sense information). :param fileid: The name of the underlying file. :param unit: One of `'token'`, `'word'`, or `'chunk'`. :param bracket_sent: If true, include sentence bracketing. :param pos_tag: Whether to include part-of-speech tags. :param sem_tag: Whether to include semantic tags, namely WordNet lemma and OOV named entity status. """ assert unit in ("token", "word", "chunk") result = [] xmldoc = ElementTree.parse(fileid).getroot() for xmlsent in xmldoc.findall(".//s"): sent = [] for xmlword in _all_xmlwords_in(xmlsent): itm = SemcorCorpusReader._word( xmlword, unit, pos_tag, sem_tag, self._wordnet ) if unit == "word": sent.extend(itm) else: sent.append(itm) if bracket_sent: result.append(SemcorSentence(xmlsent.attrib["snum"], sent)) else: result.extend(sent) assert None not in result return result @staticmethod def _word(xmlword, unit, pos_tag, sem_tag, wordnet): tkn = xmlword.text if not tkn: tkn = "" # fixes issue 337? lemma = xmlword.get("lemma", tkn) # lemma or NE class lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense) if lexsn is not None: sense_key = lemma + "%" + lexsn wnpos = ("n", "v", "a", "r", "s")[ int(lexsn.split(":")[0]) - 1 ] # see http://wordnet.princeton.edu/man/senseidx.5WN.html else: sense_key = wnpos = None redef = xmlword.get( "rdf", tkn ) # redefinition--this indicates the lookup string # does not exactly match the enclosed string, e.g. due to typographical adjustments # or discontinuity of a multiword expression. If a redefinition has occurred, # the "rdf" attribute holds its inflected form and "lemma" holds its lemma. # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class). sensenum = xmlword.get("wnsn") # WordNet sense number isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet pos = xmlword.get( "pos" ) # part of speech for the whole chunk (None for punctuation) if unit == "token": if not pos_tag and not sem_tag: itm = tkn else: itm = ( (tkn,) + ((pos,) if pos_tag else ()) + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ()) ) return itm else: ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE if unit == "word": return ww else: if sensenum is not None: try: sense = wordnet.lemma_from_key(sense_key) # Lemma object except Exception: # cannot retrieve the wordnet.Lemma object. possible reasons: # (a) the wordnet corpus is not downloaded; # (b) a nonexistent sense is annotated: e.g., such.s.00 triggers: # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00' # solution: just use the lemma name as a string try: sense = "%s.%s.%02d" % ( lemma, wnpos, int(sensenum), ) # e.g.: reach.v.02 except ValueError: sense = ( lemma + "." + wnpos + "." + sensenum ) # e.g. the sense number may be "2;1" bottom = [Tree(pos, ww)] if pos_tag else ww if sem_tag and isOOVEntity: if sensenum is not None: return Tree(sense, [Tree("NE", bottom)]) else: # 'other' NE return Tree("NE", bottom) elif sem_tag and sensenum is not None: return Tree(sense, bottom) elif pos_tag: return bottom[0] else: return bottom # chunk as a list def _all_xmlwords_in(elt, result=None): if result is None: result = [] for child in elt: if child.tag in ("wf", "punc"): result.append(child) else: _all_xmlwords_in(child, result) return result class SemcorSentence(list): """ A list of words, augmented by an attribute ``num`` used to record the sentence identifier (the ``n`` attribute from the XML). """ def __init__(self, num, items): self.num = num list.__init__(self, items) class SemcorWordView(XMLCorpusView): """ A stream backed corpus view specialized for use with the BNC corpus. """ def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet): """ :param fileid: The name of the underlying file. :param unit: One of `'token'`, `'word'`, or `'chunk'`. :param bracket_sent: If true, include sentence bracketing. :param pos_tag: Whether to include part-of-speech tags. :param sem_tag: Whether to include semantic tags, namely WordNet lemma and OOV named entity status. """ if bracket_sent: tagspec = ".*/s" else: tagspec = ".*/s/(punc|wf)" self._unit = unit self._sent = bracket_sent self._pos_tag = pos_tag self._sem_tag = sem_tag self._wordnet = wordnet XMLCorpusView.__init__(self, fileid, tagspec) def handle_elt(self, elt, context): if self._sent: return self.handle_sent(elt) else: return self.handle_word(elt) def handle_word(self, elt): return SemcorCorpusReader._word( elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet ) def handle_sent(self, elt): sent = [] for child in elt: if child.tag in ("wf", "punc"): itm = self.handle_word(child) if self._unit == "word": sent.extend(itm) else: sent.append(itm) else: raise ValueError("Unexpected element %s" % child.tag) return SemcorSentence(elt.attrib["snum"], sent) nltk-3.7/nltk/corpus/reader/senseval.py000066400000000000000000000162571420073152400202630ustar00rootroot00000000000000# Natural Language Toolkit: Senseval 2 Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # Steven Bird (modifications) # URL: # For license information, see LICENSE.TXT """ Read from the Senseval 2 Corpus. SENSEVAL [http://www.senseval.org/] Evaluation exercises for Word Sense Disambiguation. Organized by ACL-SIGLEX [https://www.siglex.org/] Prepared by Ted Pedersen , University of Minnesota, https://www.d.umn.edu/~tpederse/data.html Distributed with permission. The NLTK version of the Senseval 2 files uses well-formed XML. Each instance of the ambiguous words "hard", "interest", "line", and "serve" is tagged with a sense identifier, and supplied with context. """ import re from xml.etree import ElementTree from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.tokenize import * class SensevalInstance: def __init__(self, word, position, context, senses): self.word = word self.senses = tuple(senses) self.position = position self.context = context def __repr__(self): return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % ( self.word, self.position, self.context, self.senses, ) class SensevalCorpusReader(CorpusReader): def instances(self, fileids=None): return concat( [ SensevalCorpusView(fileid, enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def _entry(self, tree): elts = [] for lexelt in tree.findall("lexelt"): for inst in lexelt.findall("instance"): sense = inst[0].attrib["senseid"] context = [(w.text, w.attrib["pos"]) for w in inst[1]] elts.append((sense, context)) return elts class SensevalCorpusView(StreamBackedCorpusView): def __init__(self, fileid, encoding): StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) self._word_tokenizer = WhitespaceTokenizer() self._lexelt_starts = [0] # list of streampos self._lexelts = [None] # list of lexelt names def read_block(self, stream): # Decide which lexical element we're in. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1 lexelt = self._lexelts[lexelt_num] instance_lines = [] in_instance = False while True: line = stream.readline() if line == "": assert instance_lines == [] return [] # Start of a lexical element? if line.lstrip().startswith(" has no 'item=...' lexelt = m.group(1)[1:-1] if lexelt_num < len(self._lexelts): assert lexelt == self._lexelts[lexelt_num] else: self._lexelts.append(lexelt) self._lexelt_starts.append(stream.tell()) # Start of an instance? if line.lstrip().startswith("" elif cword.tag == "wf": context.append((cword.text, cword.attrib["pos"])) elif cword.tag == "s": pass # Sentence boundary marker. else: print("ACK", cword.tag) assert False, "expected CDATA or or " if cword.tail: context += self._word_tokenizer.tokenize(cword.tail) else: assert False, "unexpected tag %s" % child.tag return SensevalInstance(lexelt, position, context, senses) def _fixXML(text): """ Fix the various issues with Senseval pseudo-XML. """ # <~> or <^> => ~ or ^ text = re.sub(r"<([~\^])>", r"\1", text) # fix lone & text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text) # fix """ text = re.sub(r'"""', "'\"'", text) # fix => text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) # fix foreign word tag text = re.sub(r"<\&frasl>\s*]*>", "FRASL", text) # remove <&I .> text = re.sub(r"<\&I[^>]*>", "", text) # fix <{word}> text = re.sub(r"<{([^}]+)}>", r"\1", text) # remove <@>,

    ,

    text = re.sub(r"<(@|/?p)>", r"", text) # remove <&M .> and <&T .> and <&Ms .> text = re.sub(r"<&\w+ \.>", r"", text) # remove lines text = re.sub(r"]*>", r"", text) # remove <[hi]> and <[/p]> etc text = re.sub(r"<\[\/?[^>]+\]*>", r"", text) # take the thing out of the brackets: <…> text = re.sub(r"<(\&\w+;)>", r"\1", text) # and remove the & for those patterns that aren't regular XML text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text) # fix 'abc ' style tags - now abc text = re.sub( r'[ \t]*([^<>\s]+?)[ \t]*', r' \1', text ) text = re.sub(r'\s*"\s*', " \"", text) return text nltk-3.7/nltk/corpus/reader/sentiwordnet.py000066400000000000000000000106241420073152400211600ustar00rootroot00000000000000# Natural Language Toolkit: SentiWordNet # # Copyright (C) 2001-2022 NLTK Project # Author: Christopher Potts # URL: # For license information, see LICENSE.TXT """ An NLTK interface for SentiWordNet SentiWordNet is a lexical resource for opinion mining. SentiWordNet assigns to each synset of WordNet three sentiment scores: positivity, negativity, and objectivity. For details about SentiWordNet see: http://sentiwordnet.isti.cnr.it/ >>> from nltk.corpus import sentiwordnet as swn >>> print(swn.senti_synset('breakdown.n.03')) >>> list(swn.senti_synsets('slow')) [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')] >>> happy = swn.senti_synsets('happy', 'a') >>> happy0 = list(happy)[0] >>> happy0.pos_score() 0.875 >>> happy0.neg_score() 0.0 >>> happy0.obj_score() 0.125 """ import re from nltk.corpus.reader import CorpusReader class SentiWordNetCorpusReader(CorpusReader): def __init__(self, root, fileids, encoding="utf-8"): """ Construct a new SentiWordNet Corpus Reader, using data from the specified file. """ super().__init__(root, fileids, encoding=encoding) if len(self._fileids) != 1: raise ValueError("Exactly one file must be specified") self._db = {} self._parse_src_file() def _parse_src_file(self): lines = self.open(self._fileids[0]).read().splitlines() lines = filter((lambda x: not re.search(r"^\s*#", x)), lines) for i, line in enumerate(lines): fields = [field.strip() for field in re.split(r"\t+", line)] try: pos, offset, pos_score, neg_score, synset_terms, gloss = fields except BaseException as e: raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e if pos and offset: offset = int(offset) self._db[(pos, offset)] = (float(pos_score), float(neg_score)) def senti_synset(self, *vals): from nltk.corpus import wordnet as wn if tuple(vals) in self._db: pos_score, neg_score = self._db[tuple(vals)] pos, offset = vals if pos == "s": pos = "a" synset = wn.synset_from_pos_and_offset(pos, offset) return SentiSynset(pos_score, neg_score, synset) else: synset = wn.synset(vals[0]) pos = synset.pos() if pos == "s": pos = "a" offset = synset.offset() if (pos, offset) in self._db: pos_score, neg_score = self._db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None def senti_synsets(self, string, pos=None): from nltk.corpus import wordnet as wn sentis = [] synset_list = wn.synsets(string, pos) for synset in synset_list: sentis.append(self.senti_synset(synset.name())) sentis = filter(lambda x: x, sentis) return sentis def all_senti_synsets(self): from nltk.corpus import wordnet as wn for key, fields in self._db.items(): pos, offset = key pos_score, neg_score = fields synset = wn.synset_from_pos_and_offset(pos, offset) yield SentiSynset(pos_score, neg_score, synset) class SentiSynset: def __init__(self, pos_score, neg_score, synset): self._pos_score = pos_score self._neg_score = neg_score self._obj_score = 1.0 - (self._pos_score + self._neg_score) self.synset = synset def pos_score(self): return self._pos_score def neg_score(self): return self._neg_score def obj_score(self): return self._obj_score def __str__(self): """Prints just the Pos/Neg scores for now.""" s = "<" s += self.synset.name() + ": " s += "PosScore=%s " % self._pos_score s += "NegScore=%s" % self._neg_score s += ">" return s def __repr__(self): return "Senti" + repr(self.synset) nltk-3.7/nltk/corpus/reader/sinica_treebank.py000066400000000000000000000046421420073152400215570ustar00rootroot00000000000000# Natural Language Toolkit: Sinica Treebank Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ Sinica Treebank Corpus Sample http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm 10,000 parsed sentences, drawn from the Academia Sinica Balanced Corpus of Modern Chinese. Parse tree notation is based on Information-based Case Grammar. Tagset documentation is available at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html Language and Knowledge Processing Group, Institute of Information Science, Academia Sinica The data is distributed with the Natural Language Toolkit under the terms of the Creative Commons Attribution-NonCommercial-ShareAlike License [https://creativecommons.org/licenses/by-nc-sa/2.5/]. References: Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999) The Construction of Sinica Treebank. Computational Linguistics and Chinese Language Processing, 4, pp 87-104. Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria, Annotation Guidelines, and On-line Interface. Proceedings of 2nd Chinese Language Processing Workshop, Association for Computational Linguistics. Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar Extraction, Proceedings of IJCNLP-04, pp560-565. """ from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.tag import map_tag from nltk.tree import sinica_parse IDENTIFIER = re.compile(r"^#\S+\s") APPENDIX = re.compile(r"(?<=\))#.*$") TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)") WORD = re.compile(r":[^:()|]+:([^:()|]+)") class SinicaTreebankCorpusReader(SyntaxCorpusReader): """ Reader for the sinica treebank. """ def _read_block(self, stream): sent = stream.readline() sent = IDENTIFIER.sub("", sent) sent = APPENDIX.sub("", sent) return [sent] def _parse(self, sent): return sinica_parse(sent) def _tag(self, sent, tagset=None): tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)] if tagset and tagset != self._tagset: tagged_sent = [ (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent ] return tagged_sent def _word(self, sent): return WORD.findall(sent) nltk-3.7/nltk/corpus/reader/string_category.py000066400000000000000000000035071420073152400216400ustar00rootroot00000000000000# Natural Language Toolkit: String Category Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Read tuples from a corpus consisting of categorized strings. For example, from the question classification corpus: NUM:dist How far is it from Denver to Aspen ? LOC:city What county is Modesto , California in ? HUM:desc Who was Galileo ? DESC:def What is an atom ? NUM:date When did Hawaii become a state ? """ from nltk.corpus.reader.api import * # based on PPAttachmentCorpusReader from nltk.corpus.reader.util import * # [xx] Should the order of the tuple be reversed -- in most other places # in nltk, we use the form (data, tag) -- e.g., tagged words and # labeled texts for classifiers. class StringCategoryCorpusReader(CorpusReader): def __init__(self, root, fileids, delimiter=" ", encoding="utf8"): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param delimiter: Field delimiter """ CorpusReader.__init__(self, root, fileids, encoding) self._delimiter = delimiter def tuples(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, str): fileids = [fileids] return concat( [ StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def _read_tuple_block(self, stream): line = stream.readline().strip() if line: return [tuple(line.split(self._delimiter, 1))] else: return [] nltk-3.7/nltk/corpus/reader/switchboard.py000066400000000000000000000105061420073152400207430ustar00rootroot00000000000000# Natural Language Toolkit: Switchboard Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT import re from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.tag import map_tag, str2tuple class SwitchboardTurn(list): """ A specialized list object used to encode switchboard utterances. The elements of the list are the words in the utterance; and two attributes, ``speaker`` and ``id``, are provided to retrieve the spearker identifier and utterance id. Note that utterance ids are only unique within a given discourse. """ def __init__(self, words, speaker, id): list.__init__(self, words) self.speaker = speaker self.id = int(id) def __repr__(self): if len(self) == 0: text = "" elif isinstance(self[0], tuple): text = " ".join("%s/%s" % w for w in self) else: text = " ".join(self) return f"<{self.speaker}.{self.id}: {text!r}>" class SwitchboardCorpusReader(CorpusReader): _FILES = ["tagged"] # Use the "tagged" file even for non-tagged data methods, since # it's tokenized. def __init__(self, root, tagset=None): CorpusReader.__init__(self, root, self._FILES) self._tagset = tagset def words(self): return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader) def tagged_words(self, tagset=None): def tagged_words_block_reader(stream): return self._tagged_words_block_reader(stream, tagset) return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader) def turns(self): return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader) def tagged_turns(self, tagset=None): def tagged_turns_block_reader(stream): return self._tagged_turns_block_reader(stream, tagset) return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader) def discourses(self): return StreamBackedCorpusView( self.abspath("tagged"), self._discourses_block_reader ) def tagged_discourses(self, tagset=False): def tagged_discourses_block_reader(stream): return self._tagged_discourses_block_reader(stream, tagset) return StreamBackedCorpusView( self.abspath("tagged"), tagged_discourses_block_reader ) def _discourses_block_reader(self, stream): # returns at most 1 discourse. (The other methods depend on this.) return [ [ self._parse_utterance(u, include_tag=False) for b in read_blankline_block(stream) for u in b.split("\n") if u.strip() ] ] def _tagged_discourses_block_reader(self, stream, tagset=None): # returns at most 1 discourse. (The other methods depend on this.) return [ [ self._parse_utterance(u, include_tag=True, tagset=tagset) for b in read_blankline_block(stream) for u in b.split("\n") if u.strip() ] ] def _turns_block_reader(self, stream): return self._discourses_block_reader(stream)[0] def _tagged_turns_block_reader(self, stream, tagset=None): return self._tagged_discourses_block_reader(stream, tagset)[0] def _words_block_reader(self, stream): return sum(self._discourses_block_reader(stream)[0], []) def _tagged_words_block_reader(self, stream, tagset=None): return sum(self._tagged_discourses_block_reader(stream, tagset)[0], []) _UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)") _SEP = "/" def _parse_utterance(self, utterance, include_tag, tagset=None): m = self._UTTERANCE_RE.match(utterance) if m is None: raise ValueError("Bad utterance %r" % utterance) speaker, id, text = m.groups() words = [str2tuple(s, self._SEP) for s in text.split()] if not include_tag: words = [w for (w, t) in words] elif tagset and tagset != self._tagset: words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words] return SwitchboardTurn(words, speaker, id) nltk-3.7/nltk/corpus/reader/tagged.py000066400000000000000000000270121420073152400176650ustar00rootroot00000000000000# Natural Language Toolkit: Tagged Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Jacob Perkins # URL: # For license information, see LICENSE.TXT """ A reader for corpora whose documents contain part-of-speech-tagged words. """ import os from nltk.corpus.reader.api import * from nltk.corpus.reader.timit import read_timit_block from nltk.corpus.reader.util import * from nltk.tag import map_tag, str2tuple from nltk.tokenize import * class TaggedCorpusReader(CorpusReader): """ Reader for simple part-of-speech tagged corpora. Paragraphs are assumed to be split using blank lines. Sentences and words can be tokenized using the default tokenizers, or by custom tokenizers specified as parameters to the constructor. Words are parsed using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the separator. I.e., words should have the form:: word1/tag1 word2/tag2 word3/tag3 ... But custom separators may be specified as parameters to the constructor. Part of speech tags are case-normalized to upper case. """ def __init__( self, root, fileids, sep="/", word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer("\n", gaps=True), para_block_reader=read_blankline_block, encoding="utf8", tagset=None, ): """ Construct a new Tagged Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._para_block_reader = para_block_reader self._tagset = tagset def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat( [ TaggedCorpusView( fileid, enc, False, False, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat( [ TaggedCorpusView( fileid, enc, False, True, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) def paras(self, fileids=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ return concat( [ TaggedCorpusView( fileid, enc, False, True, True, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_words(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat( [ TaggedCorpusView( fileid, enc, True, False, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_sents(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat( [ TaggedCorpusView( fileid, enc, True, True, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_paras(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of ``(word,tag)`` tuples. :rtype: list(list(list(tuple(str,str)))) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat( [ TaggedCorpusView( fileid, enc, True, True, True, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function, ) for (fileid, enc) in self.abspaths(fileids, True) ] ) class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader): """ A reader for part-of-speech tagged corpora whose documents are divided into categories based on their file identifiers. """ def __init__(self, *args, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``TaggedCorpusReader``. """ CategorizedCorpusReader.__init__(self, kwargs) TaggedCorpusReader.__init__(self, *args, **kwargs) def tagged_words(self, fileids=None, categories=None, tagset=None): return super().tagged_words(self._resolve(fileids, categories), tagset) def tagged_sents(self, fileids=None, categories=None, tagset=None): return super().tagged_sents(self._resolve(fileids, categories), tagset) def tagged_paras(self, fileids=None, categories=None, tagset=None): return super().tagged_paras(self._resolve(fileids, categories), tagset) class TaggedCorpusView(StreamBackedCorpusView): """ A specialized corpus view for tagged documents. It can be customized via flags to divide the tagged corpus documents up by sentence or paragraph, and to include or omit part of speech tags. ``TaggedCorpusView`` objects are typically created by ``TaggedCorpusReader`` (not directly by nltk users). """ def __init__( self, corpus_file, encoding, tagged, group_by_sent, group_by_para, sep, word_tokenizer, sent_tokenizer, para_block_reader, tag_mapping_function=None, ): self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._para_block_reader = para_block_reader self._tag_mapping_function = tag_mapping_function StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in self._para_block_reader(stream): para = [] for sent_str in self._sent_tokenizer.tokenize(para_str): sent = [ str2tuple(s, self._sep) for s in self._word_tokenizer.tokenize(sent_str) ] if self._tag_mapping_function: sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent] if not self._tagged: sent = [w for (w, t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block # needs to implement simplified tags class MacMorphoCorpusReader(TaggedCorpusReader): """ A corpus reader for the MAC_MORPHO corpus. Each line contains a single tagged word, using '_' as a separator. Sentence boundaries are based on the end-sentence tag ('_.'). Paragraph information is not included in the corpus, so each paragraph returned by ``self.paras()`` and ``self.tagged_paras()`` contains a single sentence. """ def __init__(self, root, fileids, encoding="utf8", tagset=None): TaggedCorpusReader.__init__( self, root, fileids, sep="_", word_tokenizer=LineTokenizer(), sent_tokenizer=RegexpTokenizer(".*\n"), para_block_reader=self._read_block, encoding=encoding, tagset=tagset, ) def _read_block(self, stream): return read_regexp_block(stream, r".*", r".*_\.") class TimitTaggedCorpusReader(TaggedCorpusReader): """ A corpus reader for tagged sentences that are included in the TIMIT corpus. """ def __init__(self, *args, **kwargs): TaggedCorpusReader.__init__( self, para_block_reader=read_timit_block, *args, **kwargs ) def paras(self): raise NotImplementedError("use sents() instead") def tagged_paras(self): raise NotImplementedError("use tagged_sents() instead") nltk-3.7/nltk/corpus/reader/timit.py000066400000000000000000000430531420073152400175630ustar00rootroot00000000000000# Natural Language Toolkit: TIMIT Corpus Reader # # Copyright (C) 2001-2007 NLTK Project # Author: Haejoong Lee # Steven Bird # Jacob Perkins # URL: # For license information, see LICENSE.TXT # [xx] this docstring is out-of-date: """ Read tokens, phonemes and audio data from the NLTK TIMIT Corpus. This corpus contains selected portion of the TIMIT corpus. - 16 speakers from 8 dialect regions - 1 male and 1 female from each dialect region - total 130 sentences (10 sentences per speaker. Note that some sentences are shared among other speakers, especially sa1 and sa2 are spoken by all speakers.) - total 160 recording of sentences (10 recordings per speaker) - audio format: NIST Sphere, single channel, 16kHz sampling, 16 bit sample, PCM encoding Module contents =============== The timit corpus reader provides 4 functions and 4 data items. - utterances List of utterances in the corpus. There are total 160 utterances, each of which corresponds to a unique utterance of a speaker. Here's an example of an utterance identifier in the list:: dr1-fvmh0/sx206 - _---- _--- | | | | | | | | | | | | | | `--- sentence number | | | `----- sentence type (a:all, i:shared, x:exclusive) | | `--------- speaker ID | `------------ sex (m:male, f:female) `-------------- dialect region (1..8) - speakers List of speaker IDs. An example of speaker ID:: dr1-fvmh0 Note that if you split an item ID with colon and take the first element of the result, you will get a speaker ID. >>> itemid = 'dr1-fvmh0/sx206' >>> spkrid , sentid = itemid.split('/') >>> spkrid 'dr1-fvmh0' The second element of the result is a sentence ID. - dictionary() Phonetic dictionary of words contained in this corpus. This is a Python dictionary from words to phoneme lists. - spkrinfo() Speaker information table. It's a Python dictionary from speaker IDs to records of 10 fields. Speaker IDs the same as the ones in timie.speakers. Each record is a dictionary from field names to values, and the fields are as follows:: id speaker ID as defined in the original TIMIT speaker info table sex speaker gender (M:male, F:female) dr speaker dialect region (1:new england, 2:northern, 3:north midland, 4:south midland, 5:southern, 6:new york city, 7:western, 8:army brat (moved around)) use corpus type (TRN:training, TST:test) in this sample corpus only TRN is available recdate recording date birthdate speaker birth date ht speaker height race speaker race (WHT:white, BLK:black, AMR:american indian, SPN:spanish-american, ORN:oriental,???:unknown) edu speaker education level (HS:high school, AS:associate degree, BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA), PHD:doctorate degree (PhD,JD,MD), ??:unknown) comments comments by the recorder The 4 functions are as follows. - tokenized(sentences=items, offset=False) Given a list of items, returns an iterator of a list of word lists, each of which corresponds to an item (sentence). If offset is set to True, each element of the word list is a tuple of word(string), start offset and end offset, where offset is represented as a number of 16kHz samples. - phonetic(sentences=items, offset=False) Given a list of items, returns an iterator of a list of phoneme lists, each of which corresponds to an item (sentence). If offset is set to True, each element of the phoneme list is a tuple of word(string), start offset and end offset, where offset is represented as a number of 16kHz samples. - audiodata(item, start=0, end=None) Given an item, returns a chunk of audio samples formatted into a string. When the function is called, if start and end are omitted, the entire samples of the recording will be returned. If only end is omitted, samples from the start offset to the end of the recording will be returned. - play(data) Play the given audio samples. The audio samples can be obtained from the timit.audiodata function. """ import sys import time from nltk.corpus.reader.api import * from nltk.internals import import_from_stdlib from nltk.tree import Tree class TimitCorpusReader(CorpusReader): """ Reader for the TIMIT corpus (or any other corpus with the same file layout and use of file formats). The corpus root directory should contain the following files: - timitdic.txt: dictionary of standard transcriptions - spkrinfo.txt: table of speaker information In addition, the root directory should contain one subdirectory for each speaker, containing three files for each utterance: - .txt: text content of utterances - .wrd: tokenized text content of utterances - .phn: phonetic transcription of utterances - .wav: utterance sound file """ _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt" """A regexp matching fileids that are used by this corpus reader.""" _UTTERANCE_RE = r"\w+-\w+/\w+\.txt" def __init__(self, root, encoding="utf8"): """ Construct a new TIMIT corpus reader in the given directory. :param root: The root directory for this corpus. """ # Ensure that wave files don't get treated as unicode data: if isinstance(encoding, str): encoding = [(r".*\.wav", None), (".*", encoding)] CorpusReader.__init__( self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding ) self._utterances = [ name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE) ] """A list of the utterance identifiers for all utterances in this corpus.""" self._speakerinfo = None self._root = root self.speakers = sorted({u.split("/")[0] for u in self._utterances}) def fileids(self, filetype=None): """ Return a list of file identifiers for the files that make up this corpus. :param filetype: If specified, then ``filetype`` indicates that only the files that have the given type should be returned. Accepted values are: ``txt``, ``wrd``, ``phn``, ``wav``, or ``metadata``, """ if filetype is None: return CorpusReader.fileids(self) elif filetype in ("txt", "wrd", "phn", "wav"): return [f"{u}.{filetype}" for u in self._utterances] elif filetype == "metadata": return ["timitdic.txt", "spkrinfo.txt"] else: raise ValueError("Bad value for filetype: %r" % filetype) def utteranceids( self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None ): """ :return: A list of the utterance identifiers for all utterances in this corpus, or for the given speaker, dialect region, gender, sentence type, or sentence number, if specified. """ if isinstance(dialect, str): dialect = [dialect] if isinstance(sex, str): sex = [sex] if isinstance(spkrid, str): spkrid = [spkrid] if isinstance(sent_type, str): sent_type = [sent_type] if isinstance(sentid, str): sentid = [sentid] utterances = self._utterances[:] if dialect is not None: utterances = [u for u in utterances if u[2] in dialect] if sex is not None: utterances = [u for u in utterances if u[4] in sex] if spkrid is not None: utterances = [u for u in utterances if u[:9] in spkrid] if sent_type is not None: utterances = [u for u in utterances if u[11] in sent_type] if sentid is not None: utterances = [u for u in utterances if u[10:] in spkrid] return utterances def transcription_dict(self): """ :return: A dictionary giving the 'standard' transcription for each word. """ _transcriptions = {} with self.open("timitdic.txt") as fp: for line in fp: if not line.strip() or line[0] == ";": continue m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line) if not m: raise ValueError("Bad line: %r" % line) _transcriptions[m.group(1)] = m.group(2).split() return _transcriptions def spkrid(self, utterance): return utterance.split("/")[0] def sentid(self, utterance): return utterance.split("/")[1] def utterance(self, spkrid, sentid): return f"{spkrid}/{sentid}" def spkrutteranceids(self, speaker): """ :return: A list of all utterances associated with a given speaker. """ return [ utterance for utterance in self._utterances if utterance.startswith(speaker + "/") ] def spkrinfo(self, speaker): """ :return: A dictionary mapping .. something. """ if speaker in self._utterances: speaker = self.spkrid(speaker) if self._speakerinfo is None: self._speakerinfo = {} with self.open("spkrinfo.txt") as fp: for line in fp: if not line.strip() or line[0] == ";": continue rec = line.strip().split(None, 9) key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}" self._speakerinfo[key] = SpeakerInfo(*rec) return self._speakerinfo[speaker] def phones(self, utterances=None): results = [] for fileid in self._utterance_fileids(utterances, ".phn"): with self.open(fileid) as fp: for line in fp: if line.strip(): results.append(line.split()[-1]) return results def phone_times(self, utterances=None): """ offset is represented as a number of 16kHz samples! """ results = [] for fileid in self._utterance_fileids(utterances, ".phn"): with self.open(fileid) as fp: for line in fp: if line.strip(): results.append( ( line.split()[2], int(line.split()[0]), int(line.split()[1]), ) ) return results def words(self, utterances=None): results = [] for fileid in self._utterance_fileids(utterances, ".wrd"): with self.open(fileid) as fp: for line in fp: if line.strip(): results.append(line.split()[-1]) return results def word_times(self, utterances=None): results = [] for fileid in self._utterance_fileids(utterances, ".wrd"): with self.open(fileid) as fp: for line in fp: if line.strip(): results.append( ( line.split()[2], int(line.split()[0]), int(line.split()[1]), ) ) return results def sents(self, utterances=None): results = [] for fileid in self._utterance_fileids(utterances, ".wrd"): with self.open(fileid) as fp: results.append([line.split()[-1] for line in fp if line.strip()]) return results def sent_times(self, utterances=None): # TODO: Check this return [ ( line.split(None, 2)[-1].strip(), int(line.split()[0]), int(line.split()[1]), ) for fileid in self._utterance_fileids(utterances, ".txt") for line in self.open(fileid) if line.strip() ] def phone_trees(self, utterances=None): if utterances is None: utterances = self._utterances if isinstance(utterances, str): utterances = [utterances] trees = [] for utterance in utterances: word_times = self.word_times(utterance) phone_times = self.phone_times(utterance) sent_times = self.sent_times(utterance) while sent_times: (sent, sent_start, sent_end) = sent_times.pop(0) trees.append(Tree("S", [])) while ( word_times and phone_times and phone_times[0][2] <= word_times[0][1] ): trees[-1].append(phone_times.pop(0)[0]) while word_times and word_times[0][2] <= sent_end: (word, word_start, word_end) = word_times.pop(0) trees[-1].append(Tree(word, [])) while phone_times and phone_times[0][2] <= word_end: trees[-1][-1].append(phone_times.pop(0)[0]) while phone_times and phone_times[0][2] <= sent_end: trees[-1].append(phone_times.pop(0)[0]) return trees # [xx] NOTE: This is currently broken -- we're assuming that the # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE # fileids. def wav(self, utterance, start=0, end=None): # nltk.chunk conflicts with the stdlib module 'chunk' wave = import_from_stdlib("wave") w = wave.open(self.open(utterance + ".wav"), "rb") if end is None: end = w.getnframes() # Skip past frames before start, then read the frames we want w.readframes(start) frames = w.readframes(end - start) # Open a new temporary file -- the wave module requires # an actual file, and won't work w/ stringio. :( tf = tempfile.TemporaryFile() out = wave.open(tf, "w") # Write the parameters & data to the new file. out.setparams(w.getparams()) out.writeframes(frames) out.close() # Read the data back from the file, and return it. The # file will automatically be deleted when we return. tf.seek(0) return tf.read() def audiodata(self, utterance, start=0, end=None): assert end is None or end > start headersize = 44 with self.open(utterance + ".wav") as fp: if end is None: data = fp.read() else: data = fp.read(headersize + end * 2) return data[headersize + start * 2 :] def _utterance_fileids(self, utterances, extension): if utterances is None: utterances = self._utterances if isinstance(utterances, str): utterances = [utterances] return [f"{u}{extension}" for u in utterances] def play(self, utterance, start=0, end=None): """ Play the given audio sample. :param utterance: The utterance id of the sample to play """ # Method 1: os audio dev. try: import ossaudiodev try: dsp = ossaudiodev.open("w") dsp.setfmt(ossaudiodev.AFMT_S16_LE) dsp.channels(1) dsp.speed(16000) dsp.write(self.audiodata(utterance, start, end)) dsp.close() except OSError as e: print( ( "can't acquire the audio device; please " "activate your audio device." ), file=sys.stderr, ) print("system error message:", str(e), file=sys.stderr) return except ImportError: pass # Method 2: pygame try: # FIXME: this won't work under python 3 import pygame.mixer import StringIO pygame.mixer.init(16000) f = StringIO.StringIO(self.wav(utterance, start, end)) pygame.mixer.Sound(f).play() while pygame.mixer.get_busy(): time.sleep(0.01) return except ImportError: pass # Method 3: complain. :) print( ("you must install pygame or ossaudiodev " "for audio playback."), file=sys.stderr, ) class SpeakerInfo: def __init__( self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None ): self.id = id self.sex = sex self.dr = dr self.use = use self.recdate = recdate self.birthdate = birthdate self.ht = ht self.race = race self.edu = edu self.comments = comments def __repr__(self): attribs = "id sex dr use recdate birthdate ht race edu comments" args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()] return "SpeakerInfo(%s)" % (", ".join(args)) def read_timit_block(stream): """ Block reader for timit tagged sentences, which are preceded by a sentence number that will be ignored. """ line = stream.readline() if not line: return [] n, sent = line.split(" ", 1) return [sent] nltk-3.7/nltk/corpus/reader/toolbox.py000066400000000000000000000037751420073152400201320ustar00rootroot00000000000000# Natural Language Toolkit: Toolbox Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Greg Aumann # Stuart Robinson # Steven Bird # URL: # For license information, see LICENSE.TXT """ Module for reading, writing and manipulating Toolbox databases and settings fileids. """ from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.toolbox import ToolboxData class ToolboxCorpusReader(CorpusReader): def xml(self, fileids, key=None): return concat( [ ToolboxData(path, enc).parse(key=key) for (path, enc) in self.abspaths(fileids, True) ] ) def fields( self, fileids, strip=True, unwrap=True, encoding="utf8", errors="strict", unicode_fields=None, ): return concat( [ list( ToolboxData(fileid, enc).fields( strip, unwrap, encoding, errors, unicode_fields ) ) for (fileid, enc) in self.abspaths(fileids, include_encoding=True) ] ) # should probably be done lazily: def entries(self, fileids, **kwargs): if "key" in kwargs: key = kwargs["key"] del kwargs["key"] else: key = "lx" # the default key in MDF entries = [] for marker, contents in self.fields(fileids, **kwargs): if marker == key: entries.append((contents, [])) else: try: entries[-1][-1].append((marker, contents)) except IndexError: pass return entries def words(self, fileids, key="lx"): return [contents for marker, contents in self.fields(fileids) if marker == key] def demo(): pass if __name__ == "__main__": demo() nltk-3.7/nltk/corpus/reader/twitter.py000066400000000000000000000105701420073152400201350ustar00rootroot00000000000000# Natural Language Toolkit: Twitter Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ A reader for corpora that consist of Tweets. It is assumed that the Tweets have been serialised into line-delimited JSON. """ import json import os from nltk.corpus.reader.api import CorpusReader from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat from nltk.tokenize import TweetTokenizer class TwitterCorpusReader(CorpusReader): r""" Reader for corpora that consist of Tweets represented as a list of line-delimited JSON. Individual Tweets can be tokenized using the default tokenizer, or by a custom tokenizer specified as a parameter to the constructor. Construct a new Tweet corpus reader for a set of documents located at the given root directory. If you made your own tweet collection in a directory called `twitter-files`, then you can initialise the reader as:: from nltk.corpus import TwitterCorpusReader reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json') However, the recommended approach is to set the relevant directory as the value of the environmental variable `TWITTER`, and then invoke the reader as follows:: root = os.environ['TWITTER'] reader = TwitterCorpusReader(root, '.*\.json') If you want to work directly with the raw Tweets, the `json` library can be used:: import json for tweet in reader.docs(): print(json.dumps(tweet, indent=1, sort_keys=True)) """ CorpusView = StreamBackedCorpusView """ The corpus view class used by this reader. """ def __init__( self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8" ): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param word_tokenizer: Tokenizer for breaking the text of Tweets into smaller units, including but not limited to words. """ CorpusReader.__init__(self, root, fileids, encoding) for path in self.abspaths(self._fileids): if isinstance(path, ZipFilePathPointer): pass elif os.path.getsize(path) == 0: raise ValueError(f"File {path} is empty") """Check that all user-created corpus files are non-empty.""" self._word_tokenizer = word_tokenizer def docs(self, fileids=None): """ Returns the full Tweet objects, as specified by `Twitter documentation on Tweets `_ :return: the given file(s) as a list of dictionaries deserialised from JSON. :rtype: list(dict) """ return concat( [ self.CorpusView(path, self._read_tweets, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True) ] ) def strings(self, fileids=None): """ Returns only the text content of Tweets in the file(s) :return: the given file(s) as a list of Tweets. :rtype: list(str) """ fulltweets = self.docs(fileids) tweets = [] for jsono in fulltweets: try: text = jsono["text"] if isinstance(text, bytes): text = text.decode(self.encoding) tweets.append(text) except KeyError: pass return tweets def tokenized(self, fileids=None): """ :return: the given file(s) as a list of the text content of Tweets as as a list of words, screenanames, hashtags, URLs and punctuation symbols. :rtype: list(list(str)) """ tweets = self.strings(fileids) tokenizer = self._word_tokenizer return [tokenizer.tokenize(t) for t in tweets] def _read_tweets(self, stream): """ Assumes that each line in ``stream`` is a JSON-serialised object. """ tweets = [] for i in range(10): line = stream.readline() if not line: return tweets tweet = json.loads(line) tweets.append(tweet) return tweets nltk-3.7/nltk/corpus/reader/udhr.py000066400000000000000000000046041420073152400173760ustar00rootroot00000000000000""" UDHR corpus reader. It mostly deals with encodings. """ from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.corpus.reader.util import find_corpus_fileids class UdhrCorpusReader(PlaintextCorpusReader): ENCODINGS = [ (".*-Latin1$", "latin-1"), (".*-Hebrew$", "hebrew"), (".*-Arabic$", "cp1256"), ("Czech_Cesky-UTF8", "cp1250"), # yeah (".*-Cyrillic$", "cyrillic"), (".*-SJIS$", "SJIS"), (".*-GB2312$", "GB2312"), (".*-Latin2$", "ISO-8859-2"), (".*-Greek$", "greek"), (".*-UTF8$", "utf-8"), ("Hungarian_Magyar-Unicode", "utf-16-le"), ("Amahuaca", "latin1"), ("Turkish_Turkce-Turkish", "latin5"), ("Lithuanian_Lietuviskai-Baltic", "latin4"), ("Japanese_Nihongo-EUC", "EUC-JP"), ("Japanese_Nihongo-JIS", "iso2022_jp"), ("Chinese_Mandarin-HZ", "hz"), (r"Abkhaz\-Cyrillic\+Abkh", "cp1251"), ] SKIP = { # The following files are not fully decodable because they # were truncated at wrong bytes: "Burmese_Myanmar-UTF8", "Japanese_Nihongo-JIS", "Chinese_Mandarin-HZ", "Chinese_Mandarin-UTF8", "Gujarati-UTF8", "Hungarian_Magyar-Unicode", "Lao-UTF8", "Magahi-UTF8", "Marathi-UTF8", "Tamil-UTF8", # Unfortunately, encodings required for reading # the following files are not supported by Python: "Vietnamese-VPS", "Vietnamese-VIQR", "Vietnamese-TCVN", "Magahi-Agra", "Bhojpuri-Agra", "Esperanto-T61", # latin3 raises an exception # The following files are encoded for specific fonts: "Burmese_Myanmar-WinResearcher", "Armenian-DallakHelv", "Tigrinya_Tigrigna-VG2Main", "Amharic-Afenegus6..60375", # ? "Navaho_Dine-Navajo-Navaho-font", # What are these? "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117", "Azeri_Azerbaijani_Latin-Az.Times.Lat0117", # The following files are unintended: "Czech-Latin2-err", "Russian_Russky-UTF8~", } def __init__(self, root="udhr"): fileids = find_corpus_fileids(root, r"(?!README|\.).*") super().__init__( root, [fileid for fileid in fileids if fileid not in self.SKIP], encoding=self.ENCODINGS, ) nltk-3.7/nltk/corpus/reader/util.py000066400000000000000000000751761420073152400174250ustar00rootroot00000000000000# Natural Language Toolkit: Corpus Reader Utilities # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT import bisect import os import pickle import re import tempfile from functools import reduce from xml.etree import ElementTree from nltk.data import ( FileSystemPathPointer, PathPointer, SeekableUnicodeStreamReader, ZipFilePathPointer, ) from nltk.internals import slice_bounds from nltk.tokenize import wordpunct_tokenize from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence ###################################################################### # { Corpus View ###################################################################### class StreamBackedCorpusView(AbstractLazySequence): """ A 'view' of a corpus file, which acts like a sequence of tokens: it can be accessed by index, iterated over, etc. However, the tokens are only constructed as-needed -- the entire corpus is never stored in memory at once. The constructor to ``StreamBackedCorpusView`` takes two arguments: a corpus fileid (specified as a string or as a ``PathPointer``); and a block reader. A "block reader" is a function that reads zero or more tokens from a stream, and returns them as a list. A very simple example of a block reader is: >>> def simple_block_reader(stream): ... return stream.readline().split() This simple block reader reads a single line at a time, and returns a single token (consisting of a string) for each whitespace-separated substring on the line. When deciding how to define the block reader for a given corpus, careful consideration should be given to the size of blocks handled by the block reader. Smaller block sizes will increase the memory requirements of the corpus view's internal data structures (by 2 integers per block). On the other hand, larger block sizes may decrease performance for random access to the corpus. (But note that larger block sizes will *not* decrease performance for iteration.) Internally, ``CorpusView`` maintains a partial mapping from token index to file position, with one entry per block. When a token with a given index *i* is requested, the ``CorpusView`` constructs it as follows: 1. First, it searches the toknum/filepos mapping for the token index closest to (but less than or equal to) *i*. 2. Then, starting at the file position corresponding to that index, it reads one block at a time using the block reader until it reaches the requested token. The toknum/filepos mapping is created lazily: it is initially empty, but every time a new block is read, the block's initial token is added to the mapping. (Thus, the toknum/filepos map has one entry per block.) In order to increase efficiency for random access patterns that have high degrees of locality, the corpus view may cache one or more blocks. :note: Each ``CorpusView`` object internally maintains an open file object for its underlying corpus file. This file should be automatically closed when the ``CorpusView`` is garbage collected, but if you wish to close it manually, use the ``close()`` method. If you access a ``CorpusView``'s items after it has been closed, the file object will be automatically re-opened. :warning: If the contents of the file are modified during the lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior is undefined. :warning: If a unicode encoding is specified when constructing a ``CorpusView``, then the block reader may only call ``stream.seek()`` with offsets that have been returned by ``stream.tell()``; in particular, calling ``stream.seek()`` with relative offsets, or with offsets based on string lengths, may lead to incorrect behavior. :ivar _block_reader: The function used to read a single block from the underlying file stream. :ivar _toknum: A list containing the token index of each block that has been processed. In particular, ``_toknum[i]`` is the token index of the first token in block ``i``. Together with ``_filepos``, this forms a partial mapping between token indices and file positions. :ivar _filepos: A list containing the file position of each block that has been processed. In particular, ``_toknum[i]`` is the file position of the first character in block ``i``. Together with ``_toknum``, this forms a partial mapping between token indices and file positions. :ivar _stream: The stream used to access the underlying corpus file. :ivar _len: The total number of tokens in the corpus, if known; or None, if the number of tokens is not yet known. :ivar _eofpos: The character position of the last character in the file. This is calculated when the corpus view is initialized, and is used to decide when the end of file has been reached. :ivar _cache: A cache of the most recently read block. It is encoded as a tuple (start_toknum, end_toknum, tokens), where start_toknum is the token index of the first token in the block; end_toknum is the token index of the first token not in the block; and tokens is a list of the tokens in the block. """ def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"): """ Create a new corpus view, based on the file ``fileid``, and read with ``block_reader``. See the class documentation for more information. :param fileid: The path to the file that is read by this corpus view. ``fileid`` can either be a string or a ``PathPointer``. :param startpos: The file position at which the view will start reading. This can be used to skip over preface sections. :param encoding: The unicode encoding that should be used to read the file's contents. If no encoding is specified, then the file's contents will be read as a non-unicode string (i.e., a str). """ if block_reader: self.read_block = block_reader # Initialize our toknum/filepos mapping. self._toknum = [0] self._filepos = [startpos] self._encoding = encoding # We don't know our length (number of tokens) yet. self._len = None self._fileid = fileid self._stream = None self._current_toknum = None """This variable is set to the index of the next token that will be read, immediately before ``self.read_block()`` is called. This is provided for the benefit of the block reader, which under rare circumstances may need to know the current token number.""" self._current_blocknum = None """This variable is set to the index of the next block that will be read, immediately before ``self.read_block()`` is called. This is provided for the benefit of the block reader, which under rare circumstances may need to know the current block number.""" # Find the length of the file. try: if isinstance(self._fileid, PathPointer): self._eofpos = self._fileid.file_size() else: self._eofpos = os.stat(self._fileid).st_size except Exception as exc: raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc # Maintain a cache of the most recently read block, to # increase efficiency of random access. self._cache = (-1, -1, None) fileid = property( lambda self: self._fileid, doc=""" The fileid of the file that is accessed by this view. :type: str or PathPointer""", ) def read_block(self, stream): """ Read a block from the input stream. :return: a block of tokens from the input stream :rtype: list(any) :param stream: an input stream :type stream: stream """ raise NotImplementedError("Abstract Method") def _open(self): """ Open the file stream associated with this corpus view. This will be called performed if any value is read from the view while its file stream is closed. """ if isinstance(self._fileid, PathPointer): self._stream = self._fileid.open(self._encoding) elif self._encoding: self._stream = SeekableUnicodeStreamReader( open(self._fileid, "rb"), self._encoding ) else: self._stream = open(self._fileid, "rb") def close(self): """ Close the file stream associated with this corpus view. This can be useful if you are worried about running out of file handles (although the stream should automatically be closed upon garbage collection of the corpus view). If the corpus view is accessed after it is closed, it will be automatically re-opened. """ if self._stream is not None: self._stream.close() self._stream = None def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def __len__(self): if self._len is None: # iterate_from() sets self._len when it reaches the end # of the file: for tok in self.iterate_from(self._toknum[-1]): pass return self._len def __getitem__(self, i): if isinstance(i, slice): start, stop = slice_bounds(self, i) # Check if it's in the cache. offset = self._cache[0] if offset <= start and stop <= self._cache[1]: return self._cache[2][start - offset : stop - offset] # Construct & return the result. return LazySubsequence(self, start, stop) else: # Handle negative indices if i < 0: i += len(self) if i < 0: raise IndexError("index out of range") # Check if it's in the cache. offset = self._cache[0] if offset <= i < self._cache[1]: return self._cache[2][i - offset] # Use iterate_from to extract it. try: return next(self.iterate_from(i)) except StopIteration as e: raise IndexError("index out of range") from e # If we wanted to be thread-safe, then this method would need to # do some locking. def iterate_from(self, start_tok): # Start by feeding from the cache, if possible. if self._cache[0] <= start_tok < self._cache[1]: for tok in self._cache[2][start_tok - self._cache[0] :]: yield tok start_tok += 1 # Decide where in the file we should start. If `start` is in # our mapping, then we can jump straight to the correct block; # otherwise, start at the last block we've processed. if start_tok < self._toknum[-1]: block_index = bisect.bisect_right(self._toknum, start_tok) - 1 toknum = self._toknum[block_index] filepos = self._filepos[block_index] else: block_index = len(self._toknum) - 1 toknum = self._toknum[-1] filepos = self._filepos[-1] # Open the stream, if it's not open already. if self._stream is None: self._open() # If the file is empty, the while loop will never run. # This *seems* to be all the state we need to set: if self._eofpos == 0: self._len = 0 # Each iteration through this loop, we read a single block # from the stream. while filepos < self._eofpos: # Read the next block. self._stream.seek(filepos) self._current_toknum = toknum self._current_blocknum = block_index tokens = self.read_block(self._stream) assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( "block reader %s() should return list or tuple." % self.read_block.__name__ ) num_toks = len(tokens) new_filepos = self._stream.tell() assert ( new_filepos > filepos ), "block reader %s() should consume at least 1 byte (filepos=%d)" % ( self.read_block.__name__, filepos, ) # Update our cache. self._cache = (toknum, toknum + num_toks, list(tokens)) # Update our mapping. assert toknum <= self._toknum[-1] if num_toks > 0: block_index += 1 if toknum == self._toknum[-1]: assert new_filepos > self._filepos[-1] # monotonic! self._filepos.append(new_filepos) self._toknum.append(toknum + num_toks) else: # Check for consistency: assert ( new_filepos == self._filepos[block_index] ), "inconsistent block reader (num chars read)" assert ( toknum + num_toks == self._toknum[block_index] ), "inconsistent block reader (num tokens returned)" # If we reached the end of the file, then update self._len if new_filepos == self._eofpos: self._len = toknum + num_toks # Generate the tokens in this block (but skip any tokens # before start_tok). Note that between yields, our state # may be modified. for tok in tokens[max(0, start_tok - toknum) :]: yield tok # If we're at the end of the file, then we're done. assert new_filepos <= self._eofpos if new_filepos == self._eofpos: break # Update our indices toknum += num_toks filepos = new_filepos # If we reach this point, then we should know our length. assert self._len is not None # Enforce closing of stream once we reached end of file # We should have reached EOF once we're out of the while loop. self.close() # Use concat for these, so we can use a ConcatenatedCorpusView # when possible. def __add__(self, other): return concat([self, other]) def __radd__(self, other): return concat([other, self]) def __mul__(self, count): return concat([self] * count) def __rmul__(self, count): return concat([self] * count) class ConcatenatedCorpusView(AbstractLazySequence): """ A 'view' of a corpus file that joins together one or more ``StreamBackedCorpusViews``. At most one file handle is left open at any time. """ def __init__(self, corpus_views): self._pieces = corpus_views """A list of the corpus subviews that make up this concatenation.""" self._offsets = [0] """A list of offsets, indicating the index at which each subview begins. In particular:: offsets[i] = sum([len(p) for p in pieces[:i]])""" self._open_piece = None """The most recently accessed corpus subview (or None). Before a new subview is accessed, this subview will be closed.""" def __len__(self): if len(self._offsets) <= len(self._pieces): # Iterate to the end of the corpus. for tok in self.iterate_from(self._offsets[-1]): pass return self._offsets[-1] def close(self): for piece in self._pieces: piece.close() def iterate_from(self, start_tok): piecenum = bisect.bisect_right(self._offsets, start_tok) - 1 while piecenum < len(self._pieces): offset = self._offsets[piecenum] piece = self._pieces[piecenum] # If we've got another piece open, close it first. if self._open_piece is not piece: if self._open_piece is not None: self._open_piece.close() self._open_piece = piece # Get everything we can from this piece. yield from piece.iterate_from(max(0, start_tok - offset)) # Update the offset table. if piecenum + 1 == len(self._offsets): self._offsets.append(self._offsets[-1] + len(piece)) # Move on to the next piece. piecenum += 1 def concat(docs): """ Concatenate together the contents of multiple documents from a single corpus, using an appropriate concatenation function. This utility function is used by corpus readers when the user requests more than one document at a time. """ if len(docs) == 1: return docs[0] if len(docs) == 0: raise ValueError("concat() expects at least one object!") types = {d.__class__ for d in docs} # If they're all strings, use string concatenation. if all(isinstance(doc, str) for doc in docs): return "".join(docs) # If they're all corpus views, then use ConcatenatedCorpusView. for typ in types: if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)): break else: return ConcatenatedCorpusView(docs) # If they're all lazy sequences, use a lazy concatenation for typ in types: if not issubclass(typ, AbstractLazySequence): break else: return LazyConcatenation(docs) # Otherwise, see what we can do: if len(types) == 1: typ = list(types)[0] if issubclass(typ, list): return reduce((lambda a, b: a + b), docs, []) if issubclass(typ, tuple): return reduce((lambda a, b: a + b), docs, ()) if ElementTree.iselement(typ): xmltree = ElementTree.Element("documents") for doc in docs: xmltree.append(doc) return xmltree # No method found! raise ValueError("Don't know how to concatenate types: %r" % types) ###################################################################### # { Corpus View for Pickled Sequences ###################################################################### class PickleCorpusView(StreamBackedCorpusView): """ A stream backed corpus view for corpus files that consist of sequences of serialized Python objects (serialized using ``pickle.dump``). One use case for this class is to store the result of running feature detection on a corpus to disk. This can be useful when performing feature detection is expensive (so we don't want to repeat it); but the corpus is too large to store in memory. The following example illustrates this technique: >>> from nltk.corpus.reader.util import PickleCorpusView >>> from nltk.util import LazyMap >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP >>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP """ BLOCK_SIZE = 100 PROTOCOL = -1 def __init__(self, fileid, delete_on_gc=False): """ Create a new corpus view that reads the pickle corpus ``fileid``. :param delete_on_gc: If true, then ``fileid`` will be deleted whenever this object gets garbage-collected. """ self._delete_on_gc = delete_on_gc StreamBackedCorpusView.__init__(self, fileid) def read_block(self, stream): result = [] for i in range(self.BLOCK_SIZE): try: result.append(pickle.load(stream)) except EOFError: break return result def __del__(self): """ If ``delete_on_gc`` was set to true when this ``PickleCorpusView`` was created, then delete the corpus view's fileid. (This method is called whenever a ``PickledCorpusView`` is garbage-collected. """ if getattr(self, "_delete_on_gc"): if os.path.exists(self._fileid): try: os.remove(self._fileid) except OSError: pass self.__dict__.clear() # make the garbage collector's job easier @classmethod def write(cls, sequence, output_file): if isinstance(output_file, str): output_file = open(output_file, "wb") for item in sequence: pickle.dump(item, output_file, cls.PROTOCOL) @classmethod def cache_to_tempfile(cls, sequence, delete_on_gc=True): """ Write the given sequence to a temporary file as a pickle corpus; and then return a ``PickleCorpusView`` view for that temporary corpus file. :param delete_on_gc: If true, then the temporary file will be deleted whenever this object gets garbage-collected. """ try: fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-") output_file = os.fdopen(fd, "wb") cls.write(sequence, output_file) output_file.close() return PickleCorpusView(output_file_name, delete_on_gc) except OSError as e: raise ValueError("Error while creating temp file: %s" % e) from e ###################################################################### # { Block Readers ###################################################################### def read_whitespace_block(stream): toks = [] for i in range(20): # Read 20 lines at a time. toks.extend(stream.readline().split()) return toks def read_wordpunct_block(stream): toks = [] for i in range(20): # Read 20 lines at a time. toks.extend(wordpunct_tokenize(stream.readline())) return toks def read_line_block(stream): toks = [] for i in range(20): line = stream.readline() if not line: return toks toks.append(line.rstrip("\n")) return toks def read_blankline_block(stream): s = "" while True: line = stream.readline() # End of file: if not line: if s: return [s] else: return [] # Blank line: elif line and not line.strip(): if s: return [s] # Other line: else: s += line def read_alignedsent_block(stream): s = "" while True: line = stream.readline() if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n": continue # End of file: if not line: if s: return [s] else: return [] # Other line: else: s += line if re.match(r"^\d+-\d+", line) is not None: return [s] def read_regexp_block(stream, start_re, end_re=None): """ Read a sequence of tokens from a stream, where tokens begin with lines that match ``start_re``. If ``end_re`` is specified, then tokens end with lines that match ``end_re``; otherwise, tokens end whenever the next line matching ``start_re`` or EOF is found. """ # Scan until we find a line matching the start regexp. while True: line = stream.readline() if not line: return [] # end of file. if re.match(start_re, line): break # Scan until we find another line matching the regexp, or EOF. lines = [line] while True: oldpos = stream.tell() line = stream.readline() # End of file: if not line: return ["".join(lines)] # End of token: if end_re is not None and re.match(end_re, line): return ["".join(lines)] # Start of new token: backup to just before it starts, and # return the token we've already collected. if end_re is None and re.match(start_re, line): stream.seek(oldpos) return ["".join(lines)] # Anything else is part of the token. lines.append(line) def read_sexpr_block(stream, block_size=16384, comment_char=None): """ Read a sequence of s-expressions from the stream, and leave the stream's file position at the end the last complete s-expression read. This function will always return at least one s-expression, unless there are no more s-expressions in the file. If the file ends in in the middle of an s-expression, then that incomplete s-expression is returned when the end of the file is reached. :param block_size: The default block size for reading. If an s-expression is longer than one block, then more than one block will be read. :param comment_char: A character that marks comments. Any lines that begin with this character will be stripped out. (If spaces or tabs precede the comment character, then the line will not be stripped.) """ start = stream.tell() block = stream.read(block_size) encoding = getattr(stream, "encoding", None) assert encoding is not None or isinstance(block, str) if encoding not in (None, "utf-8"): import warnings warnings.warn( "Parsing may fail, depending on the properties " "of the %s encoding!" % encoding ) # (e.g., the utf-16 encoding does not work because it insists # on adding BOMs to the beginning of encoded strings.) if comment_char: COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char)) while True: try: # If we're stripping comments, then make sure our block ends # on a line boundary; and then replace any comments with # space characters. (We can't just strip them out -- that # would make our offset wrong.) if comment_char: block += stream.readline() block = re.sub(COMMENT, _sub_space, block) # Read the block. tokens, offset = _parse_sexpr_block(block) # Skip whitespace offset = re.compile(r"\s*").search(block, offset).end() # Move to the end position. if encoding is None: stream.seek(start + offset) else: stream.seek(start + len(block[:offset].encode(encoding))) # Return the list of tokens we processed return tokens except ValueError as e: if e.args[0] == "Block too small": next_block = stream.read(block_size) if next_block: block += next_block continue else: # The file ended mid-sexpr -- return what we got. return [block.strip()] else: raise def _sub_space(m): """Helper function: given a regexp match, return a string of spaces that's the same length as the matched string.""" return " " * (m.end() - m.start()) def _parse_sexpr_block(block): tokens = [] start = end = 0 while end < len(block): m = re.compile(r"\S").search(block, end) if not m: return tokens, end start = m.start() # Case 1: sexpr is not parenthesized. if m.group() != "(": m2 = re.compile(r"[\s(]").search(block, start) if m2: end = m2.start() else: if tokens: return tokens, end raise ValueError("Block too small") # Case 2: parenthesized sexpr. else: nesting = 0 for m in re.compile(r"[()]").finditer(block, start): if m.group() == "(": nesting += 1 else: nesting -= 1 if nesting == 0: end = m.end() break else: if tokens: return tokens, end raise ValueError("Block too small") tokens.append(block[start:end]) return tokens, end ###################################################################### # { Finding Corpus Items ###################################################################### def find_corpus_fileids(root, regexp): if not isinstance(root, PathPointer): raise TypeError("find_corpus_fileids: expected a PathPointer") regexp += "$" # Find fileids in a zipfile: scan the zipfile's namelist. Filter # out entries that end in '/' -- they're directories. if isinstance(root, ZipFilePathPointer): fileids = [ name[len(root.entry) :] for name in root.zipfile.namelist() if not name.endswith("/") ] items = [name for name in fileids if re.match(regexp, name)] return sorted(items) # Find fileids in a directory: use os.walk to search all (proper # or symlinked) subdirectories, and match paths against the regexp. elif isinstance(root, FileSystemPathPointer): items = [] for dirname, subdirs, fileids in os.walk(root.path): prefix = "".join("%s/" % p for p in _path_from(root.path, dirname)) items += [ prefix + fileid for fileid in fileids if re.match(regexp, prefix + fileid) ] # Don't visit svn directories: if ".svn" in subdirs: subdirs.remove(".svn") return sorted(items) else: raise AssertionError("Don't know how to handle %r" % root) def _path_from(parent, child): if os.path.split(parent)[1] == "": parent = os.path.split(parent)[0] path = [] while parent != child: child, dirname = os.path.split(child) path.insert(0, dirname) assert os.path.split(child)[0] != child return path ###################################################################### # { Paragraph structure in Treebank files ###################################################################### def tagged_treebank_para_block_reader(stream): # Read the next paragraph. para = "" while True: line = stream.readline() # End of paragraph: if re.match(r"======+\s*$", line): if para.strip(): return [para] # End of file: elif line == "": if para.strip(): return [para] else: return [] # Content line: else: para += line nltk-3.7/nltk/corpus/reader/verbnet.py000066400000000000000000000600701420073152400201000ustar00rootroot00000000000000# Natural Language Toolkit: Verbnet Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ An NLTK interface to the VerbNet verb lexicon For details about VerbNet see: https://verbs.colorado.edu/~mpalmer/projects/verbnet.html """ import re import textwrap from collections import defaultdict from nltk.corpus.reader.xmldocs import XMLCorpusReader class VerbnetCorpusReader(XMLCorpusReader): """ An NLTK interface to the VerbNet verb lexicon. From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest on-line verb lexicon currently available for English. It is a hierarchical domain-independent, broad-coverage verb lexicon with mappings to other lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)." For details about VerbNet see: https://verbs.colorado.edu/~mpalmer/projects/verbnet.html """ # No unicode encoding param, since the data files are all XML. def __init__(self, root, fileids, wrap_etree=False): XMLCorpusReader.__init__(self, root, fileids, wrap_etree) self._lemma_to_class = defaultdict(list) """A dictionary mapping from verb lemma strings to lists of VerbNet class identifiers.""" self._wordnet_to_class = defaultdict(list) """A dictionary mapping from wordnet identifier strings to lists of VerbNet class identifiers.""" self._class_to_fileid = {} """A dictionary mapping from class identifiers to corresponding file identifiers. The keys of this dictionary provide a complete list of all classes and subclasses.""" self._shortid_to_longid = {} # Initialize the dictionaries. Use the quick (regexp-based) # method instead of the slow (xml-based) method, because it # runs 2-30 times faster. self._quick_index() _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$") """Regular expression that matches (and decomposes) longids""" _SHORTID_RE = re.compile(r"[\d+.\-]+$") """Regular expression that matches shortids""" _INDEX_RE = re.compile( r']+>|' r'' ) """Regular expression used by ``_index()`` to quickly scan the corpus for basic information.""" def lemmas(self, vnclass=None): """ Return a list of all verb lemmas that appear in any class, or in the ``classid`` if specified. """ if vnclass is None: return sorted(self._lemma_to_class.keys()) else: # [xx] should this include subclass members? if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")] def wordnetids(self, vnclass=None): """ Return a list of all wordnet identifiers that appear in any class, or in ``classid`` if specified. """ if vnclass is None: return sorted(self._wordnet_to_class.keys()) else: # [xx] should this include subclass members? if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) return sum( ( member.get("wn", "").split() for member in vnclass.findall("MEMBERS/MEMBER") ), [], ) def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None): """ Return a list of the VerbNet class identifiers. If a file identifier is specified, then return only the VerbNet class identifiers for classes (and subclasses) defined by that file. If a lemma is specified, then return only VerbNet class identifiers for classes that contain that lemma as a member. If a wordnetid is specified, then return only identifiers for classes that contain that wordnetid as a member. If a classid is specified, then return only identifiers for subclasses of the specified VerbNet class. If nothing is specified, return all classids within VerbNet """ if fileid is not None: return [c for (c, f) in self._class_to_fileid.items() if f == fileid] elif lemma is not None: return self._lemma_to_class[lemma] elif wordnetid is not None: return self._wordnet_to_class[wordnetid] elif classid is not None: xmltree = self.vnclass(classid) return [ subclass.get("ID") for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS") ] else: return sorted(self._class_to_fileid.keys()) def vnclass(self, fileid_or_classid): """Returns VerbNet class ElementTree Return an ElementTree containing the xml for the specified VerbNet class. :param fileid_or_classid: An identifier specifying which class should be returned. Can be a file identifier (such as ``'put-9.1.xml'``), or a VerbNet class identifier (such as ``'put-9.1'``) or a short VerbNet class identifier (such as ``'9.1'``). """ # File identifier: just return the xml. if fileid_or_classid in self._fileids: return self.xml(fileid_or_classid) # Class identifier: get the xml, and find the right elt. classid = self.longid(fileid_or_classid) if classid in self._class_to_fileid: fileid = self._class_to_fileid[self.longid(classid)] tree = self.xml(fileid) if classid == tree.get("ID"): return tree else: for subclass in tree.findall(".//VNSUBCLASS"): if classid == subclass.get("ID"): return subclass else: assert False # we saw it during _index()! else: raise ValueError(f"Unknown identifier {fileid_or_classid}") def fileids(self, vnclass_ids=None): """ Return a list of fileids that make up this corpus. If ``vnclass_ids`` is specified, then return the fileids that make up the specified VerbNet class(es). """ if vnclass_ids is None: return self._fileids elif isinstance(vnclass_ids, str): return [self._class_to_fileid[self.longid(vnclass_ids)]] else: return [ self._class_to_fileid[self.longid(vnclass_id)] for vnclass_id in vnclass_ids ] def frames(self, vnclass): """Given a VerbNet class, this method returns VerbNet frames The members returned are: 1) Example 2) Description 3) Syntax 4) Semantics :param vnclass: A VerbNet class identifier; or an ElementTree containing the xml contents of a VerbNet class. :return: frames - a list of frame dictionaries """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) frames = [] vnframes = vnclass.findall("FRAMES/FRAME") for vnframe in vnframes: frames.append( { "example": self._get_example_within_frame(vnframe), "description": self._get_description_within_frame(vnframe), "syntax": self._get_syntactic_list_within_frame(vnframe), "semantics": self._get_semantics_within_frame(vnframe), } ) return frames def subclasses(self, vnclass): """Returns subclass ids, if any exist Given a VerbNet class, this method returns subclass ids (if they exist) in a list of strings. :param vnclass: A VerbNet class identifier; or an ElementTree containing the xml contents of a VerbNet class. :return: list of subclasses """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) subclasses = [ subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS") ] return subclasses def themroles(self, vnclass): """Returns thematic roles participating in a VerbNet class Members returned as part of roles are- 1) Type 2) Modifiers :param vnclass: A VerbNet class identifier; or an ElementTree containing the xml contents of a VerbNet class. :return: themroles: A list of thematic roles in the VerbNet class """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) themroles = [] for trole in vnclass.findall("THEMROLES/THEMROLE"): themroles.append( { "type": trole.get("type"), "modifiers": [ {"value": restr.get("Value"), "type": restr.get("type")} for restr in trole.findall("SELRESTRS/SELRESTR") ], } ) return themroles ###################################################################### # { Index Initialization ###################################################################### def _index(self): """ Initialize the indexes ``_lemma_to_class``, ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning through the corpus fileids. This is fast if ElementTree uses the C implementation (<0.1 secs), but quite slow (>10 secs) if only the python implementation is available. """ for fileid in self._fileids: self._index_helper(self.xml(fileid), fileid) def _index_helper(self, xmltree, fileid): """Helper for ``_index()``""" vnclass = xmltree.get("ID") self._class_to_fileid[vnclass] = fileid self._shortid_to_longid[self.shortid(vnclass)] = vnclass for member in xmltree.findall("MEMBERS/MEMBER"): self._lemma_to_class[member.get("name")].append(vnclass) for wn in member.get("wn", "").split(): self._wordnet_to_class[wn].append(vnclass) for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"): self._index_helper(subclass, fileid) def _quick_index(self): """ Initialize the indexes ``_lemma_to_class``, ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning through the corpus fileids. This doesn't do proper xml parsing, but is good enough to find everything in the standard VerbNet corpus -- and it runs about 30 times faster than xml parsing (with the python ElementTree; only 2-3 times faster if ElementTree uses the C implementation). """ # nb: if we got rid of wordnet_to_class, this would run 2-3 # times faster. for fileid in self._fileids: vnclass = fileid[:-4] # strip the '.xml' self._class_to_fileid[vnclass] = fileid self._shortid_to_longid[self.shortid(vnclass)] = vnclass with self.open(fileid) as fp: for m in self._INDEX_RE.finditer(fp.read()): groups = m.groups() if groups[0] is not None: self._lemma_to_class[groups[0]].append(vnclass) for wn in groups[1].split(): self._wordnet_to_class[wn].append(vnclass) elif groups[2] is not None: self._class_to_fileid[groups[2]] = fileid vnclass = groups[2] # for elts. self._shortid_to_longid[self.shortid(vnclass)] = vnclass else: assert False, "unexpected match condition" ###################################################################### # { Identifier conversion ###################################################################### def longid(self, shortid): """Returns longid of a VerbNet class Given a short VerbNet class identifier (eg '37.10'), map it to a long id (eg 'confess-37.10'). If ``shortid`` is already a long id, then return it as-is""" if self._LONGID_RE.match(shortid): return shortid # it's already a longid. elif not self._SHORTID_RE.match(shortid): raise ValueError("vnclass identifier %r not found" % shortid) try: return self._shortid_to_longid[shortid] except KeyError as e: raise ValueError("vnclass identifier %r not found" % shortid) from e def shortid(self, longid): """Returns shortid of a VerbNet class Given a long VerbNet class identifier (eg 'confess-37.10'), map it to a short id (eg '37.10'). If ``longid`` is already a short id, then return it as-is.""" if self._SHORTID_RE.match(longid): return longid # it's already a shortid. m = self._LONGID_RE.match(longid) if m: return m.group(2) else: raise ValueError("vnclass identifier %r not found" % longid) ###################################################################### # { Frame access utility functions ###################################################################### def _get_semantics_within_frame(self, vnframe): """Returns semantics within a single frame A utility function to retrieve semantics within a frame in VerbNet Members of the semantics dictionary: 1) Predicate value 2) Arguments :param vnframe: An ElementTree containing the xml contents of a VerbNet frame. :return: semantics: semantics dictionary """ semantics_within_single_frame = [] for pred in vnframe.findall("SEMANTICS/PRED"): arguments = [ {"type": arg.get("type"), "value": arg.get("value")} for arg in pred.findall("ARGS/ARG") ] semantics_within_single_frame.append( {"predicate_value": pred.get("value"), "arguments": arguments} ) return semantics_within_single_frame def _get_example_within_frame(self, vnframe): """Returns example within a frame A utility function to retrieve an example within a frame in VerbNet. :param vnframe: An ElementTree containing the xml contents of a VerbNet frame. :return: example_text: The example sentence for this particular frame """ example_element = vnframe.find("EXAMPLES/EXAMPLE") if example_element is not None: example_text = example_element.text else: example_text = "" return example_text def _get_description_within_frame(self, vnframe): """Returns member description within frame A utility function to retrieve a description of participating members within a frame in VerbNet. :param vnframe: An ElementTree containing the xml contents of a VerbNet frame. :return: description: a description dictionary with members - primary and secondary """ description_element = vnframe.find("DESCRIPTION") return { "primary": description_element.attrib["primary"], "secondary": description_element.get("secondary", ""), } def _get_syntactic_list_within_frame(self, vnframe): """Returns semantics within a frame A utility function to retrieve semantics within a frame in VerbNet. Members of the syntactic dictionary: 1) POS Tag 2) Modifiers :param vnframe: An ElementTree containing the xml contents of a VerbNet frame. :return: syntax_within_single_frame """ syntax_within_single_frame = [] for elt in vnframe.find("SYNTAX"): pos_tag = elt.tag modifiers = dict() modifiers["value"] = elt.get("value") if "value" in elt.attrib else "" modifiers["selrestrs"] = [ {"value": restr.get("Value"), "type": restr.get("type")} for restr in elt.findall("SELRESTRS/SELRESTR") ] modifiers["synrestrs"] = [ {"value": restr.get("Value"), "type": restr.get("type")} for restr in elt.findall("SYNRESTRS/SYNRESTR") ] syntax_within_single_frame.append( {"pos_tag": pos_tag, "modifiers": modifiers} ) return syntax_within_single_frame ###################################################################### # { Pretty Printing ###################################################################### def pprint(self, vnclass): """Returns pretty printed version of a VerbNet class Return a string containing a pretty-printed representation of the given VerbNet class. :param vnclass: A VerbNet class identifier; or an ElementTree containing the xml contents of a VerbNet class. """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) s = vnclass.get("ID") + "\n" s += self.pprint_subclasses(vnclass, indent=" ") + "\n" s += self.pprint_members(vnclass, indent=" ") + "\n" s += " Thematic roles:\n" s += self.pprint_themroles(vnclass, indent=" ") + "\n" s += " Frames:\n" s += self.pprint_frames(vnclass, indent=" ") return s def pprint_subclasses(self, vnclass, indent=""): """Returns pretty printed version of subclasses of VerbNet class Return a string containing a pretty-printed representation of the given VerbNet class's subclasses. :param vnclass: A VerbNet class identifier; or an ElementTree containing the xml contents of a VerbNet class. """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) subclasses = self.subclasses(vnclass) if not subclasses: subclasses = ["(none)"] s = "Subclasses: " + " ".join(subclasses) return textwrap.fill( s, 70, initial_indent=indent, subsequent_indent=indent + " " ) def pprint_members(self, vnclass, indent=""): """Returns pretty printed version of members in a VerbNet class Return a string containing a pretty-printed representation of the given VerbNet class's member verbs. :param vnclass: A VerbNet class identifier; or an ElementTree containing the xml contents of a VerbNet class. """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) members = self.lemmas(vnclass) if not members: members = ["(none)"] s = "Members: " + " ".join(members) return textwrap.fill( s, 70, initial_indent=indent, subsequent_indent=indent + " " ) def pprint_themroles(self, vnclass, indent=""): """Returns pretty printed version of thematic roles in a VerbNet class Return a string containing a pretty-printed representation of the given VerbNet class's thematic roles. :param vnclass: A VerbNet class identifier; or an ElementTree containing the xml contents of a VerbNet class. """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) pieces = [] for themrole in self.themroles(vnclass): piece = indent + "* " + themrole.get("type") modifiers = [ modifier["value"] + modifier["type"] for modifier in themrole["modifiers"] ] if modifiers: piece += "[{}]".format(" ".join(modifiers)) pieces.append(piece) return "\n".join(pieces) def pprint_frames(self, vnclass, indent=""): """Returns pretty version of all frames in a VerbNet class Return a string containing a pretty-printed representation of the list of frames within the VerbNet class. :param vnclass: A VerbNet class identifier; or an ElementTree containing the xml contents of a VerbNet class. """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) pieces = [] for vnframe in self.frames(vnclass): pieces.append(self._pprint_single_frame(vnframe, indent)) return "\n".join(pieces) def _pprint_single_frame(self, vnframe, indent=""): """Returns pretty printed version of a single frame in a VerbNet class Returns a string containing a pretty-printed representation of the given frame. :param vnframe: An ElementTree containing the xml contents of a VerbNet frame. """ frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n" frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n" frame_string += ( self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n" ) frame_string += indent + " Semantics:\n" frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ") return frame_string def _pprint_example_within_frame(self, vnframe, indent=""): """Returns pretty printed version of example within frame in a VerbNet class Return a string containing a pretty-printed representation of the given VerbNet frame example. :param vnframe: An ElementTree containing the xml contents of a Verbnet frame. """ if vnframe["example"]: return indent + " Example: " + vnframe["example"] def _pprint_description_within_frame(self, vnframe, indent=""): """Returns pretty printed version of a VerbNet frame description Return a string containing a pretty-printed representation of the given VerbNet frame description. :param vnframe: An ElementTree containing the xml contents of a VerbNet frame. """ description = indent + vnframe["description"]["primary"] if vnframe["description"]["secondary"]: description += " ({})".format(vnframe["description"]["secondary"]) return description def _pprint_syntax_within_frame(self, vnframe, indent=""): """Returns pretty printed version of syntax within a frame in a VerbNet class Return a string containing a pretty-printed representation of the given VerbNet frame syntax. :param vnframe: An ElementTree containing the xml contents of a VerbNet frame. """ pieces = [] for element in vnframe["syntax"]: piece = element["pos_tag"] modifier_list = [] if "value" in element["modifiers"] and element["modifiers"]["value"]: modifier_list.append(element["modifiers"]["value"]) modifier_list += [ "{}{}".format(restr["value"], restr["type"]) for restr in ( element["modifiers"]["selrestrs"] + element["modifiers"]["synrestrs"] ) ] if modifier_list: piece += "[{}]".format(" ".join(modifier_list)) pieces.append(piece) return indent + " ".join(pieces) def _pprint_semantics_within_frame(self, vnframe, indent=""): """Returns a pretty printed version of semantics within frame in a VerbNet class Return a string containing a pretty-printed representation of the given VerbNet frame semantics. :param vnframe: An ElementTree containing the xml contents of a VerbNet frame. """ pieces = [] for predicate in vnframe["semantics"]: arguments = [argument["value"] for argument in predicate["arguments"]] pieces.append( "{}({})".format(predicate["predicate_value"], ", ".join(arguments)) ) return "\n".join(f"{indent}* {piece}" for piece in pieces) nltk-3.7/nltk/corpus/reader/wordlist.py000066400000000000000000000130161420073152400203000ustar00rootroot00000000000000# Natural Language Toolkit: Word List Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.tokenize import line_tokenize class WordListCorpusReader(CorpusReader): """ List of words, one per line. Blank lines are ignored. """ def words(self, fileids=None, ignore_lines_startswith="\n"): return [ line for line in line_tokenize(self.raw(fileids)) if not line.startswith(ignore_lines_startswith) ] class SwadeshCorpusReader(WordListCorpusReader): def entries(self, fileids=None): """ :return: a tuple of words for the specified fileids. """ if not fileids: fileids = self.fileids() wordlists = [self.words(f) for f in fileids] return list(zip(*wordlists)) class NonbreakingPrefixesCorpusReader(WordListCorpusReader): """ This is a class to read the nonbreaking prefixes textfiles from the Moses Machine Translation toolkit. These lists are used in the Python port of the Moses' word tokenizer. """ available_langs = { "catalan": "ca", "czech": "cs", "german": "de", "greek": "el", "english": "en", "spanish": "es", "finnish": "fi", "french": "fr", "hungarian": "hu", "icelandic": "is", "italian": "it", "latvian": "lv", "dutch": "nl", "polish": "pl", "portuguese": "pt", "romanian": "ro", "russian": "ru", "slovak": "sk", "slovenian": "sl", "swedish": "sv", "tamil": "ta", } # Also, add the lang IDs as the keys. available_langs.update({v: v for v in available_langs.values()}) def words(self, lang=None, fileids=None, ignore_lines_startswith="#"): """ This module returns a list of nonbreaking prefixes for the specified language(s). >>> from nltk.corpus import nonbreaking_prefixes as nbp >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J'] True >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89'] True :return: a list words for the specified language(s). """ # If *lang* in list of languages available, allocate apt fileid. # Otherwise, the function returns non-breaking prefixes for # all languages when fileids==None. if lang in self.available_langs: lang = self.available_langs[lang] fileids = ["nonbreaking_prefix." + lang] return [ line for line in line_tokenize(self.raw(fileids)) if not line.startswith(ignore_lines_startswith) ] class UnicharsCorpusReader(WordListCorpusReader): """ This class is used to read lists of characters from the Perl Unicode Properties (see https://perldoc.perl.org/perluniprops.html). The files in the perluniprop.zip are extracted using the Unicode::Tussle module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm """ # These are categories similar to the Perl Unicode Properties available_categories = [ "Close_Punctuation", "Currency_Symbol", "IsAlnum", "IsAlpha", "IsLower", "IsN", "IsSc", "IsSo", "IsUpper", "Line_Separator", "Number", "Open_Punctuation", "Punctuation", "Separator", "Symbol", ] def chars(self, category=None, fileids=None): """ This module returns a list of characters from the Perl Unicode Properties. They are very useful when porting Perl tokenizers to Python. >>> from nltk.corpus import perluniprops as pup >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c'] True >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5'] True >>> pup.available_categories ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol'] :return: a list of characters given the specific unicode character category """ if category in self.available_categories: fileids = [category + ".txt"] return list(self.raw(fileids).strip()) class MWAPPDBCorpusReader(WordListCorpusReader): """ This class is used to read the list of word pairs from the subset of lexical pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015): - http://acl2014.org/acl2014/Q14/pdf/Q14-1017 - https://www.aclweb.org/anthology/S14-2039 - https://www.aclweb.org/anthology/S15-2027 The original source of the full PPDB corpus can be found on https://www.cis.upenn.edu/~ccb/ppdb/ :return: a list of tuples of similar lexical terms. """ mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs" def entries(self, fileids=mwa_ppdb_xxxl_file): """ :return: a tuple of synonym word pairs. """ return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))] nltk-3.7/nltk/corpus/reader/wordnet.py000066400000000000000000002536421420073152400201260ustar00rootroot00000000000000# Natural Language Toolkit: WordNet # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bethard # Steven Bird # Edward Loper # Nitin Madnani # Nasruddin A’aidil Shari # Sim Wei Ying Geraldine # Soe Lynn # Francis Bond # Eric Kafe # URL: # For license information, see LICENSE.TXT """ An NLTK interface for WordNet WordNet is a lexical database of English. Using synsets, helps find conceptual relationships between words such as hypernyms, hyponyms, synonyms, antonyms etc. For details about WordNet see: https://wordnet.princeton.edu/ This module also allows you to find lemmas in languages other than English from the Open Multilingual Wordnet http://compling.hss.ntu.edu.sg/omw/ """ import math import os import re import warnings from collections import defaultdict, deque from functools import total_ordering from itertools import chain, islice from operator import itemgetter from nltk.corpus.reader import CorpusReader from nltk.internals import deprecated from nltk.probability import FreqDist from nltk.util import binary_search_file as _binary_search_file ###################################################################### # Table of Contents ###################################################################### # - Constants # - Data Classes # - WordNetError # - Lemma # - Synset # - WordNet Corpus Reader # - WordNet Information Content Corpus Reader # - Similarity Metrics # - Demo ###################################################################### # Constants ###################################################################### #: Positive infinity (for similarity functions) _INF = 1e300 # { Part-of-speech constants ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v" # } POS_LIST = [NOUN, VERB, ADJ, ADV] # A table of strings that are used to express verb frames. VERB_FRAME_STRINGS = ( None, "Something %s", "Somebody %s", "It is %sing", "Something is %sing PP", "Something %s something Adjective/Noun", "Something %s Adjective/Noun", "Somebody %s Adjective", "Somebody %s something", "Somebody %s somebody", "Something %s somebody", "Something %s something", "Something %s to somebody", "Somebody %s on something", "Somebody %s somebody something", "Somebody %s something to somebody", "Somebody %s something from somebody", "Somebody %s somebody with something", "Somebody %s somebody of something", "Somebody %s something on somebody", "Somebody %s somebody PP", "Somebody %s something PP", "Somebody %s PP", "Somebody's (body part) %s", "Somebody %s somebody to INFINITIVE", "Somebody %s somebody INFINITIVE", "Somebody %s that CLAUSE", "Somebody %s to somebody", "Somebody %s to INFINITIVE", "Somebody %s whether INFINITIVE", "Somebody %s somebody into V-ing something", "Somebody %s something with something", "Somebody %s INFINITIVE", "Somebody %s VERB-ing", "It %s that CLAUSE", "Something %s INFINITIVE", ) SENSENUM_RE = re.compile(r"\.[\d]+\.") ###################################################################### # Data Classes ###################################################################### class WordNetError(Exception): """An exception class for wordnet-related errors.""" @total_ordering class _WordNetObject: """A common base class for lemmas and synsets.""" def hypernyms(self): return self._related("@") def _hypernyms(self): return self._related("@") def instance_hypernyms(self): return self._related("@i") def _instance_hypernyms(self): return self._related("@i") def hyponyms(self): return self._related("~") def instance_hyponyms(self): return self._related("~i") def member_holonyms(self): return self._related("#m") def substance_holonyms(self): return self._related("#s") def part_holonyms(self): return self._related("#p") def member_meronyms(self): return self._related("%m") def substance_meronyms(self): return self._related("%s") def part_meronyms(self): return self._related("%p") def topic_domains(self): return self._related(";c") def in_topic_domains(self): return self._related("-c") def region_domains(self): return self._related(";r") def in_region_domains(self): return self._related("-r") def usage_domains(self): return self._related(";u") def in_usage_domains(self): return self._related("-u") def attributes(self): return self._related("=") def entailments(self): return self._related("*") def causes(self): return self._related(">") def also_sees(self): return self._related("^") def verb_groups(self): return self._related("$") def similar_tos(self): return self._related("&") def __hash__(self): return hash(self._name) def __eq__(self, other): return self._name == other._name def __ne__(self, other): return self._name != other._name def __lt__(self, other): return self._name < other._name class Lemma(_WordNetObject): """ The lexical entry for a single morphological form of a sense-disambiguated word. Create a Lemma from a "..." string where: is the morphological stem identifying the synset is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB is the sense number, counting from 0. is the morphological form of interest Note that and can be different, e.g. the Synset 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and 'salt.n.03.salinity'. Lemma attributes, accessible via methods with the same name: - name: The canonical name of this lemma. - synset: The synset that this lemma belongs to. - syntactic_marker: For adjectives, the WordNet string identifying the syntactic position relative modified noun. See: https://wordnet.princeton.edu/documentation/wninput5wn For all other parts of speech, this attribute is None. - count: The frequency of this lemma in wordnet. Lemma methods: Lemmas have the following methods for retrieving related Lemmas. They correspond to the names for the pointer symbols defined here: https://wordnet.princeton.edu/documentation/wninput5wn These methods all return lists of Lemmas: - antonyms - hypernyms, instance_hypernyms - hyponyms, instance_hyponyms - member_holonyms, substance_holonyms, part_holonyms - member_meronyms, substance_meronyms, part_meronyms - topic_domains, region_domains, usage_domains - attributes - derivationally_related_forms - entailments - causes - also_sees - verb_groups - similar_tos - pertainyms """ __slots__ = [ "_wordnet_corpus_reader", "_name", "_syntactic_marker", "_synset", "_frame_strings", "_frame_ids", "_lexname_index", "_lex_id", "_lang", "_key", ] def __init__( self, wordnet_corpus_reader, synset, name, lexname_index, lex_id, syntactic_marker, ): self._wordnet_corpus_reader = wordnet_corpus_reader self._name = name self._syntactic_marker = syntactic_marker self._synset = synset self._frame_strings = [] self._frame_ids = [] self._lexname_index = lexname_index self._lex_id = lex_id self._lang = "eng" self._key = None # gets set later. def name(self): return self._name def syntactic_marker(self): return self._syntactic_marker def synset(self): return self._synset def frame_strings(self): return self._frame_strings def frame_ids(self): return self._frame_ids def lang(self): return self._lang def key(self): return self._key def __repr__(self): tup = type(self).__name__, self._synset._name, self._name return "%s('%s.%s')" % tup def _related(self, relation_symbol): get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset if (self._name, relation_symbol) not in self._synset._lemma_pointers: return [] return [ get_synset(pos, offset)._lemmas[lemma_index] for pos, offset, lemma_index in self._synset._lemma_pointers[ self._name, relation_symbol ] ] def count(self): """Return the frequency count for this Lemma""" return self._wordnet_corpus_reader.lemma_count(self) def antonyms(self): return self._related("!") def derivationally_related_forms(self): return self._related("+") def pertainyms(self): return self._related("\\") class Synset(_WordNetObject): """Create a Synset from a ".." string where: is the word's morphological stem is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB is the sense number, counting from 0. Synset attributes, accessible via methods with the same name: - name: The canonical name of this synset, formed using the first lemma of this synset. Note that this may be different from the name passed to the constructor if that string used a different lemma to identify the synset. - pos: The synset's part of speech, matching one of the module level attributes ADJ, ADJ_SAT, ADV, NOUN or VERB. - lemmas: A list of the Lemma objects for this synset. - definition: The definition for this synset. - examples: A list of example strings for this synset. - offset: The offset in the WordNet dict file of this synset. - lexname: The name of the lexicographer file containing this synset. Synset methods: Synsets have the following methods for retrieving related Synsets. They correspond to the names for the pointer symbols defined here: https://wordnet.princeton.edu/documentation/wninput5wn These methods all return lists of Synsets. - hypernyms, instance_hypernyms - hyponyms, instance_hyponyms - member_holonyms, substance_holonyms, part_holonyms - member_meronyms, substance_meronyms, part_meronyms - attributes - entailments - causes - also_sees - verb_groups - similar_tos Additionally, Synsets support the following methods specific to the hypernym relation: - root_hypernyms - common_hypernyms - lowest_common_hypernyms Note that Synsets do not support the following relations because these are defined by WordNet as lexical relations: - antonyms - derivationally_related_forms - pertainyms """ __slots__ = [ "_pos", "_offset", "_name", "_frame_ids", "_lemmas", "_lemma_names", "_definition", "_examples", "_lexname", "_pointers", "_lemma_pointers", "_max_depth", "_min_depth", ] def __init__(self, wordnet_corpus_reader): self._wordnet_corpus_reader = wordnet_corpus_reader # All of these attributes get initialized by # WordNetCorpusReader._synset_from_pos_and_line() self._pos = None self._offset = None self._name = None self._frame_ids = [] self._lemmas = [] self._lemma_names = [] self._definition = None self._examples = [] self._lexname = None # lexicographer name self._all_hypernyms = None self._pointers = defaultdict(set) self._lemma_pointers = defaultdict(list) def pos(self): return self._pos def offset(self): return self._offset def name(self): return self._name def frame_ids(self): return self._frame_ids def _doc(self, doc_type, default, lang="eng"): """Helper method for Synset.definition and Synset.examples""" corpus = self._wordnet_corpus_reader if lang not in corpus.langs(): return None elif lang == "eng": return default else: corpus._load_lang_data(lang) of = corpus.ss2of(self) i = corpus.lg_attrs.index(doc_type) if of in corpus._lang_data[lang][i].keys(): return corpus._lang_data[lang][i][of] else: return None def definition(self, lang="eng"): """Return definition in specified language""" return self._doc("def", self._definition, lang=lang) def examples(self, lang="eng"): """Return examples in specified language""" return self._doc("exe", self._examples, lang=lang) def lexname(self): return self._lexname def _needs_root(self): if self._pos == NOUN and self._wordnet_corpus_reader.get_version() != "1.6": return False else: return True def lemma_names(self, lang="eng"): """Return all the lemma_names associated with the synset""" if lang == "eng": return self._lemma_names else: self._wordnet_corpus_reader._load_lang_data(lang) i = self._wordnet_corpus_reader.ss2of(self, lang) if i in self._wordnet_corpus_reader._lang_data[lang][0]: return self._wordnet_corpus_reader._lang_data[lang][0][i] else: return [] def lemmas(self, lang="eng"): """Return all the lemma objects associated with the synset""" if lang == "eng": return self._lemmas else: self._wordnet_corpus_reader._load_lang_data(lang) lemmark = [] lemmy = self.lemma_names(lang) for lem in lemmy: temp = Lemma( self._wordnet_corpus_reader, self, lem, self._wordnet_corpus_reader._lexnames.index(self.lexname()), 0, None, ) temp._lang = lang lemmark.append(temp) return lemmark def root_hypernyms(self): """Get the topmost hypernyms of this synset in WordNet.""" result = [] seen = set() todo = [self] while todo: next_synset = todo.pop() if next_synset not in seen: seen.add(next_synset) next_hypernyms = ( next_synset.hypernyms() + next_synset.instance_hypernyms() ) if not next_hypernyms: result.append(next_synset) else: todo.extend(next_hypernyms) return result # Simpler implementation which makes incorrect assumption that # hypernym hierarchy is acyclic: # # if not self.hypernyms(): # return [self] # else: # return list(set(root for h in self.hypernyms() # for root in h.root_hypernyms())) def max_depth(self): """ :return: The length of the longest hypernym path from this synset to the root. """ if "_max_depth" not in self.__dict__: hypernyms = self.hypernyms() + self.instance_hypernyms() if not hypernyms: self._max_depth = 0 else: self._max_depth = 1 + max(h.max_depth() for h in hypernyms) return self._max_depth def min_depth(self): """ :return: The length of the shortest hypernym path from this synset to the root. """ if "_min_depth" not in self.__dict__: hypernyms = self.hypernyms() + self.instance_hypernyms() if not hypernyms: self._min_depth = 0 else: self._min_depth = 1 + min(h.min_depth() for h in hypernyms) return self._min_depth def closure(self, rel, depth=-1): """ Return the transitive closure of source under the rel relationship, breadth-first, discarding cycles: >>> from nltk.corpus import wordnet as wn >>> computer = wn.synset('computer.n.01') >>> topic = lambda s:s.topic_domains() >>> print(list(computer.closure(topic))) [Synset('computer_science.n.01')] UserWarning: Discarded redundant search for Synset('computer.n.01') at depth 2 Include redundant paths (but only once), avoiding duplicate searches (from 'animal.n.01' to 'entity.n.01'): >>> dog = wn.synset('dog.n.01') >>> hyp = lambda s:s.hypernyms() >>> print(list(dog.closure(hyp))) [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01')] UserWarning: Discarded redundant search for Synset('animal.n.01') at depth 7 """ from nltk.util import acyclic_breadth_first for synset in acyclic_breadth_first(self, rel, depth): if synset != self: yield synset from nltk.util import acyclic_depth_first as acyclic_tree from nltk.util import unweighted_minimum_spanning_tree as mst # Also add this shortcut? # from nltk.util import unweighted_minimum_spanning_digraph as umsd def tree(self, rel, depth=-1, cut_mark=None): """ Return the full relation tree, including self, discarding cycles: >>> from nltk.corpus import wordnet as wn >>> from pprint import pprint >>> computer = wn.synset('computer.n.01') >>> topic = lambda s:s.topic_domains() >>> pprint(computer.tree(topic)) [Synset('computer.n.01'), [Synset('computer_science.n.01')]] UserWarning: Discarded redundant search for Synset('computer.n.01') at depth -3 But keep duplicate branches (from 'animal.n.01' to 'entity.n.01'): >>> dog = wn.synset('dog.n.01') >>> hyp = lambda s:s.hypernyms() >>> pprint(dog.tree(hyp)) [Synset('dog.n.01'), [Synset('canine.n.02'), [Synset('carnivore.n.01'), [Synset('placental.n.01'), [Synset('mammal.n.01'), [Synset('vertebrate.n.01'), [Synset('chordate.n.01'), [Synset('animal.n.01'), [Synset('organism.n.01'), [Synset('living_thing.n.01'), [Synset('whole.n.02'), [Synset('object.n.01'), [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]]]]], [Synset('domestic_animal.n.01'), [Synset('animal.n.01'), [Synset('organism.n.01'), [Synset('living_thing.n.01'), [Synset('whole.n.02'), [Synset('object.n.01'), [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]] """ from nltk.util import acyclic_branches_depth_first return acyclic_branches_depth_first(self, rel, depth, cut_mark) def hypernym_paths(self): """ Get the path(s) from this synset to the root, where each path is a list of the synset nodes traversed on the way to the root. :return: A list of lists, where each list gives the node sequence connecting the initial ``Synset`` node and a root node. """ paths = [] hypernyms = self.hypernyms() + self.instance_hypernyms() if len(hypernyms) == 0: paths = [[self]] for hypernym in hypernyms: for ancestor_list in hypernym.hypernym_paths(): ancestor_list.append(self) paths.append(ancestor_list) return paths def common_hypernyms(self, other): """ Find all synsets that are hypernyms of this synset and the other synset. :type other: Synset :param other: other input synset. :return: The synsets that are hypernyms of both synsets. """ if not self._all_hypernyms: self._all_hypernyms = { self_synset for self_synsets in self._iter_hypernym_lists() for self_synset in self_synsets } if not other._all_hypernyms: other._all_hypernyms = { other_synset for other_synsets in other._iter_hypernym_lists() for other_synset in other_synsets } return list(self._all_hypernyms.intersection(other._all_hypernyms)) def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False): """ Get a list of lowest synset(s) that both synsets have as a hypernym. When `use_min_depth == False` this means that the synset which appears as a hypernym of both `self` and `other` with the lowest maximum depth is returned or if there are multiple such synsets at the same depth they are all returned However, if `use_min_depth == True` then the synset(s) which has/have the lowest minimum depth and appear(s) in both paths is/are returned. By setting the use_min_depth flag to True, the behavior of NLTK2 can be preserved. This was changed in NLTK3 to give more accurate results in a small set of cases, generally with synsets concerning people. (eg: 'chef.n.01', 'fireman.n.01', etc.) This method is an implementation of Ted Pedersen's "Lowest Common Subsumer" method from the Perl Wordnet module. It can return either "self" or "other" if they are a hypernym of the other. :type other: Synset :param other: other input synset :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (False by default) creates a fake root that connects all the taxonomies. Set it to True to enable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will need to be added for nouns as well. :type use_min_depth: bool :param use_min_depth: This setting mimics older (v2) behavior of NLTK wordnet If True, will use the min_depth function to calculate the lowest common hypernyms. This is known to give strange results for some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained for backwards compatibility :return: The synsets that are the lowest common hypernyms of both synsets """ synsets = self.common_hypernyms(other) if simulate_root: fake_synset = Synset(None) fake_synset._name = "*ROOT*" fake_synset.hypernyms = lambda: [] fake_synset.instance_hypernyms = lambda: [] synsets.append(fake_synset) try: if use_min_depth: max_depth = max(s.min_depth() for s in synsets) unsorted_lch = [s for s in synsets if s.min_depth() == max_depth] else: max_depth = max(s.max_depth() for s in synsets) unsorted_lch = [s for s in synsets if s.max_depth() == max_depth] return sorted(unsorted_lch) except ValueError: return [] def hypernym_distances(self, distance=0, simulate_root=False): """ Get the path(s) from this synset to the root, counting the distance of each node from the initial node on the way. A set of (synset, distance) tuples is returned. :type distance: int :param distance: the distance (number of edges) from this hypernym to the original hypernym ``Synset`` on which this method was called. :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is a hypernym of the first ``Synset``. """ distances = {(self, distance)} for hypernym in self._hypernyms() + self._instance_hypernyms(): distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False) if simulate_root: fake_synset = Synset(None) fake_synset._name = "*ROOT*" fake_synset_distance = max(distances, key=itemgetter(1))[1] distances.add((fake_synset, fake_synset_distance + 1)) return distances def _shortest_hypernym_paths(self, simulate_root): if self._name == "*ROOT*": return {self: 0} queue = deque([(self, 0)]) path = {} while queue: s, depth = queue.popleft() if s in path: continue path[s] = depth depth += 1 queue.extend((hyp, depth) for hyp in s._hypernyms()) queue.extend((hyp, depth) for hyp in s._instance_hypernyms()) if simulate_root: fake_synset = Synset(None) fake_synset._name = "*ROOT*" path[fake_synset] = max(path.values()) + 1 return path def shortest_path_distance(self, other, simulate_root=False): """ Returns the distance of the shortest path linking the two synsets (if one exists). For each synset, all the ancestor nodes and their distances are recorded and compared. The ancestor node common to both synsets that can be reached with the minimum number of traversals is used. If no ancestor nodes are common, None is returned. If a node is compared with itself 0 is returned. :type other: Synset :param other: The Synset to which the shortest path will be found. :return: The number of edges in the shortest path connecting the two nodes, or None if no path exists. """ if self == other: return 0 dist_dict1 = self._shortest_hypernym_paths(simulate_root) dist_dict2 = other._shortest_hypernym_paths(simulate_root) # For each ancestor synset common to both subject synsets, find the # connecting path length. Return the shortest of these. inf = float("inf") path_distance = inf for synset, d1 in dist_dict1.items(): d2 = dist_dict2.get(synset, inf) path_distance = min(path_distance, d1 + d2) return None if math.isinf(path_distance) else path_distance # interface to similarity methods def path_similarity(self, other, verbose=False, simulate_root=True): """ Path Distance Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1, except in those cases where a path cannot be found (will only be true for verbs as there are many distinct verb taxonomies), in which case None is returned. A score of 1 represents identity i.e. comparing a sense with itself will return 1. :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (True by default) creates a fake root that connects all the taxonomies. Set it to false to disable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will be added for nouns as well. :return: A score denoting the similarity of the two ``Synset`` objects, normally between 0 and 1. None is returned if no connecting path could be found. 1 is returned if a ``Synset`` is compared with itself. """ distance = self.shortest_path_distance( other, simulate_root=simulate_root and (self._needs_root() or other._needs_root()), ) if distance is None or distance < 0: return None return 1.0 / (distance + 1) def lch_similarity(self, other, verbose=False, simulate_root=True): """ Leacock Chodorow Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur. The relationship is given as -log(p/2d) where p is the shortest path length and d is the taxonomy depth. :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (True by default) creates a fake root that connects all the taxonomies. Set it to false to disable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will be added for nouns as well. :return: A score denoting the similarity of the two ``Synset`` objects, normally greater than 0. None is returned if no connecting path could be found. If a ``Synset`` is compared with itself, the maximum score is returned, which varies depending on the taxonomy depth. """ if self._pos != other._pos: raise WordNetError( "Computing the lch similarity requires " "%s and %s to have the same part of speech." % (self, other) ) need_root = self._needs_root() if self._pos not in self._wordnet_corpus_reader._max_depth: self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root) depth = self._wordnet_corpus_reader._max_depth[self._pos] distance = self.shortest_path_distance( other, simulate_root=simulate_root and need_root ) if distance is None or distance < 0 or depth == 0: return None return -math.log((distance + 1) / (2.0 * depth)) def wup_similarity(self, other, verbose=False, simulate_root=True): """ Wu-Palmer Similarity: Return a score denoting how similar two word senses are, based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). Previously, the scores computed by this implementation did _not_ always agree with those given by Pedersen's Perl implementation of WordNet Similarity. However, with the addition of the simulate_root flag (see below), the score for verbs now almost always agree but not always for nouns. The LCS does not necessarily feature in the shortest path connecting the two senses, as it is by definition the common ancestor deepest in the taxonomy, not closest to the two senses. Typically, however, it will so feature. Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected. Where the LCS has multiple paths to the root, the longer path is used for the purposes of the calculation. :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (True by default) creates a fake root that connects all the taxonomies. Set it to false to disable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will be added for nouns as well. :return: A float score denoting the similarity of the two ``Synset`` objects, normally greater than zero. If no connecting path between the two senses can be found, None is returned. """ need_root = self._needs_root() or other._needs_root() # Note that to preserve behavior from NLTK2 we set use_min_depth=True # It is possible that more accurate results could be obtained by # removing this setting and it should be tested later on subsumers = self.lowest_common_hypernyms( other, simulate_root=simulate_root and need_root, use_min_depth=True ) # If no LCS was found return None if len(subsumers) == 0: return None subsumer = self if self in subsumers else subsumers[0] # Get the longest path from the LCS to the root, # including a correction: # - add one because the calculations include both the start and end # nodes depth = subsumer.max_depth() + 1 # Note: No need for an additional add-one correction for non-nouns # to account for an imaginary root node because that is now # automatically handled by simulate_root # if subsumer._pos != NOUN: # depth += 1 # Get the shortest path from the LCS to each of the synsets it is # subsuming. Add this to the LCS path length to get the path # length from each synset to the root. len1 = self.shortest_path_distance( subsumer, simulate_root=simulate_root and need_root ) len2 = other.shortest_path_distance( subsumer, simulate_root=simulate_root and need_root ) if len1 is None or len2 is None: return None len1 += depth len2 += depth return (2.0 * depth) / (len1 + len2) def res_similarity(self, other, ic, verbose=False): """ Resnik Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects. Synsets whose LCS is the root node of the taxonomy will have a score of 0 (e.g. N['dog'][0] and N['table'][0]). """ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) return lcs_ic def jcn_similarity(self, other, ic, verbose=False): """ Jiang-Conrath Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects. """ if self == other: return _INF ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) # If either of the input synsets are the root synset, or have a # frequency of 0 (sparse data problem), return 0. if ic1 == 0 or ic2 == 0: return 0 ic_difference = ic1 + ic2 - 2 * lcs_ic if ic_difference == 0: return _INF return 1 / ic_difference def lin_similarity(self, other, ic, verbose=False): """ Lin Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects, in the range 0 to 1. """ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) return (2.0 * lcs_ic) / (ic1 + ic2) def _iter_hypernym_lists(self): """ :return: An iterator over ``Synset`` objects that are either proper hypernyms or instance of hypernyms of the synset. """ todo = [self] seen = set() while todo: for synset in todo: seen.add(synset) yield todo todo = [ hypernym for synset in todo for hypernym in (synset.hypernyms() + synset.instance_hypernyms()) if hypernym not in seen ] def __repr__(self): return f"{type(self).__name__}('{self._name}')" def _related(self, relation_symbol, sort=True): get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset if relation_symbol not in self._pointers: return [] pointer_tuples = self._pointers[relation_symbol] r = [get_synset(pos, offset) for pos, offset in pointer_tuples] if sort: r.sort() return r ###################################################################### # WordNet Corpus Reader ###################################################################### class WordNetCorpusReader(CorpusReader): """ A corpus reader used to access wordnet or its variants. """ _ENCODING = "utf8" # { Part-of-speech constants ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v" # } # { Filename constants _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"} # } # { Part of speech constants _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5} _pos_names = dict(tup[::-1] for tup in _pos_numbers.items()) # } #: A list of file identifiers for all the fileids used by this #: corpus reader. _FILES = ( "cntlist.rev", "lexnames", "index.sense", "index.adj", "index.adv", "index.noun", "index.verb", "data.adj", "data.adv", "data.noun", "data.verb", "adj.exc", "adv.exc", "noun.exc", "verb.exc", ) def __init__(self, root, omw_reader): """ Construct a new wordnet corpus reader, with the given root directory. """ super().__init__(root, self._FILES, encoding=self._ENCODING) # A index that provides the file offset # Map from lemma -> pos -> synset_index -> offset self._lemma_pos_offset_map = defaultdict(dict) # A cache so we don't have to reconstruct synsets # Map from pos -> offset -> synset self._synset_offset_cache = defaultdict(dict) # A lookup for the maximum depth of each part of speech. Useful for # the lch similarity metric. self._max_depth = defaultdict(dict) # Corpus reader containing omw data. self._omw_reader = omw_reader if self._omw_reader is None: warnings.warn( "The multilingual functions are not available with this Wordnet version" ) else: self.provenances = self.omw_prov() # A cache to store the wordnet data of multiple languages self._lang_data = defaultdict(list) self._data_file_map = {} self._exception_map = {} self._lexnames = [] self._key_count_file = None self._key_synset_file = None # Load the lexnames with self.open("lexnames") as fp: for i, line in enumerate(fp): index, lexname, _ = line.split() assert int(index) == i self._lexnames.append(lexname) # Load the indices for lemmas and synset offsets self._load_lemma_pos_offset_map() # load the exception file data into memory self._load_exception_map() # map from WordNet 3.0 for OMW data self.map30 = self.map_wn30() # Language data attributes self.lg_attrs = ["lemma", "none", "def", "exe"] def corpus2sk(self, corpus=None): """Read sense key to synset id mapping, from index.sense file in corpus directory""" fn = "index.sense" if corpus: fn = os.path.join(os.pardir, corpus, fn) with self.open(fn) as fp: sk_map = {} for line in fp: items = line.strip().split(" ") sk = items[0] pos = self._pos_names[int(sk.split("%")[1].split(":")[0])] sk_map[sk] = f"{items[1]}-{pos}" return sk_map def map_wn30(self): """Mapping from Wordnet 3.0 to currently loaded Wordnet version""" if self.get_version() == "3.0": return None # warnings.warn(f"Mapping WN v. 3.0 to Wordnet v. {self.version}") sk1 = self.corpus2sk("wordnet") sk2 = self.corpus2sk() skmap = {} for sk in set(sk1.keys()).intersection(set(sk2.keys())): of1 = sk1[sk] of2 = sk2[sk] if of1 not in skmap.keys(): skmap[of1] = [of2] else: skmap[of1].append(of2) map30 = {} for of in skmap.keys(): candidates = skmap[of] # map to candidate that covers most lemmas: of2 = max((candidates.count(x), x) for x in set(candidates))[1] # warnings.warn(f"Map {of} {of2}") map30[of] = of2 if of[-1] == "s": # Add a mapping from "a" for applications like omw, # which don't use the "s" ss_type: map30[of[:-1] + "a"] = of2 return map30 # Open Multilingual WordNet functions, contributed by # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn def of2ss(self, of): """take an id and return the synsets""" return self.synset_from_pos_and_offset(of[-1], int(of[:8])) def ss2of(self, ss, lang=None): """return the ID of the synset""" pos = ss.pos() # Only these 3 WordNets retain the satellite pos tag if lang not in ["nld", "lit", "slk"] and pos == "s": pos = "a" return f"{ss.offset():08d}-{pos}" def _load_lang_data(self, lang): """load the wordnet data of the requested language from the file to the cache, _lang_data""" if lang in self._lang_data.keys(): return if lang not in self.langs(): raise WordNetError("Language is not supported.") with self._omw_reader.open( f"{self.provenances[lang]}/wn-data-{lang.split('_')[0]}.tab" ) as fp: self.custom_lemmas(fp, lang) def omw_prov(self): """Return a provenance dictionary of the languages in Multilingual Wordnet""" provdict = {} provdict["eng"] = "" fileids = self._omw_reader.fileids() for fileid in fileids: prov, langfile = os.path.split(fileid) file_name, file_extension = os.path.splitext(langfile) if file_extension == ".tab": lang = file_name.split("-")[-1] if lang in provdict.keys(): # We already have another resource for this lang, # so we need to further specify the lang id: lang = f"{lang}_{prov}" provdict[lang] = prov return provdict def langs(self): """return a list of languages supported by Multilingual Wordnet""" return self.provenances.keys() def _load_lemma_pos_offset_map(self): for suffix in self._FILEMAP.values(): # parse each line of the file (ignoring comment lines) with self.open("index.%s" % suffix) as fp: for i, line in enumerate(fp): if line.startswith(" "): continue _iter = iter(line.split()) def _next_token(): return next(_iter) try: # get the lemma and part-of-speech lemma = _next_token() pos = _next_token() # get the number of synsets for this lemma n_synsets = int(_next_token()) assert n_synsets > 0 # get and ignore the pointer symbols for all synsets of # this lemma n_pointers = int(_next_token()) [_next_token() for _ in range(n_pointers)] # same as number of synsets n_senses = int(_next_token()) assert n_synsets == n_senses # get and ignore number of senses ranked according to # frequency _next_token() # get synset offsets synset_offsets = [int(_next_token()) for _ in range(n_synsets)] # raise more informative error with file name and line number except (AssertionError, ValueError) as e: tup = ("index.%s" % suffix), (i + 1), e raise WordNetError("file %s, line %i: %s" % tup) from e # map lemmas and parts of speech to synsets self._lemma_pos_offset_map[lemma][pos] = synset_offsets if pos == ADJ: self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets def _load_exception_map(self): # load the exception file data into memory for pos, suffix in self._FILEMAP.items(): self._exception_map[pos] = {} with self.open("%s.exc" % suffix) as fp: for line in fp: terms = line.split() self._exception_map[pos][terms[0]] = terms[1:] self._exception_map[ADJ_SAT] = self._exception_map[ADJ] def _compute_max_depth(self, pos, simulate_root): """ Compute the max depth for the given part of speech. This is used by the lch similarity metric. """ depth = 0 for ii in self.all_synsets(pos): try: depth = max(depth, ii.max_depth()) except RuntimeError: print(ii) if simulate_root: depth += 1 self._max_depth[pos] = depth def get_version(self): fh = self._data_file(ADJ) fh.seek(0) for line in fh: match = re.search(r"Word[nN]et (\d+|\d+\.\d+) Copyright", line) if match is not None: version = match.group(1) fh.seek(0) return version ############################################################# # Loading Lemmas ############################################################# def lemma(self, name, lang="eng"): """Return lemma object that matches the name""" # cannot simply split on first '.', # e.g.: '.45_caliber.a.01..45_caliber' separator = SENSENUM_RE.search(name).end() synset_name, lemma_name = name[: separator - 1], name[separator:] synset = self.synset(synset_name) for lemma in synset.lemmas(lang): if lemma._name == lemma_name: return lemma raise WordNetError(f"no lemma {lemma_name!r} in {synset_name!r}") def lemma_from_key(self, key): # Keys are case sensitive and always lower-case key = key.lower() lemma_name, lex_sense = key.split("%") pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":") pos = self._pos_names[int(pos_number)] # open the key -> synset file if necessary if self._key_synset_file is None: self._key_synset_file = self.open("index.sense") # Find the synset for the lemma. synset_line = _binary_search_file(self._key_synset_file, key) if not synset_line: raise WordNetError("No synset found for key %r" % key) offset = int(synset_line.split()[1]) synset = self.synset_from_pos_and_offset(pos, offset) # return the corresponding lemma for lemma in synset._lemmas: if lemma._key == key: return lemma raise WordNetError("No lemma found for for key %r" % key) ############################################################# # Loading Synsets ############################################################# def synset(self, name): # split name into lemma, part of speech and synset number lemma, pos, synset_index_str = name.lower().rsplit(".", 2) synset_index = int(synset_index_str) - 1 # get the offset for this synset try: offset = self._lemma_pos_offset_map[lemma][pos][synset_index] except KeyError as e: message = "no lemma %r with part of speech %r" raise WordNetError(message % (lemma, pos)) from e except IndexError as e: n_senses = len(self._lemma_pos_offset_map[lemma][pos]) message = "lemma %r with part of speech %r has only %i %s" if n_senses == 1: tup = lemma, pos, n_senses, "sense" else: tup = lemma, pos, n_senses, "senses" raise WordNetError(message % tup) from e # load synset information from the appropriate file synset = self.synset_from_pos_and_offset(pos, offset) # some basic sanity checks on loaded attributes if pos == "s" and synset._pos == "a": message = ( "adjective satellite requested but only plain " "adjective found for lemma %r" ) raise WordNetError(message % lemma) assert synset._pos == pos or (pos == "a" and synset._pos == "s") # Return the synset object. return synset def _data_file(self, pos): """ Return an open file pointer for the data file for the given part of speech. """ if pos == ADJ_SAT: pos = ADJ if self._data_file_map.get(pos) is None: fileid = "data.%s" % self._FILEMAP[pos] self._data_file_map[pos] = self.open(fileid) return self._data_file_map[pos] def synset_from_pos_and_offset(self, pos, offset): """ - pos: The synset's part of speech, matching one of the module level attributes ADJ, ADJ_SAT, ADV, NOUN or VERB ('a', 's', 'r', 'n', or 'v'). - offset: The byte offset of this synset in the WordNet dict file for this pos. >>> from nltk.corpus import wordnet as wn >>> print(wn.synset_from_pos_and_offset('n', 1740)) Synset('entity.n.01') """ # Check to see if the synset is in the cache if offset in self._synset_offset_cache[pos]: return self._synset_offset_cache[pos][offset] data_file = self._data_file(pos) data_file.seek(offset) data_file_line = data_file.readline() # If valid, the offset equals the 8-digit 0-padded integer found at the start of the line: line_offset = data_file_line[:8] if line_offset.isalnum() and offset == int(line_offset): synset = self._synset_from_pos_and_line(pos, data_file_line) assert synset._offset == offset self._synset_offset_cache[pos][offset] = synset else: synset = None raise WordNetError( f"No WordNet synset found for pos={pos} at offset={offset}." ) data_file.seek(0) return synset @deprecated("Use public method synset_from_pos_and_offset() instead") def _synset_from_pos_and_offset(self, *args, **kwargs): """ Hack to help people like the readers of https://stackoverflow.com/a/27145655/1709587 who were using this function before it was officially a public method """ return self.synset_from_pos_and_offset(*args, **kwargs) def _synset_from_pos_and_line(self, pos, data_file_line): # Construct a new (empty) synset. synset = Synset(self) # parse the entry for this synset try: # parse out the definitions and examples from the gloss columns_str, gloss = data_file_line.strip().split("|") definition = re.sub(r"[\"].*?[\"]", "", gloss).strip() examples = re.findall(r'"([^"]*)"', gloss) for example in examples: synset._examples.append(example) synset._definition = definition.strip("; ") # split the other info into fields _iter = iter(columns_str.split()) def _next_token(): return next(_iter) # get the offset synset._offset = int(_next_token()) # determine the lexicographer file name lexname_index = int(_next_token()) synset._lexname = self._lexnames[lexname_index] # get the part of speech synset._pos = _next_token() # create Lemma objects for each lemma n_lemmas = int(_next_token(), 16) for _ in range(n_lemmas): # get the lemma name lemma_name = _next_token() # get the lex_id (used for sense_keys) lex_id = int(_next_token(), 16) # If the lemma has a syntactic marker, extract it. m = re.match(r"(.*?)(\(.*\))?$", lemma_name) lemma_name, syn_mark = m.groups() # create the lemma object lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark) synset._lemmas.append(lemma) synset._lemma_names.append(lemma._name) # collect the pointer tuples n_pointers = int(_next_token()) for _ in range(n_pointers): symbol = _next_token() offset = int(_next_token()) pos = _next_token() lemma_ids_str = _next_token() if lemma_ids_str == "0000": synset._pointers[symbol].add((pos, offset)) else: source_index = int(lemma_ids_str[:2], 16) - 1 target_index = int(lemma_ids_str[2:], 16) - 1 source_lemma_name = synset._lemmas[source_index]._name lemma_pointers = synset._lemma_pointers tups = lemma_pointers[source_lemma_name, symbol] tups.append((pos, offset, target_index)) # read the verb frames try: frame_count = int(_next_token()) except StopIteration: pass else: for _ in range(frame_count): # read the plus sign plus = _next_token() assert plus == "+" # read the frame and lemma number frame_number = int(_next_token()) frame_string_fmt = VERB_FRAME_STRINGS[frame_number] lemma_number = int(_next_token(), 16) # lemma number of 00 means all words in the synset if lemma_number == 0: synset._frame_ids.append(frame_number) for lemma in synset._lemmas: lemma._frame_ids.append(frame_number) lemma._frame_strings.append(frame_string_fmt % lemma._name) # only a specific word in the synset else: lemma = synset._lemmas[lemma_number - 1] lemma._frame_ids.append(frame_number) lemma._frame_strings.append(frame_string_fmt % lemma._name) # raise a more informative error with line text except ValueError as e: raise WordNetError(f"line {data_file_line!r}: {e}") from e # set sense keys for Lemma objects - note that this has to be # done afterwards so that the relations are available for lemma in synset._lemmas: if synset._pos == ADJ_SAT: head_lemma = synset.similar_tos()[0]._lemmas[0] head_name = head_lemma._name head_id = "%02d" % head_lemma._lex_id else: head_name = head_id = "" tup = ( lemma._name, WordNetCorpusReader._pos_numbers[synset._pos], lemma._lexname_index, lemma._lex_id, head_name, head_id, ) lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower() # the canonical name is based on the first lemma lemma_name = synset._lemmas[0]._name.lower() offsets = self._lemma_pos_offset_map[lemma_name][synset._pos] sense_index = offsets.index(synset._offset) tup = lemma_name, synset._pos, sense_index + 1 synset._name = "%s.%s.%02i" % tup return synset def synset_from_sense_key(self, sense_key): """ Retrieves synset based on a given sense_key. Sense keys can be obtained from lemma.key() From https://wordnet.princeton.edu/documentation/senseidx5wn: A sense_key is represented as:: lemma % lex_sense (e.g. 'dog%1:18:01::') where lex_sense is encoded as:: ss_type:lex_filenum:lex_id:head_word:head_id :lemma: ASCII text of word/collocation, in lower case :ss_type: synset type for the sense (1 digit int) The synset type is encoded as follows:: 1 NOUN 2 VERB 3 ADJECTIVE 4 ADVERB 5 ADJECTIVE SATELLITE :lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int) :lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int) :head_word: lemma of the first word in satellite's head synset Only used if sense is in an adjective satellite synset :head_id: uniquely identifies sense in a lexicographer file when paired with head_word Only used if head_word is present (2 digit int) >>> import nltk >>> from nltk.corpus import wordnet as wn >>> print(wn.synset_from_sense_key("drive%1:04:03::")) Synset('drive.n.06') >>> print(wn.synset_from_sense_key("driving%1:04:03::")) Synset('drive.n.06') """ return lemma_from_key(self, key).synset() ############################################################# # Retrieve synsets and lemmas. ############################################################# def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True): """Load all synsets with a given lemma and part of speech tag. If no pos is specified, all synsets for all parts of speech will be loaded. If lang is specified, all the synsets associated with the lemma name of that language will be returned. """ lemma = lemma.lower() if lang == "eng": get_synset = self.synset_from_pos_and_offset index = self._lemma_pos_offset_map if pos is None: pos = POS_LIST return [ get_synset(p, offset) for p in pos for form in self._morphy(lemma, p, check_exceptions) for offset in index[form].get(p, []) ] else: self._load_lang_data(lang) synset_list = [] if lemma in self._lang_data[lang][1]: for l in self._lang_data[lang][1][lemma]: if pos is not None and l[-1] != pos: continue synset_list.append(self.of2ss(l)) return synset_list def lemmas(self, lemma, pos=None, lang="eng"): """Return all Lemma objects with a name matching the specified lemma name and part of speech tag. Matches any part of speech tag if none is specified.""" lemma = lemma.lower() if lang == "eng": return [ lemma_obj for synset in self.synsets(lemma, pos) for lemma_obj in synset.lemmas() if lemma_obj.name().lower() == lemma ] else: self._load_lang_data(lang) lemmas = [] syn = self.synsets(lemma, lang=lang) for s in syn: if pos is not None and s.pos() != pos: continue for lemma_obj in s.lemmas(lang=lang): if lemma_obj.name().lower() == lemma: lemmas.append(lemma_obj) return lemmas def all_lemma_names(self, pos=None, lang="eng"): """Return all lemma names for all synsets for the given part of speech tag and language or languages. If pos is not specified, all synsets for all parts of speech will be used.""" if lang == "eng": if pos is None: return iter(self._lemma_pos_offset_map) else: return ( lemma for lemma in self._lemma_pos_offset_map if pos in self._lemma_pos_offset_map[lemma] ) else: self._load_lang_data(lang) lemma = [] for i in self._lang_data[lang][0]: if pos is not None and i[-1] != pos: continue lemma.extend(self._lang_data[lang][0][i]) lemma = iter(set(lemma)) return lemma def all_omw_synsets(self, pos=None, lang=None): if lang not in self.langs(): return None self._load_lang_data(lang) for of in self._lang_data[lang][0].keys(): try: ss = self.of2ss(of) yield ss except: # A few OMW offsets don't exist in Wordnet 3.0. # Additionally, when mapped to later Wordnets, # increasing numbers of synsets are lost in the mapping. # warnings.warn(f"Language {lang}: no synset found for {of}") pass def all_synsets(self, pos=None, lang="eng"): """Iterate over all synsets with a given part of speech tag. If no pos is specified, all synsets for all parts of speech will be loaded. """ if lang == "eng": return self.all_eng_synsets(pos=pos) else: return self.all_omw_synsets(pos=pos, lang=lang) def all_eng_synsets(self, pos=None): if pos is None: pos_tags = self._FILEMAP.keys() else: pos_tags = [pos] cache = self._synset_offset_cache from_pos_and_line = self._synset_from_pos_and_line # generate all synsets for each part of speech for pos_tag in pos_tags: # Open the file for reading. Note that we can not re-use # the file poitners from self._data_file_map here, because # we're defining an iterator, and those file pointers might # be moved while we're not looking. if pos_tag == ADJ_SAT: pos_tag = ADJ fileid = "data.%s" % self._FILEMAP[pos_tag] data_file = self.open(fileid) try: # generate synsets for each line in the POS file offset = data_file.tell() line = data_file.readline() while line: if not line[0].isspace(): if offset in cache[pos_tag]: # See if the synset is cached synset = cache[pos_tag][offset] else: # Otherwise, parse the line synset = from_pos_and_line(pos_tag, line) cache[pos_tag][offset] = synset # adjective satellites are in the same file as # adjectives so only yield the synset if it's actually # a satellite if synset._pos == ADJ_SAT: yield synset # for all other POS tags, yield all synsets (this means # that adjectives also include adjective satellites) else: yield synset offset = data_file.tell() line = data_file.readline() # close the extra file handle we opened except: data_file.close() raise else: data_file.close() def words(self, lang="eng"): """return lemmas of the given language as list of words""" return self.all_lemma_names(lang=lang) def doc(self, file="README", lang="eng"): """Return the contents of readme, license or citation file use lang=lang to get the file for an individual language""" if lang == "eng": reader = self else: reader = self._omw_reader if lang in self.langs(): file = f"{os.path.join(self.provenances[lang],file)}" try: with reader.open(file) as fp: return fp.read() except: if lang in self._lang_data: return f"Cannot determine {file} for {lang}" else: return f"Language {lang} is not supported." def license(self, lang="eng"): """Return the contents of LICENSE (for omw) use lang=lang to get the license for an individual language""" return self.doc(file="LICENSE", lang=lang) def readme(self, lang="eng"): """Return the contents of README (for omw) use lang=lang to get the readme for an individual language""" return self.doc(file="README", lang=lang) def citation(self, lang="eng"): """Return the contents of citation.bib file (for omw) use lang=lang to get the citation for an individual language""" return self.doc(file="citation.bib", lang=lang) ############################################################# # Misc ############################################################# def lemma_count(self, lemma): """Return the frequency count for this Lemma""" # Currently, count is only work for English if lemma._lang != "eng": return 0 # open the count file if we haven't already if self._key_count_file is None: self._key_count_file = self.open("cntlist.rev") # find the key in the counts file and return the count line = _binary_search_file(self._key_count_file, lemma._key) if line: return int(line.rsplit(" ", 1)[-1]) else: return 0 def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True): return synset1.path_similarity(synset2, verbose, simulate_root) path_similarity.__doc__ = Synset.path_similarity.__doc__ def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True): return synset1.lch_similarity(synset2, verbose, simulate_root) lch_similarity.__doc__ = Synset.lch_similarity.__doc__ def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True): return synset1.wup_similarity(synset2, verbose, simulate_root) wup_similarity.__doc__ = Synset.wup_similarity.__doc__ def res_similarity(self, synset1, synset2, ic, verbose=False): return synset1.res_similarity(synset2, ic, verbose) res_similarity.__doc__ = Synset.res_similarity.__doc__ def jcn_similarity(self, synset1, synset2, ic, verbose=False): return synset1.jcn_similarity(synset2, ic, verbose) jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ def lin_similarity(self, synset1, synset2, ic, verbose=False): return synset1.lin_similarity(synset2, ic, verbose) lin_similarity.__doc__ = Synset.lin_similarity.__doc__ ############################################################# # Morphy ############################################################# # Morphy, adapted from Oliver Steele's pywordnet def morphy(self, form, pos=None, check_exceptions=True): """ Find a possible base form for the given form, with the given part of speech, by checking WordNet's list of exceptional forms, and by recursively stripping affixes for this part of speech until a form in WordNet is found. >>> from nltk.corpus import wordnet as wn >>> print(wn.morphy('dogs')) dog >>> print(wn.morphy('churches')) church >>> print(wn.morphy('aardwolves')) aardwolf >>> print(wn.morphy('abaci')) abacus >>> wn.morphy('hardrock', wn.ADV) >>> print(wn.morphy('book', wn.NOUN)) book >>> wn.morphy('book', wn.ADJ) """ if pos is None: morphy = self._morphy analyses = chain(a for p in POS_LIST for a in morphy(form, p)) else: analyses = self._morphy(form, pos, check_exceptions) # get the first one we find first = list(islice(analyses, 1)) if len(first) == 1: return first[0] else: return None MORPHOLOGICAL_SUBSTITUTIONS = { NOUN: [ ("s", ""), ("ses", "s"), ("ves", "f"), ("xes", "x"), ("zes", "z"), ("ches", "ch"), ("shes", "sh"), ("men", "man"), ("ies", "y"), ], VERB: [ ("s", ""), ("ies", "y"), ("es", "e"), ("es", ""), ("ed", "e"), ("ed", ""), ("ing", "e"), ("ing", ""), ], ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")], ADV: [], } MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ] def _morphy(self, form, pos, check_exceptions=True): # from jordanbg: # Given an original string x # 1. Apply rules once to the input to get y1, y2, y3, etc. # 2. Return all that are in the database # 3. If there are no matches, keep applying rules until you either # find a match or you can't go any further exceptions = self._exception_map[pos] substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos] def apply_rules(forms): return [ form[: -len(old)] + new for form in forms for old, new in substitutions if form.endswith(old) ] def filter_forms(forms): result = [] seen = set() for form in forms: if form in self._lemma_pos_offset_map: if pos in self._lemma_pos_offset_map[form]: if form not in seen: result.append(form) seen.add(form) return result # 0. Check the exception lists if check_exceptions: if form in exceptions: return filter_forms([form] + exceptions[form]) # 1. Apply rules once to the input to get y1, y2, y3, etc. forms = apply_rules([form]) # 2. Return all that are in the database (and check the original too) results = filter_forms([form] + forms) if results: return results # 3. If there are no matches, keep applying rules until we find a match while forms: forms = apply_rules(forms) results = filter_forms(forms) if results: return results # Return an empty list if we can't find anything return [] ############################################################# # Create information content from corpus ############################################################# def ic(self, corpus, weight_senses_equally=False, smoothing=1.0): """ Creates an information content lookup dictionary from a corpus. :type corpus: CorpusReader :param corpus: The corpus from which we create an information content dictionary. :type weight_senses_equally: bool :param weight_senses_equally: If this is True, gives all possible senses equal weight rather than dividing by the number of possible senses. (If a word has 3 synses, each sense gets 0.3333 per appearance when this is False, 1.0 when it is true.) :param smoothing: How much do we smooth synset counts (default is 1.0) :type smoothing: float :return: An information content dictionary """ counts = FreqDist() for ww in corpus.words(): counts[ww] += 1 ic = {} for pp in POS_LIST: ic[pp] = defaultdict(float) # Initialize the counts with the smoothing value if smoothing > 0.0: for pp in POS_LIST: ic[pp][0] = smoothing for ss in self.all_synsets(): pos = ss._pos if pos == ADJ_SAT: pos = ADJ ic[pos][ss._offset] = smoothing for ww in counts: possible_synsets = self.synsets(ww) if len(possible_synsets) == 0: continue # Distribute weight among possible synsets weight = float(counts[ww]) if not weight_senses_equally: weight /= float(len(possible_synsets)) for ss in possible_synsets: pos = ss._pos if pos == ADJ_SAT: pos = ADJ for level in ss._iter_hypernym_lists(): for hh in level: ic[pos][hh._offset] += weight # Add the weight to the root ic[pos][0] += weight return ic def custom_lemmas(self, tab_file, lang): """ Reads a custom tab file containing mappings of lemmas in the given language to Princeton WordNet 3.0 synset offsets, allowing NLTK's WordNet functions to then be used with that language. See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for documentation on the Multilingual WordNet tab file format. :param tab_file: Tab file as a file or file-like object :type: lang str :param: lang ISO 639-3 code of the language of the tab file """ lg = lang.split("_")[0] if len(lg) != 3: raise ValueError("lang should be a (3 character) ISO 639-3 code") self._lang_data[lang] = [ defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list), ] for line in tab_file.readlines(): if isinstance(line, bytes): # Support byte-stream files (e.g. as returned by Python 2's # open() function) as well as text-stream ones line = line.decode("utf-8") if not line.startswith("#"): triple = line.strip().split("\t") if len(triple) < 3: continue offset_pos, label = triple[:2] val = triple[-1] if self.map30: if offset_pos in self.map30.keys(): # Map offset_pos to current Wordnet version: offset_pos = self.map30[offset_pos] else: # Synsets with no mapping keep their Wordnet 3.0 offset # warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}") pass pair = label.split(":") attr = pair[-1] if len(pair) == 1 or pair[0] == lg: if attr == "lemma": val = val.strip().replace(" ", "_") self._lang_data[lang][1][val.lower()].append(offset_pos) if attr in self.lg_attrs: self._lang_data[lang][self.lg_attrs.index(attr)][ offset_pos ].append(val) # Make sure no more entries are accidentally added subsequently for n in range(len(self.lg_attrs)): self._lang_data[lang][n].default_factory = None ###################################################################### # Visualize WordNet relation graphs using Graphviz ###################################################################### def digraph( self, inputs, rel=lambda s: s.hypernyms(), pos=None, maxdepth=-1, shapes=None, attr=None, verbose=False, ): """ Produce a graphical representation from 'inputs' (a list of start nodes, which can be a mix of Synsets, Lemmas and/or words), and a synset relation, for drawing with the 'dot' graph visualisation program from the Graphviz package. Return a string in the DOT graph file language, which can then be converted to an image by nltk.parse.dependencygraph.dot2img(dot_string). Optional Parameters: :rel: Wordnet synset relation :pos: for words, restricts Part of Speech to 'n', 'v', 'a' or 'r' :maxdepth: limit the longest path :shapes: dictionary of strings that trigger a specified shape :attr: dictionary with global graph attributes :verbose: warn about cycles >>> from nltk.corpus import wordnet as wn >>> print(wn.digraph([wn.synset('dog.n.01')])) digraph G { "Synset('dog.n.01')" -> "Synset('domestic_animal.n.01')"; "Synset('organism.n.01')" -> "Synset('living_thing.n.01')"; "Synset('mammal.n.01')" -> "Synset('vertebrate.n.01')"; "Synset('placental.n.01')" -> "Synset('mammal.n.01')"; "Synset('animal.n.01')" -> "Synset('organism.n.01')"; "Synset('vertebrate.n.01')" -> "Synset('chordate.n.01')"; "Synset('chordate.n.01')" -> "Synset('animal.n.01')"; "Synset('canine.n.02')" -> "Synset('carnivore.n.01')"; "Synset('living_thing.n.01')" -> "Synset('whole.n.02')"; "Synset('physical_entity.n.01')" -> "Synset('entity.n.01')"; "Synset('carnivore.n.01')" -> "Synset('placental.n.01')"; "Synset('object.n.01')" -> "Synset('physical_entity.n.01')"; "Synset('whole.n.02')" -> "Synset('object.n.01')"; "Synset('dog.n.01')" -> "Synset('canine.n.02')"; "Synset('domestic_animal.n.01')" -> "Synset('animal.n.01')"; } """ from nltk.util import edge_closure, edges2dot synsets = set() edges = set() if not shapes: shapes = dict() if not attr: attr = dict() def add_lemma(lem): ss = lem.synset() synsets.add(ss) edges.add((lem, ss)) for node in inputs: typ = type(node) if typ == Synset: synsets.add(node) elif typ == Lemma: add_lemma(node) elif typ == str: for lemma in self.lemmas(node, pos): add_lemma(lemma) for ss in synsets: edges = edges.union(edge_closure(ss, rel, maxdepth, verbose)) dot_string = edges2dot(edges, shapes=shapes, attr=attr) return dot_string ###################################################################### # WordNet Information Content Corpus Reader ###################################################################### class WordNetICCorpusReader(CorpusReader): """ A corpus reader for the WordNet information content corpus. """ def __init__(self, root, fileids): CorpusReader.__init__(self, root, fileids, encoding="utf8") # this load function would be more efficient if the data was pickled # Note that we can't use NLTK's frequency distributions because # synsets are overlapping (each instance of a synset also counts # as an instance of its hypernyms) def ic(self, icfile): """ Load an information content file from the wordnet_ic corpus and return a dictionary. This dictionary has just two keys, NOUN and VERB, whose values are dictionaries that map from synsets to information content values. :type icfile: str :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat") :return: An information content dictionary """ ic = {} ic[NOUN] = defaultdict(float) ic[VERB] = defaultdict(float) with self.open(icfile) as fp: for num, line in enumerate(fp): if num == 0: # skip the header continue fields = line.split() offset = int(fields[0][:-1]) value = float(fields[1]) pos = _get_pos(fields[0]) if len(fields) == 3 and fields[2] == "ROOT": # Store root count. ic[pos][0] += value if value != 0: ic[pos][offset] = value return ic ###################################################################### # Similarity metrics ###################################################################### # TODO: Add in the option to manually add a new root node; this will be # useful for verb similarity as there exist multiple verb taxonomies. # More information about the metrics is available at # http://marimba.d.umn.edu/similarity/measures.html def path_similarity(synset1, synset2, verbose=False, simulate_root=True): return synset1.path_similarity(synset2, verbose, simulate_root) def lch_similarity(synset1, synset2, verbose=False, simulate_root=True): return synset1.lch_similarity(synset2, verbose, simulate_root) def wup_similarity(synset1, synset2, verbose=False, simulate_root=True): return synset1.wup_similarity(synset2, verbose, simulate_root) def res_similarity(synset1, synset2, ic, verbose=False): return synset1.res_similarity(synset2, verbose) def jcn_similarity(synset1, synset2, ic, verbose=False): return synset1.jcn_similarity(synset2, verbose) def lin_similarity(synset1, synset2, ic, verbose=False): return synset1.lin_similarity(synset2, verbose) path_similarity.__doc__ = Synset.path_similarity.__doc__ lch_similarity.__doc__ = Synset.lch_similarity.__doc__ wup_similarity.__doc__ = Synset.wup_similarity.__doc__ res_similarity.__doc__ = Synset.res_similarity.__doc__ jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ lin_similarity.__doc__ = Synset.lin_similarity.__doc__ def _lcs_ic(synset1, synset2, ic, verbose=False): """ Get the information content of the least common subsumer that has the highest information content value. If two nodes have no explicit common subsumer, assume that they share an artificial root node that is the hypernym of all explicit roots. :type synset1: Synset :param synset1: First input synset. :type synset2: Synset :param synset2: Second input synset. Must be the same part of speech as the first synset. :type ic: dict :param ic: an information content object (as returned by ``load_ic()``). :return: The information content of the two synsets and their most informative subsumer """ if synset1._pos != synset2._pos: raise WordNetError( "Computing the least common subsumer requires " "%s and %s to have the same part of speech." % (synset1, synset2) ) ic1 = information_content(synset1, ic) ic2 = information_content(synset2, ic) subsumers = synset1.common_hypernyms(synset2) if len(subsumers) == 0: subsumer_ic = 0 else: subsumer_ic = max(information_content(s, ic) for s in subsumers) if verbose: print("> LCS Subsumer by content:", subsumer_ic) return ic1, ic2, subsumer_ic # Utility functions def information_content(synset, ic): try: icpos = ic[synset._pos] except KeyError as e: msg = "Information content file has no entries for part-of-speech: %s" raise WordNetError(msg % synset._pos) from e counts = icpos[synset._offset] if counts == 0: return _INF else: return -math.log(counts / icpos[0]) # get the part of speech (NOUN or VERB) from the information content record # (each identifier has a 'n' or 'v' suffix) def _get_pos(field): if field[-1] == "n": return NOUN elif field[-1] == "v": return VERB else: msg = ( "Unidentified part of speech in WordNet Information Content file " "for field %s" % field ) raise ValueError(msg) nltk-3.7/nltk/corpus/reader/xmldocs.py000066400000000000000000000370201420073152400201030ustar00rootroot00000000000000# Natural Language Toolkit: XML Corpus Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ Corpus reader for corpora whose documents are xml files. (note -- not named 'xml' to avoid conflicting w/ standard xml package) """ import codecs from xml.etree import ElementTree from nltk.corpus.reader.api import CorpusReader from nltk.corpus.reader.util import * from nltk.data import SeekableUnicodeStreamReader from nltk.internals import ElementWrapper from nltk.tokenize import WordPunctTokenizer class XMLCorpusReader(CorpusReader): """ Corpus reader for corpora whose documents are xml files. Note that the ``XMLCorpusReader`` constructor does not take an ``encoding`` argument, because the unicode encoding is specified by the XML files themselves. See the XML specs for more info. """ def __init__(self, root, fileids, wrap_etree=False): self._wrap_etree = wrap_etree CorpusReader.__init__(self, root, fileids) def xml(self, fileid=None): # Make sure we have exactly one file -- no concatenating XML. if fileid is None and len(self._fileids) == 1: fileid = self._fileids[0] if not isinstance(fileid, str): raise TypeError("Expected a single file identifier string") # Read the XML in using ElementTree. with self.abspath(fileid).open() as fp: elt = ElementTree.parse(fp).getroot() # If requested, wrap it. if self._wrap_etree: elt = ElementWrapper(elt) # Return the ElementTree element. return elt def words(self, fileid=None): """ Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file. :return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """ elt = self.xml(fileid) encoding = self.encoding(fileid) word_tokenizer = WordPunctTokenizer() try: iterator = elt.getiterator() except: iterator = elt.iter() out = [] for node in iterator: text = node.text if text is not None: if isinstance(text, bytes): text = text.decode(encoding) toks = word_tokenizer.tokenize(text) out.extend(toks) return out class XMLCorpusView(StreamBackedCorpusView): """ A corpus view that selects out specified elements from an XML file, and provides a flat list-like interface for accessing them. (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself, but may be used by subclasses of ``XMLCorpusReader``.) Every XML corpus view has a "tag specification", indicating what XML elements should be included in the view; and each (non-nested) element that matches this specification corresponds to one item in the view. Tag specifications are regular expressions over tag paths, where a tag path is a list of element tag names, separated by '/', indicating the ancestry of the element. Some examples: - ``'foo'``: A top-level element whose tag is ``foo``. - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent is a top-level element whose tag is ``foo``. - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere in the xml tree. - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``, appearing anywhere in the xml tree. The view items are generated from the selected XML elements via the method ``handle_elt()``. By default, this method returns the element as-is (i.e., as an ElementTree object); but it can be overridden, either via subclassing or via the ``elt_handler`` constructor parameter. """ #: If true, then display debugging output to stdout when reading #: blocks. _DEBUG = False #: The number of characters read at a time by this corpus reader. _BLOCK_SIZE = 1024 def __init__(self, fileid, tagspec, elt_handler=None): """ Create a new corpus view based on a specified XML file. Note that the ``XMLCorpusView`` constructor does not take an ``encoding`` argument, because the unicode encoding is specified by the XML files themselves. :type tagspec: str :param tagspec: A tag specification, indicating what XML elements should be included in the view. Each non-nested element that matches this specification corresponds to one item in the view. :param elt_handler: A function used to transform each element to a value for the view. If no handler is specified, then ``self.handle_elt()`` is called, which returns the element as an ElementTree object. The signature of elt_handler is:: elt_handler(elt, tagspec) -> value """ if elt_handler: self.handle_elt = elt_handler self._tagspec = re.compile(tagspec + r"\Z") """The tag specification for this corpus view.""" self._tag_context = {0: ()} """A dictionary mapping from file positions (as returned by ``stream.seek()`` to XML contexts. An XML context is a tuple of XML tag names, indicating which tags have not yet been closed.""" encoding = self._detect_encoding(fileid) StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) def _detect_encoding(self, fileid): if isinstance(fileid, PathPointer): try: infile = fileid.open() s = infile.readline() finally: infile.close() else: with open(fileid, "rb") as infile: s = infile.readline() if s.startswith(codecs.BOM_UTF16_BE): return "utf-16-be" if s.startswith(codecs.BOM_UTF16_LE): return "utf-16-le" if s.startswith(codecs.BOM_UTF32_BE): return "utf-32-be" if s.startswith(codecs.BOM_UTF32_LE): return "utf-32-le" if s.startswith(codecs.BOM_UTF8): return "utf-8" m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s) if m: return m.group(1).decode() m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s) if m: return m.group(1).decode() # No encoding found -- what should the default be? return "utf-8" def handle_elt(self, elt, context): """ Convert an element into an appropriate value for inclusion in the view. Unless overridden by a subclass or by the ``elt_handler`` constructor argument, this method simply returns ``elt``. :return: The view value corresponding to ``elt``. :type elt: ElementTree :param elt: The element that should be converted. :type context: str :param context: A string composed of element tags separated by forward slashes, indicating the XML context of the given element. For example, the string ``'foo/bar/baz'`` indicates that the element is a ``baz`` element whose parent is a ``bar`` element and whose grandparent is a top-level ``foo`` element. """ return elt #: A regular expression that matches XML fragments that do not #: contain any un-closed tags. _VALID_XML_RE = re.compile( r""" [^<]* ( (() | # comment () | # doctype decl (<[^!>][^>]*>)) # tag or PI [^<]*)* \Z""", re.DOTALL | re.VERBOSE, ) #: A regular expression used to extract the tag name from a start tag, #: end tag, or empty-elt tag string. _XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)") #: A regular expression used to find all start-tags, end-tags, and #: empty-elt tags in an XML file. This regexp is more lenient than #: the XML spec -- e.g., it allows spaces in some places where the #: spec does not. _XML_PIECE = re.compile( r""" # Include these so we can skip them: (?P )| (?P )| (?P <\?.*?\?> )| (?P ]*(\[[^\]]*])?\s*>)| # These are the ones we actually care about: (?P <\s*[^>/\?!\s][^>]*/\s*> )| (?P <\s*[^>/\?!\s][^>]*> )| (?P <\s*/[^>/\?!\s][^>]*> )""", re.DOTALL | re.VERBOSE, ) def _read_xml_fragment(self, stream): """ Read a string from the given stream that does not contain any un-closed tags. In particular, this function first reads a block from the stream of size ``self._BLOCK_SIZE``. It then checks if that block contains an un-closed tag. If it does, then this function either backtracks to the last '<', or reads another block. """ fragment = "" if isinstance(stream, SeekableUnicodeStreamReader): startpos = stream.tell() while True: # Read a block and add it to the fragment. xml_block = stream.read(self._BLOCK_SIZE) fragment += xml_block # Do we have a well-formed xml fragment? if self._VALID_XML_RE.match(fragment): return fragment # Do we have a fragment that will never be well-formed? if re.search("[<>]", fragment).group(0) == ">": pos = stream.tell() - ( len(fragment) - re.search("[<>]", fragment).end() ) raise ValueError('Unexpected ">" near char %s' % pos) # End of file? if not xml_block: raise ValueError("Unexpected end of file: tag not closed") # If not, then we must be in the middle of a <..tag..>. # If appropriate, backtrack to the most recent '<' # character. last_open_bracket = fragment.rfind("<") if last_open_bracket > 0: if self._VALID_XML_RE.match(fragment[:last_open_bracket]): if isinstance(stream, SeekableUnicodeStreamReader): stream.seek(startpos) stream.char_seek_forward(last_open_bracket) else: stream.seek(-(len(fragment) - last_open_bracket), 1) return fragment[:last_open_bracket] # Otherwise, read another block. (i.e., return to the # top of the loop.) def read_block(self, stream, tagspec=None, elt_handler=None): """ Read from ``stream`` until we find at least one element that matches ``tagspec``, and return the result of applying ``elt_handler`` to each element found. """ if tagspec is None: tagspec = self._tagspec if elt_handler is None: elt_handler = self.handle_elt # Use a stack of strings to keep track of our context: context = list(self._tag_context.get(stream.tell())) assert context is not None # check this -- could it ever happen? elts = [] elt_start = None # where does the elt start elt_depth = None # what context depth elt_text = "" while elts == [] or elt_start is not None: if isinstance(stream, SeekableUnicodeStreamReader): startpos = stream.tell() xml_fragment = self._read_xml_fragment(stream) # End of file. if not xml_fragment: if elt_start is None: break else: raise ValueError("Unexpected end of file") # Process each in the xml fragment. for piece in self._XML_PIECE.finditer(xml_fragment): if self._DEBUG: print("{:>25} {}".format("/".join(context)[-20:], piece.group())) if piece.group("START_TAG"): name = self._XML_TAG_NAME.match(piece.group()).group(1) # Keep context up-to-date. context.append(name) # Is this one of the elts we're looking for? if elt_start is None: if re.match(tagspec, "/".join(context)): elt_start = piece.start() elt_depth = len(context) elif piece.group("END_TAG"): name = self._XML_TAG_NAME.match(piece.group()).group(1) # sanity checks: if not context: raise ValueError("Unmatched tag " % name) if name != context[-1]: raise ValueError(f"Unmatched tag <{context[-1]}>...") # Is this the end of an element? if elt_start is not None and elt_depth == len(context): elt_text += xml_fragment[elt_start : piece.end()] elts.append((elt_text, "/".join(context))) elt_start = elt_depth = None elt_text = "" # Keep context up-to-date context.pop() elif piece.group("EMPTY_ELT_TAG"): name = self._XML_TAG_NAME.match(piece.group()).group(1) if elt_start is None: if re.match(tagspec, "/".join(context) + "/" + name): elts.append((piece.group(), "/".join(context) + "/" + name)) if elt_start is not None: # If we haven't found any elements yet, then keep # looping until we do. if elts == []: elt_text += xml_fragment[elt_start:] elt_start = 0 # If we've found at least one element, then try # backtracking to the start of the element that we're # inside of. else: # take back the last start-tag, and return what # we've gotten so far (elts is non-empty). if self._DEBUG: print(" " * 36 + "(backtrack)") if isinstance(stream, SeekableUnicodeStreamReader): stream.seek(startpos) stream.char_seek_forward(elt_start) else: stream.seek(-(len(xml_fragment) - elt_start), 1) context = context[: elt_depth - 1] elt_start = elt_depth = None elt_text = "" # Update the _tag_context dict. pos = stream.tell() if pos in self._tag_context: assert tuple(context) == self._tag_context[pos] else: self._tag_context[pos] = tuple(context) return [ elt_handler( ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")), context, ) for (elt, context) in elts ] nltk-3.7/nltk/corpus/reader/ycoe.py000066400000000000000000000240101420073152400173640ustar00rootroot00000000000000# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) # # Copyright (C) 2001-2015 NLTK Project # Author: Selina Dennis # URL: # For license information, see LICENSE.TXT """ Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE), a 1.5 million word syntactically-annotated corpus of Old English prose texts. The corpus is distributed by the Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included with NLTK. The YCOE corpus is divided into 100 files, each representing an Old English prose text. Tags used within each text complies to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm """ import os import re from nltk.corpus.reader.api import * from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader from nltk.corpus.reader.tagged import TaggedCorpusReader from nltk.corpus.reader.util import * from nltk.tokenize import RegexpTokenizer class YCOECorpusReader(CorpusReader): """ Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE), a 1.5 million word syntactically-annotated corpus of Old English prose texts. """ def __init__(self, root, encoding="utf8"): CorpusReader.__init__(self, root, [], encoding) self._psd_reader = YCOEParseCorpusReader( self.root.join("psd"), ".*", ".psd", encoding=encoding ) self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos") # Make sure we have a consistent set of items: documents = {f[:-4] for f in self._psd_reader.fileids()} if {f[:-4] for f in self._pos_reader.fileids()} != documents: raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.") fileids = sorted( ["%s.psd" % doc for doc in documents] + ["%s.pos" % doc for doc in documents] ) CorpusReader.__init__(self, root, fileids, encoding) self._documents = sorted(documents) def documents(self, fileids=None): """ Return a list of document identifiers for all documents in this corpus, or for the documents with the given file(s) if specified. """ if fileids is None: return self._documents if isinstance(fileids, str): fileids = [fileids] for f in fileids: if f not in self._fileids: raise KeyError("File id %s not found" % fileids) # Strip off the '.pos' and '.psd' extensions. return sorted({f[:-4] for f in fileids}) def fileids(self, documents=None): """ Return a list of file identifiers for the files that make up this corpus, or that store the given document(s) if specified. """ if documents is None: return self._fileids elif isinstance(documents, str): documents = [documents] return sorted( set( ["%s.pos" % doc for doc in documents] + ["%s.psd" % doc for doc in documents] ) ) def _getfileids(self, documents, subcorpus): """ Helper that selects the appropriate fileids for a given set of documents from a given subcorpus (pos or psd). """ if documents is None: documents = self._documents else: if isinstance(documents, str): documents = [documents] for document in documents: if document not in self._documents: if document[-4:] in (".pos", ".psd"): raise ValueError( "Expected a document identifier, not a file " "identifier. (Use corpus.documents() to get " "a list of document identifiers." ) else: raise ValueError("Document identifier %s not found" % document) return [f"{d}.{subcorpus}" for d in documents] # Delegate to one of our two sub-readers: def words(self, documents=None): return self._pos_reader.words(self._getfileids(documents, "pos")) def sents(self, documents=None): return self._pos_reader.sents(self._getfileids(documents, "pos")) def paras(self, documents=None): return self._pos_reader.paras(self._getfileids(documents, "pos")) def tagged_words(self, documents=None): return self._pos_reader.tagged_words(self._getfileids(documents, "pos")) def tagged_sents(self, documents=None): return self._pos_reader.tagged_sents(self._getfileids(documents, "pos")) def tagged_paras(self, documents=None): return self._pos_reader.tagged_paras(self._getfileids(documents, "pos")) def parsed_sents(self, documents=None): return self._psd_reader.parsed_sents(self._getfileids(documents, "psd")) class YCOEParseCorpusReader(BracketParseCorpusReader): """Specialized version of the standard bracket parse corpus reader that strips out (CODE ...) and (ID ...) nodes.""" def _parse(self, t): t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t) if re.match(r"\s*\(\s*\)\s*$", t): return None return BracketParseCorpusReader._parse(self, t) class YCOETaggedCorpusReader(TaggedCorpusReader): def __init__(self, root, items, encoding="utf8"): gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*" sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__( self, root, items, sep="_", sent_tokenizer=sent_tokenizer ) #: A list of all documents and their titles in ycoe. documents = { "coadrian.o34": "Adrian and Ritheus", "coaelhom.o3": "Ælfric, Supplemental Homilies", "coaelive.o3": "Ælfric's Lives of Saints", "coalcuin": "Alcuin De virtutibus et vitiis", "coalex.o23": "Alexander's Letter to Aristotle", "coapollo.o3": "Apollonius of Tyre", "coaugust": "Augustine", "cobede.o2": "Bede's History of the English Church", "cobenrul.o3": "Benedictine Rule", "coblick.o23": "Blickling Homilies", "coboeth.o2": "Boethius' Consolation of Philosophy", "cobyrhtf.o3": "Byrhtferth's Manual", "cocanedgD": "Canons of Edgar (D)", "cocanedgX": "Canons of Edgar (X)", "cocathom1.o3": "Ælfric's Catholic Homilies I", "cocathom2.o3": "Ælfric's Catholic Homilies II", "cochad.o24": "Saint Chad", "cochdrul": "Chrodegang of Metz, Rule", "cochristoph": "Saint Christopher", "cochronA.o23": "Anglo-Saxon Chronicle A", "cochronC": "Anglo-Saxon Chronicle C", "cochronD": "Anglo-Saxon Chronicle D", "cochronE.o34": "Anglo-Saxon Chronicle E", "cocura.o2": "Cura Pastoralis", "cocuraC": "Cura Pastoralis (Cotton)", "codicts.o34": "Dicts of Cato", "codocu1.o1": "Documents 1 (O1)", "codocu2.o12": "Documents 2 (O1/O2)", "codocu2.o2": "Documents 2 (O2)", "codocu3.o23": "Documents 3 (O2/O3)", "codocu3.o3": "Documents 3 (O3)", "codocu4.o24": "Documents 4 (O2/O4)", "coeluc1": "Honorius of Autun, Elucidarium 1", "coeluc2": "Honorius of Autun, Elucidarium 1", "coepigen.o3": "Ælfric's Epilogue to Genesis", "coeuphr": "Saint Euphrosyne", "coeust": "Saint Eustace and his companions", "coexodusP": "Exodus (P)", "cogenesiC": "Genesis (C)", "cogregdC.o24": "Gregory's Dialogues (C)", "cogregdH.o23": "Gregory's Dialogues (H)", "coherbar": "Pseudo-Apuleius, Herbarium", "coinspolD.o34": "Wulfstan's Institute of Polity (D)", "coinspolX": "Wulfstan's Institute of Polity (X)", "cojames": "Saint James", "colacnu.o23": "Lacnunga", "colaece.o2": "Leechdoms", "colaw1cn.o3": "Laws, Cnut I", "colaw2cn.o3": "Laws, Cnut II", "colaw5atr.o3": "Laws, Æthelred V", "colaw6atr.o3": "Laws, Æthelred VI", "colawaf.o2": "Laws, Alfred", "colawafint.o2": "Alfred's Introduction to Laws", "colawger.o34": "Laws, Gerefa", "colawine.ox2": "Laws, Ine", "colawnorthu.o3": "Northumbra Preosta Lagu", "colawwllad.o4": "Laws, William I, Lad", "coleofri.o4": "Leofric", "colsigef.o3": "Ælfric's Letter to Sigefyrth", "colsigewB": "Ælfric's Letter to Sigeweard (B)", "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)", "colwgeat": "Ælfric's Letter to Wulfgeat", "colwsigeT": "Ælfric's Letter to Wulfsige (T)", "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)", "colwstan1.o3": "Ælfric's Letter to Wulfstan I", "colwstan2.o3": "Ælfric's Letter to Wulfstan II", "comargaC.o34": "Saint Margaret (C)", "comargaT": "Saint Margaret (T)", "comart1": "Martyrology, I", "comart2": "Martyrology, II", "comart3.o23": "Martyrology, III", "comarvel.o23": "Marvels of the East", "comary": "Mary of Egypt", "coneot": "Saint Neot", "conicodA": "Gospel of Nicodemus (A)", "conicodC": "Gospel of Nicodemus (C)", "conicodD": "Gospel of Nicodemus (D)", "conicodE": "Gospel of Nicodemus (E)", "coorosiu.o2": "Orosius", "cootest.o3": "Heptateuch", "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I", "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II", "coprefcura.o2": "Preface to the Cura Pastoralis", "coprefgen.o3": "Ælfric's Preface to Genesis", "copreflives.o3": "Ælfric's Preface to Lives of Saints", "coprefsolilo": "Preface to Augustine's Soliloquies", "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus", "corood": "History of the Holy Rood-Tree", "cosevensl": "Seven Sleepers", "cosolilo": "St. Augustine's Soliloquies", "cosolsat1.o4": "Solomon and Saturn I", "cosolsat2": "Solomon and Saturn II", "cotempo.o3": "Ælfric's De Temporibus Anni", "coverhom": "Vercelli Homilies", "coverhomE": "Vercelli Homilies (E)", "coverhomL": "Vercelli Homilies (L)", "covinceB": "Saint Vincent (Bodley 343)", "covinsal": "Vindicta Salvatoris", "cowsgosp.o3": "West-Saxon Gospels", "cowulf.o34": "Wulfstan's Homilies", } nltk-3.7/nltk/corpus/util.py000066400000000000000000000131211420073152400161410ustar00rootroot00000000000000# Natural Language Toolkit: Corpus Reader Utility Functions # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT ###################################################################### # { Lazy Corpus Loader ###################################################################### import gc import re import nltk TRY_ZIPFILE_FIRST = False class LazyCorpusLoader: """ To see the API documentation for this lazily loaded corpus, first run corpus.ensure_loaded(), and then run help(this_corpus). LazyCorpusLoader is a proxy object which is used to stand in for a corpus object before the corpus is loaded. This allows NLTK to create an object for each corpus, but defer the costs associated with loading those corpora until the first time that they're actually accessed. The first time this object is accessed in any way, it will load the corresponding corpus, and transform itself into that corpus (by modifying its own ``__class__`` and ``__dict__`` attributes). If the corpus can not be found, then accessing this object will raise an exception, displaying installation instructions for the NLTK data package. Once they've properly installed the data package (or modified ``nltk.data.path`` to point to its location), they can then use the corpus object without restarting python. :param name: The name of the corpus :type name: str :param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader :type reader: nltk.corpus.reader.api.CorpusReader :param nltk_data_subdir: The subdirectory where the corpus is stored. :type nltk_data_subdir: str :param `*args`: Any other non-keywords arguments that `reader_cls` might need. :param `**kwargs`: Any other keywords arguments that `reader_cls` might need. """ def __init__(self, name, reader_cls, *args, **kwargs): from nltk.corpus.reader.api import CorpusReader assert issubclass(reader_cls, CorpusReader) self.__name = self.__name__ = name self.__reader_cls = reader_cls # If nltk_data_subdir is set explicitly if "nltk_data_subdir" in kwargs: # Use the specified subdirectory path self.subdir = kwargs["nltk_data_subdir"] # Pops the `nltk_data_subdir` argument, we don't need it anymore. kwargs.pop("nltk_data_subdir", None) else: # Otherwise use 'nltk_data/corpora' self.subdir = "corpora" self.__args = args self.__kwargs = kwargs def __load(self): # Find the corpus root directory. zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name) if TRY_ZIPFILE_FIRST: try: root = nltk.data.find(f"{self.subdir}/{zip_name}") except LookupError as e: try: root = nltk.data.find(f"{self.subdir}/{self.__name}") except LookupError: raise e else: try: root = nltk.data.find(f"{self.subdir}/{self.__name}") except LookupError as e: try: root = nltk.data.find(f"{self.subdir}/{zip_name}") except LookupError: raise e # Load the corpus. corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) # This is where the magic happens! Transform ourselves into # the corpus by modifying our own __dict__ and __class__ to # match that of the corpus. args, kwargs = self.__args, self.__kwargs name, reader_cls = self.__name, self.__reader_cls self.__dict__ = corpus.__dict__ self.__class__ = corpus.__class__ # _unload support: assign __dict__ and __class__ back, then do GC. # after reassigning __dict__ there shouldn't be any references to # corpus data so the memory should be deallocated after gc.collect() def _unload(self): lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs) self.__dict__ = lazy_reader.__dict__ self.__class__ = lazy_reader.__class__ gc.collect() self._unload = _make_bound_method(_unload, self) def __getattr__(self, attr): # Fix for inspect.isclass under Python 2.6 # (see https://bugs.python.org/issue1225107). # Without this fix tests may take extra 1.5GB RAM # because all corpora gets loaded during test collection. if attr == "__bases__": raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") self.__load() # This looks circular, but its not, since __load() changes our # __class__ to something new: return getattr(self, attr) def __repr__(self): return "<{} in {!r} (not loaded yet)>".format( self.__reader_cls.__name__, ".../corpora/" + self.__name, ) def _unload(self): # If an exception occurs during corpus loading then # '_unload' method may be unattached, so __getattr__ can be called; # we shouldn't trigger corpus loading again in this case. pass def _make_bound_method(func, self): """ Magic for creating bound methods (used for _unload). """ class Foo: def meth(self): pass f = Foo() bound_method = type(f.meth) try: return bound_method(func, self, self.__class__) except TypeError: # python3 return bound_method(func, self) nltk-3.7/nltk/data.py000066400000000000000000001442071420073152400145740ustar00rootroot00000000000000# Natural Language Toolkit: Utility functions # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Functions to find and load NLTK resource files, such as corpora, grammars, and saved processing objects. Resource files are identified using URLs, such as ``nltk:corpora/abc/rural.txt`` or ``http://nltk.org/sample/toy.cfg``. The following URL protocols are supported: - ``file:path``: Specifies the file whose path is *path*. Both relative and absolute paths may be used. - ``https://host/path``: Specifies the file stored on the web server *host* at path *path*. - ``nltk:path``: Specifies the file stored in the NLTK data package at *path*. NLTK will search for these files in the directories specified by ``nltk.data.path``. If no protocol is specified, then the default protocol ``nltk:`` will be used. This module provides to functions that can be used to access a resource file, given its URL: ``load()`` loads a given resource, and adds it to a resource cache; and ``retrieve()`` copies a given resource to a local file. """ import codecs import functools import os import pickle import re import sys import textwrap import zipfile from abc import ABCMeta, abstractmethod from gzip import WRITE as GZ_WRITE from gzip import GzipFile from io import BytesIO, TextIOWrapper from urllib.request import url2pathname, urlopen try: from zlib import Z_SYNC_FLUSH as FLUSH except ImportError: from zlib import Z_FINISH as FLUSH from nltk import grammar, sem from nltk.compat import add_py3_data, py3_data from nltk.internals import deprecated textwrap_indent = functools.partial(textwrap.indent, prefix=" ") ###################################################################### # Search Path ###################################################################### path = [] """A list of directories where the NLTK data package might reside. These directories will be checked in order when looking for a resource in the data package. Note that this allows users to substitute in their own versions of resources, if they have them (e.g., in their home directory under ~/nltk_data).""" # User-specified locations: _paths_from_env = os.environ.get("NLTK_DATA", "").split(os.pathsep) path += [d for d in _paths_from_env if d] if "APPENGINE_RUNTIME" not in os.environ and os.path.expanduser("~/") != "~/": path.append(os.path.expanduser("~/nltk_data")) if sys.platform.startswith("win"): # Common locations on Windows: path += [ os.path.join(sys.prefix, "nltk_data"), os.path.join(sys.prefix, "share", "nltk_data"), os.path.join(sys.prefix, "lib", "nltk_data"), os.path.join(os.environ.get("APPDATA", "C:\\"), "nltk_data"), r"C:\nltk_data", r"D:\nltk_data", r"E:\nltk_data", ] else: # Common locations on UNIX & OS X: path += [ os.path.join(sys.prefix, "nltk_data"), os.path.join(sys.prefix, "share", "nltk_data"), os.path.join(sys.prefix, "lib", "nltk_data"), "/usr/share/nltk_data", "/usr/local/share/nltk_data", "/usr/lib/nltk_data", "/usr/local/lib/nltk_data", ] ###################################################################### # Util Functions ###################################################################### def gzip_open_unicode( filename, mode="rb", compresslevel=9, encoding="utf-8", fileobj=None, errors=None, newline=None, ): if fileobj is None: fileobj = GzipFile(filename, mode, compresslevel, fileobj) return TextIOWrapper(fileobj, encoding, errors, newline) def split_resource_url(resource_url): """ Splits a resource url into ":". >>> windows = sys.platform.startswith('win') >>> split_resource_url('nltk:home/nltk') ('nltk', 'home/nltk') >>> split_resource_url('nltk:/home/nltk') ('nltk', '/home/nltk') >>> split_resource_url('file:/home/nltk') ('file', '/home/nltk') >>> split_resource_url('file:///home/nltk') ('file', '/home/nltk') >>> split_resource_url('file:///C:/home/nltk') ('file', '/C:/home/nltk') """ protocol, path_ = resource_url.split(":", 1) if protocol == "nltk": pass elif protocol == "file": if path_.startswith("/"): path_ = "/" + path_.lstrip("/") else: path_ = re.sub(r"^/{0,2}", "", path_) return protocol, path_ def normalize_resource_url(resource_url): r""" Normalizes a resource url >>> windows = sys.platform.startswith('win') >>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \ ... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg')) True >>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file' True >>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg' True >>> normalize_resource_url('nltk:home/nltk') 'nltk:home/nltk' >>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk' True >>> normalize_resource_url('https://example.com/dir/file') 'https://example.com/dir/file' >>> normalize_resource_url('dir/file') 'nltk:dir/file' """ try: protocol, name = split_resource_url(resource_url) except ValueError: # the resource url has no protocol, use the nltk protocol by default protocol = "nltk" name = resource_url # use file protocol if the path is an absolute path if protocol == "nltk" and os.path.isabs(name): protocol = "file://" name = normalize_resource_name(name, False, None) elif protocol == "file": protocol = "file://" # name is absolute name = normalize_resource_name(name, False, None) elif protocol == "nltk": protocol = "nltk:" name = normalize_resource_name(name, True) else: # handled by urllib protocol += "://" return "".join([protocol, name]) def normalize_resource_name(resource_name, allow_relative=True, relative_path=None): """ :type resource_name: str or unicode :param resource_name: The name of the resource to search for. Resource names are posix-style relative path names, such as ``corpora/brown``. Directory names will automatically be converted to a platform-appropriate path separator. Directory trailing slashes are preserved >>> windows = sys.platform.startswith('win') >>> normalize_resource_name('.', True) './' >>> normalize_resource_name('./', True) './' >>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file' True >>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file' True >>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file' True >>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file' True >>> not windows or normalize_resource_name('/dir/file', True, '/') == 'dir/file' True >>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file' True """ is_dir = bool(re.search(r"[\\/.]$", resource_name)) or resource_name.endswith( os.path.sep ) if sys.platform.startswith("win"): resource_name = resource_name.lstrip("/") else: resource_name = re.sub(r"^/+", "/", resource_name) if allow_relative: resource_name = os.path.normpath(resource_name) else: if relative_path is None: relative_path = os.curdir resource_name = os.path.abspath(os.path.join(relative_path, resource_name)) resource_name = resource_name.replace("\\", "/").replace(os.path.sep, "/") if sys.platform.startswith("win") and os.path.isabs(resource_name): resource_name = "/" + resource_name if is_dir and not resource_name.endswith("/"): resource_name += "/" return resource_name ###################################################################### # Path Pointers ###################################################################### class PathPointer(metaclass=ABCMeta): """ An abstract base class for 'path pointers,' used by NLTK's data package to identify specific paths. Two subclasses exist: ``FileSystemPathPointer`` identifies a file that can be accessed directly via a given absolute path. ``ZipFilePathPointer`` identifies a file contained within a zipfile, that can be accessed by reading that zipfile. """ @abstractmethod def open(self, encoding=None): """ Return a seekable read-only stream that can be used to read the contents of the file identified by this path pointer. :raise IOError: If the path specified by this pointer does not contain a readable file. """ @abstractmethod def file_size(self): """ Return the size of the file pointed to by this path pointer, in bytes. :raise IOError: If the path specified by this pointer does not contain a readable file. """ @abstractmethod def join(self, fileid): """ Return a new path pointer formed by starting at the path identified by this pointer, and then following the relative path given by ``fileid``. The path components of ``fileid`` should be separated by forward slashes, regardless of the underlying file system's path separator character. """ class FileSystemPathPointer(PathPointer, str): """ A path pointer that identifies a file which can be accessed directly via a given absolute path. """ @py3_data def __init__(self, _path): """ Create a new path pointer for the given absolute path. :raise IOError: If the given path does not exist. """ _path = os.path.abspath(_path) if not os.path.exists(_path): raise OSError("No such file or directory: %r" % _path) self._path = _path # There's no need to call str.__init__(), since it's a no-op; # str does all of its setup work in __new__. @property def path(self): """The absolute path identified by this path pointer.""" return self._path def open(self, encoding=None): stream = open(self._path, "rb") if encoding is not None: stream = SeekableUnicodeStreamReader(stream, encoding) return stream def file_size(self): return os.stat(self._path).st_size def join(self, fileid): _path = os.path.join(self._path, fileid) return FileSystemPathPointer(_path) def __repr__(self): return "FileSystemPathPointer(%r)" % self._path def __str__(self): return self._path @deprecated("Use gzip.GzipFile instead as it also uses a buffer.") class BufferedGzipFile(GzipFile): """A ``GzipFile`` subclass for compatibility with older nltk releases. Use ``GzipFile`` directly as it also buffers in all supported Python versions. """ @py3_data def __init__( self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs ): """Return a buffered gzip file object.""" GzipFile.__init__(self, filename, mode, compresslevel, fileobj) def write(self, data): # This is identical to GzipFile.write but does not return # the bytes written to retain compatibility. super().write(data) class GzipFileSystemPathPointer(FileSystemPathPointer): """ A subclass of ``FileSystemPathPointer`` that identifies a gzip-compressed file located at a given absolute path. ``GzipFileSystemPathPointer`` is appropriate for loading large gzip-compressed pickle objects efficiently. """ def open(self, encoding=None): stream = GzipFile(self._path, "rb") if encoding: stream = SeekableUnicodeStreamReader(stream, encoding) return stream class ZipFilePathPointer(PathPointer): """ A path pointer that identifies a file contained within a zipfile, which can be accessed by reading that zipfile. """ @py3_data def __init__(self, zipfile, entry=""): """ Create a new path pointer pointing at the specified entry in the given zipfile. :raise IOError: If the given zipfile does not exist, or if it does not contain the specified entry. """ if isinstance(zipfile, str): zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile)) # Check that the entry exists: if entry: # Normalize the entry string, it should be relative: entry = normalize_resource_name(entry, True, "/").lstrip("/") try: zipfile.getinfo(entry) except Exception as e: # Sometimes directories aren't explicitly listed in # the zip file. So if `entry` is a directory name, # then check if the zipfile contains any files that # are under the given directory. if entry.endswith("/") and [ n for n in zipfile.namelist() if n.startswith(entry) ]: pass # zipfile contains a file in that directory. else: # Otherwise, complain. raise OSError( f"Zipfile {zipfile.filename!r} does not contain {entry!r}" ) from e self._zipfile = zipfile self._entry = entry @property def zipfile(self): """ The zipfile.ZipFile object used to access the zip file containing the entry identified by this path pointer. """ return self._zipfile @property def entry(self): """ The name of the file within zipfile that this path pointer points to. """ return self._entry def open(self, encoding=None): data = self._zipfile.read(self._entry) stream = BytesIO(data) if self._entry.endswith(".gz"): stream = GzipFile(self._entry, fileobj=stream) elif encoding is not None: stream = SeekableUnicodeStreamReader(stream, encoding) return stream def file_size(self): return self._zipfile.getinfo(self._entry).file_size def join(self, fileid): entry = f"{self._entry}/{fileid}" return ZipFilePathPointer(self._zipfile, entry) def __repr__(self): return f"ZipFilePathPointer({self._zipfile.filename!r}, {self._entry!r})" def __str__(self): return os.path.normpath(os.path.join(self._zipfile.filename, self._entry)) ###################################################################### # Access Functions ###################################################################### # Don't use a weak dictionary, because in the common case this # causes a lot more reloading that necessary. _resource_cache = {} """A dictionary used to cache resources so that they won't need to be loaded more than once.""" def find(resource_name, paths=None): """ Find the given resource by searching through the directories and zip files in paths, where a None or empty string specifies an absolute path. Returns a corresponding path name. If the given resource is not found, raise a ``LookupError``, whose message gives a pointer to the installation instructions for the NLTK downloader. Zip File Handling: - If ``resource_name`` contains a component with a ``.zip`` extension, then it is assumed to be a zipfile; and the remaining path components are used to look inside the zipfile. - If any element of ``nltk.data.path`` has a ``.zip`` extension, then it is assumed to be a zipfile. - If a given resource name that does not contain any zipfile component is not found initially, then ``find()`` will make a second attempt to find that resource, by replacing each component *p* in the path with *p.zip/p*. For example, this allows ``find()`` to map the resource name ``corpora/chat80/cities.pl`` to a zip file path pointer to ``corpora/chat80.zip/chat80/cities.pl``. - When using ``find()`` to locate a directory contained in a zipfile, the resource name must end with the forward slash character. Otherwise, ``find()`` will not locate the directory. :type resource_name: str or unicode :param resource_name: The name of the resource to search for. Resource names are posix-style relative path names, such as ``corpora/brown``. Directory names will be automatically converted to a platform-appropriate path separator. :rtype: str """ resource_name = normalize_resource_name(resource_name, True) # Resolve default paths at runtime in-case the user overrides # nltk.data.path if paths is None: paths = path # Check if the resource name includes a zipfile name m = re.match(r"(.*\.zip)/?(.*)$|", resource_name) zipfile, zipentry = m.groups() # Check each item in our path for path_ in paths: # Is the path item a zipfile? if path_ and (os.path.isfile(path_) and path_.endswith(".zip")): try: return ZipFilePathPointer(path_, resource_name) except OSError: # resource not in zipfile continue # Is the path item a directory or is resource_name an absolute path? elif not path_ or os.path.isdir(path_): if zipfile is None: p = os.path.join(path_, url2pathname(resource_name)) if os.path.exists(p): if p.endswith(".gz"): return GzipFileSystemPathPointer(p) else: return FileSystemPathPointer(p) else: p = os.path.join(path_, url2pathname(zipfile)) if os.path.exists(p): try: return ZipFilePathPointer(p, zipentry) except OSError: # resource not in zipfile continue # Fallback: if the path doesn't include a zip file, then try # again, assuming that one of the path components is inside a # zipfile of the same name. if zipfile is None: pieces = resource_name.split("/") for i in range(len(pieces)): modified_name = "/".join(pieces[:i] + [pieces[i] + ".zip"] + pieces[i:]) try: return find(modified_name, paths) except LookupError: pass # Identify the package (i.e. the .zip file) to download. resource_zipname = resource_name.split("/")[1] if resource_zipname.endswith(".zip"): resource_zipname = resource_zipname.rpartition(".")[0] # Display a friendly error message if the resource wasn't found: msg = str( "Resource \33[93m{resource}\033[0m not found.\n" "Please use the NLTK Downloader to obtain the resource:\n\n" "\33[31m" # To display red text in terminal. ">>> import nltk\n" ">>> nltk.download('{resource}')\n" "\033[0m" ).format(resource=resource_zipname) msg = textwrap_indent(msg) msg += "\n For more information see: https://www.nltk.org/data.html\n" msg += "\n Attempted to load \33[93m{resource_name}\033[0m\n".format( resource_name=resource_name ) msg += "\n Searched in:" + "".join("\n - %r" % d for d in paths) sep = "*" * 70 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" raise LookupError(resource_not_found) def retrieve(resource_url, filename=None, verbose=True): """ Copy the given resource to a local file. If no filename is specified, then use the URL's filename. If there is already a file named ``filename``, then raise a ``ValueError``. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. """ resource_url = normalize_resource_url(resource_url) if filename is None: if resource_url.startswith("file:"): filename = os.path.split(resource_url)[-1] else: filename = re.sub(r"(^\w+:)?.*/", "", resource_url) if os.path.exists(filename): filename = os.path.abspath(filename) raise ValueError("File %r already exists!" % filename) if verbose: print(f"Retrieving {resource_url!r}, saving to {filename!r}") # Open the input & output streams. infile = _open(resource_url) # Copy infile -> outfile, using 64k blocks. with open(filename, "wb") as outfile: while True: s = infile.read(1024 * 64) # 64k blocks. outfile.write(s) if not s: break infile.close() #: A dictionary describing the formats that are supported by NLTK's #: load() method. Keys are format names, and values are format #: descriptions. FORMATS = { "pickle": "A serialized python object, stored using the pickle module.", "json": "A serialized python object, stored using the json module.", "yaml": "A serialized python object, stored using the yaml module.", "cfg": "A context free grammar.", "pcfg": "A probabilistic CFG.", "fcfg": "A feature CFG.", "fol": "A list of first order logic expressions, parsed with " "nltk.sem.logic.Expression.fromstring.", "logic": "A list of first order logic expressions, parsed with " "nltk.sem.logic.LogicParser. Requires an additional logic_parser " "parameter", "val": "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.", "raw": "The raw (byte string) contents of a file.", "text": "The raw (unicode string) contents of a file. ", } #: A dictionary mapping from file extensions to format names, used #: by load() when format="auto" to decide the format for a #: given resource url. AUTO_FORMATS = { "pickle": "pickle", "json": "json", "yaml": "yaml", "cfg": "cfg", "pcfg": "pcfg", "fcfg": "fcfg", "fol": "fol", "logic": "logic", "val": "val", "txt": "text", "text": "text", } def load( resource_url, format="auto", cache=True, verbose=False, logic_parser=None, fstruct_reader=None, encoding=None, ): """ Load a given resource from the NLTK data package. The following resource formats are currently supported: - ``pickle`` - ``json`` - ``yaml`` - ``cfg`` (context free grammars) - ``pcfg`` (probabilistic CFGs) - ``fcfg`` (feature-based CFGs) - ``fol`` (formulas of First Order Logic) - ``logic`` (Logical formulas to be parsed by the given logic_parser) - ``val`` (valuation of First Order Logic model) - ``text`` (the file contents as a unicode string) - ``raw`` (the raw file contents as a byte string) If no format is specified, ``load()`` will attempt to determine a format based on the resource name's file extension. If that fails, ``load()`` will raise a ``ValueError`` exception. For all text formats (everything except ``pickle``, ``json``, ``yaml`` and ``raw``), it tries to decode the raw contents using UTF-8, and if that doesn't work, it tries with ISO-8859-1 (Latin-1), unless the ``encoding`` is specified. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. :type cache: bool :param cache: If true, add this resource to a cache. If load() finds a resource in its cache, then it will return it from the cache rather than loading it. :type verbose: bool :param verbose: If true, print a message when loading a resource. Messages are not displayed when a resource is retrieved from the cache. :type logic_parser: LogicParser :param logic_parser: The parser that will be used to parse logical expressions. :type fstruct_reader: FeatStructReader :param fstruct_reader: The parser that will be used to parse the feature structure of an fcfg. :type encoding: str :param encoding: the encoding of the input; only used for text formats. """ resource_url = normalize_resource_url(resource_url) resource_url = add_py3_data(resource_url) # Determine the format of the resource. if format == "auto": resource_url_parts = resource_url.split(".") ext = resource_url_parts[-1] if ext == "gz": ext = resource_url_parts[-2] format = AUTO_FORMATS.get(ext) if format is None: raise ValueError( "Could not determine format for %s based " 'on its file\nextension; use the "format" ' "argument to specify the format explicitly." % resource_url ) if format not in FORMATS: raise ValueError(f"Unknown format type: {format}!") # If we've cached the resource, then just return it. if cache: resource_val = _resource_cache.get((resource_url, format)) if resource_val is not None: if verbose: print(f"<>") return resource_val # Let the user know what's going on. if verbose: print(f"<>") # Load the resource. opened_resource = _open(resource_url) if format == "raw": resource_val = opened_resource.read() elif format == "pickle": resource_val = pickle.load(opened_resource) elif format == "json": import json from nltk.jsontags import json_tags resource_val = json.load(opened_resource) tag = None if len(resource_val) != 1: tag = next(resource_val.keys()) if tag not in json_tags: raise ValueError("Unknown json tag.") elif format == "yaml": import yaml resource_val = yaml.safe_load(opened_resource) else: # The resource is a text format. binary_data = opened_resource.read() if encoding is not None: string_data = binary_data.decode(encoding) else: try: string_data = binary_data.decode("utf-8") except UnicodeDecodeError: string_data = binary_data.decode("latin-1") if format == "text": resource_val = string_data elif format == "cfg": resource_val = grammar.CFG.fromstring(string_data, encoding=encoding) elif format == "pcfg": resource_val = grammar.PCFG.fromstring(string_data, encoding=encoding) elif format == "fcfg": resource_val = grammar.FeatureGrammar.fromstring( string_data, logic_parser=logic_parser, fstruct_reader=fstruct_reader, encoding=encoding, ) elif format == "fol": resource_val = sem.read_logic( string_data, logic_parser=sem.logic.LogicParser(), encoding=encoding, ) elif format == "logic": resource_val = sem.read_logic( string_data, logic_parser=logic_parser, encoding=encoding ) elif format == "val": resource_val = sem.read_valuation(string_data, encoding=encoding) else: raise AssertionError( "Internal NLTK error: Format %s isn't " "handled by nltk.data.load()" % (format,) ) opened_resource.close() # If requested, add it to the cache. if cache: try: _resource_cache[(resource_url, format)] = resource_val # TODO: add this line # print('<>' % (resource_url,)) except TypeError: # We can't create weak references to some object types, like # strings and tuples. For now, just don't cache them. pass return resource_val def show_cfg(resource_url, escape="##"): """ Write out a grammar file, ignoring escaped and empty lines. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. :type escape: str :param escape: Prepended string that signals lines to be ignored """ resource_url = normalize_resource_url(resource_url) resource_val = load(resource_url, format="text", cache=False) lines = resource_val.splitlines() for l in lines: if l.startswith(escape): continue if re.match("^$", l): continue print(l) def clear_cache(): """ Remove all objects from the resource cache. :see: load() """ _resource_cache.clear() def _open(resource_url): """ Helper function that returns an open file object for a resource, given its resource URL. If the given resource URL uses the "nltk:" protocol, or uses no protocol, then use ``nltk.data.find`` to find its path, and open it with the given mode; if the resource URL uses the 'file' protocol, then open the file with the given mode; otherwise, delegate to ``urllib2.urlopen``. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. """ resource_url = normalize_resource_url(resource_url) protocol, path_ = split_resource_url(resource_url) if protocol is None or protocol.lower() == "nltk": return find(path_, path + [""]).open() elif protocol.lower() == "file": # urllib might not use mode='rb', so handle this one ourselves: return find(path_, [""]).open() else: return urlopen(resource_url) ###################################################################### # Lazy Resource Loader ###################################################################### class LazyLoader: @py3_data def __init__(self, _path): self._path = _path def __load(self): resource = load(self._path) # This is where the magic happens! Transform ourselves into # the object by modifying our own __dict__ and __class__ to # match that of `resource`. self.__dict__ = resource.__dict__ self.__class__ = resource.__class__ def __getattr__(self, attr): self.__load() # This looks circular, but its not, since __load() changes our # __class__ to something new: return getattr(self, attr) def __repr__(self): self.__load() # This looks circular, but its not, since __load() changes our # __class__ to something new: return repr(self) ###################################################################### # Open-On-Demand ZipFile ###################################################################### class OpenOnDemandZipFile(zipfile.ZipFile): """ A subclass of ``zipfile.ZipFile`` that closes its file pointer whenever it is not using it; and re-opens it when it needs to read data from the zipfile. This is useful for reducing the number of open file handles when many zip files are being accessed at once. ``OpenOnDemandZipFile`` must be constructed from a filename, not a file-like object (to allow re-opening). ``OpenOnDemandZipFile`` is read-only (i.e. ``write()`` and ``writestr()`` are disabled. """ @py3_data def __init__(self, filename): if not isinstance(filename, str): raise TypeError("ReopenableZipFile filename must be a string") zipfile.ZipFile.__init__(self, filename) assert self.filename == filename self.close() # After closing a ZipFile object, the _fileRefCnt needs to be cleared # for Python2and3 compatible code. self._fileRefCnt = 0 def read(self, name): assert self.fp is None self.fp = open(self.filename, "rb") value = zipfile.ZipFile.read(self, name) # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code. # Since we only opened one file here, we add 1. self._fileRefCnt += 1 self.close() return value def write(self, *args, **kwargs): """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" raise NotImplementedError("OpenOnDemandZipfile is read-only") def writestr(self, *args, **kwargs): """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" raise NotImplementedError("OpenOnDemandZipfile is read-only") def __repr__(self): return repr("OpenOnDemandZipFile(%r)" % self.filename) ###################################################################### # Seekable Unicode Stream Reader ###################################################################### class SeekableUnicodeStreamReader: """ A stream reader that automatically encodes the source byte stream into unicode (like ``codecs.StreamReader``); but still supports the ``seek()`` and ``tell()`` operations correctly. This is in contrast to ``codecs.StreamReader``, which provide *broken* ``seek()`` and ``tell()`` methods. This class was motivated by ``StreamBackedCorpusView``, which makes extensive use of ``seek()`` and ``tell()``, and needs to be able to handle unicode-encoded files. Note: this class requires stateless decoders. To my knowledge, this shouldn't cause a problem with any of python's builtin unicode encodings. """ DEBUG = True # : If true, then perform extra sanity checks. @py3_data def __init__(self, stream, encoding, errors="strict"): # Rewind the stream to its beginning. stream.seek(0) self.stream = stream """The underlying stream.""" self.encoding = encoding """The name of the encoding that should be used to encode the underlying stream.""" self.errors = errors """The error mode that should be used when decoding data from the underlying stream. Can be 'strict', 'ignore', or 'replace'.""" self.decode = codecs.getdecoder(encoding) """The function that is used to decode byte strings into unicode strings.""" self.bytebuffer = b"" """A buffer to use bytes that have been read but have not yet been decoded. This is only used when the final bytes from a read do not form a complete encoding for a character.""" self.linebuffer = None """A buffer used by ``readline()`` to hold characters that have been read, but have not yet been returned by ``read()`` or ``readline()``. This buffer consists of a list of unicode strings, where each string corresponds to a single line. The final element of the list may or may not be a complete line. Note that the existence of a linebuffer makes the ``tell()`` operation more complex, because it must backtrack to the beginning of the buffer to determine the correct file position in the underlying byte stream.""" self._rewind_checkpoint = 0 """The file position at which the most recent read on the underlying stream began. This is used, together with ``_rewind_numchars``, to backtrack to the beginning of ``linebuffer`` (which is required by ``tell()``).""" self._rewind_numchars = None """The number of characters that have been returned since the read that started at ``_rewind_checkpoint``. This is used, together with ``_rewind_checkpoint``, to backtrack to the beginning of ``linebuffer`` (which is required by ``tell()``).""" self._bom = self._check_bom() """The length of the byte order marker at the beginning of the stream (or None for no byte order marker).""" # ///////////////////////////////////////////////////////////////// # Read methods # ///////////////////////////////////////////////////////////////// def read(self, size=None): """ Read up to ``size`` bytes, decode them using this reader's encoding, and return the resulting unicode string. :param size: The maximum number of bytes to read. If not specified, then read as many bytes as possible. :type size: int :rtype: unicode """ chars = self._read(size) # If linebuffer is not empty, then include it in the result if self.linebuffer: chars = "".join(self.linebuffer) + chars self.linebuffer = None self._rewind_numchars = None return chars def discard_line(self): if self.linebuffer and len(self.linebuffer) > 1: line = self.linebuffer.pop(0) self._rewind_numchars += len(line) else: self.stream.readline() def readline(self, size=None): """ Read a line of text, decode it using this reader's encoding, and return the resulting unicode string. :param size: The maximum number of bytes to read. If no newline is encountered before ``size`` bytes have been read, then the returned value may not be a complete line of text. :type size: int """ # If we have a non-empty linebuffer, then return the first # line from it. (Note that the last element of linebuffer may # not be a complete line; so let _read() deal with it.) if self.linebuffer and len(self.linebuffer) > 1: line = self.linebuffer.pop(0) self._rewind_numchars += len(line) return line readsize = size or 72 chars = "" # If there's a remaining incomplete line in the buffer, add it. if self.linebuffer: chars += self.linebuffer.pop() self.linebuffer = None while True: startpos = self.stream.tell() - len(self.bytebuffer) new_chars = self._read(readsize) # If we're at a '\r', then read one extra character, since # it might be a '\n', to get the proper line ending. if new_chars and new_chars.endswith("\r"): new_chars += self._read(1) chars += new_chars lines = chars.splitlines(True) if len(lines) > 1: line = lines[0] self.linebuffer = lines[1:] self._rewind_numchars = len(new_chars) - (len(chars) - len(line)) self._rewind_checkpoint = startpos break elif len(lines) == 1: line0withend = lines[0] line0withoutend = lines[0].splitlines(False)[0] if line0withend != line0withoutend: # complete line line = line0withend break if not new_chars or size is not None: line = chars break # Read successively larger blocks of text. if readsize < 8000: readsize *= 2 return line def readlines(self, sizehint=None, keepends=True): """ Read this file's contents, decode them using this reader's encoding, and return it as a list of unicode lines. :rtype: list(unicode) :param sizehint: Ignored. :param keepends: If false, then strip newlines. """ return self.read().splitlines(keepends) def next(self): """Return the next decoded line from the underlying stream.""" line = self.readline() if line: return line else: raise StopIteration def __next__(self): return self.next() def __iter__(self): """Return self""" return self def __del__(self): # let garbage collector deal with still opened streams if not self.closed: self.close() def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def xreadlines(self): """Return self""" return self # ///////////////////////////////////////////////////////////////// # Pass-through methods & properties # ///////////////////////////////////////////////////////////////// @property def closed(self): """True if the underlying stream is closed.""" return self.stream.closed @property def name(self): """The name of the underlying stream.""" return self.stream.name @property def mode(self): """The mode of the underlying stream.""" return self.stream.mode def close(self): """ Close the underlying stream. """ self.stream.close() # ///////////////////////////////////////////////////////////////// # Seek and tell # ///////////////////////////////////////////////////////////////// def seek(self, offset, whence=0): """ Move the stream to a new file position. If the reader is maintaining any buffers, then they will be cleared. :param offset: A byte count offset. :param whence: If 0, then the offset is from the start of the file (offset should be positive), if 1, then the offset is from the current position (offset may be positive or negative); and if 2, then the offset is from the end of the file (offset should typically be negative). """ if whence == 1: raise ValueError( "Relative seek is not supported for " "SeekableUnicodeStreamReader -- consider " "using char_seek_forward() instead." ) self.stream.seek(offset, whence) self.linebuffer = None self.bytebuffer = b"" self._rewind_numchars = None self._rewind_checkpoint = self.stream.tell() def char_seek_forward(self, offset): """ Move the read pointer forward by ``offset`` characters. """ if offset < 0: raise ValueError("Negative offsets are not supported") # Clear all buffers. self.seek(self.tell()) # Perform the seek operation. self._char_seek_forward(offset) def _char_seek_forward(self, offset, est_bytes=None): """ Move the file position forward by ``offset`` characters, ignoring all buffers. :param est_bytes: A hint, giving an estimate of the number of bytes that will be needed to move forward by ``offset`` chars. Defaults to ``offset``. """ if est_bytes is None: est_bytes = offset bytes = b"" while True: # Read in a block of bytes. newbytes = self.stream.read(est_bytes - len(bytes)) bytes += newbytes # Decode the bytes to characters. chars, bytes_decoded = self._incr_decode(bytes) # If we got the right number of characters, then seek # backwards over any truncated characters, and return. if len(chars) == offset: self.stream.seek(-len(bytes) + bytes_decoded, 1) return # If we went too far, then we can back-up until we get it # right, using the bytes we've already read. if len(chars) > offset: while len(chars) > offset: # Assume at least one byte/char. est_bytes += offset - len(chars) chars, bytes_decoded = self._incr_decode(bytes[:est_bytes]) self.stream.seek(-len(bytes) + bytes_decoded, 1) return # Otherwise, we haven't read enough bytes yet; loop again. est_bytes += offset - len(chars) def tell(self): """ Return the current file position on the underlying byte stream. If this reader is maintaining any buffers, then the returned file position will be the position of the beginning of those buffers. """ # If nothing's buffered, then just return our current filepos: if self.linebuffer is None: return self.stream.tell() - len(self.bytebuffer) # Otherwise, we'll need to backtrack the filepos until we # reach the beginning of the buffer. # Store our original file position, so we can return here. orig_filepos = self.stream.tell() # Calculate an estimate of where we think the newline is. bytes_read = (orig_filepos - len(self.bytebuffer)) - self._rewind_checkpoint buf_size = sum(len(line) for line in self.linebuffer) est_bytes = int( bytes_read * self._rewind_numchars / (self._rewind_numchars + buf_size) ) self.stream.seek(self._rewind_checkpoint) self._char_seek_forward(self._rewind_numchars, est_bytes) filepos = self.stream.tell() # Sanity check if self.DEBUG: self.stream.seek(filepos) check1 = self._incr_decode(self.stream.read(50))[0] check2 = "".join(self.linebuffer) assert check1.startswith(check2) or check2.startswith(check1) # Return to our original filepos (so we don't have to throw # out our buffer.) self.stream.seek(orig_filepos) # Return the calculated filepos return filepos # ///////////////////////////////////////////////////////////////// # Helper methods # ///////////////////////////////////////////////////////////////// def _read(self, size=None): """ Read up to ``size`` bytes from the underlying stream, decode them using this reader's encoding, and return the resulting unicode string. ``linebuffer`` is not included in the result. """ if size == 0: return "" # Skip past the byte order marker, if present. if self._bom and self.stream.tell() == 0: self.stream.read(self._bom) # Read the requested number of bytes. if size is None: new_bytes = self.stream.read() else: new_bytes = self.stream.read(size) bytes = self.bytebuffer + new_bytes # Decode the bytes into unicode characters chars, bytes_decoded = self._incr_decode(bytes) # If we got bytes but couldn't decode any, then read further. if (size is not None) and (not chars) and (len(new_bytes) > 0): while not chars: new_bytes = self.stream.read(1) if not new_bytes: break # end of file. bytes += new_bytes chars, bytes_decoded = self._incr_decode(bytes) # Record any bytes we didn't consume. self.bytebuffer = bytes[bytes_decoded:] # Return the result return chars def _incr_decode(self, bytes): """ Decode the given byte string into a unicode string, using this reader's encoding. If an exception is encountered that appears to be caused by a truncation error, then just decode the byte string without the bytes that cause the trunctaion error. Return a tuple ``(chars, num_consumed)``, where ``chars`` is the decoded unicode string, and ``num_consumed`` is the number of bytes that were consumed. """ while True: try: return self.decode(bytes, "strict") except UnicodeDecodeError as exc: # If the exception occurs at the end of the string, # then assume that it's a truncation error. if exc.end == len(bytes): return self.decode(bytes[: exc.start], self.errors) # Otherwise, if we're being strict, then raise it. elif self.errors == "strict": raise # If we're not strict, then re-process it with our # errors setting. This *may* raise an exception. else: return self.decode(bytes, self.errors) _BOM_TABLE = { "utf8": [(codecs.BOM_UTF8, None)], "utf16": [(codecs.BOM_UTF16_LE, "utf16-le"), (codecs.BOM_UTF16_BE, "utf16-be")], "utf16le": [(codecs.BOM_UTF16_LE, None)], "utf16be": [(codecs.BOM_UTF16_BE, None)], "utf32": [(codecs.BOM_UTF32_LE, "utf32-le"), (codecs.BOM_UTF32_BE, "utf32-be")], "utf32le": [(codecs.BOM_UTF32_LE, None)], "utf32be": [(codecs.BOM_UTF32_BE, None)], } def _check_bom(self): # Normalize our encoding name enc = re.sub("[ -]", "", self.encoding.lower()) # Look up our encoding in the BOM table. bom_info = self._BOM_TABLE.get(enc) if bom_info: # Read a prefix, to check against the BOM(s) bytes = self.stream.read(16) self.stream.seek(0) # Check for each possible BOM. for (bom, new_encoding) in bom_info: if bytes.startswith(bom): if new_encoding: self.encoding = new_encoding return len(bom) return None __all__ = [ "path", "PathPointer", "FileSystemPathPointer", "BufferedGzipFile", "GzipFileSystemPathPointer", "GzipFileSystemPathPointer", "find", "retrieve", "FORMATS", "AUTO_FORMATS", "load", "show_cfg", "clear_cache", "LazyLoader", "OpenOnDemandZipFile", "GzipFileSystemPathPointer", "SeekableUnicodeStreamReader", ] nltk-3.7/nltk/decorators.py000066400000000000000000000201231420073152400160160ustar00rootroot00000000000000""" Decorator module by Michele Simionato Copyright Michele Simionato, distributed under the terms of the BSD License (see below). http://www.phyast.pitt.edu/~micheles/python/documentation.html Included in NLTK for its support of a nice memoization decorator. """ __docformat__ = "restructuredtext en" ## The basic trick is to generate the source code for the decorated function ## with the right signature and to evaluate it. ## Uncomment the statement 'print >> sys.stderr, func_src' in _decorator ## to understand what is going on. __all__ = ["decorator", "new_wrapper", "getinfo"] import sys # Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in # the Python standard library. OLD_SYS_PATH = sys.path[:] sys.path = [p for p in sys.path if p and "nltk" not in str(p)] import inspect sys.path = OLD_SYS_PATH def __legacysignature(signature): """ For retrocompatibility reasons, we don't use a standard Signature. Instead, we use the string generated by this method. Basically, from a Signature we create a string and remove the default values. """ listsignature = str(signature)[1:-1].split(",") for counter, param in enumerate(listsignature): if param.count("=") > 0: listsignature[counter] = param[0 : param.index("=")].strip() else: listsignature[counter] = param.strip() return ", ".join(listsignature) def getinfo(func): """ Returns an info dictionary containing: - name (the name of the function : str) - argnames (the names of the arguments : list) - defaults (the values of the default arguments : tuple) - signature (the signature : str) - fullsignature (the full signature : Signature) - doc (the docstring : str) - module (the module name : str) - dict (the function __dict__ : str) >>> def f(self, x=1, y=2, *args, **kw): pass >>> info = getinfo(f) >>> info["name"] 'f' >>> info["argnames"] ['self', 'x', 'y', 'args', 'kw'] >>> info["defaults"] (1, 2) >>> info["signature"] 'self, x, y, *args, **kw' >>> info["fullsignature"] """ assert inspect.ismethod(func) or inspect.isfunction(func) argspec = inspect.getfullargspec(func) regargs, varargs, varkwargs = argspec[:3] argnames = list(regargs) if varargs: argnames.append(varargs) if varkwargs: argnames.append(varkwargs) fullsignature = inspect.signature(func) # Convert Signature to str signature = __legacysignature(fullsignature) # pypy compatibility if hasattr(func, "__closure__"): _closure = func.__closure__ _globals = func.__globals__ else: _closure = func.func_closure _globals = func.func_globals return dict( name=func.__name__, argnames=argnames, signature=signature, fullsignature=fullsignature, defaults=func.__defaults__, doc=func.__doc__, module=func.__module__, dict=func.__dict__, globals=_globals, closure=_closure, ) def update_wrapper(wrapper, model, infodict=None): "akin to functools.update_wrapper" infodict = infodict or getinfo(model) wrapper.__name__ = infodict["name"] wrapper.__doc__ = infodict["doc"] wrapper.__module__ = infodict["module"] wrapper.__dict__.update(infodict["dict"]) wrapper.__defaults__ = infodict["defaults"] wrapper.undecorated = model return wrapper def new_wrapper(wrapper, model): """ An improvement over functools.update_wrapper. The wrapper is a generic callable object. It works by generating a copy of the wrapper with the right signature and by updating the copy, not the original. Moreovoer, 'model' can be a dictionary with keys 'name', 'doc', 'module', 'dict', 'defaults'. """ if isinstance(model, dict): infodict = model else: # assume model is a function infodict = getinfo(model) assert ( not "_wrapper_" in infodict["argnames"] ), '"_wrapper_" is a reserved argument name!' src = "lambda %(signature)s: _wrapper_(%(signature)s)" % infodict funcopy = eval(src, dict(_wrapper_=wrapper)) return update_wrapper(funcopy, model, infodict) # helper used in decorator_factory def __call__(self, func): return new_wrapper(lambda *a, **k: self.call(func, *a, **k), func) def decorator_factory(cls): """ Take a class with a ``.caller`` method and return a callable decorator object. It works by adding a suitable __call__ method to the class; it raises a TypeError if the class already has a nontrivial __call__ method. """ attrs = set(dir(cls)) if "__call__" in attrs: raise TypeError( "You cannot decorate a class with a nontrivial " "__call__ method" ) if "call" not in attrs: raise TypeError("You cannot decorate a class without a " ".call method") cls.__call__ = __call__ return cls def decorator(caller): """ General purpose decorator factory: takes a caller function as input and returns a decorator with the same attributes. A caller function is any function like this:: def caller(func, *args, **kw): # do something return func(*args, **kw) Here is an example of usage: >>> @decorator ... def chatty(f, *args, **kw): ... print("Calling %r" % f.__name__) ... return f(*args, **kw) >>> chatty.__name__ 'chatty' >>> @chatty ... def f(): pass ... >>> f() Calling 'f' decorator can also take in input a class with a .caller method; in this case it converts the class into a factory of callable decorator objects. See the documentation for an example. """ if inspect.isclass(caller): return decorator_factory(caller) def _decorator(func): # the real meat is here infodict = getinfo(func) argnames = infodict["argnames"] assert not ( "_call_" in argnames or "_func_" in argnames ), "You cannot use _call_ or _func_ as argument names!" src = "lambda %(signature)s: _call_(_func_, %(signature)s)" % infodict # import sys; print >> sys.stderr, src # for debugging purposes dec_func = eval(src, dict(_func_=func, _call_=caller)) return update_wrapper(dec_func, func, infodict) return update_wrapper(_decorator, caller) def getattr_(obj, name, default_thunk): "Similar to .setdefault in dictionaries." try: return getattr(obj, name) except AttributeError: default = default_thunk() setattr(obj, name, default) return default @decorator def memoize(func, *args): dic = getattr_(func, "memoize_dic", dict) # memoize_dic is created at the first call if args in dic: return dic[args] result = func(*args) dic[args] = result return result ########################## LEGALESE ############################### ## Redistributions of source code must retain the above copyright ## notice, this list of conditions and the following disclaimer. ## Redistributions in bytecode form must reproduce the above copyright ## notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the ## distribution. ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ## INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ## BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS ## OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ## ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR ## TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ## USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ## DAMAGE. nltk-3.7/nltk/downloader.py000066400000000000000000002653041420073152400160230ustar00rootroot00000000000000# Natural Language Toolkit: Corpus & Model Downloader # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ The NLTK corpus and module downloader. This module defines several interfaces which can be used to download corpora, models, and other data packages that can be used with NLTK. Downloading Packages ==================== If called with no arguments, ``download()`` will display an interactive interface which can be used to download and install new packages. If Tkinter is available, then a graphical interface will be shown, otherwise a simple text interface will be provided. Individual packages can be downloaded by calling the ``download()`` function with a single argument, giving the package identifier for the package that should be downloaded: >>> download('treebank') # doctest: +SKIP [nltk_data] Downloading package 'treebank'... [nltk_data] Unzipping corpora/treebank.zip. NLTK also provides a number of \"package collections\", consisting of a group of related packages. To download all packages in a colleciton, simply call ``download()`` with the collection's identifier: >>> download('all-corpora') # doctest: +SKIP [nltk_data] Downloading package 'abc'... [nltk_data] Unzipping corpora/abc.zip. [nltk_data] Downloading package 'alpino'... [nltk_data] Unzipping corpora/alpino.zip. ... [nltk_data] Downloading package 'words'... [nltk_data] Unzipping corpora/words.zip. Download Directory ================== By default, packages are installed in either a system-wide directory (if Python has sufficient access to write to it); or in the current user's home directory. However, the ``download_dir`` argument may be used to specify a different installation target, if desired. See ``Downloader.default_download_dir()`` for more a detailed description of how the default download directory is chosen. NLTK Download Server ==================== Before downloading any packages, the corpus and module downloader contacts the NLTK download server, to retrieve an index file describing the available packages. By default, this index file is loaded from ``https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml``. If necessary, it is possible to create a new ``Downloader`` object, specifying a different URL for the package index file. Usage:: python nltk/downloader.py [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS or:: python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS """ # ---------------------------------------------------------------------- """ 0 1 2 3 [label][----][label][----] [column ][column ] Notes ===== Handling data files.. Some questions: * Should the data files be kept zipped or unzipped? I say zipped. * Should the data files be kept in svn at all? Advantages: history; automatic version numbers; 'svn up' could be used rather than the downloader to update the corpora. Disadvantages: they're big, which makes working from svn a bit of a pain. And we're planning to potentially make them much bigger. I don't think we want people to have to download 400MB corpora just to use nltk from svn. * Compromise: keep the data files in trunk/data rather than in trunk/nltk. That way you can check them out in svn if you want to; but you don't need to, and you can use the downloader instead. * Also: keep models in mind. When we change the code, we'd potentially like the models to get updated. This could require a little thought. * So.. let's assume we have a trunk/data directory, containing a bunch of packages. The packages should be kept as zip files, because we really shouldn't be editing them much (well -- we may edit models more, but they tend to be binary-ish files anyway, where diffs aren't that helpful). So we'll have trunk/data, with a bunch of files like abc.zip and treebank.zip and propbank.zip. For each package we could also have eg treebank.xml and propbank.xml, describing the contents of the package (name, copyright, license, etc). Collections would also have .xml files. Finally, we would pull all these together to form a single index.xml file. Some directory structure wouldn't hurt. So how about:: /trunk/data/ ....................... root of data svn index.xml ........................ main index file src/ ............................. python scripts packages/ ........................ dir for packages corpora/ ....................... zip & xml files for corpora grammars/ ...................... zip & xml files for grammars taggers/ ....................... zip & xml files for taggers tokenizers/ .................... zip & xml files for tokenizers etc. collections/ ..................... xml files for collections Where the root (/trunk/data) would contain a makefile; and src/ would contain a script to update the info.xml file. It could also contain scripts to rebuild some of the various model files. The script that builds index.xml should probably check that each zip file expands entirely into a single subdir, whose name matches the package's uid. Changes I need to make: - in index: change "size" to "filesize" or "compressed-size" - in index: add "unzipped-size" - when checking status: check both compressed & uncompressed size. uncompressed size is important to make sure we detect a problem if something got partially unzipped. define new status values to differentiate stale vs corrupt vs corruptly-uncompressed?? (we shouldn't need to re-download the file if the zip file is ok but it didn't get uncompressed fully.) - add other fields to the index: author, license, copyright, contact, etc. the current grammars/ package would become a single new package (eg toy-grammars or book-grammars). xml file should have: - authorship info - license info - copyright info - contact info - info about what type of data/annotation it contains? - recommended corpus reader? collections can contain other collections. they can also contain multiple package types (corpora & models). Have a single 'basics' package that includes everything we talk about in the book? n.b.: there will have to be a fallback to the punkt tokenizer, in case they didn't download that model. default: unzip or not? """ import functools import itertools import os import shutil import subprocess import sys import textwrap import threading import time import warnings import zipfile from hashlib import md5 from xml.etree import ElementTree try: TKINTER = True from tkinter import Button, Canvas, Entry, Frame, IntVar, Label, Menu, TclError, Tk from tkinter.messagebox import showerror from nltk.draw.table import Table from nltk.draw.util import ShowText except ImportError: TKINTER = False TclError = ValueError from urllib.error import HTTPError, URLError from urllib.request import urlopen import nltk # urllib2 = nltk.internals.import_from_stdlib('urllib2') ###################################################################### # Directory entry objects (from the data server's index file) ###################################################################### class Package: """ A directory entry for a downloadable package. These entries are extracted from the XML index file that is downloaded by ``Downloader``. Each package consists of a single file; but if that file is a zip file, then it can be automatically decompressed when the package is installed. """ def __init__( self, id, url, name=None, subdir="", size=None, unzipped_size=None, checksum=None, svn_revision=None, copyright="Unknown", contact="Unknown", license="Unknown", author="Unknown", unzip=True, **kw, ): self.id = id """A unique identifier for this package.""" self.name = name or id """A string name for this package.""" self.subdir = subdir """The subdirectory where this package should be installed. E.g., ``'corpora'`` or ``'taggers'``.""" self.url = url """A URL that can be used to download this package's file.""" self.size = int(size) """The filesize (in bytes) of the package file.""" self.unzipped_size = int(unzipped_size) """The total filesize of the files contained in the package's zipfile.""" self.checksum = checksum """The MD-5 checksum of the package file.""" self.svn_revision = svn_revision """A subversion revision number for this package.""" self.copyright = copyright """Copyright holder for this package.""" self.contact = contact """Name & email of the person who should be contacted with questions about this package.""" self.license = license """License information for this package.""" self.author = author """Author of this package.""" ext = os.path.splitext(url.split("/")[-1])[1] self.filename = os.path.join(subdir, id + ext) """The filename that should be used for this package's file. It is formed by joining ``self.subdir`` with ``self.id``, and using the same extension as ``url``.""" self.unzip = bool(int(unzip)) # '0' or '1' """A flag indicating whether this corpus should be unzipped by default.""" # Include any other attributes provided by the XML file. self.__dict__.update(kw) @staticmethod def fromxml(xml): if isinstance(xml, str): xml = ElementTree.parse(xml) for key in xml.attrib: xml.attrib[key] = str(xml.attrib[key]) return Package(**xml.attrib) def __lt__(self, other): return self.id < other.id def __repr__(self): return "" % self.id class Collection: """ A directory entry for a collection of downloadable packages. These entries are extracted from the XML index file that is downloaded by ``Downloader``. """ def __init__(self, id, children, name=None, **kw): self.id = id """A unique identifier for this collection.""" self.name = name or id """A string name for this collection.""" self.children = children """A list of the ``Collections`` or ``Packages`` directly contained by this collection.""" self.packages = None """A list of ``Packages`` contained by this collection or any collections it recursively contains.""" # Include any other attributes provided by the XML file. self.__dict__.update(kw) @staticmethod def fromxml(xml): if isinstance(xml, str): xml = ElementTree.parse(xml) for key in xml.attrib: xml.attrib[key] = str(xml.attrib[key]) children = [child.get("ref") for child in xml.findall("item")] return Collection(children=children, **xml.attrib) def __lt__(self, other): return self.id < other.id def __repr__(self): return "" % self.id ###################################################################### # Message Passing Objects ###################################################################### class DownloaderMessage: """A status message object, used by ``incr_download`` to communicate its progress.""" class StartCollectionMessage(DownloaderMessage): """Data server has started working on a collection of packages.""" def __init__(self, collection): self.collection = collection class FinishCollectionMessage(DownloaderMessage): """Data server has finished working on a collection of packages.""" def __init__(self, collection): self.collection = collection class StartPackageMessage(DownloaderMessage): """Data server has started working on a package.""" def __init__(self, package): self.package = package class FinishPackageMessage(DownloaderMessage): """Data server has finished working on a package.""" def __init__(self, package): self.package = package class StartDownloadMessage(DownloaderMessage): """Data server has started downloading a package.""" def __init__(self, package): self.package = package class FinishDownloadMessage(DownloaderMessage): """Data server has finished downloading a package.""" def __init__(self, package): self.package = package class StartUnzipMessage(DownloaderMessage): """Data server has started unzipping a package.""" def __init__(self, package): self.package = package class FinishUnzipMessage(DownloaderMessage): """Data server has finished unzipping a package.""" def __init__(self, package): self.package = package class UpToDateMessage(DownloaderMessage): """The package download file is already up-to-date""" def __init__(self, package): self.package = package class StaleMessage(DownloaderMessage): """The package download file is out-of-date or corrupt""" def __init__(self, package): self.package = package class ErrorMessage(DownloaderMessage): """Data server encountered an error""" def __init__(self, package, message): self.package = package if isinstance(message, Exception): self.message = str(message) else: self.message = message class ProgressMessage(DownloaderMessage): """Indicates how much progress the data server has made""" def __init__(self, progress): self.progress = progress class SelectDownloadDirMessage(DownloaderMessage): """Indicates what download directory the data server is using""" def __init__(self, download_dir): self.download_dir = download_dir ###################################################################### # NLTK Data Server ###################################################################### class Downloader: """ A class used to access the NLTK data server, which can be used to download corpora and other data packages. """ # ///////////////////////////////////////////////////////////////// # Configuration # ///////////////////////////////////////////////////////////////// INDEX_TIMEOUT = 60 * 60 # 1 hour """The amount of time after which the cached copy of the data server index will be considered 'stale,' and will be re-downloaded.""" DEFAULT_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml" """The default URL for the NLTK data server's index. An alternative URL can be specified when creating a new ``Downloader`` object.""" # ///////////////////////////////////////////////////////////////// # Status Constants # ///////////////////////////////////////////////////////////////// INSTALLED = "installed" """A status string indicating that a package or collection is installed and up-to-date.""" NOT_INSTALLED = "not installed" """A status string indicating that a package or collection is not installed.""" STALE = "out of date" """A status string indicating that a package or collection is corrupt or out-of-date.""" PARTIAL = "partial" """A status string indicating that a collection is partially installed (i.e., only some of its packages are installed.)""" # ///////////////////////////////////////////////////////////////// # Constructor # ///////////////////////////////////////////////////////////////// def __init__(self, server_index_url=None, download_dir=None): self._url = server_index_url or self.DEFAULT_URL """The URL for the data server's index file.""" self._collections = {} """Dictionary from collection identifier to ``Collection``""" self._packages = {} """Dictionary from package identifier to ``Package``""" self._download_dir = download_dir """The default directory to which packages will be downloaded.""" self._index = None """The XML index file downloaded from the data server""" self._index_timestamp = None """Time at which ``self._index`` was downloaded. If it is more than ``INDEX_TIMEOUT`` seconds old, it will be re-downloaded.""" self._status_cache = {} """Dictionary from package/collection identifier to status string (``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``). Cache is used for packages only, not collections.""" self._errors = None """Flag for telling if all packages got successfully downloaded or not.""" # decide where we're going to save things to. if self._download_dir is None: self._download_dir = self.default_download_dir() # ///////////////////////////////////////////////////////////////// # Information # ///////////////////////////////////////////////////////////////// def list( self, download_dir=None, show_packages=True, show_collections=True, header=True, more_prompt=False, skip_installed=False, ): lines = 0 # for more_prompt if download_dir is None: download_dir = self._download_dir print("Using default data directory (%s)" % download_dir) if header: print("=" * (26 + len(self._url))) print(" Data server index for <%s>" % self._url) print("=" * (26 + len(self._url))) lines += 3 # for more_prompt stale = partial = False categories = [] if show_packages: categories.append("packages") if show_collections: categories.append("collections") for category in categories: print("%s:" % category.capitalize()) lines += 1 # for more_prompt for info in sorted(getattr(self, category)(), key=str): status = self.status(info, download_dir) if status == self.INSTALLED and skip_installed: continue if status == self.STALE: stale = True if status == self.PARTIAL: partial = True prefix = { self.INSTALLED: "*", self.STALE: "-", self.PARTIAL: "P", self.NOT_INSTALLED: " ", }[status] name = textwrap.fill( "-" * 27 + (info.name or info.id), 75, subsequent_indent=27 * " " )[27:] print(" [{}] {} {}".format(prefix, info.id.ljust(20, "."), name)) lines += len(name.split("\n")) # for more_prompt if more_prompt and lines > 20: user_input = input("Hit Enter to continue: ") if user_input.lower() in ("x", "q"): return lines = 0 print() msg = "([*] marks installed packages" if stale: msg += "; [-] marks out-of-date or corrupt packages" if partial: msg += "; [P] marks partially installed collections" print(textwrap.fill(msg + ")", subsequent_indent=" ", width=76)) def packages(self): self._update_index() return self._packages.values() def corpora(self): self._update_index() return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == "corpora"] def models(self): self._update_index() return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != "corpora"] def collections(self): self._update_index() return self._collections.values() # ///////////////////////////////////////////////////////////////// # Downloading # ///////////////////////////////////////////////////////////////// def _info_or_id(self, info_or_id): if isinstance(info_or_id, str): return self.info(info_or_id) else: return info_or_id # [xx] When during downloading is it 'safe' to abort? Only unsafe # time is *during* an unzip -- we don't want to leave a # partially-unzipped corpus in place because we wouldn't notice # it. But if we had the exact total size of the unzipped corpus, # then that would be fine. Then we could abort anytime we want! # So this is really what we should do. That way the threaded # downloader in the gui can just kill the download thread anytime # it wants. def incr_download(self, info_or_id, download_dir=None, force=False): # If they didn't specify a download_dir, then use the default one. if download_dir is None: download_dir = self._download_dir yield SelectDownloadDirMessage(download_dir) # If they gave us a list of ids, then download each one. if isinstance(info_or_id, (list, tuple)): yield from self._download_list(info_or_id, download_dir, force) return # Look up the requested collection or package. try: info = self._info_or_id(info_or_id) except (OSError, ValueError) as e: yield ErrorMessage(None, f"Error loading {info_or_id}: {e}") return # Handle collections. if isinstance(info, Collection): yield StartCollectionMessage(info) yield from self.incr_download(info.children, download_dir, force) yield FinishCollectionMessage(info) # Handle Packages (delegate to a helper function). else: yield from self._download_package(info, download_dir, force) def _num_packages(self, item): if isinstance(item, Package): return 1 else: return len(item.packages) def _download_list(self, items, download_dir, force): # Look up the requested items. for i in range(len(items)): try: items[i] = self._info_or_id(items[i]) except (OSError, ValueError) as e: yield ErrorMessage(items[i], e) return # Download each item, re-scaling their progress. num_packages = sum(self._num_packages(item) for item in items) progress = 0 for i, item in enumerate(items): if isinstance(item, Package): delta = 1.0 / num_packages else: delta = len(item.packages) / num_packages for msg in self.incr_download(item, download_dir, force): if isinstance(msg, ProgressMessage): yield ProgressMessage(progress + msg.progress * delta) else: yield msg progress += 100 * delta def _download_package(self, info, download_dir, force): yield StartPackageMessage(info) yield ProgressMessage(0) # Do we already have the current version? status = self.status(info, download_dir) if not force and status == self.INSTALLED: yield UpToDateMessage(info) yield ProgressMessage(100) yield FinishPackageMessage(info) return # Remove the package from our status cache self._status_cache.pop(info.id, None) # Check for (and remove) any old/stale version. filepath = os.path.join(download_dir, info.filename) if os.path.exists(filepath): if status == self.STALE: yield StaleMessage(info) os.remove(filepath) # Ensure the download_dir exists if not os.path.exists(download_dir): os.makedirs(download_dir) if not os.path.exists(os.path.join(download_dir, info.subdir)): os.makedirs(os.path.join(download_dir, info.subdir)) # Download the file. This will raise an IOError if the url # is not found. yield StartDownloadMessage(info) yield ProgressMessage(5) try: infile = urlopen(info.url) with open(filepath, "wb") as outfile: num_blocks = max(1, info.size / (1024 * 16)) for block in itertools.count(): s = infile.read(1024 * 16) # 16k blocks. outfile.write(s) if not s: break if block % 2 == 0: # how often? yield ProgressMessage(min(80, 5 + 75 * (block / num_blocks))) infile.close() except OSError as e: yield ErrorMessage( info, "Error downloading %r from <%s>:" "\n %s" % (info.id, info.url, e), ) return yield FinishDownloadMessage(info) yield ProgressMessage(80) # If it's a zipfile, uncompress it. if info.filename.endswith(".zip"): zipdir = os.path.join(download_dir, info.subdir) # Unzip if we're unzipping by default; *or* if it's already # been unzipped (presumably a previous version). if info.unzip or os.path.exists(os.path.join(zipdir, info.id)): yield StartUnzipMessage(info) for msg in _unzip_iter(filepath, zipdir, verbose=False): # Somewhat of a hack, but we need a proper package reference msg.package = info yield msg yield FinishUnzipMessage(info) yield FinishPackageMessage(info) def download( self, info_or_id=None, download_dir=None, quiet=False, force=False, prefix="[nltk_data] ", halt_on_error=True, raise_on_error=False, print_error_to=sys.stderr, ): print_to = functools.partial(print, file=print_error_to) # If no info or id is given, then use the interactive shell. if info_or_id is None: # [xx] hmm -- changing self._download_dir here seems like # the wrong thing to do. Maybe the _interactive_download # function should make a new copy of self to use? if download_dir is not None: self._download_dir = download_dir self._interactive_download() return True else: # Define a helper function for displaying output: def show(s, prefix2=""): print_to( textwrap.fill( s, initial_indent=prefix + prefix2, subsequent_indent=prefix + prefix2 + " " * 4, ) ) for msg in self.incr_download(info_or_id, download_dir, force): # Error messages if isinstance(msg, ErrorMessage): show(msg.message) if raise_on_error: raise ValueError(msg.message) if halt_on_error: return False self._errors = True if not quiet: print_to("Error installing package. Retry? [n/y/e]") choice = input().strip() if choice in ["y", "Y"]: if not self.download( msg.package.id, download_dir, quiet, force, prefix, halt_on_error, raise_on_error, ): return False elif choice in ["e", "E"]: return False # All other messages if not quiet: # Collection downloading messages: if isinstance(msg, StartCollectionMessage): show("Downloading collection %r" % msg.collection.id) prefix += " | " print_to(prefix) elif isinstance(msg, FinishCollectionMessage): print_to(prefix) prefix = prefix[:-4] if self._errors: show( "Downloaded collection %r with errors" % msg.collection.id ) else: show("Done downloading collection %s" % msg.collection.id) # Package downloading messages: elif isinstance(msg, StartPackageMessage): show( "Downloading package %s to %s..." % (msg.package.id, download_dir) ) elif isinstance(msg, UpToDateMessage): show("Package %s is already up-to-date!" % msg.package.id, " ") # elif isinstance(msg, StaleMessage): # show('Package %s is out-of-date or corrupt' % # msg.package.id, ' ') elif isinstance(msg, StartUnzipMessage): show("Unzipping %s." % msg.package.filename, " ") # Data directory message: elif isinstance(msg, SelectDownloadDirMessage): download_dir = msg.download_dir return True def is_stale(self, info_or_id, download_dir=None): return self.status(info_or_id, download_dir) == self.STALE def is_installed(self, info_or_id, download_dir=None): return self.status(info_or_id, download_dir) == self.INSTALLED def clear_status_cache(self, id=None): if id is None: self._status_cache.clear() else: self._status_cache.pop(id, None) def status(self, info_or_id, download_dir=None): """ Return a constant describing the status of the given package or collection. Status can be one of ``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``. """ if download_dir is None: download_dir = self._download_dir info = self._info_or_id(info_or_id) # Handle collections: if isinstance(info, Collection): pkg_status = [self.status(pkg.id) for pkg in info.packages] if self.STALE in pkg_status: return self.STALE elif self.PARTIAL in pkg_status: return self.PARTIAL elif self.INSTALLED in pkg_status and self.NOT_INSTALLED in pkg_status: return self.PARTIAL elif self.NOT_INSTALLED in pkg_status: return self.NOT_INSTALLED else: return self.INSTALLED # Handle packages: else: filepath = os.path.join(download_dir, info.filename) if download_dir != self._download_dir: return self._pkg_status(info, filepath) else: if info.id not in self._status_cache: self._status_cache[info.id] = self._pkg_status(info, filepath) return self._status_cache[info.id] def _pkg_status(self, info, filepath): if not os.path.exists(filepath): return self.NOT_INSTALLED # Check if the file has the correct size. try: filestat = os.stat(filepath) except OSError: return self.NOT_INSTALLED if filestat.st_size != int(info.size): return self.STALE # Check if the file's checksum matches if md5_hexdigest(filepath) != info.checksum: return self.STALE # If it's a zipfile, and it's been at least partially # unzipped, then check if it's been fully unzipped. if filepath.endswith(".zip"): unzipdir = filepath[:-4] if not os.path.exists(unzipdir): return self.INSTALLED # but not unzipped -- ok! if not os.path.isdir(unzipdir): return self.STALE unzipped_size = sum( os.stat(os.path.join(d, f)).st_size for d, _, files in os.walk(unzipdir) for f in files ) if unzipped_size != info.unzipped_size: return self.STALE # Otherwise, everything looks good. return self.INSTALLED def update(self, quiet=False, prefix="[nltk_data] "): """ Re-download any packages whose status is STALE. """ self.clear_status_cache() for pkg in self.packages(): if self.status(pkg) == self.STALE: self.download(pkg, quiet=quiet, prefix=prefix) # ///////////////////////////////////////////////////////////////// # Index # ///////////////////////////////////////////////////////////////// def _update_index(self, url=None): """A helper function that ensures that self._index is up-to-date. If the index is older than self.INDEX_TIMEOUT, then download it again.""" # Check if the index is already up-to-date. If so, do nothing. if not ( self._index is None or url is not None or time.time() - self._index_timestamp > self.INDEX_TIMEOUT ): return # If a URL was specified, then update our URL. self._url = url or self._url # Download the index file. self._index = nltk.internals.ElementWrapper( ElementTree.parse(urlopen(self._url)).getroot() ) self._index_timestamp = time.time() # Build a dictionary of packages. packages = [Package.fromxml(p) for p in self._index.findall("packages/package")] self._packages = {p.id: p for p in packages} # Build a dictionary of collections. collections = [ Collection.fromxml(c) for c in self._index.findall("collections/collection") ] self._collections = {c.id: c for c in collections} # Replace identifiers with actual children in collection.children. for collection in self._collections.values(): for i, child_id in enumerate(collection.children): if child_id in self._packages: collection.children[i] = self._packages[child_id] elif child_id in self._collections: collection.children[i] = self._collections[child_id] else: print( "removing collection member with no package: {}".format( child_id ) ) del collection.children[i] # Fill in collection.packages for each collection. for collection in self._collections.values(): packages = {} queue = [collection] for child in queue: if isinstance(child, Collection): queue.extend(child.children) elif isinstance(child, Package): packages[child.id] = child else: pass collection.packages = packages.values() # Flush the status cache self._status_cache.clear() def index(self): """ Return the XML index describing the packages available from the data server. If necessary, this index will be downloaded from the data server. """ self._update_index() return self._index def info(self, id): """Return the ``Package`` or ``Collection`` record for the given item.""" self._update_index() if id in self._packages: return self._packages[id] if id in self._collections: return self._collections[id] raise ValueError("Package %r not found in index" % id) def xmlinfo(self, id): """Return the XML info record for the given item""" self._update_index() for package in self._index.findall("packages/package"): if package.get("id") == id: return package for collection in self._index.findall("collections/collection"): if collection.get("id") == id: return collection raise ValueError("Package %r not found in index" % id) # ///////////////////////////////////////////////////////////////// # URL & Data Directory # ///////////////////////////////////////////////////////////////// def _get_url(self): """The URL for the data server's index file.""" return self._url def _set_url(self, url): """ Set a new URL for the data server. If we're unable to contact the given url, then the original url is kept. """ original_url = self._url try: self._update_index(url) except: self._url = original_url raise url = property(_get_url, _set_url) def default_download_dir(self): """ Return the directory to which packages will be downloaded by default. This value can be overridden using the constructor, or on a case-by-case basis using the ``download_dir`` argument when calling ``download()``. On Windows, the default download directory is ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the directory containing Python, e.g. ``C:\\Python25``. On all other platforms, the default directory is the first of the following which exists or which can be created with write permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``, ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``. """ # Check if we are on GAE where we cannot write into filesystem. if "APPENGINE_RUNTIME" in os.environ: return # Check if we have sufficient permissions to install in a # variety of system-wide locations. for nltkdir in nltk.data.path: if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir): return nltkdir # On Windows, use %APPDATA% if sys.platform == "win32" and "APPDATA" in os.environ: homedir = os.environ["APPDATA"] # Otherwise, install in the user's home directory. else: homedir = os.path.expanduser("~/") if homedir == "~/": raise ValueError("Could not find a default download directory") # append "nltk_data" to the home directory return os.path.join(homedir, "nltk_data") def _get_download_dir(self): """ The default directory to which packages will be downloaded. This defaults to the value returned by ``default_download_dir()``. To override this default on a case-by-case basis, use the ``download_dir`` argument when calling ``download()``. """ return self._download_dir def _set_download_dir(self, download_dir): self._download_dir = download_dir # Clear the status cache. self._status_cache.clear() download_dir = property(_get_download_dir, _set_download_dir) # ///////////////////////////////////////////////////////////////// # Interactive Shell # ///////////////////////////////////////////////////////////////// def _interactive_download(self): # Try the GUI first; if that doesn't work, try the simple # interactive shell. if TKINTER: try: DownloaderGUI(self).mainloop() except TclError: DownloaderShell(self).run() else: DownloaderShell(self).run() class DownloaderShell: def __init__(self, dataserver): self._ds = dataserver def _simple_interactive_menu(self, *options): print("-" * 75) spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * " " print(" " + spc.join(options)) print("-" * 75) def run(self): print("NLTK Downloader") while True: self._simple_interactive_menu( "d) Download", "l) List", " u) Update", "c) Config", "h) Help", "q) Quit", ) user_input = input("Downloader> ").strip() if not user_input: print() continue command = user_input.lower().split()[0] args = user_input.split()[1:] try: if command == "l": print() self._ds.list(self._ds.download_dir, header=False, more_prompt=True) elif command == "h": self._simple_interactive_help() elif command == "c": self._simple_interactive_config() elif command in ("q", "x"): return elif command == "d": self._simple_interactive_download(args) elif command == "u": self._simple_interactive_update() else: print("Command %r unrecognized" % user_input) except HTTPError as e: print("Error reading from server: %s" % e) except URLError as e: print("Error connecting to server: %s" % e.reason) # try checking if user_input is a package name, & # downloading it? print() def _simple_interactive_download(self, args): if args: for arg in args: try: self._ds.download(arg, prefix=" ") except (OSError, ValueError) as e: print(e) else: while True: print() print("Download which package (l=list; x=cancel)?") user_input = input(" Identifier> ") if user_input.lower() == "l": self._ds.list( self._ds.download_dir, header=False, more_prompt=True, skip_installed=True, ) continue elif user_input.lower() in ("x", "q", ""): return elif user_input: for id in user_input.split(): try: self._ds.download(id, prefix=" ") except (OSError, ValueError) as e: print(e) break def _simple_interactive_update(self): while True: stale_packages = [] stale = partial = False for info in sorted(getattr(self._ds, "packages")(), key=str): if self._ds.status(info) == self._ds.STALE: stale_packages.append((info.id, info.name)) print() if stale_packages: print("Will update following packages (o=ok; x=cancel)") for pid, pname in stale_packages: name = textwrap.fill( "-" * 27 + (pname), 75, subsequent_indent=27 * " " )[27:] print(" [ ] {} {}".format(pid.ljust(20, "."), name)) print() user_input = input(" Identifier> ") if user_input.lower() == "o": for pid, pname in stale_packages: try: self._ds.download(pid, prefix=" ") except (OSError, ValueError) as e: print(e) break elif user_input.lower() in ("x", "q", ""): return else: print("Nothing to update.") return def _simple_interactive_help(self): print() print("Commands:") print( " d) Download a package or collection u) Update out of date packages" ) print(" l) List packages & collections h) Help") print(" c) View & Modify Configuration q) Quit") def _show_config(self): print() print("Data Server:") print(" - URL: <%s>" % self._ds.url) print(" - %d Package Collections Available" % len(self._ds.collections())) print(" - %d Individual Packages Available" % len(self._ds.packages())) print() print("Local Machine:") print(" - Data directory: %s" % self._ds.download_dir) def _simple_interactive_config(self): self._show_config() while True: print() self._simple_interactive_menu( "s) Show Config", "u) Set Server URL", "d) Set Data Dir", "m) Main Menu" ) user_input = input("Config> ").strip().lower() if user_input == "s": self._show_config() elif user_input == "d": new_dl_dir = input(" New Directory> ").strip() if new_dl_dir in ("", "x", "q", "X", "Q"): print(" Cancelled!") elif os.path.isdir(new_dl_dir): self._ds.download_dir = new_dl_dir else: print("Directory %r not found! Create it first." % new_dl_dir) elif user_input == "u": new_url = input(" New URL> ").strip() if new_url in ("", "x", "q", "X", "Q"): print(" Cancelled!") else: if not new_url.startswith(("http://", "https://")): new_url = "http://" + new_url try: self._ds.url = new_url except Exception as e: print(f"Error reading <{new_url!r}>:\n {e}") elif user_input == "m": break class DownloaderGUI: """ Graphical interface for downloading packages from the NLTK data server. """ # ///////////////////////////////////////////////////////////////// # Column Configuration # ///////////////////////////////////////////////////////////////// COLUMNS = [ "", "Identifier", "Name", "Size", "Status", "Unzipped Size", "Copyright", "Contact", "License", "Author", "Subdir", "Checksum", ] """A list of the names of columns. This controls the order in which the columns will appear. If this is edited, then ``_package_to_columns()`` may need to be edited to match.""" COLUMN_WEIGHTS = {"": 0, "Name": 5, "Size": 0, "Status": 0} """A dictionary specifying how columns should be resized when the table is resized. Columns with weight 0 will not be resized at all; and columns with high weight will be resized more. Default weight (for columns not explicitly listed) is 1.""" COLUMN_WIDTHS = { "": 1, "Identifier": 20, "Name": 45, "Size": 10, "Unzipped Size": 10, "Status": 12, } """A dictionary specifying how wide each column should be, in characters. The default width (for columns not explicitly listed) is specified by ``DEFAULT_COLUMN_WIDTH``.""" DEFAULT_COLUMN_WIDTH = 30 """The default width for columns that are not explicitly listed in ``COLUMN_WIDTHS``.""" INITIAL_COLUMNS = ["", "Identifier", "Name", "Size", "Status"] """The set of columns that should be displayed by default.""" # Perform a few import-time sanity checks to make sure that the # column configuration variables are defined consistently: for c in COLUMN_WEIGHTS: assert c in COLUMNS for c in COLUMN_WIDTHS: assert c in COLUMNS for c in INITIAL_COLUMNS: assert c in COLUMNS # ///////////////////////////////////////////////////////////////// # Color Configuration # ///////////////////////////////////////////////////////////////// _BACKDROP_COLOR = ("#000", "#ccc") _ROW_COLOR = { Downloader.INSTALLED: ("#afa", "#080"), Downloader.PARTIAL: ("#ffa", "#880"), Downloader.STALE: ("#faa", "#800"), Downloader.NOT_INSTALLED: ("#fff", "#888"), } _MARK_COLOR = ("#000", "#ccc") # _FRONT_TAB_COLOR = ('#ccf', '#008') # _BACK_TAB_COLOR = ('#88a', '#448') _FRONT_TAB_COLOR = ("#fff", "#45c") _BACK_TAB_COLOR = ("#aaa", "#67a") _PROGRESS_COLOR = ("#f00", "#aaa") _TAB_FONT = "helvetica -16 bold" # ///////////////////////////////////////////////////////////////// # Constructor # ///////////////////////////////////////////////////////////////// def __init__(self, dataserver, use_threads=True): self._ds = dataserver self._use_threads = use_threads # For the threaded downloader: self._download_lock = threading.Lock() self._download_msg_queue = [] self._download_abort_queue = [] self._downloading = False # For tkinter after callbacks: self._afterid = {} # A message log. self._log_messages = [] self._log_indent = 0 self._log("NLTK Downloader Started!") # Create the main window. top = self.top = Tk() top.geometry("+50+50") top.title("NLTK Downloader") top.configure(background=self._BACKDROP_COLOR[1]) # Set up some bindings now, in case anything goes wrong. top.bind("", self.destroy) top.bind("", self.destroy) self._destroyed = False self._column_vars = {} # Initialize the GUI. self._init_widgets() self._init_menu() try: self._fill_table() except HTTPError as e: showerror("Error reading from server", e) except URLError as e: showerror("Error connecting to server", e.reason) self._show_info() self._select_columns() self._table.select(0) # Make sure we get notified when we're destroyed, so we can # cancel any download in progress. self._table.bind("", self._destroy) def _log(self, msg): self._log_messages.append( "{} {}{}".format(time.ctime(), " | " * self._log_indent, msg) ) # ///////////////////////////////////////////////////////////////// # Internals # ///////////////////////////////////////////////////////////////// def _init_widgets(self): # Create the top-level frame structures f1 = Frame(self.top, relief="raised", border=2, padx=8, pady=0) f1.pack(sid="top", expand=True, fill="both") f1.grid_rowconfigure(2, weight=1) f1.grid_columnconfigure(0, weight=1) Frame(f1, height=8).grid(column=0, row=0) # spacer tabframe = Frame(f1) tabframe.grid(column=0, row=1, sticky="news") tableframe = Frame(f1) tableframe.grid(column=0, row=2, sticky="news") buttonframe = Frame(f1) buttonframe.grid(column=0, row=3, sticky="news") Frame(f1, height=8).grid(column=0, row=4) # spacer infoframe = Frame(f1) infoframe.grid(column=0, row=5, sticky="news") Frame(f1, height=8).grid(column=0, row=6) # spacer progressframe = Frame( self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1] ) progressframe.pack(side="bottom", fill="x") self.top["border"] = 0 self.top["highlightthickness"] = 0 # Create the tabs self._tab_names = ["Collections", "Corpora", "Models", "All Packages"] self._tabs = {} for i, tab in enumerate(self._tab_names): label = Label(tabframe, text=tab, font=self._TAB_FONT) label.pack(side="left", padx=((i + 1) % 2) * 10) label.bind("", self._select_tab) self._tabs[tab.lower()] = label # Create the table. column_weights = [self.COLUMN_WEIGHTS.get(column, 1) for column in self.COLUMNS] self._table = Table( tableframe, self.COLUMNS, column_weights=column_weights, highlightthickness=0, listbox_height=16, reprfunc=self._table_reprfunc, ) self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked for i, column in enumerate(self.COLUMNS): width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH) self._table.columnconfig(i, width=width) self._table.pack(expand=True, fill="both") self._table.focus() self._table.bind_to_listboxes("", self._download) self._table.bind("", self._table_mark) self._table.bind("", self._download) self._table.bind("", self._prev_tab) self._table.bind("", self._next_tab) self._table.bind("", self._mark_all) # Create entry boxes for URL & download_dir infoframe.grid_columnconfigure(1, weight=1) info = [ ("url", "Server Index:", self._set_url), ("download_dir", "Download Directory:", self._set_download_dir), ] self._info = {} for (i, (key, label, callback)) in enumerate(info): Label(infoframe, text=label).grid(column=0, row=i, sticky="e") entry = Entry( infoframe, font="courier", relief="groove", disabledforeground="black" ) self._info[key] = (entry, callback) entry.bind("", self._info_save) entry.bind("", lambda e, key=key: self._info_edit(key)) entry.grid(column=1, row=i, sticky="ew") # If the user edits url or download_dir, and then clicks outside # the entry box, then save their results. self.top.bind("", self._info_save) # Create Download & Refresh buttons. self._download_button = Button( buttonframe, text="Download", command=self._download, width=8 ) self._download_button.pack(side="left") self._refresh_button = Button( buttonframe, text="Refresh", command=self._refresh, width=8 ) self._refresh_button.pack(side="right") # Create Progress bar self._progresslabel = Label( progressframe, text="", foreground=self._BACKDROP_COLOR[0], background=self._BACKDROP_COLOR[1], ) self._progressbar = Canvas( progressframe, width=200, height=16, background=self._PROGRESS_COLOR[1], relief="sunken", border=1, ) self._init_progressbar() self._progressbar.pack(side="right") self._progresslabel.pack(side="left") def _init_menu(self): menubar = Menu(self.top) filemenu = Menu(menubar, tearoff=0) filemenu.add_command( label="Download", underline=0, command=self._download, accelerator="Return" ) filemenu.add_separator() filemenu.add_command( label="Change Server Index", underline=7, command=lambda: self._info_edit("url"), ) filemenu.add_command( label="Change Download Directory", underline=0, command=lambda: self._info_edit("download_dir"), ) filemenu.add_separator() filemenu.add_command(label="Show Log", underline=5, command=self._show_log) filemenu.add_separator() filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) # Create a menu to control which columns of the table are # shown. n.b.: we never hide the first two columns (mark and # identifier). viewmenu = Menu(menubar, tearoff=0) for column in self._table.column_names[2:]: var = IntVar(self.top) assert column not in self._column_vars self._column_vars[column] = var if column in self.INITIAL_COLUMNS: var.set(1) viewmenu.add_checkbutton( label=column, underline=0, variable=var, command=self._select_columns ) menubar.add_cascade(label="View", underline=0, menu=viewmenu) # Create a sort menu # [xx] this should be selectbuttons; and it should include # reversed sorts as options. sortmenu = Menu(menubar, tearoff=0) for column in self._table.column_names[1:]: sortmenu.add_command( label="Sort by %s" % column, command=(lambda c=column: self._table.sort_by(c, "ascending")), ) sortmenu.add_separator() # sortmenu.add_command(label='Descending Sort:') for column in self._table.column_names[1:]: sortmenu.add_command( label="Reverse sort by %s" % column, command=(lambda c=column: self._table.sort_by(c, "descending")), ) menubar.add_cascade(label="Sort", underline=0, menu=sortmenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label="About", underline=0, command=self.about) helpmenu.add_command( label="Instructions", underline=0, command=self.help, accelerator="F1" ) menubar.add_cascade(label="Help", underline=0, menu=helpmenu) self.top.bind("", self.help) self.top.config(menu=menubar) def _select_columns(self): for (column, var) in self._column_vars.items(): if var.get(): self._table.show_column(column) else: self._table.hide_column(column) def _refresh(self): self._ds.clear_status_cache() try: self._fill_table() except HTTPError as e: showerror("Error reading from server", e) except URLError as e: showerror("Error connecting to server", e.reason) self._table.select(0) def _info_edit(self, info_key): self._info_save() # just in case. (entry, callback) = self._info[info_key] entry["state"] = "normal" entry["relief"] = "sunken" entry.focus() def _info_save(self, e=None): focus = self._table for entry, callback in self._info.values(): if entry["state"] == "disabled": continue if e is not None and e.widget is entry and e.keysym != "Return": focus = entry else: entry["state"] = "disabled" entry["relief"] = "groove" callback(entry.get()) focus.focus() def _table_reprfunc(self, row, col, val): if self._table.column_names[col].endswith("Size"): if isinstance(val, str): return " %s" % val elif val < 1024 ** 2: return " %.1f KB" % (val / 1024.0 ** 1) elif val < 1024 ** 3: return " %.1f MB" % (val / 1024.0 ** 2) else: return " %.1f GB" % (val / 1024.0 ** 3) if col in (0, ""): return str(val) else: return " %s" % val def _set_url(self, url): if url == self._ds.url: return try: self._ds.url = url self._fill_table() except OSError as e: showerror("Error Setting Server Index", str(e)) self._show_info() def _set_download_dir(self, download_dir): if self._ds.download_dir == download_dir: return # check if the dir exists, and if not, ask if we should create it? # Clear our status cache, & re-check what's installed self._ds.download_dir = download_dir try: self._fill_table() except HTTPError as e: showerror("Error reading from server", e) except URLError as e: showerror("Error connecting to server", e.reason) self._show_info() def _show_info(self): print("showing info", self._ds.url) for entry, cb in self._info.values(): entry["state"] = "normal" entry.delete(0, "end") self._info["url"][0].insert(0, self._ds.url) self._info["download_dir"][0].insert(0, self._ds.download_dir) for entry, cb in self._info.values(): entry["state"] = "disabled" def _prev_tab(self, *e): for i, tab in enumerate(self._tab_names): if tab.lower() == self._tab and i > 0: self._tab = self._tab_names[i - 1].lower() try: return self._fill_table() except HTTPError as e: showerror("Error reading from server", e) except URLError as e: showerror("Error connecting to server", e.reason) def _next_tab(self, *e): for i, tab in enumerate(self._tab_names): if tab.lower() == self._tab and i < (len(self._tabs) - 1): self._tab = self._tab_names[i + 1].lower() try: return self._fill_table() except HTTPError as e: showerror("Error reading from server", e) except URLError as e: showerror("Error connecting to server", e.reason) def _select_tab(self, event): self._tab = event.widget["text"].lower() try: self._fill_table() except HTTPError as e: showerror("Error reading from server", e) except URLError as e: showerror("Error connecting to server", e.reason) _tab = "collections" # _tab = 'corpora' _rows = None def _fill_table(self): selected_row = self._table.selected_row() self._table.clear() if self._tab == "all packages": items = self._ds.packages() elif self._tab == "corpora": items = self._ds.corpora() elif self._tab == "models": items = self._ds.models() elif self._tab == "collections": items = self._ds.collections() else: assert 0, "bad tab value %r" % self._tab rows = [self._package_to_columns(item) for item in items] self._table.extend(rows) # Highlight the active tab. for tab, label in self._tabs.items(): if tab == self._tab: label.configure( foreground=self._FRONT_TAB_COLOR[0], background=self._FRONT_TAB_COLOR[1], ) else: label.configure( foreground=self._BACK_TAB_COLOR[0], background=self._BACK_TAB_COLOR[1], ) self._table.sort_by("Identifier", order="ascending") self._color_table() self._table.select(selected_row) # This is a hack, because the scrollbar isn't updating its # position right -- I'm not sure what the underlying cause is # though. (This is on OS X w/ python 2.5) The length of # delay that's necessary seems to depend on how fast the # comptuer is. :-/ self.top.after(150, self._table._scrollbar.set, *self._table._mlb.yview()) self.top.after(300, self._table._scrollbar.set, *self._table._mlb.yview()) def _update_table_status(self): for row_num in range(len(self._table)): status = self._ds.status(self._table[row_num, "Identifier"]) self._table[row_num, "Status"] = status self._color_table() def _download(self, *e): # If we're using threads, then delegate to the threaded # downloader instead. if self._use_threads: return self._download_threaded(*e) marked = [ self._table[row, "Identifier"] for row in range(len(self._table)) if self._table[row, 0] != "" ] selection = self._table.selected_row() if not marked and selection is not None: marked = [self._table[selection, "Identifier"]] download_iter = self._ds.incr_download(marked, self._ds.download_dir) self._log_indent = 0 self._download_cb(download_iter, marked) _DL_DELAY = 10 def _download_cb(self, download_iter, ids): try: msg = next(download_iter) except StopIteration: # self._fill_table(sort=False) self._update_table_status() afterid = self.top.after(10, self._show_progress, 0) self._afterid["_download_cb"] = afterid return def show(s): self._progresslabel["text"] = s self._log(s) if isinstance(msg, ProgressMessage): self._show_progress(msg.progress) elif isinstance(msg, ErrorMessage): show(msg.message) if msg.package is not None: self._select(msg.package.id) self._show_progress(None) return # halt progress. elif isinstance(msg, StartCollectionMessage): show("Downloading collection %s" % msg.collection.id) self._log_indent += 1 elif isinstance(msg, StartPackageMessage): show("Downloading package %s" % msg.package.id) elif isinstance(msg, UpToDateMessage): show("Package %s is up-to-date!" % msg.package.id) # elif isinstance(msg, StaleMessage): # show('Package %s is out-of-date or corrupt' % msg.package.id) elif isinstance(msg, FinishDownloadMessage): show("Finished downloading %r." % msg.package.id) elif isinstance(msg, StartUnzipMessage): show("Unzipping %s" % msg.package.filename) elif isinstance(msg, FinishCollectionMessage): self._log_indent -= 1 show("Finished downloading collection %r." % msg.collection.id) self._clear_mark(msg.collection.id) elif isinstance(msg, FinishPackageMessage): self._clear_mark(msg.package.id) afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids) self._afterid["_download_cb"] = afterid def _select(self, id): for row in range(len(self._table)): if self._table[row, "Identifier"] == id: self._table.select(row) return def _color_table(self): # Color rows according to status. for row in range(len(self._table)): bg, sbg = self._ROW_COLOR[self._table[row, "Status"]] fg, sfg = ("black", "white") self._table.rowconfig( row, foreground=fg, selectforeground=sfg, background=bg, selectbackground=sbg, ) # Color the marked column self._table.itemconfigure( row, 0, foreground=self._MARK_COLOR[0], background=self._MARK_COLOR[1] ) def _clear_mark(self, id): for row in range(len(self._table)): if self._table[row, "Identifier"] == id: self._table[row, 0] = "" def _mark_all(self, *e): for row in range(len(self._table)): self._table[row, 0] = "X" def _table_mark(self, *e): selection = self._table.selected_row() if selection >= 0: if self._table[selection][0] != "": self._table[selection, 0] = "" else: self._table[selection, 0] = "X" self._table.select(delta=1) def _show_log(self): text = "\n".join(self._log_messages) ShowText(self.top, "NLTK Downloader Log", text) def _package_to_columns(self, pkg): """ Given a package, return a list of values describing that package, one for each column in ``self.COLUMNS``. """ row = [] for column_index, column_name in enumerate(self.COLUMNS): if column_index == 0: # Mark: row.append("") elif column_name == "Identifier": row.append(pkg.id) elif column_name == "Status": row.append(self._ds.status(pkg)) else: attr = column_name.lower().replace(" ", "_") row.append(getattr(pkg, attr, "n/a")) return row # ///////////////////////////////////////////////////////////////// # External Interface # ///////////////////////////////////////////////////////////////// def destroy(self, *e): if self._destroyed: return self.top.destroy() self._destroyed = True def _destroy(self, *e): if self.top is not None: for afterid in self._afterid.values(): self.top.after_cancel(afterid) # Abort any download in progress. if self._downloading and self._use_threads: self._abort_download() # Make sure the garbage collector destroys these now; # otherwise, they may get destroyed when we're not in the main # thread, which would make Tkinter unhappy. self._column_vars.clear() def mainloop(self, *args, **kwargs): self.top.mainloop(*args, **kwargs) # ///////////////////////////////////////////////////////////////// # HELP # ///////////////////////////////////////////////////////////////// HELP = textwrap.dedent( """\ This tool can be used to download a variety of corpora and models that can be used with NLTK. Each corpus or model is distributed in a single zip file, known as a \"package file.\" You can download packages individually, or you can download pre-defined collections of packages. When you download a package, it will be saved to the \"download directory.\" A default download directory is chosen when you run the downloader; but you may also select a different download directory. On Windows, the default download directory is \"package.\" The NLTK downloader can be used to download a variety of corpora, models, and other data packages. Keyboard shortcuts:: [return]\t Download [up]\t Select previous package [down]\t Select next package [left]\t Select previous tab [right]\t Select next tab """ ) def help(self, *e): # The default font's not very legible; try using 'fixed' instead. try: ShowText( self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75, font="fixed", ) except: ShowText(self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75) def about(self, *e): ABOUT = "NLTK Downloader\n" + "Written by Edward Loper" TITLE = "About: NLTK Downloader" try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except ImportError: ShowText(self.top, TITLE, ABOUT) # ///////////////////////////////////////////////////////////////// # Progress Bar # ///////////////////////////////////////////////////////////////// _gradient_width = 5 def _init_progressbar(self): c = self._progressbar width, height = int(c["width"]), int(c["height"]) for i in range(0, (int(c["width"]) * 2) // self._gradient_width): c.create_line( i * self._gradient_width + 20, -20, i * self._gradient_width - height - 20, height + 20, width=self._gradient_width, fill="#%02x0000" % (80 + abs(i % 6 - 3) * 12), ) c.addtag_all("gradient") c.itemconfig("gradient", state="hidden") # This is used to display progress c.addtag_withtag( "redbox", c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0]) ) def _show_progress(self, percent): c = self._progressbar if percent is None: c.coords("redbox", 0, 0, 0, 0) c.itemconfig("gradient", state="hidden") else: width, height = int(c["width"]), int(c["height"]) x = percent * int(width) // 100 + 1 c.coords("redbox", 0, 0, x, height + 1) def _progress_alive(self): c = self._progressbar if not self._downloading: c.itemconfig("gradient", state="hidden") else: c.itemconfig("gradient", state="normal") x1, y1, x2, y2 = c.bbox("gradient") if x1 <= -100: c.move("gradient", (self._gradient_width * 6) - 4, 0) else: c.move("gradient", -4, 0) afterid = self.top.after(200, self._progress_alive) self._afterid["_progress_alive"] = afterid # ///////////////////////////////////////////////////////////////// # Threaded downloader # ///////////////////////////////////////////////////////////////// def _download_threaded(self, *e): # If the user tries to start a new download while we're already # downloading something, then abort the current download instead. if self._downloading: self._abort_download() return # Change the 'download' button to an 'abort' button. self._download_button["text"] = "Cancel" marked = [ self._table[row, "Identifier"] for row in range(len(self._table)) if self._table[row, 0] != "" ] selection = self._table.selected_row() if not marked and selection is not None: marked = [self._table[selection, "Identifier"]] # Create a new data server object for the download operation, # just in case the user modifies our data server during the # download (e.g., clicking 'refresh' or editing the index url). ds = Downloader(self._ds.url, self._ds.download_dir) # Start downloading in a separate thread. assert self._download_msg_queue == [] assert self._download_abort_queue == [] self._DownloadThread( ds, marked, self._download_lock, self._download_msg_queue, self._download_abort_queue, ).start() # Monitor the download message queue & display its progress. self._log_indent = 0 self._downloading = True self._monitor_message_queue() # Display an indication that we're still alive and well by # cycling the progress bar. self._progress_alive() def _abort_download(self): if self._downloading: self._download_lock.acquire() self._download_abort_queue.append("abort") self._download_lock.release() class _DownloadThread(threading.Thread): def __init__(self, data_server, items, lock, message_queue, abort): self.data_server = data_server self.items = items self.lock = lock self.message_queue = message_queue self.abort = abort threading.Thread.__init__(self) def run(self): for msg in self.data_server.incr_download(self.items): self.lock.acquire() self.message_queue.append(msg) # Check if we've been told to kill ourselves: if self.abort: self.message_queue.append("aborted") self.lock.release() return self.lock.release() self.lock.acquire() self.message_queue.append("finished") self.lock.release() _MONITOR_QUEUE_DELAY = 100 def _monitor_message_queue(self): def show(s): self._progresslabel["text"] = s self._log(s) # Try to acquire the lock; if it's busy, then just try again later. if not self._download_lock.acquire(): return for msg in self._download_msg_queue: # Done downloading? if msg == "finished" or msg == "aborted": # self._fill_table(sort=False) self._update_table_status() self._downloading = False self._download_button["text"] = "Download" del self._download_msg_queue[:] del self._download_abort_queue[:] self._download_lock.release() if msg == "aborted": show("Download aborted!") self._show_progress(None) else: afterid = self.top.after(100, self._show_progress, None) self._afterid["_monitor_message_queue"] = afterid return # All other messages elif isinstance(msg, ProgressMessage): self._show_progress(msg.progress) elif isinstance(msg, ErrorMessage): show(msg.message) if msg.package is not None: self._select(msg.package.id) self._show_progress(None) self._downloading = False return # halt progress. elif isinstance(msg, StartCollectionMessage): show("Downloading collection %r" % msg.collection.id) self._log_indent += 1 elif isinstance(msg, StartPackageMessage): self._ds.clear_status_cache(msg.package.id) show("Downloading package %r" % msg.package.id) elif isinstance(msg, UpToDateMessage): show("Package %s is up-to-date!" % msg.package.id) # elif isinstance(msg, StaleMessage): # show('Package %s is out-of-date or corrupt; updating it' % # msg.package.id) elif isinstance(msg, FinishDownloadMessage): show("Finished downloading %r." % msg.package.id) elif isinstance(msg, StartUnzipMessage): show("Unzipping %s" % msg.package.filename) elif isinstance(msg, FinishUnzipMessage): show("Finished installing %s" % msg.package.id) elif isinstance(msg, FinishCollectionMessage): self._log_indent -= 1 show("Finished downloading collection %r." % msg.collection.id) self._clear_mark(msg.collection.id) elif isinstance(msg, FinishPackageMessage): self._update_table_status() self._clear_mark(msg.package.id) # Let the user know when we're aborting a download (but # waiting for a good point to abort it, so we don't end up # with a partially unzipped package or anything like that). if self._download_abort_queue: self._progresslabel["text"] = "Aborting download..." # Clear the message queue and then release the lock del self._download_msg_queue[:] self._download_lock.release() # Check the queue again after MONITOR_QUEUE_DELAY msec. afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue) self._afterid["_monitor_message_queue"] = afterid ###################################################################### # Helper Functions ###################################################################### # [xx] It may make sense to move these to nltk.internals. def md5_hexdigest(file): """ Calculate and return the MD5 checksum for a given file. ``file`` may either be a filename or an open stream. """ if isinstance(file, str): with open(file, "rb") as infile: return _md5_hexdigest(infile) return _md5_hexdigest(file) def _md5_hexdigest(fp): md5_digest = md5() while True: block = fp.read(1024 * 16) # 16k blocks if not block: break md5_digest.update(block) return md5_digest.hexdigest() # change this to periodically yield progress messages? # [xx] get rid of topdir parameter -- we should be checking # this when we build the index, anyway. def unzip(filename, root, verbose=True): """ Extract the contents of the zip file ``filename`` into the directory ``root``. """ for message in _unzip_iter(filename, root, verbose): if isinstance(message, ErrorMessage): raise Exception(message) def _unzip_iter(filename, root, verbose=True): if verbose: sys.stdout.write("Unzipping %s" % os.path.split(filename)[1]) sys.stdout.flush() try: zf = zipfile.ZipFile(filename) except zipfile.error as e: yield ErrorMessage(filename, "Error with downloaded zip file") return except Exception as e: yield ErrorMessage(filename, e) return zf.extractall(root) if verbose: print() ###################################################################### # Index Builder ###################################################################### # This may move to a different file sometime. def build_index(root, base_url): """ Create a new data.xml index file, by combining the xml description files for various packages and collections. ``root`` should be the path to a directory containing the package xml and zip files; and the collection xml files. The ``root`` directory is expected to have the following subdirectories:: root/ packages/ .................. subdirectory for packages corpora/ ................. zip & xml files for corpora grammars/ ................ zip & xml files for grammars taggers/ ................. zip & xml files for taggers tokenizers/ .............. zip & xml files for tokenizers etc. collections/ ............... xml files for collections For each package, there should be two files: ``package.zip`` (where *package* is the package name) which contains the package itself as a compressed zip file; and ``package.xml``, which is an xml description of the package. The zipfile ``package.zip`` should expand to a single subdirectory named ``package/``. The base filename ``package`` must match the identifier given in the package's xml file. For each collection, there should be a single file ``collection.zip`` describing the collection, where *collection* is the name of the collection. All identifiers (for both packages and collections) must be unique. """ # Find all packages. packages = [] for pkg_xml, zf, subdir in _find_packages(os.path.join(root, "packages")): zipstat = os.stat(zf.filename) url = f"{base_url}/{subdir}/{os.path.split(zf.filename)[1]}" unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist()) # Fill in several fields of the package xml with calculated values. pkg_xml.set("unzipped_size", "%s" % unzipped_size) pkg_xml.set("size", "%s" % zipstat.st_size) pkg_xml.set("checksum", "%s" % md5_hexdigest(zf.filename)) pkg_xml.set("subdir", subdir) # pkg_xml.set('svn_revision', _svn_revision(zf.filename)) if not pkg_xml.get("url"): pkg_xml.set("url", url) # Record the package. packages.append(pkg_xml) # Find all collections collections = list(_find_collections(os.path.join(root, "collections"))) # Check that all UIDs are unique uids = set() for item in packages + collections: if item.get("id") in uids: raise ValueError("Duplicate UID: %s" % item.get("id")) uids.add(item.get("id")) # Put it all together top_elt = ElementTree.Element("nltk_data") top_elt.append(ElementTree.Element("packages")) top_elt[0].extend(sorted(packages, key=lambda package: package.get("id"))) top_elt.append(ElementTree.Element("collections")) top_elt[1].extend(sorted(collections, key=lambda collection: collection.get("id"))) _indent_xml(top_elt) return top_elt def _indent_xml(xml, prefix=""): """ Helper for ``build_index()``: Given an XML ``ElementTree``, modify it (and its descendents) ``text`` and ``tail`` attributes to generate an indented tree, where each nested element is indented by 2 spaces with respect to its parent. """ if len(xml) > 0: xml.text = (xml.text or "").strip() + "\n" + prefix + " " for child in xml: _indent_xml(child, prefix + " ") for child in xml[:-1]: child.tail = (child.tail or "").strip() + "\n" + prefix + " " xml[-1].tail = (xml[-1].tail or "").strip() + "\n" + prefix def _check_package(pkg_xml, zipfilename, zf): """ Helper for ``build_index()``: Perform some checks to make sure that the given package is consistent. """ # The filename must patch the id given in the XML file. uid = os.path.splitext(os.path.split(zipfilename)[1])[0] if pkg_xml.get("id") != uid: raise ValueError( "package identifier mismatch ({} vs {})".format(pkg_xml.get("id"), uid) ) # Zip file must expand to a subdir whose name matches uid. if sum((name != uid and not name.startswith(uid + "/")) for name in zf.namelist()): raise ValueError( "Zipfile %s.zip does not expand to a single " "subdirectory %s/" % (uid, uid) ) # update for git? def _svn_revision(filename): """ Helper for ``build_index()``: Calculate the subversion revision number for a given file (by using ``subprocess`` to run ``svn``). """ p = subprocess.Popen( ["svn", "status", "-v", filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) (stdout, stderr) = p.communicate() if p.returncode != 0 or stderr or not stdout: raise ValueError( "Error determining svn_revision for %s: %s" % (os.path.split(filename)[1], textwrap.fill(stderr)) ) return stdout.split()[2] def _find_collections(root): """ Helper for ``build_index()``: Yield a list of ElementTree.Element objects, each holding the xml for a single package collection. """ for dirname, _subdirs, files in os.walk(root): for filename in files: if filename.endswith(".xml"): xmlfile = os.path.join(dirname, filename) yield ElementTree.parse(xmlfile).getroot() def _find_packages(root): """ Helper for ``build_index()``: Yield a list of tuples ``(pkg_xml, zf, subdir)``, where: - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a package - ``zf`` is a ``zipfile.ZipFile`` for the package's contents. - ``subdir`` is the subdirectory (relative to ``root``) where the package was found (e.g. 'corpora' or 'grammars'). """ from nltk.corpus.reader.util import _path_from # Find all packages. packages = [] for dirname, subdirs, files in os.walk(root): relpath = "/".join(_path_from(root, dirname)) for filename in files: if filename.endswith(".xml"): xmlfilename = os.path.join(dirname, filename) zipfilename = xmlfilename[:-4] + ".zip" try: zf = zipfile.ZipFile(zipfilename) except Exception as e: raise ValueError(f"Error reading file {zipfilename!r}!\n{e}") from e try: pkg_xml = ElementTree.parse(xmlfilename).getroot() except Exception as e: raise ValueError(f"Error reading file {xmlfilename!r}!\n{e}") from e # Check that the UID matches the filename uid = os.path.split(xmlfilename[:-4])[1] if pkg_xml.get("id") != uid: raise ValueError( "package identifier mismatch (%s " "vs %s)" % (pkg_xml.get("id"), uid) ) # Check that the zipfile expands to a subdir whose # name matches the uid. if sum( (name != uid and not name.startswith(uid + "/")) for name in zf.namelist() ): raise ValueError( "Zipfile %s.zip does not expand to a " "single subdirectory %s/" % (uid, uid) ) yield pkg_xml, zf, relpath elif filename.endswith(".zip"): # Warn user in case a .xml does not exist for a .zip resourcename = os.path.splitext(filename)[0] xmlfilename = os.path.join(dirname, resourcename + ".xml") if not os.path.exists(xmlfilename): warnings.warn( f"{filename} exists, but {resourcename + '.xml'} cannot be found! " f"This could mean that {resourcename} can not be downloaded.", stacklevel=2, ) # Don't recurse into svn subdirectories: try: subdirs.remove(".svn") except ValueError: pass ###################################################################### # Main: ###################################################################### # There should be a command-line interface # Aliases _downloader = Downloader() download = _downloader.download def download_shell(): DownloaderShell(_downloader).run() def download_gui(): DownloaderGUI(_downloader).mainloop() def update(): _downloader.update() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option( "-d", "--dir", dest="dir", help="download package to directory DIR", metavar="DIR", ) parser.add_option( "-q", "--quiet", dest="quiet", action="store_true", default=False, help="work quietly", ) parser.add_option( "-f", "--force", dest="force", action="store_true", default=False, help="download even if already installed", ) parser.add_option( "-e", "--exit-on-error", dest="halt_on_error", action="store_true", default=False, help="exit if an error occurs", ) parser.add_option( "-u", "--url", dest="server_index_url", default=os.environ.get("NLTK_DOWNLOAD_URL"), help="download server index url", ) (options, args) = parser.parse_args() downloader = Downloader(server_index_url=options.server_index_url) if args: for pkg_id in args: rv = downloader.download( info_or_id=pkg_id, download_dir=options.dir, quiet=options.quiet, force=options.force, halt_on_error=options.halt_on_error, ) if rv == False and options.halt_on_error: break else: downloader.download( download_dir=options.dir, quiet=options.quiet, force=options.force, halt_on_error=options.halt_on_error, ) nltk-3.7/nltk/draw/000077500000000000000000000000001420073152400142365ustar00rootroot00000000000000nltk-3.7/nltk/draw/__init__.py000066400000000000000000000014171420073152400163520ustar00rootroot00000000000000# Natural Language Toolkit: graphical representations package # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT # Import Tkinter-based modules if Tkinter is installed try: import tkinter except ImportError: import warnings warnings.warn("nltk.draw package not loaded (please install Tkinter library).") else: from nltk.draw.cfg import ProductionList, CFGEditor, CFGDemo from nltk.draw.tree import ( TreeSegmentWidget, tree_to_treesegment, TreeWidget, TreeView, draw_trees, ) from nltk.draw.table import Table from nltk.draw.dispersion import dispersion_plot nltk-3.7/nltk/draw/cfg.py000066400000000000000000000723571420073152400153650ustar00rootroot00000000000000# Natural Language Toolkit: CFG visualization # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Visualization tools for CFGs. """ # Idea for a nice demo: # - 3 panes: grammar, treelet, working area # - grammar is a list of productions # - when you select a production, the treelet that it licenses appears # in the treelet area # - the working area has the text on the bottom, and S at top. When # you select a production, it shows (ghosted) the locations where # that production's treelet could be attached to either the text # or the tree rooted at S. # - the user can drag the treelet onto one of those (or click on them?) # - the user can delete pieces of the tree from the working area # (right click?) # - connecting top to bottom? drag one NP onto another? # # +-------------------------------------------------------------+ # | S -> NP VP | S | # |[NP -> Det N ]| / \ | # | ... | NP VP | # | N -> 'dog' | | # | N -> 'cat' | | # | ... | | # +--------------+ | # | NP | Det N | # | / \ | | | | # | Det N | the cat saw the dog | # | | | # +--------------+----------------------------------------------+ # # Operations: # - connect a new treelet -- drag or click shadow # - delete a treelet -- right click # - if only connected to top, delete everything below # - if only connected to bottom, delete everything above # - connect top & bottom -- drag a leaf to a root or a root to a leaf # - disconnect top & bottom -- right click # - if connected to top & bottom, then disconnect import re from tkinter import ( Button, Canvas, Entry, Frame, IntVar, Label, Scrollbar, Text, Tk, Toplevel, ) from nltk.draw.tree import TreeSegmentWidget, tree_to_treesegment from nltk.draw.util import ( CanvasFrame, ColorizedList, ShowText, SymbolWidget, TextWidget, ) from nltk.grammar import CFG, Nonterminal, _read_cfg_production, nonterminals from nltk.tree import Tree ###################################################################### # Production List ###################################################################### class ProductionList(ColorizedList): ARROW = SymbolWidget.SYMBOLS["rightarrow"] def _init_colortags(self, textwidget, options): textwidget.tag_config("terminal", foreground="#006000") textwidget.tag_config("arrow", font="symbol", underline="0") textwidget.tag_config( "nonterminal", foreground="blue", font=("helvetica", -12, "bold") ) def _item_repr(self, item): contents = [] contents.append(("%s\t" % item.lhs(), "nonterminal")) contents.append((self.ARROW, "arrow")) for elt in item.rhs(): if isinstance(elt, Nonterminal): contents.append((" %s" % elt.symbol(), "nonterminal")) else: contents.append((" %r" % elt, "terminal")) return contents ###################################################################### # CFG Editor ###################################################################### _CFGEditor_HELP = """ The CFG Editor can be used to create or modify context free grammars. A context free grammar consists of a start symbol and a list of productions. The start symbol is specified by the text entry field in the upper right hand corner of the editor; and the list of productions are specified in the main text editing box. Every non-blank line specifies a single production. Each production has the form "LHS -> RHS," where LHS is a single nonterminal, and RHS is a list of nonterminals and terminals. Nonterminals must be a single word, such as S or NP or NP_subj. Currently, nonterminals must consists of alphanumeric characters and underscores (_). Nonterminals are colored blue. If you place the mouse over any nonterminal, then all occurrences of that nonterminal will be highlighted. Terminals must be surrounded by single quotes (') or double quotes(\"). For example, "dog" and "New York" are terminals. Currently, the string within the quotes must consist of alphanumeric characters, underscores, and spaces. To enter a new production, go to a blank line, and type a nonterminal, followed by an arrow (->), followed by a sequence of terminals and nonterminals. Note that "->" (dash + greater-than) is automatically converted to an arrow symbol. When you move your cursor to a different line, your production will automatically be colorized. If there are any errors, they will be highlighted in red. Note that the order of the productions is significant for some algorithms. To re-order the productions, use cut and paste to move them. Use the buttons at the bottom of the window when you are done editing the CFG: - Ok: apply the new CFG, and exit the editor. - Apply: apply the new CFG, and do not exit the editor. - Reset: revert to the original CFG, and do not exit the editor. - Cancel: revert to the original CFG, and exit the editor. """ class CFGEditor: """ A dialog window for creating and editing context free grammars. ``CFGEditor`` imposes the following restrictions: - All nonterminals must be strings consisting of word characters. - All terminals must be strings consisting of word characters and space characters. """ # Regular expressions used by _analyze_line. Precompile them, so # we can process the text faster. ARROW = SymbolWidget.SYMBOLS["rightarrow"] _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))") _ARROW_RE = re.compile(r"\s*(->|(" + ARROW + r"))\s*") _PRODUCTION_RE = re.compile( r"(^\s*\w+\s*)" + "(->|(" # LHS + ARROW + r"))\s*" + r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$" # arrow ) # RHS _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")") _BOLD = ("helvetica", -12, "bold") def __init__(self, parent, cfg=None, set_cfg_callback=None): self._parent = parent if cfg is not None: self._cfg = cfg else: self._cfg = CFG(Nonterminal("S"), []) self._set_cfg_callback = set_cfg_callback self._highlight_matching_nonterminals = 1 # Create the top-level window. self._top = Toplevel(parent) self._init_bindings() self._init_startframe() self._startframe.pack(side="top", fill="x", expand=0) self._init_prodframe() self._prodframe.pack(side="top", fill="both", expand=1) self._init_buttons() self._buttonframe.pack(side="bottom", fill="x", expand=0) self._textwidget.focus() def _init_startframe(self): frame = self._startframe = Frame(self._top) self._start = Entry(frame) self._start.pack(side="right") Label(frame, text="Start Symbol:").pack(side="right") Label(frame, text="Productions:").pack(side="left") self._start.insert(0, self._cfg.start().symbol()) def _init_buttons(self): frame = self._buttonframe = Frame(self._top) Button(frame, text="Ok", command=self._ok, underline=0, takefocus=0).pack( side="left" ) Button(frame, text="Apply", command=self._apply, underline=0, takefocus=0).pack( side="left" ) Button(frame, text="Reset", command=self._reset, underline=0, takefocus=0).pack( side="left" ) Button( frame, text="Cancel", command=self._cancel, underline=0, takefocus=0 ).pack(side="left") Button(frame, text="Help", command=self._help, underline=0, takefocus=0).pack( side="right" ) def _init_bindings(self): self._top.title("CFG Editor") self._top.bind("", self._cancel) self._top.bind("", self._cancel) self._top.bind("", self._cancel) # self._top.bind('', self._cancel) self._top.bind("", self._cancel) self._top.bind("", self._cancel) # self._top.bind('', self._cancel) self._top.bind("", self._cancel) self._top.bind("", self._ok) self._top.bind("", self._ok) self._top.bind("", self._apply) self._top.bind("", self._apply) self._top.bind("", self._reset) self._top.bind("", self._reset) self._top.bind("", self._help) self._top.bind("", self._help) self._top.bind("", self._help) def _init_prodframe(self): self._prodframe = Frame(self._top) # Create the basic Text widget & scrollbar. self._textwidget = Text( self._prodframe, background="#e0e0e0", exportselection=1 ) self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient="vertical") self._textwidget.config(yscrollcommand=self._textscroll.set) self._textscroll.config(command=self._textwidget.yview) self._textscroll.pack(side="right", fill="y") self._textwidget.pack(expand=1, fill="both", side="left") # Initialize the colorization tags. Each nonterminal gets its # own tag, so they aren't listed here. self._textwidget.tag_config("terminal", foreground="#006000") self._textwidget.tag_config("arrow", font="symbol") self._textwidget.tag_config("error", background="red") # Keep track of what line they're on. We use that to remember # to re-analyze a line whenever they leave it. self._linenum = 0 # Expand "->" to an arrow. self._top.bind(">", self._replace_arrows) # Re-colorize lines when appropriate. self._top.bind("<>", self._analyze) self._top.bind("", self._check_analyze) self._top.bind("", self._check_analyze) # Tab cycles focus. (why doesn't this work??) def cycle(e, textwidget=self._textwidget): textwidget.tk_focusNext().focus() self._textwidget.bind("", cycle) prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()] for i in range(len(prod_tuples) - 1, 0, -1): if prod_tuples[i][0] == prod_tuples[i - 1][0]: if () in prod_tuples[i][1]: continue if () in prod_tuples[i - 1][1]: continue print(prod_tuples[i - 1][1]) print(prod_tuples[i][1]) prod_tuples[i - 1][1].extend(prod_tuples[i][1]) del prod_tuples[i] for lhs, rhss in prod_tuples: print(lhs, rhss) s = "%s ->" % lhs for rhs in rhss: for elt in rhs: if isinstance(elt, Nonterminal): s += " %s" % elt else: s += " %r" % elt s += " |" s = s[:-2] + "\n" self._textwidget.insert("end", s) self._analyze() # # Add the producitons to the text widget, and colorize them. # prod_by_lhs = {} # for prod in self._cfg.productions(): # if len(prod.rhs()) > 0: # prod_by_lhs.setdefault(prod.lhs(),[]).append(prod) # for (lhs, prods) in prod_by_lhs.items(): # self._textwidget.insert('end', '%s ->' % lhs) # self._textwidget.insert('end', self._rhs(prods[0])) # for prod in prods[1:]: # print '\t|'+self._rhs(prod), # self._textwidget.insert('end', '\t|'+self._rhs(prod)) # print # self._textwidget.insert('end', '\n') # for prod in self._cfg.productions(): # if len(prod.rhs()) == 0: # self._textwidget.insert('end', '%s' % prod) # self._analyze() # def _rhs(self, prod): # s = '' # for elt in prod.rhs(): # if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol() # else: s += ' %r' % elt # return s def _clear_tags(self, linenum): """ Remove all tags (except ``arrow`` and ``sel``) from the given line of the text widget used for editing the productions. """ start = "%d.0" % linenum end = "%d.end" % linenum for tag in self._textwidget.tag_names(): if tag not in ("arrow", "sel"): self._textwidget.tag_remove(tag, start, end) def _check_analyze(self, *e): """ Check if we've moved to a new line. If we have, then remove all colorization from the line we moved to, and re-colorize the line that we moved from. """ linenum = int(self._textwidget.index("insert").split(".")[0]) if linenum != self._linenum: self._clear_tags(linenum) self._analyze_line(self._linenum) self._linenum = linenum def _replace_arrows(self, *e): """ Replace any ``'->'`` text strings with arrows (char \\256, in symbol font). This searches the whole buffer, but is fast enough to be done anytime they press '>'. """ arrow = "1.0" while True: arrow = self._textwidget.search("->", arrow, "end+1char") if arrow == "": break self._textwidget.delete(arrow, arrow + "+2char") self._textwidget.insert(arrow, self.ARROW, "arrow") self._textwidget.insert(arrow, "\t") arrow = "1.0" while True: arrow = self._textwidget.search(self.ARROW, arrow + "+1char", "end+1char") if arrow == "": break self._textwidget.tag_add("arrow", arrow, arrow + "+1char") def _analyze_token(self, match, linenum): """ Given a line number and a regexp match for a token on that line, colorize the token. Note that the regexp match gives us the token's text, start index (on the line), and end index (on the line). """ # What type of token is it? if match.group()[0] in "'\"": tag = "terminal" elif match.group() in ("->", self.ARROW): tag = "arrow" else: # If it's a nonterminal, then set up new bindings, so we # can highlight all instances of that nonterminal when we # put the mouse over it. tag = "nonterminal_" + match.group() if tag not in self._textwidget.tag_names(): self._init_nonterminal_tag(tag) start = "%d.%d" % (linenum, match.start()) end = "%d.%d" % (linenum, match.end()) self._textwidget.tag_add(tag, start, end) def _init_nonterminal_tag(self, tag, foreground="blue"): self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD) if not self._highlight_matching_nonterminals: return def enter(e, textwidget=self._textwidget, tag=tag): textwidget.tag_config(tag, background="#80ff80") def leave(e, textwidget=self._textwidget, tag=tag): textwidget.tag_config(tag, background="") self._textwidget.tag_bind(tag, "", enter) self._textwidget.tag_bind(tag, "", leave) def _analyze_line(self, linenum): """ Colorize a given line. """ # Get rid of any tags that were previously on the line. self._clear_tags(linenum) # Get the line line's text string. line = self._textwidget.get(repr(linenum) + ".0", repr(linenum) + ".end") # If it's a valid production, then colorize each token. if CFGEditor._PRODUCTION_RE.match(line): # It's valid; Use _TOKEN_RE to tokenize the production, # and call analyze_token on each token. def analyze_token(match, self=self, linenum=linenum): self._analyze_token(match, linenum) return "" CFGEditor._TOKEN_RE.sub(analyze_token, line) elif line.strip() != "": # It's invalid; show the user where the error is. self._mark_error(linenum, line) def _mark_error(self, linenum, line): """ Mark the location of an error in a line. """ arrowmatch = CFGEditor._ARROW_RE.search(line) if not arrowmatch: # If there's no arrow at all, highlight the whole line. start = "%d.0" % linenum end = "%d.end" % linenum elif not CFGEditor._LHS_RE.match(line): # Otherwise, if the LHS is bad, highlight it. start = "%d.0" % linenum end = "%d.%d" % (linenum, arrowmatch.start()) else: # Otherwise, highlight the RHS. start = "%d.%d" % (linenum, arrowmatch.end()) end = "%d.end" % linenum # If we're highlighting 0 chars, highlight the whole line. if self._textwidget.compare(start, "==", end): start = "%d.0" % linenum end = "%d.end" % linenum self._textwidget.tag_add("error", start, end) def _analyze(self, *e): """ Replace ``->`` with arrows, and colorize the entire buffer. """ self._replace_arrows() numlines = int(self._textwidget.index("end").split(".")[0]) for linenum in range(1, numlines + 1): # line numbers start at 1. self._analyze_line(linenum) def _parse_productions(self): """ Parse the current contents of the textwidget buffer, to create a list of productions. """ productions = [] # Get the text, normalize it, and split it into lines. text = self._textwidget.get("1.0", "end") text = re.sub(self.ARROW, "->", text) text = re.sub("\t", " ", text) lines = text.split("\n") # Convert each line to a CFG production for line in lines: line = line.strip() if line == "": continue productions += _read_cfg_production(line) # if line.strip() == '': continue # if not CFGEditor._PRODUCTION_RE.match(line): # raise ValueError('Bad production string %r' % line) # # (lhs_str, rhs_str) = line.split('->') # lhs = Nonterminal(lhs_str.strip()) # rhs = [] # def parse_token(match, rhs=rhs): # token = match.group() # if token[0] in "'\"": rhs.append(token[1:-1]) # else: rhs.append(Nonterminal(token)) # return '' # CFGEditor._TOKEN_RE.sub(parse_token, rhs_str) # # productions.append(Production(lhs, *rhs)) return productions def _destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def _ok(self, *e): self._apply() self._destroy() def _apply(self, *e): productions = self._parse_productions() start = Nonterminal(self._start.get()) cfg = CFG(start, productions) if self._set_cfg_callback is not None: self._set_cfg_callback(cfg) def _reset(self, *e): self._textwidget.delete("1.0", "end") for production in self._cfg.productions(): self._textwidget.insert("end", "%s\n" % production) self._analyze() if self._set_cfg_callback is not None: self._set_cfg_callback(self._cfg) def _cancel(self, *e): try: self._reset() except: pass self._destroy() def _help(self, *e): # The default font's not very legible; try using 'fixed' instead. try: ShowText( self._parent, "Help: Chart Parser Demo", (_CFGEditor_HELP).strip(), width=75, font="fixed", ) except: ShowText( self._parent, "Help: Chart Parser Demo", (_CFGEditor_HELP).strip(), width=75, ) ###################################################################### # New Demo (built tree based on cfg) ###################################################################### class CFGDemo: def __init__(self, grammar, text): self._grammar = grammar self._text = text # Set up the main window. self._top = Tk() self._top.title("Context Free Grammar Demo") # Base font size self._size = IntVar(self._top) self._size.set(12) # = medium # Set up the key bindings self._init_bindings(self._top) # Create the basic frames frame1 = Frame(self._top) frame1.pack(side="left", fill="y", expand=0) self._init_menubar(self._top) self._init_buttons(self._top) self._init_grammar(frame1) self._init_treelet(frame1) self._init_workspace(self._top) # ////////////////////////////////////////////////// # Initialization # ////////////////////////////////////////////////// def _init_bindings(self, top): top.bind("", self.destroy) def _init_menubar(self, parent): pass def _init_buttons(self, parent): pass def _init_grammar(self, parent): self._prodlist = ProductionList(parent, self._grammar, width=20) self._prodlist.pack(side="top", fill="both", expand=1) self._prodlist.focus() self._prodlist.add_callback("select", self._selectprod_cb) self._prodlist.add_callback("move", self._selectprod_cb) def _init_treelet(self, parent): self._treelet_canvas = Canvas(parent, background="white") self._treelet_canvas.pack(side="bottom", fill="x") self._treelet = None def _init_workspace(self, parent): self._workspace = CanvasFrame(parent, background="white") self._workspace.pack(side="right", fill="both", expand=1) self._tree = None self.reset_workspace() # ////////////////////////////////////////////////// # Workspace # ////////////////////////////////////////////////// def reset_workspace(self): c = self._workspace.canvas() fontsize = int(self._size.get()) node_font = ("helvetica", -(fontsize + 4), "bold") leaf_font = ("helvetica", -(fontsize + 2)) # Remove the old tree if self._tree is not None: self._workspace.remove_widget(self._tree) # The root of the tree. start = self._grammar.start().symbol() rootnode = TextWidget(c, start, font=node_font, draggable=1) # The leaves of the tree. leaves = [] for word in self._text: leaves.append(TextWidget(c, word, font=leaf_font, draggable=1)) # Put it all together into one tree self._tree = TreeSegmentWidget(c, rootnode, leaves, color="white") # Add it to the workspace. self._workspace.add_widget(self._tree) # Move the leaves to the bottom of the workspace. for leaf in leaves: leaf.move(0, 100) # self._nodes = {start:1} # self._leaves = dict([(l,1) for l in leaves]) def workspace_markprod(self, production): pass def _markproduction(self, prod, tree=None): if tree is None: tree = self._tree for i in range(len(tree.subtrees()) - len(prod.rhs())): if tree["color", i] == "white": self._markproduction # FIXME: Is this necessary at all? for j, node in enumerate(prod.rhs()): widget = tree.subtrees()[i + j] if ( isinstance(node, Nonterminal) and isinstance(widget, TreeSegmentWidget) and node.symbol == widget.label().text() ): pass # matching nonterminal elif ( isinstance(node, str) and isinstance(widget, TextWidget) and node == widget.text() ): pass # matching nonterminal else: break else: # Everything matched! print("MATCH AT", i) # ////////////////////////////////////////////////// # Grammar # ////////////////////////////////////////////////// def _selectprod_cb(self, production): canvas = self._treelet_canvas self._prodlist.highlight(production) if self._treelet is not None: self._treelet.destroy() # Convert the production to a tree. rhs = production.rhs() for (i, elt) in enumerate(rhs): if isinstance(elt, Nonterminal): elt = Tree(elt) tree = Tree(production.lhs().symbol(), *rhs) # Draw the tree in the treelet area. fontsize = int(self._size.get()) node_font = ("helvetica", -(fontsize + 4), "bold") leaf_font = ("helvetica", -(fontsize + 2)) self._treelet = tree_to_treesegment( canvas, tree, node_font=node_font, leaf_font=leaf_font ) self._treelet["draggable"] = 1 # Center the treelet. (x1, y1, x2, y2) = self._treelet.bbox() w, h = int(canvas["width"]), int(canvas["height"]) self._treelet.move((w - x1 - x2) / 2, (h - y1 - y2) / 2) # Mark the places where we can add it to the workspace. self._markproduction(production) def destroy(self, *args): self._top.destroy() def mainloop(self, *args, **kwargs): self._top.mainloop(*args, **kwargs) def demo2(): from nltk import CFG, Nonterminal, Production nonterminals = "S VP NP PP P N Name V Det" (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ["up", "over", NP]), # Lexical Productions Production(NP, ["I"]), Production(Det, ["the"]), Production(Det, ["a"]), Production(N, ["man"]), Production(V, ["saw"]), Production(P, ["in"]), Production(P, ["with"]), Production(N, ["park"]), Production(N, ["dog"]), Production(N, ["statue"]), Production(Det, ["my"]), ) grammar = CFG(S, productions) text = "I saw a man in the park".split() d = CFGDemo(grammar, text) d.mainloop() ###################################################################### # Old Demo ###################################################################### def demo(): from nltk import CFG, Nonterminal nonterminals = "S VP NP PP P N Name V Det" (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) grammar = CFG.fromstring( """ S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' Det -> 'my' NP -> 'I' N -> 'dog' N -> 'man' N -> 'park' N -> 'statue' V -> 'saw' P -> 'in' P -> 'up' P -> 'over' P -> 'with' """ ) def cb(grammar): print(grammar) top = Tk() editor = CFGEditor(top, grammar, cb) Label(top, text="\nTesting CFG Editor\n").pack() Button(top, text="Quit", command=top.destroy).pack() top.mainloop() def demo3(): from nltk import Production (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals( "S, VP, NP, PP, P, N, Name, V, Det" ) productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ["up", "over", NP]), # Lexical Productions Production(NP, ["I"]), Production(Det, ["the"]), Production(Det, ["a"]), Production(N, ["man"]), Production(V, ["saw"]), Production(P, ["in"]), Production(P, ["with"]), Production(N, ["park"]), Production(N, ["dog"]), Production(N, ["statue"]), Production(Det, ["my"]), ) t = Tk() def destroy(e, t=t): t.destroy() t.bind("q", destroy) p = ProductionList(t, productions) p.pack(expand=1, fill="both") p.add_callback("select", p.markonly) p.add_callback("move", p.markonly) p.focus() p.mark(productions[2]) p.mark(productions[8]) if __name__ == "__main__": demo() nltk-3.7/nltk/draw/dispersion.py000066400000000000000000000033331420073152400167710ustar00rootroot00000000000000# Natural Language Toolkit: Dispersion Plots # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ A utility for displaying lexical dispersion. """ def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"): """ Generate a lexical dispersion plot. :param text: The source text :type text: list(str) or enum(str) :param words: The target words :type words: list of str :param ignore_case: flag to set if case should be ignored when searching text :type ignore_case: bool """ try: from matplotlib import pylab except ImportError as e: raise ValueError( "The plot function requires matplotlib to be installed." "See https://matplotlib.org/" ) from e text = list(text) words.reverse() if ignore_case: words_to_comp = list(map(str.lower, words)) text_to_comp = list(map(str.lower, text)) else: words_to_comp = words text_to_comp = text points = [ (x, y) for x in range(len(text_to_comp)) for y in range(len(words_to_comp)) if text_to_comp[x] == words_to_comp[y] ] if points: x, y = list(zip(*points)) else: x = y = () pylab.plot(x, y, "b|", scalex=0.1) pylab.yticks(list(range(len(words))), words, color="b") pylab.ylim(-1, len(words)) pylab.title(title) pylab.xlabel("Word Offset") pylab.show() if __name__ == "__main__": from nltk.corpus import gutenberg words = ["Elinor", "Marianne", "Edward", "Willoughby"] dispersion_plot(gutenberg.words("austen-sense.txt"), words) nltk-3.7/nltk/draw/table.py000066400000000000000000001274521420073152400157120ustar00rootroot00000000000000# Natural Language Toolkit: Table widget # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Tkinter widgets for displaying multi-column listboxes and tables. """ import operator from tkinter import Frame, Label, Listbox, Scrollbar, Tk ###################################################################### # Multi-Column Listbox ###################################################################### class MultiListbox(Frame): """ A multi-column listbox, where the current selection applies to an entire row. Based on the MultiListbox Tkinter widget recipe from the Python Cookbook (https://code.activestate.com/recipes/52266/) For the most part, ``MultiListbox`` methods delegate to its contained listboxes. For any methods that do not have docstrings, see ``Tkinter.Listbox`` for a description of what that method does. """ # ///////////////////////////////////////////////////////////////// # Configuration # ///////////////////////////////////////////////////////////////// #: Default configuration values for the frame. FRAME_CONFIG = dict(background="#888", takefocus=True, highlightthickness=1) #: Default configurations for the column labels. LABEL_CONFIG = dict( borderwidth=1, relief="raised", font="helvetica -16 bold", background="#444", foreground="white", ) #: Default configuration for the column listboxes. LISTBOX_CONFIG = dict( borderwidth=1, selectborderwidth=0, highlightthickness=0, exportselection=False, selectbackground="#888", activestyle="none", takefocus=False, ) # ///////////////////////////////////////////////////////////////// # Constructor # ///////////////////////////////////////////////////////////////// def __init__(self, master, columns, column_weights=None, cnf={}, **kw): """ Construct a new multi-column listbox widget. :param master: The widget that should contain the new multi-column listbox. :param columns: Specifies what columns should be included in the new multi-column listbox. If ``columns`` is an integer, the it is the number of columns to include. If it is a list, then its length indicates the number of columns to include; and each element of the list will be used as a label for the corresponding column. :param cnf, kw: Configuration parameters for this widget. Use ``label_*`` to configure all labels; and ``listbox_*`` to configure all listboxes. E.g.: >>> mlb = MultiListbox(master, 5, label_foreground='red') """ # If columns was specified as an int, convert it to a list. if isinstance(columns, int): columns = list(range(columns)) include_labels = False else: include_labels = True if len(columns) == 0: raise ValueError("Expected at least one column") # Instance variables self._column_names = tuple(columns) self._listboxes = [] self._labels = [] # Pick a default value for column_weights, if none was specified. if column_weights is None: column_weights = [1] * len(columns) elif len(column_weights) != len(columns): raise ValueError("Expected one column_weight for each column") self._column_weights = column_weights # Configure our widgets. Frame.__init__(self, master, **self.FRAME_CONFIG) self.grid_rowconfigure(1, weight=1) for i, label in enumerate(self._column_names): self.grid_columnconfigure(i, weight=column_weights[i]) # Create a label for the column if include_labels: l = Label(self, text=label, **self.LABEL_CONFIG) self._labels.append(l) l.grid(column=i, row=0, sticky="news", padx=0, pady=0) l.column_index = i # Create a listbox for the column lb = Listbox(self, **self.LISTBOX_CONFIG) self._listboxes.append(lb) lb.grid(column=i, row=1, sticky="news", padx=0, pady=0) lb.column_index = i # Clicking or dragging selects: lb.bind("", self._select) lb.bind("", self._select) # Scroll wheel scrolls: lb.bind("", lambda e: self._scroll(-1)) lb.bind("", lambda e: self._scroll(+1)) lb.bind("", lambda e: self._scroll(e.delta)) # Button 2 can be used to scan: lb.bind("", lambda e: self.scan_mark(e.x, e.y)) lb.bind("", lambda e: self.scan_dragto(e.x, e.y)) # Dragging outside the window has no effect (disable # the default listbox behavior, which scrolls): lb.bind("", lambda e: "break") # Columns can be resized by dragging them: l.bind("", self._resize_column) # Columns can be resized by dragging them. (This binding is # used if they click on the grid between columns:) self.bind("", self._resize_column) # Set up key bindings for the widget: self.bind("", lambda e: self.select(delta=-1)) self.bind("", lambda e: self.select(delta=1)) self.bind("", lambda e: self.select(delta=-self._pagesize())) self.bind("", lambda e: self.select(delta=self._pagesize())) # Configuration customizations self.configure(cnf, **kw) # ///////////////////////////////////////////////////////////////// # Column Resizing # ///////////////////////////////////////////////////////////////// def _resize_column(self, event): """ Callback used to resize a column of the table. Return ``True`` if the column is actually getting resized (if the user clicked on the far left or far right 5 pixels of a label); and ``False`` otherwies. """ # If we're already waiting for a button release, then ignore # the new button press. if event.widget.bind(""): return False # Decide which column (if any) to resize. self._resize_column_index = None if event.widget is self: for i, lb in enumerate(self._listboxes): if abs(event.x - (lb.winfo_x() + lb.winfo_width())) < 10: self._resize_column_index = i elif event.x > (event.widget.winfo_width() - 5): self._resize_column_index = event.widget.column_index elif event.x < 5 and event.widget.column_index != 0: self._resize_column_index = event.widget.column_index - 1 # Bind callbacks that are used to resize it. if self._resize_column_index is not None: event.widget.bind("", self._resize_column_motion_cb) event.widget.bind( "" % event.num, self._resize_column_buttonrelease_cb ) return True else: return False def _resize_column_motion_cb(self, event): lb = self._listboxes[self._resize_column_index] charwidth = lb.winfo_width() / lb["width"] x1 = event.x + event.widget.winfo_x() x2 = lb.winfo_x() + lb.winfo_width() lb["width"] = max(3, lb["width"] + (x1 - x2) // charwidth) def _resize_column_buttonrelease_cb(self, event): event.widget.unbind("" % event.num) event.widget.unbind("") # ///////////////////////////////////////////////////////////////// # Properties # ///////////////////////////////////////////////////////////////// @property def column_names(self): """ A tuple containing the names of the columns used by this multi-column listbox. """ return self._column_names @property def column_labels(self): """ A tuple containing the ``Tkinter.Label`` widgets used to display the label of each column. If this multi-column listbox was created without labels, then this will be an empty tuple. These widgets will all be augmented with a ``column_index`` attribute, which can be used to determine which column they correspond to. This can be convenient, e.g., when defining callbacks for bound events. """ return tuple(self._labels) @property def listboxes(self): """ A tuple containing the ``Tkinter.Listbox`` widgets used to display individual columns. These widgets will all be augmented with a ``column_index`` attribute, which can be used to determine which column they correspond to. This can be convenient, e.g., when defining callbacks for bound events. """ return tuple(self._listboxes) # ///////////////////////////////////////////////////////////////// # Mouse & Keyboard Callback Functions # ///////////////////////////////////////////////////////////////// def _select(self, e): i = e.widget.nearest(e.y) self.selection_clear(0, "end") self.selection_set(i) self.activate(i) self.focus() def _scroll(self, delta): for lb in self._listboxes: lb.yview_scroll(delta, "unit") return "break" def _pagesize(self): """:return: The number of rows that makes up one page""" return int(self.index("@0,1000000")) - int(self.index("@0,0")) # ///////////////////////////////////////////////////////////////// # Row selection # ///////////////////////////////////////////////////////////////// def select(self, index=None, delta=None, see=True): """ Set the selected row. If ``index`` is specified, then select row ``index``. Otherwise, if ``delta`` is specified, then move the current selection by ``delta`` (negative numbers for up, positive numbers for down). This will not move the selection past the top or the bottom of the list. :param see: If true, then call ``self.see()`` with the newly selected index, to ensure that it is visible. """ if (index is not None) and (delta is not None): raise ValueError("specify index or delta, but not both") # If delta was given, then calculate index. if delta is not None: if len(self.curselection()) == 0: index = -1 + delta else: index = int(self.curselection()[0]) + delta # Clear all selected rows. self.selection_clear(0, "end") # Select the specified index if index is not None: index = min(max(index, 0), self.size() - 1) # self.activate(index) self.selection_set(index) if see: self.see(index) # ///////////////////////////////////////////////////////////////// # Configuration # ///////////////////////////////////////////////////////////////// def configure(self, cnf={}, **kw): """ Configure this widget. Use ``label_*`` to configure all labels; and ``listbox_*`` to configure all listboxes. E.g.: >>> mlb = MultiListbox(master, 5) >>> mlb.configure(label_foreground='red') >>> mlb.configure(listbox_foreground='red') """ cnf = dict(list(cnf.items()) + list(kw.items())) for (key, val) in list(cnf.items()): if key.startswith("label_") or key.startswith("label-"): for label in self._labels: label.configure({key[6:]: val}) elif key.startswith("listbox_") or key.startswith("listbox-"): for listbox in self._listboxes: listbox.configure({key[8:]: val}) else: Frame.configure(self, {key: val}) def __setitem__(self, key, val): """ Configure this widget. This is equivalent to ``self.configure({key,val``)}. See ``configure()``. """ self.configure({key: val}) def rowconfigure(self, row_index, cnf={}, **kw): """ Configure all table cells in the given row. Valid keyword arguments are: ``background``, ``bg``, ``foreground``, ``fg``, ``selectbackground``, ``selectforeground``. """ for lb in self._listboxes: lb.itemconfigure(row_index, cnf, **kw) def columnconfigure(self, col_index, cnf={}, **kw): """ Configure all table cells in the given column. Valid keyword arguments are: ``background``, ``bg``, ``foreground``, ``fg``, ``selectbackground``, ``selectforeground``. """ lb = self._listboxes[col_index] cnf = dict(list(cnf.items()) + list(kw.items())) for (key, val) in list(cnf.items()): if key in ( "background", "bg", "foreground", "fg", "selectbackground", "selectforeground", ): for i in range(lb.size()): lb.itemconfigure(i, {key: val}) else: lb.configure({key: val}) def itemconfigure(self, row_index, col_index, cnf=None, **kw): """ Configure the table cell at the given row and column. Valid keyword arguments are: ``background``, ``bg``, ``foreground``, ``fg``, ``selectbackground``, ``selectforeground``. """ lb = self._listboxes[col_index] return lb.itemconfigure(row_index, cnf, **kw) # ///////////////////////////////////////////////////////////////// # Value Access # ///////////////////////////////////////////////////////////////// def insert(self, index, *rows): """ Insert the given row or rows into the table, at the given index. Each row value should be a tuple of cell values, one for each column in the row. Index may be an integer or any of the special strings (such as ``'end'``) accepted by ``Tkinter.Listbox``. """ for elt in rows: if len(elt) != len(self._column_names): raise ValueError( "rows should be tuples whose length " "is equal to the number of columns" ) for (lb, elts) in zip(self._listboxes, list(zip(*rows))): lb.insert(index, *elts) def get(self, first, last=None): """ Return the value(s) of the specified row(s). If ``last`` is not specified, then return a single row value; otherwise, return a list of row values. Each row value is a tuple of cell values, one for each column in the row. """ values = [lb.get(first, last) for lb in self._listboxes] if last: return [tuple(row) for row in zip(*values)] else: return tuple(values) def bbox(self, row, col): """ Return the bounding box for the given table cell, relative to this widget's top-left corner. The bounding box is a tuple of integers ``(left, top, width, height)``. """ dx, dy, _, _ = self.grid_bbox(row=0, column=col) x, y, w, h = self._listboxes[col].bbox(row) return int(x) + int(dx), int(y) + int(dy), int(w), int(h) # ///////////////////////////////////////////////////////////////// # Hide/Show Columns # ///////////////////////////////////////////////////////////////// def hide_column(self, col_index): """ Hide the given column. The column's state is still maintained: its values will still be returned by ``get()``, and you must supply its values when calling ``insert()``. It is safe to call this on a column that is already hidden. :see: ``show_column()`` """ if self._labels: self._labels[col_index].grid_forget() self.listboxes[col_index].grid_forget() self.grid_columnconfigure(col_index, weight=0) def show_column(self, col_index): """ Display a column that has been hidden using ``hide_column()``. It is safe to call this on a column that is not hidden. """ weight = self._column_weights[col_index] if self._labels: self._labels[col_index].grid( column=col_index, row=0, sticky="news", padx=0, pady=0 ) self._listboxes[col_index].grid( column=col_index, row=1, sticky="news", padx=0, pady=0 ) self.grid_columnconfigure(col_index, weight=weight) # ///////////////////////////////////////////////////////////////// # Binding Methods # ///////////////////////////////////////////////////////////////// def bind_to_labels(self, sequence=None, func=None, add=None): """ Add a binding to each ``Tkinter.Label`` widget in this mult-column listbox that will call ``func`` in response to the event sequence. :return: A list of the identifiers of replaced binding functions (if any), allowing for their deletion (to prevent a memory leak). """ return [label.bind(sequence, func, add) for label in self.column_labels] def bind_to_listboxes(self, sequence=None, func=None, add=None): """ Add a binding to each ``Tkinter.Listbox`` widget in this mult-column listbox that will call ``func`` in response to the event sequence. :return: A list of the identifiers of replaced binding functions (if any), allowing for their deletion (to prevent a memory leak). """ for listbox in self.listboxes: listbox.bind(sequence, func, add) def bind_to_columns(self, sequence=None, func=None, add=None): """ Add a binding to each ``Tkinter.Label`` and ``Tkinter.Listbox`` widget in this mult-column listbox that will call ``func`` in response to the event sequence. :return: A list of the identifiers of replaced binding functions (if any), allowing for their deletion (to prevent a memory leak). """ return self.bind_to_labels(sequence, func, add) + self.bind_to_listboxes( sequence, func, add ) # ///////////////////////////////////////////////////////////////// # Simple Delegation # ///////////////////////////////////////////////////////////////// # These methods delegate to the first listbox: def curselection(self, *args, **kwargs): return self._listboxes[0].curselection(*args, **kwargs) def selection_includes(self, *args, **kwargs): return self._listboxes[0].selection_includes(*args, **kwargs) def itemcget(self, *args, **kwargs): return self._listboxes[0].itemcget(*args, **kwargs) def size(self, *args, **kwargs): return self._listboxes[0].size(*args, **kwargs) def index(self, *args, **kwargs): return self._listboxes[0].index(*args, **kwargs) def nearest(self, *args, **kwargs): return self._listboxes[0].nearest(*args, **kwargs) # These methods delegate to each listbox (and return None): def activate(self, *args, **kwargs): for lb in self._listboxes: lb.activate(*args, **kwargs) def delete(self, *args, **kwargs): for lb in self._listboxes: lb.delete(*args, **kwargs) def scan_mark(self, *args, **kwargs): for lb in self._listboxes: lb.scan_mark(*args, **kwargs) def scan_dragto(self, *args, **kwargs): for lb in self._listboxes: lb.scan_dragto(*args, **kwargs) def see(self, *args, **kwargs): for lb in self._listboxes: lb.see(*args, **kwargs) def selection_anchor(self, *args, **kwargs): for lb in self._listboxes: lb.selection_anchor(*args, **kwargs) def selection_clear(self, *args, **kwargs): for lb in self._listboxes: lb.selection_clear(*args, **kwargs) def selection_set(self, *args, **kwargs): for lb in self._listboxes: lb.selection_set(*args, **kwargs) def yview(self, *args, **kwargs): for lb in self._listboxes: v = lb.yview(*args, **kwargs) return v # if called with no arguments def yview_moveto(self, *args, **kwargs): for lb in self._listboxes: lb.yview_moveto(*args, **kwargs) def yview_scroll(self, *args, **kwargs): for lb in self._listboxes: lb.yview_scroll(*args, **kwargs) # ///////////////////////////////////////////////////////////////// # Aliases # ///////////////////////////////////////////////////////////////// itemconfig = itemconfigure rowconfig = rowconfigure columnconfig = columnconfigure select_anchor = selection_anchor select_clear = selection_clear select_includes = selection_includes select_set = selection_set # ///////////////////////////////////////////////////////////////// # These listbox methods are not defined for multi-listbox # ///////////////////////////////////////////////////////////////// # def xview(self, *what): pass # def xview_moveto(self, fraction): pass # def xview_scroll(self, number, what): pass ###################################################################### # Table ###################################################################### class Table: """ A display widget for a table of values, based on a ``MultiListbox`` widget. For many purposes, ``Table`` can be treated as a list-of-lists. E.g., table[i] is a list of the values for row i; and table.append(row) adds a new row with the given list of values. Individual cells can be accessed using table[i,j], which refers to the j-th column of the i-th row. This can be used to both read and write values from the table. E.g.: >>> table[i,j] = 'hello' The column (j) can be given either as an index number, or as a column name. E.g., the following prints the value in the 3rd row for the 'First Name' column: >>> print(table[3, 'First Name']) John You can configure the colors for individual rows, columns, or cells using ``rowconfig()``, ``columnconfig()``, and ``itemconfig()``. The color configuration for each row will be preserved if the table is modified; however, when new rows are added, any color configurations that have been made for *columns* will not be applied to the new row. Note: Although ``Table`` acts like a widget in some ways (e.g., it defines ``grid()``, ``pack()``, and ``bind()``), it is not itself a widget; it just contains one. This is because widgets need to define ``__getitem__()``, ``__setitem__()``, and ``__nonzero__()`` in a way that's incompatible with the fact that ``Table`` behaves as a list-of-lists. :ivar _mlb: The multi-column listbox used to display this table's data. :ivar _rows: A list-of-lists used to hold the cell values of this table. Each element of _rows is a row value, i.e., a list of cell values, one for each column in the row. """ def __init__( self, master, column_names, rows=None, column_weights=None, scrollbar=True, click_to_sort=True, reprfunc=None, cnf={}, **kw ): """ Construct a new Table widget. :type master: Tkinter.Widget :param master: The widget that should contain the new table. :type column_names: list(str) :param column_names: A list of names for the columns; these names will be used to create labels for each column; and can be used as an index when reading or writing cell values from the table. :type rows: list(list) :param rows: A list of row values used to initialize the table. Each row value should be a tuple of cell values, one for each column in the row. :type scrollbar: bool :param scrollbar: If true, then create a scrollbar for the new table widget. :type click_to_sort: bool :param click_to_sort: If true, then create bindings that will sort the table's rows by a given column's values if the user clicks on that colum's label. :type reprfunc: function :param reprfunc: If specified, then use this function to convert each table cell value to a string suitable for display. ``reprfunc`` has the following signature: reprfunc(row_index, col_index, cell_value) -> str (Note that the column is specified by index, not by name.) :param cnf, kw: Configuration parameters for this widget's contained ``MultiListbox``. See ``MultiListbox.__init__()`` for details. """ self._num_columns = len(column_names) self._reprfunc = reprfunc self._frame = Frame(master) self._column_name_to_index = {c: i for (i, c) in enumerate(column_names)} # Make a copy of the rows & check that it's valid. if rows is None: self._rows = [] else: self._rows = [[v for v in row] for row in rows] for row in self._rows: self._checkrow(row) # Create our multi-list box. self._mlb = MultiListbox(self._frame, column_names, column_weights, cnf, **kw) self._mlb.pack(side="left", expand=True, fill="both") # Optional scrollbar if scrollbar: sb = Scrollbar(self._frame, orient="vertical", command=self._mlb.yview) self._mlb.listboxes[0]["yscrollcommand"] = sb.set # for listbox in self._mlb.listboxes: # listbox['yscrollcommand'] = sb.set sb.pack(side="right", fill="y") self._scrollbar = sb # Set up sorting self._sortkey = None if click_to_sort: for i, l in enumerate(self._mlb.column_labels): l.bind("", self._sort) # Fill in our multi-list box. self._fill_table() # ///////////////////////////////////////////////////////////////// # { Widget-like Methods # ///////////////////////////////////////////////////////////////// # These all just delegate to either our frame or our MLB. def pack(self, *args, **kwargs): """Position this table's main frame widget in its parent widget. See ``Tkinter.Frame.pack()`` for more info.""" self._frame.pack(*args, **kwargs) def grid(self, *args, **kwargs): """Position this table's main frame widget in its parent widget. See ``Tkinter.Frame.grid()`` for more info.""" self._frame.grid(*args, **kwargs) def focus(self): """Direct (keyboard) input foxus to this widget.""" self._mlb.focus() def bind(self, sequence=None, func=None, add=None): """Add a binding to this table's main frame that will call ``func`` in response to the event sequence.""" self._mlb.bind(sequence, func, add) def rowconfigure(self, row_index, cnf={}, **kw): """:see: ``MultiListbox.rowconfigure()``""" self._mlb.rowconfigure(row_index, cnf, **kw) def columnconfigure(self, col_index, cnf={}, **kw): """:see: ``MultiListbox.columnconfigure()``""" col_index = self.column_index(col_index) self._mlb.columnconfigure(col_index, cnf, **kw) def itemconfigure(self, row_index, col_index, cnf=None, **kw): """:see: ``MultiListbox.itemconfigure()``""" col_index = self.column_index(col_index) return self._mlb.itemconfigure(row_index, col_index, cnf, **kw) def bind_to_labels(self, sequence=None, func=None, add=None): """:see: ``MultiListbox.bind_to_labels()``""" return self._mlb.bind_to_labels(sequence, func, add) def bind_to_listboxes(self, sequence=None, func=None, add=None): """:see: ``MultiListbox.bind_to_listboxes()``""" return self._mlb.bind_to_listboxes(sequence, func, add) def bind_to_columns(self, sequence=None, func=None, add=None): """:see: ``MultiListbox.bind_to_columns()``""" return self._mlb.bind_to_columns(sequence, func, add) rowconfig = rowconfigure columnconfig = columnconfigure itemconfig = itemconfigure # ///////////////////////////////////////////////////////////////// # { Table as list-of-lists # ///////////////////////////////////////////////////////////////// def insert(self, row_index, rowvalue): """ Insert a new row into the table, so that its row index will be ``row_index``. If the table contains any rows whose row index is greater than or equal to ``row_index``, then they will be shifted down. :param rowvalue: A tuple of cell values, one for each column in the new row. """ self._checkrow(rowvalue) self._rows.insert(row_index, rowvalue) if self._reprfunc is not None: rowvalue = [ self._reprfunc(row_index, j, v) for (j, v) in enumerate(rowvalue) ] self._mlb.insert(row_index, rowvalue) if self._DEBUG: self._check_table_vs_mlb() def extend(self, rowvalues): """ Add new rows at the end of the table. :param rowvalues: A list of row values used to initialize the table. Each row value should be a tuple of cell values, one for each column in the row. """ for rowvalue in rowvalues: self.append(rowvalue) if self._DEBUG: self._check_table_vs_mlb() def append(self, rowvalue): """ Add a new row to the end of the table. :param rowvalue: A tuple of cell values, one for each column in the new row. """ self.insert(len(self._rows), rowvalue) if self._DEBUG: self._check_table_vs_mlb() def clear(self): """ Delete all rows in this table. """ self._rows = [] self._mlb.delete(0, "end") if self._DEBUG: self._check_table_vs_mlb() def __getitem__(self, index): """ Return the value of a row or a cell in this table. If ``index`` is an integer, then the row value for the ``index``th row. This row value consists of a tuple of cell values, one for each column in the row. If ``index`` is a tuple of two integers, ``(i,j)``, then return the value of the cell in the ``i``th row and the ``j``th column. """ if isinstance(index, slice): raise ValueError("Slicing not supported") elif isinstance(index, tuple) and len(index) == 2: return self._rows[index[0]][self.column_index(index[1])] else: return tuple(self._rows[index]) def __setitem__(self, index, val): """ Replace the value of a row or a cell in this table with ``val``. If ``index`` is an integer, then ``val`` should be a row value (i.e., a tuple of cell values, one for each column). In this case, the values of the ``index``th row of the table will be replaced with the values in ``val``. If ``index`` is a tuple of integers, ``(i,j)``, then replace the value of the cell in the ``i``th row and ``j``th column with ``val``. """ if isinstance(index, slice): raise ValueError("Slicing not supported") # table[i,j] = val elif isinstance(index, tuple) and len(index) == 2: i, j = index[0], self.column_index(index[1]) config_cookie = self._save_config_info([i]) self._rows[i][j] = val if self._reprfunc is not None: val = self._reprfunc(i, j, val) self._mlb.listboxes[j].insert(i, val) self._mlb.listboxes[j].delete(i + 1) self._restore_config_info(config_cookie) # table[i] = val else: config_cookie = self._save_config_info([index]) self._checkrow(val) self._rows[index] = list(val) if self._reprfunc is not None: val = [self._reprfunc(index, j, v) for (j, v) in enumerate(val)] self._mlb.insert(index, val) self._mlb.delete(index + 1) self._restore_config_info(config_cookie) def __delitem__(self, row_index): """ Delete the ``row_index``th row from this table. """ if isinstance(row_index, slice): raise ValueError("Slicing not supported") if isinstance(row_index, tuple) and len(row_index) == 2: raise ValueError("Cannot delete a single cell!") del self._rows[row_index] self._mlb.delete(row_index) if self._DEBUG: self._check_table_vs_mlb() def __len__(self): """ :return: the number of rows in this table. """ return len(self._rows) def _checkrow(self, rowvalue): """ Helper function: check that a given row value has the correct number of elements; and if not, raise an exception. """ if len(rowvalue) != self._num_columns: raise ValueError( "Row %r has %d columns; expected %d" % (rowvalue, len(rowvalue), self._num_columns) ) # ///////////////////////////////////////////////////////////////// # Columns # ///////////////////////////////////////////////////////////////// @property def column_names(self): """A list of the names of the columns in this table.""" return self._mlb.column_names def column_index(self, i): """ If ``i`` is a valid column index integer, then return it as is. Otherwise, check if ``i`` is used as the name for any column; if so, return that column's index. Otherwise, raise a ``KeyError`` exception. """ if isinstance(i, int) and 0 <= i < self._num_columns: return i else: # This raises a key error if the column is not found. return self._column_name_to_index[i] def hide_column(self, column_index): """:see: ``MultiListbox.hide_column()``""" self._mlb.hide_column(self.column_index(column_index)) def show_column(self, column_index): """:see: ``MultiListbox.show_column()``""" self._mlb.show_column(self.column_index(column_index)) # ///////////////////////////////////////////////////////////////// # Selection # ///////////////////////////////////////////////////////////////// def selected_row(self): """ Return the index of the currently selected row, or None if no row is selected. To get the row value itself, use ``table[table.selected_row()]``. """ sel = self._mlb.curselection() if sel: return int(sel[0]) else: return None def select(self, index=None, delta=None, see=True): """:see: ``MultiListbox.select()``""" self._mlb.select(index, delta, see) # ///////////////////////////////////////////////////////////////// # Sorting # ///////////////////////////////////////////////////////////////// def sort_by(self, column_index, order="toggle"): """ Sort the rows in this table, using the specified column's values as a sort key. :param column_index: Specifies which column to sort, using either a column index (int) or a column's label name (str). :param order: Specifies whether to sort the values in ascending or descending order: - ``'ascending'``: Sort from least to greatest. - ``'descending'``: Sort from greatest to least. - ``'toggle'``: If the most recent call to ``sort_by()`` sorted the table by the same column (``column_index``), then reverse the rows; otherwise sort in ascending order. """ if order not in ("ascending", "descending", "toggle"): raise ValueError( 'sort_by(): order should be "ascending", ' '"descending", or "toggle".' ) column_index = self.column_index(column_index) config_cookie = self._save_config_info(index_by_id=True) # Sort the rows. if order == "toggle" and column_index == self._sortkey: self._rows.reverse() else: self._rows.sort( key=operator.itemgetter(column_index), reverse=(order == "descending") ) self._sortkey = column_index # Redraw the table. self._fill_table() self._restore_config_info(config_cookie, index_by_id=True, see=True) if self._DEBUG: self._check_table_vs_mlb() def _sort(self, event): """Event handler for clicking on a column label -- sort by that column.""" column_index = event.widget.column_index # If they click on the far-left of far-right of a column's # label, then resize rather than sorting. if self._mlb._resize_column(event): return "continue" # Otherwise, sort. else: self.sort_by(column_index) return "continue" # ///////////////////////////////////////////////////////////////// # { Table Drawing Helpers # ///////////////////////////////////////////////////////////////// def _fill_table(self, save_config=True): """ Re-draw the table from scratch, by clearing out the table's multi-column listbox; and then filling it in with values from ``self._rows``. Note that any cell-, row-, or column-specific color configuration that has been done will be lost. The selection will also be lost -- i.e., no row will be selected after this call completes. """ self._mlb.delete(0, "end") for i, row in enumerate(self._rows): if self._reprfunc is not None: row = [self._reprfunc(i, j, v) for (j, v) in enumerate(row)] self._mlb.insert("end", row) def _get_itemconfig(self, r, c): return { k: self._mlb.itemconfig(r, c, k)[-1] for k in ( "foreground", "selectforeground", "background", "selectbackground", ) } def _save_config_info(self, row_indices=None, index_by_id=False): """ Return a 'cookie' containing information about which row is selected, and what color configurations have been applied. this information can the be re-applied to the table (after making modifications) using ``_restore_config_info()``. Color configuration information will be saved for any rows in ``row_indices``, or in the entire table, if ``row_indices=None``. If ``index_by_id=True``, the the cookie will associate rows with their configuration information based on the rows' python id. This is useful when performing operations that re-arrange the rows (e.g. ``sort``). If ``index_by_id=False``, then it is assumed that all rows will be in the same order when ``_restore_config_info()`` is called. """ # Default value for row_indices is all rows. if row_indices is None: row_indices = list(range(len(self._rows))) # Look up our current selection. selection = self.selected_row() if index_by_id and selection is not None: selection = id(self._rows[selection]) # Look up the color configuration info for each row. if index_by_id: config = { id(self._rows[r]): [ self._get_itemconfig(r, c) for c in range(self._num_columns) ] for r in row_indices } else: config = { r: [self._get_itemconfig(r, c) for c in range(self._num_columns)] for r in row_indices } return selection, config def _restore_config_info(self, cookie, index_by_id=False, see=False): """ Restore selection & color configuration information that was saved using ``_save_config_info``. """ selection, config = cookie # Clear the selection. if selection is None: self._mlb.selection_clear(0, "end") # Restore selection & color config if index_by_id: for r, row in enumerate(self._rows): if id(row) in config: for c in range(self._num_columns): self._mlb.itemconfigure(r, c, config[id(row)][c]) if id(row) == selection: self._mlb.select(r, see=see) else: if selection is not None: self._mlb.select(selection, see=see) for r in config: for c in range(self._num_columns): self._mlb.itemconfigure(r, c, config[r][c]) # ///////////////////////////////////////////////////////////////// # Debugging (Invariant Checker) # ///////////////////////////////////////////////////////////////// _DEBUG = False """If true, then run ``_check_table_vs_mlb()`` after any operation that modifies the table.""" def _check_table_vs_mlb(self): """ Verify that the contents of the table's ``_rows`` variable match the contents of its multi-listbox (``_mlb``). This is just included for debugging purposes, to make sure that the list-modifying operations are working correctly. """ for col in self._mlb.listboxes: assert len(self) == col.size() for row in self: assert len(row) == self._num_columns assert self._num_columns == len(self._mlb.column_names) # assert self._column_names == self._mlb.column_names for i, row in enumerate(self): for j, cell in enumerate(row): if self._reprfunc is not None: cell = self._reprfunc(i, j, cell) assert self._mlb.get(i)[j] == cell ###################################################################### # Demo/Test Function ###################################################################### # update this to use new WordNet API def demo(): root = Tk() root.bind("", lambda e: root.destroy()) table = Table( root, "Word Synset Hypernym Hyponym".split(), column_weights=[0, 1, 1, 1], reprfunc=(lambda i, j, s: " %s" % s), ) table.pack(expand=True, fill="both") from nltk.corpus import brown, wordnet for word, pos in sorted(set(brown.tagged_words()[:500])): if pos[0] != "N": continue word = word.lower() for synset in wordnet.synsets(word): try: hyper_def = synset.hypernyms()[0].definition() except: hyper_def = "*none*" try: hypo_def = synset.hypernyms()[0].definition() except: hypo_def = "*none*" table.append([word, synset.definition(), hyper_def, hypo_def]) table.columnconfig("Word", background="#afa") table.columnconfig("Synset", background="#efe") table.columnconfig("Hypernym", background="#fee") table.columnconfig("Hyponym", background="#ffe") for row in range(len(table)): for column in ("Hypernym", "Hyponym"): if table[row, column] == "*none*": table.itemconfig( row, column, foreground="#666", selectforeground="#666" ) root.mainloop() if __name__ == "__main__": demo() nltk-3.7/nltk/draw/tree.py000066400000000000000000001124021420073152400155470ustar00rootroot00000000000000# Natural Language Toolkit: Graphical Representations for Trees # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Graphically display a Tree. """ from tkinter import IntVar, Menu, Tk from nltk.draw.util import ( BoxWidget, CanvasFrame, CanvasWidget, OvalWidget, ParenWidget, TextWidget, ) from nltk.tree import Tree from nltk.util import in_idle ##////////////////////////////////////////////////////// ## Tree Segment ##////////////////////////////////////////////////////// class TreeSegmentWidget(CanvasWidget): """ A canvas widget that displays a single segment of a hierarchical tree. Each ``TreeSegmentWidget`` connects a single "node widget" to a sequence of zero or more "subtree widgets". By default, the bottom of the node is connected to the top of each subtree by a single line. However, if the ``roof`` attribute is set, then a single triangular "roof" will connect the node to all of its children. Attributes: - ``roof``: What sort of connection to draw between the node and its subtrees. If ``roof`` is true, draw a single triangular "roof" over the subtrees. If ``roof`` is false, draw a line between each subtree and the node. Default value is false. - ``xspace``: The amount of horizontal space to leave between subtrees when managing this widget. Default value is 10. - ``yspace``: The amount of space to place between the node and its children when managing this widget. Default value is 15. - ``color``: The color of the lines connecting the node to its subtrees; and of the outline of the triangular roof. Default value is ``'#006060'``. - ``fill``: The fill color for the triangular roof. Default value is ``''`` (no fill). - ``width``: The width of the lines connecting the node to its subtrees; and of the outline of the triangular roof. Default value is 1. - ``orientation``: Determines whether the tree branches downwards or rightwards. Possible values are ``'horizontal'`` and ``'vertical'``. The default value is ``'vertical'`` (i.e., branch downwards). - ``draggable``: whether the widget can be dragged by the user. """ def __init__(self, canvas, label, subtrees, **attribs): """ :type node: :type subtrees: list(CanvasWidgetI) """ self._label = label self._subtrees = subtrees # Attributes self._horizontal = 0 self._roof = 0 self._xspace = 10 self._yspace = 15 self._ordered = False # Create canvas objects. self._lines = [canvas.create_line(0, 0, 0, 0, fill="#006060") for c in subtrees] self._polygon = canvas.create_polygon( 0, 0, fill="", state="hidden", outline="#006060" ) # Register child widgets (label + subtrees) self._add_child_widget(label) for subtree in subtrees: self._add_child_widget(subtree) # Are we currently managing? self._managing = False CanvasWidget.__init__(self, canvas, **attribs) def __setitem__(self, attr, value): canvas = self.canvas() if attr == "roof": self._roof = value if self._roof: for l in self._lines: canvas.itemconfig(l, state="hidden") canvas.itemconfig(self._polygon, state="normal") else: for l in self._lines: canvas.itemconfig(l, state="normal") canvas.itemconfig(self._polygon, state="hidden") elif attr == "orientation": if value == "horizontal": self._horizontal = 1 elif value == "vertical": self._horizontal = 0 else: raise ValueError("orientation must be horizontal or vertical") elif attr == "color": for l in self._lines: canvas.itemconfig(l, fill=value) canvas.itemconfig(self._polygon, outline=value) elif isinstance(attr, tuple) and attr[0] == "color": # Set the color of an individual line. l = self._lines[int(attr[1])] canvas.itemconfig(l, fill=value) elif attr == "fill": canvas.itemconfig(self._polygon, fill=value) elif attr == "width": canvas.itemconfig(self._polygon, {attr: value}) for l in self._lines: canvas.itemconfig(l, {attr: value}) elif attr in ("xspace", "yspace"): if attr == "xspace": self._xspace = value elif attr == "yspace": self._yspace = value self.update(self._label) elif attr == "ordered": self._ordered = value else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == "roof": return self._roof elif attr == "width": return self.canvas().itemcget(self._polygon, attr) elif attr == "color": return self.canvas().itemcget(self._polygon, "outline") elif isinstance(attr, tuple) and attr[0] == "color": l = self._lines[int(attr[1])] return self.canvas().itemcget(l, "fill") elif attr == "xspace": return self._xspace elif attr == "yspace": return self._yspace elif attr == "orientation": if self._horizontal: return "horizontal" else: return "vertical" elif attr == "ordered": return self._ordered else: return CanvasWidget.__getitem__(self, attr) def label(self): return self._label def subtrees(self): return self._subtrees[:] def set_label(self, label): """ Set the node label to ``label``. """ self._remove_child_widget(self._label) self._add_child_widget(label) self._label = label self.update(self._label) def replace_child(self, oldchild, newchild): """ Replace the child ``oldchild`` with ``newchild``. """ index = self._subtrees.index(oldchild) self._subtrees[index] = newchild self._remove_child_widget(oldchild) self._add_child_widget(newchild) self.update(newchild) def remove_child(self, child): index = self._subtrees.index(child) del self._subtrees[index] self._remove_child_widget(child) self.canvas().delete(self._lines.pop()) self.update(self._label) def insert_child(self, index, child): canvas = self.canvas() self._subtrees.insert(index, child) self._add_child_widget(child) self._lines.append(canvas.create_line(0, 0, 0, 0, fill="#006060")) self.update(self._label) # but.. lines??? def _tags(self): if self._roof: return [self._polygon] else: return self._lines def _subtree_top(self, child): if isinstance(child, TreeSegmentWidget): bbox = child.label().bbox() else: bbox = child.bbox() if self._horizontal: return (bbox[0], (bbox[1] + bbox[3]) / 2.0) else: return ((bbox[0] + bbox[2]) / 2.0, bbox[1]) def _node_bottom(self): bbox = self._label.bbox() if self._horizontal: return (bbox[2], (bbox[1] + bbox[3]) / 2.0) else: return ((bbox[0] + bbox[2]) / 2.0, bbox[3]) def _update(self, child): if len(self._subtrees) == 0: return if self._label.bbox() is None: return # [XX] ??? # Which lines need to be redrawn? if child is self._label: need_update = self._subtrees else: need_update = [child] if self._ordered and not self._managing: need_update = self._maintain_order(child) # Update the polygon. (nodex, nodey) = self._node_bottom() (xmin, ymin, xmax, ymax) = self._subtrees[0].bbox() for subtree in self._subtrees[1:]: bbox = subtree.bbox() xmin = min(xmin, bbox[0]) ymin = min(ymin, bbox[1]) xmax = max(xmax, bbox[2]) ymax = max(ymax, bbox[3]) if self._horizontal: self.canvas().coords( self._polygon, nodex, nodey, xmin, ymin, xmin, ymax, nodex, nodey ) else: self.canvas().coords( self._polygon, nodex, nodey, xmin, ymin, xmax, ymin, nodex, nodey ) # Redraw all lines that need it. for subtree in need_update: (nodex, nodey) = self._node_bottom() line = self._lines[self._subtrees.index(subtree)] (subtreex, subtreey) = self._subtree_top(subtree) self.canvas().coords(line, nodex, nodey, subtreex, subtreey) def _maintain_order(self, child): if self._horizontal: return self._maintain_order_horizontal(child) else: return self._maintain_order_vertical(child) def _maintain_order_vertical(self, child): (left, top, right, bot) = child.bbox() if child is self._label: # Check all the leaves for subtree in self._subtrees: (x1, y1, x2, y2) = subtree.bbox() if bot + self._yspace > y1: subtree.move(0, bot + self._yspace - y1) return self._subtrees else: moved = [child] index = self._subtrees.index(child) # Check leaves to our right. x = right + self._xspace for i in range(index + 1, len(self._subtrees)): (x1, y1, x2, y2) = self._subtrees[i].bbox() if x > x1: self._subtrees[i].move(x - x1, 0) x += x2 - x1 + self._xspace moved.append(self._subtrees[i]) # Check leaves to our left. x = left - self._xspace for i in range(index - 1, -1, -1): (x1, y1, x2, y2) = self._subtrees[i].bbox() if x < x2: self._subtrees[i].move(x - x2, 0) x -= x2 - x1 + self._xspace moved.append(self._subtrees[i]) # Check the node (x1, y1, x2, y2) = self._label.bbox() if y2 > top - self._yspace: self._label.move(0, top - self._yspace - y2) moved = self._subtrees # Return a list of the nodes we moved return moved def _maintain_order_horizontal(self, child): (left, top, right, bot) = child.bbox() if child is self._label: # Check all the leaves for subtree in self._subtrees: (x1, y1, x2, y2) = subtree.bbox() if right + self._xspace > x1: subtree.move(right + self._xspace - x1) return self._subtrees else: moved = [child] index = self._subtrees.index(child) # Check leaves below us. y = bot + self._yspace for i in range(index + 1, len(self._subtrees)): (x1, y1, x2, y2) = self._subtrees[i].bbox() if y > y1: self._subtrees[i].move(0, y - y1) y += y2 - y1 + self._yspace moved.append(self._subtrees[i]) # Check leaves above us y = top - self._yspace for i in range(index - 1, -1, -1): (x1, y1, x2, y2) = self._subtrees[i].bbox() if y < y2: self._subtrees[i].move(0, y - y2) y -= y2 - y1 + self._yspace moved.append(self._subtrees[i]) # Check the node (x1, y1, x2, y2) = self._label.bbox() if x2 > left - self._xspace: self._label.move(left - self._xspace - x2, 0) moved = self._subtrees # Return a list of the nodes we moved return moved def _manage_horizontal(self): (nodex, nodey) = self._node_bottom() # Put the subtrees in a line. y = 20 for subtree in self._subtrees: subtree_bbox = subtree.bbox() dx = nodex - subtree_bbox[0] + self._xspace dy = y - subtree_bbox[1] subtree.move(dx, dy) y += subtree_bbox[3] - subtree_bbox[1] + self._yspace # Find the center of their tops. center = 0.0 for subtree in self._subtrees: center += self._subtree_top(subtree)[1] center /= len(self._subtrees) # Center the subtrees with the node. for subtree in self._subtrees: subtree.move(0, nodey - center) def _manage_vertical(self): (nodex, nodey) = self._node_bottom() # Put the subtrees in a line. x = 0 for subtree in self._subtrees: subtree_bbox = subtree.bbox() dy = nodey - subtree_bbox[1] + self._yspace dx = x - subtree_bbox[0] subtree.move(dx, dy) x += subtree_bbox[2] - subtree_bbox[0] + self._xspace # Find the center of their tops. center = 0.0 for subtree in self._subtrees: center += self._subtree_top(subtree)[0] / len(self._subtrees) # Center the subtrees with the node. for subtree in self._subtrees: subtree.move(nodex - center, 0) def _manage(self): self._managing = True (nodex, nodey) = self._node_bottom() if len(self._subtrees) == 0: return if self._horizontal: self._manage_horizontal() else: self._manage_vertical() # Update lines to subtrees. for subtree in self._subtrees: self._update(subtree) self._managing = False def __repr__(self): return f"[TreeSeg {self._label}: {self._subtrees}]" def _tree_to_treeseg( canvas, t, make_node, make_leaf, tree_attribs, node_attribs, leaf_attribs, loc_attribs, ): if isinstance(t, Tree): label = make_node(canvas, t.label(), **node_attribs) subtrees = [ _tree_to_treeseg( canvas, child, make_node, make_leaf, tree_attribs, node_attribs, leaf_attribs, loc_attribs, ) for child in t ] return TreeSegmentWidget(canvas, label, subtrees, **tree_attribs) else: return make_leaf(canvas, t, **leaf_attribs) def tree_to_treesegment( canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs ): """ Convert a Tree into a ``TreeSegmentWidget``. :param make_node: A ``CanvasWidget`` constructor or a function that creates ``CanvasWidgets``. ``make_node`` is used to convert the Tree's nodes into ``CanvasWidgets``. If no constructor is specified, then ``TextWidget`` will be used. :param make_leaf: A ``CanvasWidget`` constructor or a function that creates ``CanvasWidgets``. ``make_leaf`` is used to convert the Tree's leafs into ``CanvasWidgets``. If no constructor is specified, then ``TextWidget`` will be used. :param attribs: Attributes for the canvas widgets that make up the returned ``TreeSegmentWidget``. Any attribute beginning with ``'tree_'`` will be passed to all ``TreeSegmentWidgets`` (with the ``'tree_'`` prefix removed. Any attribute beginning with ``'node_'`` will be passed to all nodes. Any attribute beginning with ``'leaf_'`` will be passed to all leaves. And any attribute beginning with ``'loc_'`` will be passed to all text locations (for Trees). """ # Process attribs. tree_attribs = {} node_attribs = {} leaf_attribs = {} loc_attribs = {} for (key, value) in list(attribs.items()): if key[:5] == "tree_": tree_attribs[key[5:]] = value elif key[:5] == "node_": node_attribs[key[5:]] = value elif key[:5] == "leaf_": leaf_attribs[key[5:]] = value elif key[:4] == "loc_": loc_attribs[key[4:]] = value else: raise ValueError("Bad attribute: %s" % key) return _tree_to_treeseg( canvas, t, make_node, make_leaf, tree_attribs, node_attribs, leaf_attribs, loc_attribs, ) ##////////////////////////////////////////////////////// ## Tree Widget ##////////////////////////////////////////////////////// class TreeWidget(CanvasWidget): """ A canvas widget that displays a single Tree. ``TreeWidget`` manages a group of ``TreeSegmentWidgets`` that are used to display a Tree. Attributes: - ``node_attr``: Sets the attribute ``attr`` on all of the node widgets for this ``TreeWidget``. - ``node_attr``: Sets the attribute ``attr`` on all of the leaf widgets for this ``TreeWidget``. - ``loc_attr``: Sets the attribute ``attr`` on all of the location widgets for this ``TreeWidget`` (if it was built from a Tree). Note that a location widget is a ``TextWidget``. - ``xspace``: The amount of horizontal space to leave between subtrees when managing this widget. Default value is 10. - ``yspace``: The amount of space to place between the node and its children when managing this widget. Default value is 15. - ``line_color``: The color of the lines connecting each expanded node to its subtrees. - ``roof_color``: The color of the outline of the triangular roof for collapsed trees. - ``roof_fill``: The fill color for the triangular roof for collapsed trees. - ``width`` - ``orientation``: Determines whether the tree branches downwards or rightwards. Possible values are ``'horizontal'`` and ``'vertical'``. The default value is ``'vertical'`` (i.e., branch downwards). - ``shapeable``: whether the subtrees can be independently dragged by the user. THIS property simply sets the ``DRAGGABLE`` property on all of the ``TreeWidget``'s tree segments. - ``draggable``: whether the widget can be dragged by the user. """ def __init__( self, canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs ): # Node & leaf canvas widget constructors self._make_node = make_node self._make_leaf = make_leaf self._tree = t # Attributes. self._nodeattribs = {} self._leafattribs = {} self._locattribs = {"color": "#008000"} self._line_color = "#008080" self._line_width = 1 self._roof_color = "#008080" self._roof_fill = "#c0c0c0" self._shapeable = False self._xspace = 10 self._yspace = 10 self._orientation = "vertical" self._ordered = False # Build trees. self._keys = {} # treeseg -> key self._expanded_trees = {} self._collapsed_trees = {} self._nodes = [] self._leaves = [] # self._locs = [] self._make_collapsed_trees(canvas, t, ()) self._treeseg = self._make_expanded_tree(canvas, t, ()) self._add_child_widget(self._treeseg) CanvasWidget.__init__(self, canvas, **attribs) def expanded_tree(self, *path_to_tree): """ Return the ``TreeSegmentWidget`` for the specified subtree. :param path_to_tree: A list of indices i1, i2, ..., in, where the desired widget is the widget corresponding to ``tree.children()[i1].children()[i2]....children()[in]``. For the root, the path is ``()``. """ return self._expanded_trees[path_to_tree] def collapsed_tree(self, *path_to_tree): """ Return the ``TreeSegmentWidget`` for the specified subtree. :param path_to_tree: A list of indices i1, i2, ..., in, where the desired widget is the widget corresponding to ``tree.children()[i1].children()[i2]....children()[in]``. For the root, the path is ``()``. """ return self._collapsed_trees[path_to_tree] def bind_click_trees(self, callback, button=1): """ Add a binding to all tree segments. """ for tseg in list(self._expanded_trees.values()): tseg.bind_click(callback, button) for tseg in list(self._collapsed_trees.values()): tseg.bind_click(callback, button) def bind_drag_trees(self, callback, button=1): """ Add a binding to all tree segments. """ for tseg in list(self._expanded_trees.values()): tseg.bind_drag(callback, button) for tseg in list(self._collapsed_trees.values()): tseg.bind_drag(callback, button) def bind_click_leaves(self, callback, button=1): """ Add a binding to all leaves. """ for leaf in self._leaves: leaf.bind_click(callback, button) for leaf in self._leaves: leaf.bind_click(callback, button) def bind_drag_leaves(self, callback, button=1): """ Add a binding to all leaves. """ for leaf in self._leaves: leaf.bind_drag(callback, button) for leaf in self._leaves: leaf.bind_drag(callback, button) def bind_click_nodes(self, callback, button=1): """ Add a binding to all nodes. """ for node in self._nodes: node.bind_click(callback, button) for node in self._nodes: node.bind_click(callback, button) def bind_drag_nodes(self, callback, button=1): """ Add a binding to all nodes. """ for node in self._nodes: node.bind_drag(callback, button) for node in self._nodes: node.bind_drag(callback, button) def _make_collapsed_trees(self, canvas, t, key): if not isinstance(t, Tree): return make_node = self._make_node make_leaf = self._make_leaf node = make_node(canvas, t.label(), **self._nodeattribs) self._nodes.append(node) leaves = [make_leaf(canvas, l, **self._leafattribs) for l in t.leaves()] self._leaves += leaves treeseg = TreeSegmentWidget( canvas, node, leaves, roof=1, color=self._roof_color, fill=self._roof_fill, width=self._line_width, ) self._collapsed_trees[key] = treeseg self._keys[treeseg] = key # self._add_child_widget(treeseg) treeseg.hide() # Build trees for children. for i in range(len(t)): child = t[i] self._make_collapsed_trees(canvas, child, key + (i,)) def _make_expanded_tree(self, canvas, t, key): make_node = self._make_node make_leaf = self._make_leaf if isinstance(t, Tree): node = make_node(canvas, t.label(), **self._nodeattribs) self._nodes.append(node) children = t subtrees = [ self._make_expanded_tree(canvas, children[i], key + (i,)) for i in range(len(children)) ] treeseg = TreeSegmentWidget( canvas, node, subtrees, color=self._line_color, width=self._line_width ) self._expanded_trees[key] = treeseg self._keys[treeseg] = key return treeseg else: leaf = make_leaf(canvas, t, **self._leafattribs) self._leaves.append(leaf) return leaf def __setitem__(self, attr, value): if attr[:5] == "node_": for node in self._nodes: node[attr[5:]] = value elif attr[:5] == "leaf_": for leaf in self._leaves: leaf[attr[5:]] = value elif attr == "line_color": self._line_color = value for tseg in list(self._expanded_trees.values()): tseg["color"] = value elif attr == "line_width": self._line_width = value for tseg in list(self._expanded_trees.values()): tseg["width"] = value for tseg in list(self._collapsed_trees.values()): tseg["width"] = value elif attr == "roof_color": self._roof_color = value for tseg in list(self._collapsed_trees.values()): tseg["color"] = value elif attr == "roof_fill": self._roof_fill = value for tseg in list(self._collapsed_trees.values()): tseg["fill"] = value elif attr == "shapeable": self._shapeable = value for tseg in list(self._expanded_trees.values()): tseg["draggable"] = value for tseg in list(self._collapsed_trees.values()): tseg["draggable"] = value for leaf in self._leaves: leaf["draggable"] = value elif attr == "xspace": self._xspace = value for tseg in list(self._expanded_trees.values()): tseg["xspace"] = value for tseg in list(self._collapsed_trees.values()): tseg["xspace"] = value self.manage() elif attr == "yspace": self._yspace = value for tseg in list(self._expanded_trees.values()): tseg["yspace"] = value for tseg in list(self._collapsed_trees.values()): tseg["yspace"] = value self.manage() elif attr == "orientation": self._orientation = value for tseg in list(self._expanded_trees.values()): tseg["orientation"] = value for tseg in list(self._collapsed_trees.values()): tseg["orientation"] = value self.manage() elif attr == "ordered": self._ordered = value for tseg in list(self._expanded_trees.values()): tseg["ordered"] = value for tseg in list(self._collapsed_trees.values()): tseg["ordered"] = value else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr[:5] == "node_": return self._nodeattribs.get(attr[5:], None) elif attr[:5] == "leaf_": return self._leafattribs.get(attr[5:], None) elif attr[:4] == "loc_": return self._locattribs.get(attr[4:], None) elif attr == "line_color": return self._line_color elif attr == "line_width": return self._line_width elif attr == "roof_color": return self._roof_color elif attr == "roof_fill": return self._roof_fill elif attr == "shapeable": return self._shapeable elif attr == "xspace": return self._xspace elif attr == "yspace": return self._yspace elif attr == "orientation": return self._orientation else: return CanvasWidget.__getitem__(self, attr) def _tags(self): return [] def _manage(self): segs = list(self._expanded_trees.values()) + list( self._collapsed_trees.values() ) for tseg in segs: if tseg.hidden(): tseg.show() tseg.manage() tseg.hide() def toggle_collapsed(self, treeseg): """ Collapse/expand a tree. """ old_treeseg = treeseg if old_treeseg["roof"]: new_treeseg = self._expanded_trees[self._keys[old_treeseg]] else: new_treeseg = self._collapsed_trees[self._keys[old_treeseg]] # Replace the old tree with the new tree. if old_treeseg.parent() is self: self._remove_child_widget(old_treeseg) self._add_child_widget(new_treeseg) self._treeseg = new_treeseg else: old_treeseg.parent().replace_child(old_treeseg, new_treeseg) # Move the new tree to where the old tree was. Show it first, # so we can find its bounding box. new_treeseg.show() (newx, newy) = new_treeseg.label().bbox()[:2] (oldx, oldy) = old_treeseg.label().bbox()[:2] new_treeseg.move(oldx - newx, oldy - newy) # Hide the old tree old_treeseg.hide() # We could do parent.manage() here instead, if we wanted. new_treeseg.parent().update(new_treeseg) ##////////////////////////////////////////////////////// ## draw_trees ##////////////////////////////////////////////////////// class TreeView: def __init__(self, *trees): from math import ceil, sqrt self._trees = trees self._top = Tk() self._top.title("NLTK") self._top.bind("", self.destroy) self._top.bind("", self.destroy) cf = self._cframe = CanvasFrame(self._top) self._top.bind("", self._cframe.print_to_file) # Size is variable. self._size = IntVar(self._top) self._size.set(12) bold = ("helvetica", -self._size.get(), "bold") helv = ("helvetica", -self._size.get()) # Lay the trees out in a square. self._width = int(ceil(sqrt(len(trees)))) self._widgets = [] for i in range(len(trees)): widget = TreeWidget( cf.canvas(), trees[i], node_font=bold, leaf_color="#008040", node_color="#004080", roof_color="#004040", roof_fill="white", line_color="#004040", draggable=1, leaf_font=helv, ) widget.bind_click_trees(widget.toggle_collapsed) self._widgets.append(widget) cf.add_widget(widget, 0, 0) self._layout() self._cframe.pack(expand=1, fill="both") self._init_menubar() def _layout(self): i = x = y = ymax = 0 width = self._width for i in range(len(self._widgets)): widget = self._widgets[i] (oldx, oldy) = widget.bbox()[:2] if i % width == 0: y = ymax x = 0 widget.move(x - oldx, y - oldy) x = widget.bbox()[2] + 10 ymax = max(ymax, widget.bbox()[3] + 10) def _init_menubar(self): menubar = Menu(self._top) filemenu = Menu(menubar, tearoff=0) filemenu.add_command( label="Print to Postscript", underline=0, command=self._cframe.print_to_file, accelerator="Ctrl-p", ) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) zoommenu = Menu(menubar, tearoff=0) zoommenu.add_radiobutton( label="Tiny", variable=self._size, underline=0, value=10, command=self.resize, ) zoommenu.add_radiobutton( label="Small", variable=self._size, underline=0, value=12, command=self.resize, ) zoommenu.add_radiobutton( label="Medium", variable=self._size, underline=0, value=14, command=self.resize, ) zoommenu.add_radiobutton( label="Large", variable=self._size, underline=0, value=28, command=self.resize, ) zoommenu.add_radiobutton( label="Huge", variable=self._size, underline=0, value=50, command=self.resize, ) menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu) self._top.config(menu=menubar) def resize(self, *e): bold = ("helvetica", -self._size.get(), "bold") helv = ("helvetica", -self._size.get()) xspace = self._size.get() yspace = self._size.get() for widget in self._widgets: widget["node_font"] = bold widget["leaf_font"] = helv widget["xspace"] = xspace widget["yspace"] = yspace if self._size.get() < 20: widget["line_width"] = 1 elif self._size.get() < 30: widget["line_width"] = 2 else: widget["line_width"] = 3 self._layout() def destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) def draw_trees(*trees): """ Open a new window containing a graphical diagram of the given trees. :rtype: None """ TreeView(*trees).mainloop() return ##////////////////////////////////////////////////////// ## Demo Code ##////////////////////////////////////////////////////// def demo(): import random def fill(cw): cw["fill"] = "#%06d" % random.randint(0, 999999) cf = CanvasFrame(width=550, height=450, closeenough=2) t = Tree.fromstring( """ (S (NP the very big cat) (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))""" ) tc = TreeWidget( cf.canvas(), t, draggable=1, node_font=("helvetica", -14, "bold"), leaf_font=("helvetica", -12, "italic"), roof_fill="white", roof_color="black", leaf_color="green4", node_color="blue2", ) cf.add_widget(tc, 10, 10) def boxit(canvas, text): big = ("helvetica", -16, "bold") return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green") def ovalit(canvas, text): return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan") treetok = Tree.fromstring("(S (NP this tree) (VP (V is) (AdjP shapeable)))") tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1) def color(node): node["color"] = "#%04d00" % random.randint(0, 9999) def color2(treeseg): treeseg.label()["fill"] = "#%06d" % random.randint(0, 9999) treeseg.label().child()["color"] = "white" tc.bind_click_trees(tc.toggle_collapsed) tc2.bind_click_trees(tc2.toggle_collapsed) tc.bind_click_nodes(color, 3) tc2.expanded_tree(1).bind_click(color2, 3) tc2.expanded_tree().bind_click(color2, 3) paren = ParenWidget(cf.canvas(), tc2) cf.add_widget(paren, tc.bbox()[2] + 10, 10) tree3 = Tree.fromstring( """ (S (NP this tree) (AUX was) (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))""" ) tc3 = tree_to_treesegment( cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2 ) tc3["draggable"] = 1 cf.add_widget(tc3, 10, tc.bbox()[3] + 10) def orientswitch(treewidget): if treewidget["orientation"] == "horizontal": treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical") treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical") treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical") treewidget.collapsed_tree().subtrees()[3].set_text("vertical") treewidget["orientation"] = "vertical" else: treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal") treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal") treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal") treewidget.collapsed_tree().subtrees()[3].set_text("horizontal") treewidget["orientation"] = "horizontal" text = """ Try clicking, right clicking, and dragging different elements of each of the trees. The top-left tree is a TreeWidget built from a Tree. The top-right is a TreeWidget built from a Tree, using non-default widget constructors for the nodes & leaves (BoxWidget and OvalWidget). The bottom-left tree is built from tree_to_treesegment.""" twidget = TextWidget(cf.canvas(), text.strip()) textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1) cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10) tree4 = Tree.fromstring("(S (NP this tree) (VP (V is) (Adj horizontal)))") tc4 = TreeWidget( cf.canvas(), tree4, draggable=1, line_color="brown2", roof_color="brown2", node_font=("helvetica", -12, "bold"), node_color="brown4", orientation="horizontal", ) tc4.manage() cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10) tc4.bind_click(orientswitch) tc4.bind_click_trees(tc4.toggle_collapsed, 3) # Run mainloop cf.mainloop() if __name__ == "__main__": demo() nltk-3.7/nltk/draw/util.py000066400000000000000000002543661420073152400156050ustar00rootroot00000000000000# Natural Language Toolkit: Drawing utilities # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Tools for graphically displaying and interacting with the objects and processing classes defined by the Toolkit. These tools are primarily intended to help students visualize the objects that they create. The graphical tools are typically built using "canvas widgets", each of which encapsulates the graphical elements and bindings used to display a complex object on a Tkinter ``Canvas``. For example, NLTK defines canvas widgets for displaying trees and directed graphs, as well as a number of simpler widgets. These canvas widgets make it easier to build new graphical tools and demos. See the class documentation for ``CanvasWidget`` for more information. The ``nltk.draw`` module defines the abstract ``CanvasWidget`` base class, and a number of simple canvas widgets. The remaining canvas widgets are defined by submodules, such as ``nltk.draw.tree``. The ``nltk.draw`` module also defines ``CanvasFrame``, which encapsulates a ``Canvas`` and its scrollbars. It uses a ``ScrollWatcherWidget`` to ensure that all canvas widgets contained on its canvas are within the scroll region. Acknowledgements: Many of the ideas behind the canvas widget system are derived from ``CLIG``, a Tk-based grapher for linguistic data structures. For more information, see the CLIG homepage (http://www.ags.uni-sb.de/~konrad/clig.html). """ from abc import ABCMeta, abstractmethod from tkinter import ( RAISED, Button, Canvas, Entry, Frame, Label, Menu, Menubutton, Scrollbar, StringVar, Text, Tk, Toplevel, Widget, ) from tkinter.filedialog import asksaveasfilename from nltk.util import in_idle ##////////////////////////////////////////////////////// ## CanvasWidget ##////////////////////////////////////////////////////// class CanvasWidget(metaclass=ABCMeta): """ A collection of graphical elements and bindings used to display a complex object on a Tkinter ``Canvas``. A canvas widget is responsible for managing the ``Canvas`` tags and callback bindings necessary to display and interact with the object. Canvas widgets are often organized into hierarchies, where parent canvas widgets control aspects of their child widgets. Each canvas widget is bound to a single ``Canvas``. This ``Canvas`` is specified as the first argument to the ``CanvasWidget``'s constructor. Attributes. Each canvas widget can support a variety of "attributes", which control how the canvas widget is displayed. Some typical examples attributes are ``color``, ``font``, and ``radius``. Each attribute has a default value. This default value can be overridden in the constructor, using keyword arguments of the form ``attribute=value``: >>> from nltk.draw.util import TextWidget >>> cn = TextWidget(c, 'test', color='red') Attribute values can also be changed after a canvas widget has been constructed, using the ``__setitem__`` operator: >>> cn['font'] = 'times' The current value of an attribute value can be queried using the ``__getitem__`` operator: >>> cn['color'] red For a list of the attributes supported by a type of canvas widget, see its class documentation. Interaction. The attribute ``'draggable'`` controls whether the user can drag a canvas widget around the canvas. By default, canvas widgets are not draggable. ``CanvasWidget`` provides callback support for two types of user interaction: clicking and dragging. The method ``bind_click`` registers a callback function that is called whenever the canvas widget is clicked. The method ``bind_drag`` registers a callback function that is called after the canvas widget is dragged. If the user clicks or drags a canvas widget with no registered callback function, then the interaction event will propagate to its parent. For each canvas widget, only one callback function may be registered for an interaction event. Callback functions can be deregistered with the ``unbind_click`` and ``unbind_drag`` methods. Subclassing. ``CanvasWidget`` is an abstract class. Subclasses are required to implement the following methods: - ``__init__``: Builds a new canvas widget. It must perform the following three tasks (in order): - Create any new graphical elements. - Call ``_add_child_widget`` on each child widget. - Call the ``CanvasWidget`` constructor. - ``_tags``: Returns a list of the canvas tags for all graphical elements managed by this canvas widget, not including graphical elements managed by its child widgets. - ``_manage``: Arranges the child widgets of this canvas widget. This is typically only called when the canvas widget is created. - ``_update``: Update this canvas widget in response to a change in a single child. For a ``CanvasWidget`` with no child widgets, the default definitions for ``_manage`` and ``_update`` may be used. If a subclass defines any attributes, then it should implement ``__getitem__`` and ``__setitem__``. If either of these methods is called with an unknown attribute, then they should propagate the request to ``CanvasWidget``. Most subclasses implement a number of additional methods that modify the ``CanvasWidget`` in some way. These methods must call ``parent.update(self)`` after making any changes to the canvas widget's graphical elements. The canvas widget must also call ``parent.update(self)`` after changing any attribute value that affects the shape or position of the canvas widget's graphical elements. :type __canvas: Tkinter.Canvas :ivar __canvas: This ``CanvasWidget``'s canvas. :type __parent: CanvasWidget or None :ivar __parent: This ``CanvasWidget``'s hierarchical parent widget. :type __children: list(CanvasWidget) :ivar __children: This ``CanvasWidget``'s hierarchical child widgets. :type __updating: bool :ivar __updating: Is this canvas widget currently performing an update? If it is, then it will ignore any new update requests from child widgets. :type __draggable: bool :ivar __draggable: Is this canvas widget draggable? :type __press: event :ivar __press: The ButtonPress event that we're currently handling. :type __drag_x: int :ivar __drag_x: Where it's been moved to (to find dx) :type __drag_y: int :ivar __drag_y: Where it's been moved to (to find dy) :type __callbacks: dictionary :ivar __callbacks: Registered callbacks. Currently, four keys are used: ``1``, ``2``, ``3``, and ``'drag'``. The values are callback functions. Each callback function takes a single argument, which is the ``CanvasWidget`` that triggered the callback. """ def __init__(self, canvas, parent=None, **attribs): """ Create a new canvas widget. This constructor should only be called by subclass constructors; and it should be called only "after" the subclass has constructed all graphical canvas objects and registered all child widgets. :param canvas: This canvas widget's canvas. :type canvas: Tkinter.Canvas :param parent: This canvas widget's hierarchical parent. :type parent: CanvasWidget :param attribs: The new canvas widget's attributes. """ if self.__class__ == CanvasWidget: raise TypeError("CanvasWidget is an abstract base class") if not isinstance(canvas, Canvas): raise TypeError("Expected a canvas!") self.__canvas = canvas self.__parent = parent # If the subclass constructor called _add_child_widget, then # self.__children will already exist. if not hasattr(self, "_CanvasWidget__children"): self.__children = [] # Is this widget hidden? self.__hidden = 0 # Update control (prevents infinite loops) self.__updating = 0 # Button-press and drag callback handling. self.__press = None self.__drag_x = self.__drag_y = 0 self.__callbacks = {} self.__draggable = 0 # Set up attributes. for (attr, value) in list(attribs.items()): self[attr] = value # Manage this canvas widget self._manage() # Register any new bindings for tag in self._tags(): self.__canvas.tag_bind(tag, "", self.__press_cb) self.__canvas.tag_bind(tag, "", self.__press_cb) self.__canvas.tag_bind(tag, "", self.__press_cb) ##////////////////////////////////////////////////////// ## Inherited methods. ##////////////////////////////////////////////////////// def bbox(self): """ :return: A bounding box for this ``CanvasWidget``. The bounding box is a tuple of four coordinates, *(xmin, ymin, xmax, ymax)*, for a rectangle which encloses all of the canvas widget's graphical elements. Bounding box coordinates are specified with respect to the coordinate space of the ``Canvas``. :rtype: tuple(int, int, int, int) """ if self.__hidden: return (0, 0, 0, 0) if len(self.tags()) == 0: raise ValueError("No tags") return self.__canvas.bbox(*self.tags()) def width(self): """ :return: The width of this canvas widget's bounding box, in its ``Canvas``'s coordinate space. :rtype: int """ if len(self.tags()) == 0: raise ValueError("No tags") bbox = self.__canvas.bbox(*self.tags()) return bbox[2] - bbox[0] def height(self): """ :return: The height of this canvas widget's bounding box, in its ``Canvas``'s coordinate space. :rtype: int """ if len(self.tags()) == 0: raise ValueError("No tags") bbox = self.__canvas.bbox(*self.tags()) return bbox[3] - bbox[1] def parent(self): """ :return: The hierarchical parent of this canvas widget. ``self`` is considered a subpart of its parent for purposes of user interaction. :rtype: CanvasWidget or None """ return self.__parent def child_widgets(self): """ :return: A list of the hierarchical children of this canvas widget. These children are considered part of ``self`` for purposes of user interaction. :rtype: list of CanvasWidget """ return self.__children def canvas(self): """ :return: The canvas that this canvas widget is bound to. :rtype: Tkinter.Canvas """ return self.__canvas def move(self, dx, dy): """ Move this canvas widget by a given distance. In particular, shift the canvas widget right by ``dx`` pixels, and down by ``dy`` pixels. Both ``dx`` and ``dy`` may be negative, resulting in leftward or upward movement. :type dx: int :param dx: The number of pixels to move this canvas widget rightwards. :type dy: int :param dy: The number of pixels to move this canvas widget downwards. :rtype: None """ if dx == dy == 0: return for tag in self.tags(): self.__canvas.move(tag, dx, dy) if self.__parent: self.__parent.update(self) def moveto(self, x, y, anchor="NW"): """ Move this canvas widget to the given location. In particular, shift the canvas widget such that the corner or side of the bounding box specified by ``anchor`` is at location (``x``, ``y``). :param x,y: The location that the canvas widget should be moved to. :param anchor: The corner or side of the canvas widget that should be moved to the specified location. ``'N'`` specifies the top center; ``'NE'`` specifies the top right corner; etc. """ x1, y1, x2, y2 = self.bbox() if anchor == "NW": self.move(x - x1, y - y1) if anchor == "N": self.move(x - x1 / 2 - x2 / 2, y - y1) if anchor == "NE": self.move(x - x2, y - y1) if anchor == "E": self.move(x - x2, y - y1 / 2 - y2 / 2) if anchor == "SE": self.move(x - x2, y - y2) if anchor == "S": self.move(x - x1 / 2 - x2 / 2, y - y2) if anchor == "SW": self.move(x - x1, y - y2) if anchor == "W": self.move(x - x1, y - y1 / 2 - y2 / 2) def destroy(self): """ Remove this ``CanvasWidget`` from its ``Canvas``. After a ``CanvasWidget`` has been destroyed, it should not be accessed. Note that you only need to destroy a top-level ``CanvasWidget``; its child widgets will be destroyed automatically. If you destroy a non-top-level ``CanvasWidget``, then the entire top-level widget will be destroyed. :raise ValueError: if this ``CanvasWidget`` has a parent. :rtype: None """ if self.__parent is not None: self.__parent.destroy() return for tag in self.tags(): self.__canvas.tag_unbind(tag, "") self.__canvas.tag_unbind(tag, "") self.__canvas.tag_unbind(tag, "") self.__canvas.delete(*self.tags()) self.__canvas = None def update(self, child): """ Update the graphical display of this canvas widget, and all of its ancestors, in response to a change in one of this canvas widget's children. :param child: The child widget that changed. :type child: CanvasWidget """ if self.__hidden or child.__hidden: return # If we're already updating, then do nothing. This prevents # infinite loops when _update modifies its children. if self.__updating: return self.__updating = 1 # Update this CanvasWidget. self._update(child) # Propagate update request to the parent. if self.__parent: self.__parent.update(self) # We're done updating. self.__updating = 0 def manage(self): """ Arrange this canvas widget and all of its descendants. :rtype: None """ if self.__hidden: return for child in self.__children: child.manage() self._manage() def tags(self): """ :return: a list of the canvas tags for all graphical elements managed by this canvas widget, including graphical elements managed by its child widgets. :rtype: list of int """ if self.__canvas is None: raise ValueError("Attempt to access a destroyed canvas widget") tags = [] tags += self._tags() for child in self.__children: tags += child.tags() return tags def __setitem__(self, attr, value): """ Set the value of the attribute ``attr`` to ``value``. See the class documentation for a list of attributes supported by this canvas widget. :rtype: None """ if attr == "draggable": self.__draggable = value else: raise ValueError("Unknown attribute %r" % attr) def __getitem__(self, attr): """ :return: the value of the attribute ``attr``. See the class documentation for a list of attributes supported by this canvas widget. :rtype: (any) """ if attr == "draggable": return self.__draggable else: raise ValueError("Unknown attribute %r" % attr) def __repr__(self): """ :return: a string representation of this canvas widget. :rtype: str """ return "<%s>" % self.__class__.__name__ def hide(self): """ Temporarily hide this canvas widget. :rtype: None """ self.__hidden = 1 for tag in self.tags(): self.__canvas.itemconfig(tag, state="hidden") def show(self): """ Show a hidden canvas widget. :rtype: None """ self.__hidden = 0 for tag in self.tags(): self.__canvas.itemconfig(tag, state="normal") def hidden(self): """ :return: True if this canvas widget is hidden. :rtype: bool """ return self.__hidden ##////////////////////////////////////////////////////// ## Callback interface ##////////////////////////////////////////////////////// def bind_click(self, callback, button=1): """ Register a new callback that will be called whenever this ``CanvasWidget`` is clicked on. :type callback: function :param callback: The callback function that will be called whenever this ``CanvasWidget`` is clicked. This function will be called with this ``CanvasWidget`` as its argument. :type button: int :param button: Which button the user should use to click on this ``CanvasWidget``. Typically, this should be 1 (left button), 3 (right button), or 2 (middle button). """ self.__callbacks[button] = callback def bind_drag(self, callback): """ Register a new callback that will be called after this ``CanvasWidget`` is dragged. This implicitly makes this ``CanvasWidget`` draggable. :type callback: function :param callback: The callback function that will be called whenever this ``CanvasWidget`` is clicked. This function will be called with this ``CanvasWidget`` as its argument. """ self.__draggable = 1 self.__callbacks["drag"] = callback def unbind_click(self, button=1): """ Remove a callback that was registered with ``bind_click``. :type button: int :param button: Which button the user should use to click on this ``CanvasWidget``. Typically, this should be 1 (left button), 3 (right button), or 2 (middle button). """ try: del self.__callbacks[button] except: pass def unbind_drag(self): """ Remove a callback that was registered with ``bind_drag``. """ try: del self.__callbacks["drag"] except: pass ##////////////////////////////////////////////////////// ## Callback internals ##////////////////////////////////////////////////////// def __press_cb(self, event): """ Handle a button-press event: - record the button press event in ``self.__press`` - register a button-release callback. - if this CanvasWidget or any of its ancestors are draggable, then register the appropriate motion callback. """ # If we're already waiting for a button release, then ignore # this new button press. if ( self.__canvas.bind("") or self.__canvas.bind("") or self.__canvas.bind("") ): return # Unbind motion (just in case; this shouldn't be necessary) self.__canvas.unbind("") # Record the button press event. self.__press = event # If any ancestor is draggable, set up a motion callback. # (Only if they pressed button number 1) if event.num == 1: widget = self while widget is not None: if widget["draggable"]: widget.__start_drag(event) break widget = widget.parent() # Set up the button release callback. self.__canvas.bind("" % event.num, self.__release_cb) def __start_drag(self, event): """ Begin dragging this object: - register a motion callback - record the drag coordinates """ self.__canvas.bind("", self.__motion_cb) self.__drag_x = event.x self.__drag_y = event.y def __motion_cb(self, event): """ Handle a motion event: - move this object to the new location - record the new drag coordinates """ self.move(event.x - self.__drag_x, event.y - self.__drag_y) self.__drag_x = event.x self.__drag_y = event.y def __release_cb(self, event): """ Handle a release callback: - unregister motion & button release callbacks. - decide whether they clicked, dragged, or cancelled - call the appropriate handler. """ # Unbind the button release & motion callbacks. self.__canvas.unbind("" % event.num) self.__canvas.unbind("") # Is it a click or a drag? if ( event.time - self.__press.time < 100 and abs(event.x - self.__press.x) + abs(event.y - self.__press.y) < 5 ): # Move it back, if we were dragging. if self.__draggable and event.num == 1: self.move( self.__press.x - self.__drag_x, self.__press.y - self.__drag_y ) self.__click(event.num) elif event.num == 1: self.__drag() self.__press = None def __drag(self): """ If this ``CanvasWidget`` has a drag callback, then call it; otherwise, find the closest ancestor with a drag callback, and call it. If no ancestors have a drag callback, do nothing. """ if self.__draggable: if "drag" in self.__callbacks: cb = self.__callbacks["drag"] try: cb(self) except: print("Error in drag callback for %r" % self) elif self.__parent is not None: self.__parent.__drag() def __click(self, button): """ If this ``CanvasWidget`` has a drag callback, then call it; otherwise, find the closest ancestor with a click callback, and call it. If no ancestors have a click callback, do nothing. """ if button in self.__callbacks: cb = self.__callbacks[button] # try: cb(self) # except: # print('Error in click callback for %r' % self) # raise elif self.__parent is not None: self.__parent.__click(button) ##////////////////////////////////////////////////////// ## Child/parent Handling ##////////////////////////////////////////////////////// def _add_child_widget(self, child): """ Register a hierarchical child widget. The child will be considered part of this canvas widget for purposes of user interaction. ``_add_child_widget`` has two direct effects: - It sets ``child``'s parent to this canvas widget. - It adds ``child`` to the list of canvas widgets returned by the ``child_widgets`` member function. :param child: The new child widget. ``child`` must not already have a parent. :type child: CanvasWidget """ if not hasattr(self, "_CanvasWidget__children"): self.__children = [] if child.__parent is not None: raise ValueError(f"{child} already has a parent") child.__parent = self self.__children.append(child) def _remove_child_widget(self, child): """ Remove a hierarchical child widget. This child will no longer be considered part of this canvas widget for purposes of user interaction. ``_add_child_widget`` has two direct effects: - It sets ``child``'s parent to None. - It removes ``child`` from the list of canvas widgets returned by the ``child_widgets`` member function. :param child: The child widget to remove. ``child`` must be a child of this canvas widget. :type child: CanvasWidget """ self.__children.remove(child) child.__parent = None ##////////////////////////////////////////////////////// ## Defined by subclass ##////////////////////////////////////////////////////// @abstractmethod def _tags(self): """ :return: a list of canvas tags for all graphical elements managed by this canvas widget, not including graphical elements managed by its child widgets. :rtype: list of int """ def _manage(self): """ Arrange the child widgets of this canvas widget. This method is called when the canvas widget is initially created. It is also called if the user calls the ``manage`` method on this canvas widget or any of its ancestors. :rtype: None """ def _update(self, child): """ Update this canvas widget in response to a change in one of its children. :param child: The child that changed. :type child: CanvasWidget :rtype: None """ ##////////////////////////////////////////////////////// ## Basic widgets. ##////////////////////////////////////////////////////// class TextWidget(CanvasWidget): """ A canvas widget that displays a single string of text. Attributes: - ``color``: the color of the text. - ``font``: the font used to display the text. - ``justify``: justification for multi-line texts. Valid values are ``left``, ``center``, and ``right``. - ``width``: the width of the text. If the text is wider than this width, it will be line-wrapped at whitespace. - ``draggable``: whether the text can be dragged by the user. """ def __init__(self, canvas, text, **attribs): """ Create a new text widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :type text: str :param text: The string of text to display. :param attribs: The new canvas widget's attributes. """ self._text = text self._tag = canvas.create_text(1, 1, text=text) CanvasWidget.__init__(self, canvas, **attribs) def __setitem__(self, attr, value): if attr in ("color", "font", "justify", "width"): if attr == "color": attr = "fill" self.canvas().itemconfig(self._tag, {attr: value}) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == "width": return int(self.canvas().itemcget(self._tag, attr)) elif attr in ("color", "font", "justify"): if attr == "color": attr = "fill" return self.canvas().itemcget(self._tag, attr) else: return CanvasWidget.__getitem__(self, attr) def _tags(self): return [self._tag] def text(self): """ :return: The text displayed by this text widget. :rtype: str """ return self.canvas().itemcget(self._tag, "TEXT") def set_text(self, text): """ Change the text that is displayed by this text widget. :type text: str :param text: The string of text to display. :rtype: None """ self.canvas().itemconfig(self._tag, text=text) if self.parent() is not None: self.parent().update(self) def __repr__(self): return "[Text: %r]" % self._text class SymbolWidget(TextWidget): """ A canvas widget that displays special symbols, such as the negation sign and the exists operator. Symbols are specified by name. Currently, the following symbol names are defined: ``neg``, ``disj``, ``conj``, ``lambda``, ``merge``, ``forall``, ``exists``, ``subseteq``, ``subset``, ``notsubset``, ``emptyset``, ``imp``, ``rightarrow``, ``equal``, ``notequal``, ``epsilon``. Attributes: - ``color``: the color of the text. - ``draggable``: whether the text can be dragged by the user. :cvar SYMBOLS: A dictionary mapping from symbols to the character in the ``symbol`` font used to render them. """ SYMBOLS = { "neg": "\330", "disj": "\332", "conj": "\331", "lambda": "\154", "merge": "\304", "forall": "\042", "exists": "\044", "subseteq": "\315", "subset": "\314", "notsubset": "\313", "emptyset": "\306", "imp": "\336", "rightarrow": chr(222), #'\256', "equal": "\75", "notequal": "\271", "intersection": "\307", "union": "\310", "epsilon": "e", } def __init__(self, canvas, symbol, **attribs): """ Create a new symbol widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :type symbol: str :param symbol: The name of the symbol to display. :param attribs: The new canvas widget's attributes. """ attribs["font"] = "symbol" TextWidget.__init__(self, canvas, "", **attribs) self.set_symbol(symbol) def symbol(self): """ :return: the name of the symbol that is displayed by this symbol widget. :rtype: str """ return self._symbol def set_symbol(self, symbol): """ Change the symbol that is displayed by this symbol widget. :type symbol: str :param symbol: The name of the symbol to display. """ if symbol not in SymbolWidget.SYMBOLS: raise ValueError("Unknown symbol: %s" % symbol) self._symbol = symbol self.set_text(SymbolWidget.SYMBOLS[symbol]) def __repr__(self): return "[Symbol: %r]" % self._symbol @staticmethod def symbolsheet(size=20): """ Open a new Tkinter window that displays the entire alphabet for the symbol font. This is useful for constructing the ``SymbolWidget.SYMBOLS`` dictionary. """ top = Tk() def destroy(e, top=top): top.destroy() top.bind("q", destroy) Button(top, text="Quit", command=top.destroy).pack(side="bottom") text = Text(top, font=("helvetica", -size), width=20, height=30) text.pack(side="left") sb = Scrollbar(top, command=text.yview) text["yscrollcommand"] = sb.set sb.pack(side="right", fill="y") text.tag_config("symbol", font=("symbol", -size)) for i in range(256): if i in (0, 10): continue # null and newline for k, v in list(SymbolWidget.SYMBOLS.items()): if v == chr(i): text.insert("end", "%-10s\t" % k) break else: text.insert("end", "%-10d \t" % i) text.insert("end", "[%s]\n" % chr(i), "symbol") top.mainloop() class AbstractContainerWidget(CanvasWidget): """ An abstract class for canvas widgets that contain a single child, such as ``BoxWidget`` and ``OvalWidget``. Subclasses must define a constructor, which should create any new graphical elements and then call the ``AbstractCanvasContainer`` constructor. Subclasses must also define the ``_update`` method and the ``_tags`` method; and any subclasses that define attributes should define ``__setitem__`` and ``__getitem__``. """ def __init__(self, canvas, child, **attribs): """ Create a new container widget. This constructor should only be called by subclass constructors. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The container's child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._add_child_widget(child) CanvasWidget.__init__(self, canvas, **attribs) def _manage(self): self._update(self._child) def child(self): """ :return: The child widget contained by this container widget. :rtype: CanvasWidget """ return self._child def set_child(self, child): """ Change the child widget contained by this container widget. :param child: The new child widget. ``child`` must not have a parent. :type child: CanvasWidget :rtype: None """ self._remove_child_widget(self._child) self._add_child_widget(child) self._child = child self.update(child) def __repr__(self): name = self.__class__.__name__ if name[-6:] == "Widget": name = name[:-6] return f"[{name}: {self._child!r}]" class BoxWidget(AbstractContainerWidget): """ A canvas widget that places a box around a child widget. Attributes: - ``fill``: The color used to fill the interior of the box. - ``outline``: The color used to draw the outline of the box. - ``width``: The width of the outline of the box. - ``margin``: The number of pixels space left between the child and the box. - ``draggable``: whether the text can be dragged by the user. """ def __init__(self, canvas, child, **attribs): """ Create a new box widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._margin = 1 self._box = canvas.create_rectangle(1, 1, 1, 1) canvas.tag_lower(self._box) AbstractContainerWidget.__init__(self, canvas, child, **attribs) def __setitem__(self, attr, value): if attr == "margin": self._margin = value elif attr in ("outline", "fill", "width"): self.canvas().itemconfig(self._box, {attr: value}) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == "margin": return self._margin elif attr == "width": return float(self.canvas().itemcget(self._box, attr)) elif attr in ("outline", "fill", "width"): return self.canvas().itemcget(self._box, attr) else: return CanvasWidget.__getitem__(self, attr) def _update(self, child): (x1, y1, x2, y2) = child.bbox() margin = self._margin + self["width"] / 2 self.canvas().coords( self._box, x1 - margin, y1 - margin, x2 + margin, y2 + margin ) def _tags(self): return [self._box] class OvalWidget(AbstractContainerWidget): """ A canvas widget that places a oval around a child widget. Attributes: - ``fill``: The color used to fill the interior of the oval. - ``outline``: The color used to draw the outline of the oval. - ``width``: The width of the outline of the oval. - ``margin``: The number of pixels space left between the child and the oval. - ``draggable``: whether the text can be dragged by the user. - ``double``: If true, then a double-oval is drawn. """ def __init__(self, canvas, child, **attribs): """ Create a new oval widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._margin = 1 self._oval = canvas.create_oval(1, 1, 1, 1) self._circle = attribs.pop("circle", False) self._double = attribs.pop("double", False) if self._double: self._oval2 = canvas.create_oval(1, 1, 1, 1) else: self._oval2 = None canvas.tag_lower(self._oval) AbstractContainerWidget.__init__(self, canvas, child, **attribs) def __setitem__(self, attr, value): c = self.canvas() if attr == "margin": self._margin = value elif attr == "double": if value == True and self._oval2 is None: # Copy attributes & position from self._oval. x1, y1, x2, y2 = c.bbox(self._oval) w = self["width"] * 2 self._oval2 = c.create_oval( x1 - w, y1 - w, x2 + w, y2 + w, outline=c.itemcget(self._oval, "outline"), width=c.itemcget(self._oval, "width"), ) c.tag_lower(self._oval2) if value == False and self._oval2 is not None: c.delete(self._oval2) self._oval2 = None elif attr in ("outline", "fill", "width"): c.itemconfig(self._oval, {attr: value}) if self._oval2 is not None and attr != "fill": c.itemconfig(self._oval2, {attr: value}) if self._oval2 is not None and attr != "fill": self.canvas().itemconfig(self._oval2, {attr: value}) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == "margin": return self._margin elif attr == "double": return self._double is not None elif attr == "width": return float(self.canvas().itemcget(self._oval, attr)) elif attr in ("outline", "fill", "width"): return self.canvas().itemcget(self._oval, attr) else: return CanvasWidget.__getitem__(self, attr) # The ratio between inscribed & circumscribed ovals RATIO = 1.4142135623730949 def _update(self, child): R = OvalWidget.RATIO (x1, y1, x2, y2) = child.bbox() margin = self._margin # If we're a circle, pretend our contents are square. if self._circle: dx, dy = abs(x1 - x2), abs(y1 - y2) if dx > dy: y = (y1 + y2) / 2 y1, y2 = y - dx / 2, y + dx / 2 elif dy > dx: x = (x1 + x2) / 2 x1, x2 = x - dy / 2, x + dy / 2 # Find the four corners. left = int((x1 * (1 + R) + x2 * (1 - R)) / 2) right = left + int((x2 - x1) * R) top = int((y1 * (1 + R) + y2 * (1 - R)) / 2) bot = top + int((y2 - y1) * R) self.canvas().coords( self._oval, left - margin, top - margin, right + margin, bot + margin ) if self._oval2 is not None: self.canvas().coords( self._oval2, left - margin + 2, top - margin + 2, right + margin - 2, bot + margin - 2, ) def _tags(self): if self._oval2 is None: return [self._oval] else: return [self._oval, self._oval2] class ParenWidget(AbstractContainerWidget): """ A canvas widget that places a pair of parenthases around a child widget. Attributes: - ``color``: The color used to draw the parenthases. - ``width``: The width of the parenthases. - ``draggable``: whether the text can be dragged by the user. """ def __init__(self, canvas, child, **attribs): """ Create a new parenthasis widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._oparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=90, extent=180) self._cparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=-90, extent=180) AbstractContainerWidget.__init__(self, canvas, child, **attribs) def __setitem__(self, attr, value): if attr == "color": self.canvas().itemconfig(self._oparen, outline=value) self.canvas().itemconfig(self._cparen, outline=value) elif attr == "width": self.canvas().itemconfig(self._oparen, width=value) self.canvas().itemconfig(self._cparen, width=value) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == "color": return self.canvas().itemcget(self._oparen, "outline") elif attr == "width": return self.canvas().itemcget(self._oparen, "width") else: return CanvasWidget.__getitem__(self, attr) def _update(self, child): (x1, y1, x2, y2) = child.bbox() width = max((y2 - y1) / 6, 4) self.canvas().coords(self._oparen, x1 - width, y1, x1 + width, y2) self.canvas().coords(self._cparen, x2 - width, y1, x2 + width, y2) def _tags(self): return [self._oparen, self._cparen] class BracketWidget(AbstractContainerWidget): """ A canvas widget that places a pair of brackets around a child widget. Attributes: - ``color``: The color used to draw the brackets. - ``width``: The width of the brackets. - ``draggable``: whether the text can be dragged by the user. """ def __init__(self, canvas, child, **attribs): """ Create a new bracket widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._obrack = canvas.create_line(1, 1, 1, 1, 1, 1, 1, 1) self._cbrack = canvas.create_line(1, 1, 1, 1, 1, 1, 1, 1) AbstractContainerWidget.__init__(self, canvas, child, **attribs) def __setitem__(self, attr, value): if attr == "color": self.canvas().itemconfig(self._obrack, fill=value) self.canvas().itemconfig(self._cbrack, fill=value) elif attr == "width": self.canvas().itemconfig(self._obrack, width=value) self.canvas().itemconfig(self._cbrack, width=value) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == "color": return self.canvas().itemcget(self._obrack, "outline") elif attr == "width": return self.canvas().itemcget(self._obrack, "width") else: return CanvasWidget.__getitem__(self, attr) def _update(self, child): (x1, y1, x2, y2) = child.bbox() width = max((y2 - y1) / 8, 2) self.canvas().coords( self._obrack, x1, y1, x1 - width, y1, x1 - width, y2, x1, y2 ) self.canvas().coords( self._cbrack, x2, y1, x2 + width, y1, x2 + width, y2, x2, y2 ) def _tags(self): return [self._obrack, self._cbrack] class SequenceWidget(CanvasWidget): """ A canvas widget that keeps a list of canvas widgets in a horizontal line. Attributes: - ``align``: The vertical alignment of the children. Possible values are ``'top'``, ``'center'``, and ``'bottom'``. By default, children are center-aligned. - ``space``: The amount of horizontal space to place between children. By default, one pixel of space is used. - ``ordered``: If true, then keep the children in their original order. """ def __init__(self, canvas, *children, **attribs): """ Create a new sequence widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param children: The widgets that should be aligned horizontally. Each child must not have a parent. :type children: list(CanvasWidget) :param attribs: The new canvas widget's attributes. """ self._align = "center" self._space = 1 self._ordered = False self._children = list(children) for child in children: self._add_child_widget(child) CanvasWidget.__init__(self, canvas, **attribs) def __setitem__(self, attr, value): if attr == "align": if value not in ("top", "bottom", "center"): raise ValueError("Bad alignment: %r" % value) self._align = value elif attr == "space": self._space = value elif attr == "ordered": self._ordered = value else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == "align": return self._align elif attr == "space": return self._space elif attr == "ordered": return self._ordered else: return CanvasWidget.__getitem__(self, attr) def _tags(self): return [] def _yalign(self, top, bot): if self._align == "top": return top if self._align == "bottom": return bot if self._align == "center": return (top + bot) / 2 def _update(self, child): # Align all children with child. (left, top, right, bot) = child.bbox() y = self._yalign(top, bot) for c in self._children: (x1, y1, x2, y2) = c.bbox() c.move(0, y - self._yalign(y1, y2)) if self._ordered and len(self._children) > 1: index = self._children.index(child) x = right + self._space for i in range(index + 1, len(self._children)): (x1, y1, x2, y2) = self._children[i].bbox() if x > x1: self._children[i].move(x - x1, 0) x += x2 - x1 + self._space x = left - self._space for i in range(index - 1, -1, -1): (x1, y1, x2, y2) = self._children[i].bbox() if x < x2: self._children[i].move(x - x2, 0) x -= x2 - x1 + self._space def _manage(self): if len(self._children) == 0: return child = self._children[0] # Align all children with child. (left, top, right, bot) = child.bbox() y = self._yalign(top, bot) index = self._children.index(child) # Line up children to the right of child. x = right + self._space for i in range(index + 1, len(self._children)): (x1, y1, x2, y2) = self._children[i].bbox() self._children[i].move(x - x1, y - self._yalign(y1, y2)) x += x2 - x1 + self._space # Line up children to the left of child. x = left - self._space for i in range(index - 1, -1, -1): (x1, y1, x2, y2) = self._children[i].bbox() self._children[i].move(x - x2, y - self._yalign(y1, y2)) x -= x2 - x1 + self._space def __repr__(self): return "[Sequence: " + repr(self._children)[1:-1] + "]" # Provide an alias for the child_widgets() member. children = CanvasWidget.child_widgets def replace_child(self, oldchild, newchild): """ Replace the child canvas widget ``oldchild`` with ``newchild``. ``newchild`` must not have a parent. ``oldchild``'s parent will be set to None. :type oldchild: CanvasWidget :param oldchild: The child canvas widget to remove. :type newchild: CanvasWidget :param newchild: The canvas widget that should replace ``oldchild``. """ index = self._children.index(oldchild) self._children[index] = newchild self._remove_child_widget(oldchild) self._add_child_widget(newchild) self.update(newchild) def remove_child(self, child): """ Remove the given child canvas widget. ``child``'s parent will be set to None. :type child: CanvasWidget :param child: The child canvas widget to remove. """ index = self._children.index(child) del self._children[index] self._remove_child_widget(child) if len(self._children) > 0: self.update(self._children[0]) def insert_child(self, index, child): """ Insert a child canvas widget before a given index. :type child: CanvasWidget :param child: The canvas widget that should be inserted. :type index: int :param index: The index where the child widget should be inserted. In particular, the index of ``child`` will be ``index``; and the index of any children whose indices were greater than equal to ``index`` before ``child`` was inserted will be incremented by one. """ self._children.insert(index, child) self._add_child_widget(child) class StackWidget(CanvasWidget): """ A canvas widget that keeps a list of canvas widgets in a vertical line. Attributes: - ``align``: The horizontal alignment of the children. Possible values are ``'left'``, ``'center'``, and ``'right'``. By default, children are center-aligned. - ``space``: The amount of vertical space to place between children. By default, one pixel of space is used. - ``ordered``: If true, then keep the children in their original order. """ def __init__(self, canvas, *children, **attribs): """ Create a new stack widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param children: The widgets that should be aligned vertically. Each child must not have a parent. :type children: list(CanvasWidget) :param attribs: The new canvas widget's attributes. """ self._align = "center" self._space = 1 self._ordered = False self._children = list(children) for child in children: self._add_child_widget(child) CanvasWidget.__init__(self, canvas, **attribs) def __setitem__(self, attr, value): if attr == "align": if value not in ("left", "right", "center"): raise ValueError("Bad alignment: %r" % value) self._align = value elif attr == "space": self._space = value elif attr == "ordered": self._ordered = value else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == "align": return self._align elif attr == "space": return self._space elif attr == "ordered": return self._ordered else: return CanvasWidget.__getitem__(self, attr) def _tags(self): return [] def _xalign(self, left, right): if self._align == "left": return left if self._align == "right": return right if self._align == "center": return (left + right) / 2 def _update(self, child): # Align all children with child. (left, top, right, bot) = child.bbox() x = self._xalign(left, right) for c in self._children: (x1, y1, x2, y2) = c.bbox() c.move(x - self._xalign(x1, x2), 0) if self._ordered and len(self._children) > 1: index = self._children.index(child) y = bot + self._space for i in range(index + 1, len(self._children)): (x1, y1, x2, y2) = self._children[i].bbox() if y > y1: self._children[i].move(0, y - y1) y += y2 - y1 + self._space y = top - self._space for i in range(index - 1, -1, -1): (x1, y1, x2, y2) = self._children[i].bbox() if y < y2: self._children[i].move(0, y - y2) y -= y2 - y1 + self._space def _manage(self): if len(self._children) == 0: return child = self._children[0] # Align all children with child. (left, top, right, bot) = child.bbox() x = self._xalign(left, right) index = self._children.index(child) # Line up children below the child. y = bot + self._space for i in range(index + 1, len(self._children)): (x1, y1, x2, y2) = self._children[i].bbox() self._children[i].move(x - self._xalign(x1, x2), y - y1) y += y2 - y1 + self._space # Line up children above the child. y = top - self._space for i in range(index - 1, -1, -1): (x1, y1, x2, y2) = self._children[i].bbox() self._children[i].move(x - self._xalign(x1, x2), y - y2) y -= y2 - y1 + self._space def __repr__(self): return "[Stack: " + repr(self._children)[1:-1] + "]" # Provide an alias for the child_widgets() member. children = CanvasWidget.child_widgets def replace_child(self, oldchild, newchild): """ Replace the child canvas widget ``oldchild`` with ``newchild``. ``newchild`` must not have a parent. ``oldchild``'s parent will be set to None. :type oldchild: CanvasWidget :param oldchild: The child canvas widget to remove. :type newchild: CanvasWidget :param newchild: The canvas widget that should replace ``oldchild``. """ index = self._children.index(oldchild) self._children[index] = newchild self._remove_child_widget(oldchild) self._add_child_widget(newchild) self.update(newchild) def remove_child(self, child): """ Remove the given child canvas widget. ``child``'s parent will be set to None. :type child: CanvasWidget :param child: The child canvas widget to remove. """ index = self._children.index(child) del self._children[index] self._remove_child_widget(child) if len(self._children) > 0: self.update(self._children[0]) def insert_child(self, index, child): """ Insert a child canvas widget before a given index. :type child: CanvasWidget :param child: The canvas widget that should be inserted. :type index: int :param index: The index where the child widget should be inserted. In particular, the index of ``child`` will be ``index``; and the index of any children whose indices were greater than equal to ``index`` before ``child`` was inserted will be incremented by one. """ self._children.insert(index, child) self._add_child_widget(child) class SpaceWidget(CanvasWidget): """ A canvas widget that takes up space but does not display anything. A ``SpaceWidget`` can be used to add space between elements. Each space widget is characterized by a width and a height. If you wish to only create horizontal space, then use a height of zero; and if you wish to only create vertical space, use a width of zero. """ def __init__(self, canvas, width, height, **attribs): """ Create a new space widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :type width: int :param width: The width of the new space widget. :type height: int :param height: The height of the new space widget. :param attribs: The new canvas widget's attributes. """ # For some reason, if width > 4: width -= 4 if height > 4: height -= 4 self._tag = canvas.create_line(1, 1, width, height, fill="") CanvasWidget.__init__(self, canvas, **attribs) # note: width() and height() are already defined by CanvasWidget. def set_width(self, width): """ Change the width of this space widget. :param width: The new width. :type width: int :rtype: None """ [x1, y1, x2, y2] = self.bbox() self.canvas().coords(self._tag, x1, y1, x1 + width, y2) def set_height(self, height): """ Change the height of this space widget. :param height: The new height. :type height: int :rtype: None """ [x1, y1, x2, y2] = self.bbox() self.canvas().coords(self._tag, x1, y1, x2, y1 + height) def _tags(self): return [self._tag] def __repr__(self): return "[Space]" class ScrollWatcherWidget(CanvasWidget): """ A special canvas widget that adjusts its ``Canvas``'s scrollregion to always include the bounding boxes of all of its children. The scroll-watcher widget will only increase the size of the ``Canvas``'s scrollregion; it will never decrease it. """ def __init__(self, canvas, *children, **attribs): """ Create a new scroll-watcher widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :type children: list(CanvasWidget) :param children: The canvas widgets watched by the scroll-watcher. The scroll-watcher will ensure that these canvas widgets are always contained in their canvas's scrollregion. :param attribs: The new canvas widget's attributes. """ for child in children: self._add_child_widget(child) CanvasWidget.__init__(self, canvas, **attribs) def add_child(self, canvaswidget): """ Add a new canvas widget to the scroll-watcher. The scroll-watcher will ensure that the new canvas widget is always contained in its canvas's scrollregion. :param canvaswidget: The new canvas widget. :type canvaswidget: CanvasWidget :rtype: None """ self._add_child_widget(canvaswidget) self.update(canvaswidget) def remove_child(self, canvaswidget): """ Remove a canvas widget from the scroll-watcher. The scroll-watcher will no longer ensure that the new canvas widget is always contained in its canvas's scrollregion. :param canvaswidget: The canvas widget to remove. :type canvaswidget: CanvasWidget :rtype: None """ self._remove_child_widget(canvaswidget) def _tags(self): return [] def _update(self, child): self._adjust_scrollregion() def _adjust_scrollregion(self): """ Adjust the scrollregion of this scroll-watcher's ``Canvas`` to include the bounding boxes of all of its children. """ bbox = self.bbox() canvas = self.canvas() scrollregion = [int(n) for n in canvas["scrollregion"].split()] if len(scrollregion) != 4: return if ( bbox[0] < scrollregion[0] or bbox[1] < scrollregion[1] or bbox[2] > scrollregion[2] or bbox[3] > scrollregion[3] ): scrollregion = "%d %d %d %d" % ( min(bbox[0], scrollregion[0]), min(bbox[1], scrollregion[1]), max(bbox[2], scrollregion[2]), max(bbox[3], scrollregion[3]), ) canvas["scrollregion"] = scrollregion ##////////////////////////////////////////////////////// ## Canvas Frame ##////////////////////////////////////////////////////// class CanvasFrame: """ A ``Tkinter`` frame containing a canvas and scrollbars. ``CanvasFrame`` uses a ``ScrollWatcherWidget`` to ensure that all of the canvas widgets contained on its canvas are within its scrollregion. In order for ``CanvasFrame`` to make these checks, all canvas widgets must be registered with ``add_widget`` when they are added to the canvas; and destroyed with ``destroy_widget`` when they are no longer needed. If a ``CanvasFrame`` is created with no parent, then it will create its own main window, including a "Done" button and a "Print" button. """ def __init__(self, parent=None, **kw): """ Create a new ``CanvasFrame``. :type parent: Tkinter.BaseWidget or Tkinter.Tk :param parent: The parent ``Tkinter`` widget. If no parent is specified, then ``CanvasFrame`` will create a new main window. :param kw: Keyword arguments for the new ``Canvas``. See the documentation for ``Tkinter.Canvas`` for more information. """ # If no parent was given, set up a top-level window. if parent is None: self._parent = Tk() self._parent.title("NLTK") self._parent.bind("", lambda e: self.print_to_file()) self._parent.bind("", self.destroy) self._parent.bind("", self.destroy) else: self._parent = parent # Create a frame for the canvas & scrollbars self._frame = frame = Frame(self._parent) self._canvas = canvas = Canvas(frame, **kw) xscrollbar = Scrollbar(self._frame, orient="horizontal") yscrollbar = Scrollbar(self._frame, orient="vertical") xscrollbar["command"] = canvas.xview yscrollbar["command"] = canvas.yview canvas["xscrollcommand"] = xscrollbar.set canvas["yscrollcommand"] = yscrollbar.set yscrollbar.pack(fill="y", side="right") xscrollbar.pack(fill="x", side="bottom") canvas.pack(expand=1, fill="both", side="left") # Set initial scroll region. scrollregion = "0 0 {} {}".format(canvas["width"], canvas["height"]) canvas["scrollregion"] = scrollregion self._scrollwatcher = ScrollWatcherWidget(canvas) # If no parent was given, pack the frame, and add a menu. if parent is None: self.pack(expand=1, fill="both") self._init_menubar() def _init_menubar(self): menubar = Menu(self._parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command( label="Print to Postscript", underline=0, command=self.print_to_file, accelerator="Ctrl-p", ) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) self._parent.config(menu=menubar) def print_to_file(self, filename=None): """ Print the contents of this ``CanvasFrame`` to a postscript file. If no filename is given, then prompt the user for one. :param filename: The name of the file to print the tree to. :type filename: str :rtype: None """ if filename is None: ftypes = [("Postscript files", ".ps"), ("All files", "*")] filename = asksaveasfilename(filetypes=ftypes, defaultextension=".ps") if not filename: return (x0, y0, w, h) = self.scrollregion() postscript = self._canvas.postscript( x=x0, y=y0, width=w + 2, height=h + 2, pagewidth=w + 2, # points = 1/72 inch pageheight=h + 2, # points = 1/72 inch pagex=0, pagey=0, ) # workaround for bug in Tk font handling postscript = postscript.replace(" 0 scalefont ", " 9 scalefont ") with open(filename, "wb") as f: f.write(postscript.encode("utf8")) def scrollregion(self): """ :return: The current scroll region for the canvas managed by this ``CanvasFrame``. :rtype: 4-tuple of int """ (x1, y1, x2, y2) = self._canvas["scrollregion"].split() return (int(x1), int(y1), int(x2), int(y2)) def canvas(self): """ :return: The canvas managed by this ``CanvasFrame``. :rtype: Tkinter.Canvas """ return self._canvas def add_widget(self, canvaswidget, x=None, y=None): """ Register a canvas widget with this ``CanvasFrame``. The ``CanvasFrame`` will ensure that this canvas widget is always within the ``Canvas``'s scrollregion. If no coordinates are given for the canvas widget, then the ``CanvasFrame`` will attempt to find a clear area of the canvas for it. :type canvaswidget: CanvasWidget :param canvaswidget: The new canvas widget. ``canvaswidget`` must have been created on this ``CanvasFrame``'s canvas. :type x: int :param x: The initial x coordinate for the upper left hand corner of ``canvaswidget``, in the canvas's coordinate space. :type y: int :param y: The initial y coordinate for the upper left hand corner of ``canvaswidget``, in the canvas's coordinate space. """ if x is None or y is None: (x, y) = self._find_room(canvaswidget, x, y) # Move to (x,y) (x1, y1, x2, y2) = canvaswidget.bbox() canvaswidget.move(x - x1, y - y1) # Register with scrollwatcher. self._scrollwatcher.add_child(canvaswidget) def _find_room(self, widget, desired_x, desired_y): """ Try to find a space for a given widget. """ (left, top, right, bot) = self.scrollregion() w = widget.width() h = widget.height() if w >= (right - left): return (0, 0) if h >= (bot - top): return (0, 0) # Move the widget out of the way, for now. (x1, y1, x2, y2) = widget.bbox() widget.move(left - x2 - 50, top - y2 - 50) if desired_x is not None: x = desired_x for y in range(top, bot - h, int((bot - top - h) / 10)): if not self._canvas.find_overlapping( x - 5, y - 5, x + w + 5, y + h + 5 ): return (x, y) if desired_y is not None: y = desired_y for x in range(left, right - w, int((right - left - w) / 10)): if not self._canvas.find_overlapping( x - 5, y - 5, x + w + 5, y + h + 5 ): return (x, y) for y in range(top, bot - h, int((bot - top - h) / 10)): for x in range(left, right - w, int((right - left - w) / 10)): if not self._canvas.find_overlapping( x - 5, y - 5, x + w + 5, y + h + 5 ): return (x, y) return (0, 0) def destroy_widget(self, canvaswidget): """ Remove a canvas widget from this ``CanvasFrame``. This deregisters the canvas widget, and destroys it. """ self.remove_widget(canvaswidget) canvaswidget.destroy() def remove_widget(self, canvaswidget): # Deregister with scrollwatcher. self._scrollwatcher.remove_child(canvaswidget) def pack(self, cnf={}, **kw): """ Pack this ``CanvasFrame``. See the documentation for ``Tkinter.Pack`` for more information. """ self._frame.pack(cnf, **kw) # Adjust to be big enough for kids? def destroy(self, *e): """ Destroy this ``CanvasFrame``. If this ``CanvasFrame`` created a top-level window, then this will close that window. """ if self._parent is None: return self._parent.destroy() self._parent = None def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this frame is created from a non-interactive program (e.g. from a secript); otherwise, the frame will close as soon as the script completes. """ if in_idle(): return self._parent.mainloop(*args, **kwargs) ##////////////////////////////////////////////////////// ## Text display ##////////////////////////////////////////////////////// class ShowText: """ A ``Tkinter`` window used to display a text. ``ShowText`` is typically used by graphical tools to display help text, or similar information. """ def __init__(self, root, title, text, width=None, height=None, **textbox_options): if width is None or height is None: (width, height) = self.find_dimentions(text, width, height) # Create the main window. if root is None: self._top = top = Tk() else: self._top = top = Toplevel(root) top.title(title) b = Button(top, text="Ok", command=self.destroy) b.pack(side="bottom") tbf = Frame(top) tbf.pack(expand=1, fill="both") scrollbar = Scrollbar(tbf, orient="vertical") scrollbar.pack(side="right", fill="y") textbox = Text(tbf, wrap="word", width=width, height=height, **textbox_options) textbox.insert("end", text) textbox["state"] = "disabled" textbox.pack(side="left", expand=1, fill="both") scrollbar["command"] = textbox.yview textbox["yscrollcommand"] = scrollbar.set # Make it easy to close the window. top.bind("q", self.destroy) top.bind("x", self.destroy) top.bind("c", self.destroy) top.bind("", self.destroy) top.bind("", self.destroy) # Focus the scrollbar, so they can use up/down, etc. scrollbar.focus() def find_dimentions(self, text, width, height): lines = text.split("\n") if width is None: maxwidth = max(len(line) for line in lines) width = min(maxwidth, 80) # Now, find height. height = 0 for line in lines: while len(line) > width: brk = line[:width].rfind(" ") line = line[brk:] height += 1 height += 1 height = min(height, 25) return (width, height) def destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this window is created from a non-interactive program (e.g. from a secript); otherwise, the window will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) ##////////////////////////////////////////////////////// ## Entry dialog ##////////////////////////////////////////////////////// class EntryDialog: """ A dialog box for entering """ def __init__( self, parent, original_text="", instructions="", set_callback=None, title=None ): self._parent = parent self._original_text = original_text self._set_callback = set_callback width = int(max(30, len(original_text) * 3 / 2)) self._top = Toplevel(parent) if title: self._top.title(title) # The text entry box. entryframe = Frame(self._top) entryframe.pack(expand=1, fill="both", padx=5, pady=5, ipady=10) if instructions: l = Label(entryframe, text=instructions) l.pack(side="top", anchor="w", padx=30) self._entry = Entry(entryframe, width=width) self._entry.pack(expand=1, fill="x", padx=30) self._entry.insert(0, original_text) # A divider divider = Frame(self._top, borderwidth=1, relief="sunken") divider.pack(fill="x", ipady=1, padx=10) # The buttons. buttons = Frame(self._top) buttons.pack(expand=0, fill="x", padx=5, pady=5) b = Button(buttons, text="Cancel", command=self._cancel, width=8) b.pack(side="right", padx=5) b = Button(buttons, text="Ok", command=self._ok, width=8, default="active") b.pack(side="left", padx=5) b = Button(buttons, text="Apply", command=self._apply, width=8) b.pack(side="left") self._top.bind("", self._ok) self._top.bind("", self._cancel) self._top.bind("", self._cancel) self._entry.focus() def _reset(self, *e): self._entry.delete(0, "end") self._entry.insert(0, self._original_text) if self._set_callback: self._set_callback(self._original_text) def _cancel(self, *e): try: self._reset() except: pass self._destroy() def _ok(self, *e): self._apply() self._destroy() def _apply(self, *e): if self._set_callback: self._set_callback(self._entry.get()) def _destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None ##////////////////////////////////////////////////////// ## Colorized List ##////////////////////////////////////////////////////// class ColorizedList: """ An abstract base class for displaying a colorized list of items. Subclasses should define: - ``_init_colortags``, which sets up Text color tags that will be used by the list. - ``_item_repr``, which returns a list of (text,colortag) tuples that make up the colorized representation of the item. :note: Typically, you will want to register a callback for ``'select'`` that calls ``mark`` on the given item. """ def __init__(self, parent, items=[], **options): """ Construct a new list. :param parent: The Tk widget that contains the colorized list :param items: The initial contents of the colorized list. :param options: """ self._parent = parent self._callbacks = {} # Which items are marked? self._marks = {} # Initialize the Tkinter frames. self._init_itemframe(options.copy()) # Set up key & mouse bindings. self._textwidget.bind("", self._keypress) self._textwidget.bind("", self._buttonpress) # Fill in the given CFG's items. self._items = None self.set(items) # //////////////////////////////////////////////////////////// # Abstract methods # //////////////////////////////////////////////////////////// @abstractmethod def _init_colortags(self, textwidget, options): """ Set up any colortags that will be used by this colorized list. E.g.: >>> textwidget.tag_config('terminal', foreground='black') """ @abstractmethod def _item_repr(self, item): """ Return a list of (text, colortag) tuples that make up the colorized representation of the item. Colorized representations may not span multiple lines. I.e., the text strings returned may not contain newline characters. """ # //////////////////////////////////////////////////////////// # Item Access # //////////////////////////////////////////////////////////// def get(self, index=None): """ :return: A list of the items contained by this list. """ if index is None: return self._items[:] else: return self._items[index] def set(self, items): """ Modify the list of items contained by this list. """ items = list(items) if self._items == items: return self._items = list(items) self._textwidget["state"] = "normal" self._textwidget.delete("1.0", "end") for item in items: for (text, colortag) in self._item_repr(item): assert "\n" not in text, "item repr may not contain newline" self._textwidget.insert("end", text, colortag) self._textwidget.insert("end", "\n") # Remove the final newline self._textwidget.delete("end-1char", "end") self._textwidget.mark_set("insert", "1.0") self._textwidget["state"] = "disabled" # Clear all marks self._marks.clear() def unmark(self, item=None): """ Remove highlighting from the given item; or from every item, if no item is given. :raise ValueError: If ``item`` is not contained in the list. :raise KeyError: If ``item`` is not marked. """ if item is None: self._marks.clear() self._textwidget.tag_remove("highlight", "1.0", "end+1char") else: index = self._items.index(item) del self._marks[item] (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2)) self._textwidget.tag_remove("highlight", start, end) def mark(self, item): """ Highlight the given item. :raise ValueError: If ``item`` is not contained in the list. """ self._marks[item] = 1 index = self._items.index(item) (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2)) self._textwidget.tag_add("highlight", start, end) def markonly(self, item): """ Remove any current highlighting, and mark the given item. :raise ValueError: If ``item`` is not contained in the list. """ self.unmark() self.mark(item) def view(self, item): """ Adjust the view such that the given item is visible. If the item is already visible, then do nothing. """ index = self._items.index(item) self._textwidget.see("%d.0" % (index + 1)) # //////////////////////////////////////////////////////////// # Callbacks # //////////////////////////////////////////////////////////// def add_callback(self, event, func): """ Register a callback function with the list. This function will be called whenever the given event occurs. :param event: The event that will trigger the callback function. Valid events are: click1, click2, click3, space, return, select, up, down, next, prior, move :param func: The function that should be called when the event occurs. ``func`` will be called with a single item as its argument. (The item selected or the item moved to). """ if event == "select": events = ["click1", "space", "return"] elif event == "move": events = ["up", "down", "next", "prior"] else: events = [event] for e in events: self._callbacks.setdefault(e, {})[func] = 1 def remove_callback(self, event, func=None): """ Deregister a callback function. If ``func`` is none, then all callbacks are removed for the given event. """ if event is None: events = list(self._callbacks.keys()) elif event == "select": events = ["click1", "space", "return"] elif event == "move": events = ["up", "down", "next", "prior"] else: events = [event] for e in events: if func is None: del self._callbacks[e] else: try: del self._callbacks[e][func] except: pass # //////////////////////////////////////////////////////////// # Tkinter Methods # //////////////////////////////////////////////////////////// def pack(self, cnf={}, **kw): # "@include: Tkinter.Pack.pack" self._itemframe.pack(cnf, **kw) def grid(self, cnf={}, **kw): # "@include: Tkinter.Grid.grid" self._itemframe.grid(cnf, *kw) def focus(self): # "@include: Tkinter.Widget.focus" self._textwidget.focus() # //////////////////////////////////////////////////////////// # Internal Methods # //////////////////////////////////////////////////////////// def _init_itemframe(self, options): self._itemframe = Frame(self._parent) # Create the basic Text widget & scrollbar. options.setdefault("background", "#e0e0e0") self._textwidget = Text(self._itemframe, **options) self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient="vertical") self._textwidget.config(yscrollcommand=self._textscroll.set) self._textscroll.config(command=self._textwidget.yview) self._textscroll.pack(side="right", fill="y") self._textwidget.pack(expand=1, fill="both", side="left") # Initialize the colorization tags self._textwidget.tag_config( "highlight", background="#e0ffff", border="1", relief="raised" ) self._init_colortags(self._textwidget, options) # How do I want to mark keyboard selection? self._textwidget.tag_config("sel", foreground="") self._textwidget.tag_config( "sel", foreground="", background="", border="", underline=1 ) self._textwidget.tag_lower("highlight", "sel") def _fire_callback(self, event, itemnum): if event not in self._callbacks: return if 0 <= itemnum < len(self._items): item = self._items[itemnum] else: item = None for cb_func in list(self._callbacks[event].keys()): cb_func(item) def _buttonpress(self, event): clickloc = "@%d,%d" % (event.x, event.y) insert_point = self._textwidget.index(clickloc) itemnum = int(insert_point.split(".")[0]) - 1 self._fire_callback("click%d" % event.num, itemnum) def _keypress(self, event): if event.keysym == "Return" or event.keysym == "space": insert_point = self._textwidget.index("insert") itemnum = int(insert_point.split(".")[0]) - 1 self._fire_callback(event.keysym.lower(), itemnum) return elif event.keysym == "Down": delta = "+1line" elif event.keysym == "Up": delta = "-1line" elif event.keysym == "Next": delta = "+10lines" elif event.keysym == "Prior": delta = "-10lines" else: return "continue" self._textwidget.mark_set("insert", "insert" + delta) self._textwidget.see("insert") self._textwidget.tag_remove("sel", "1.0", "end+1char") self._textwidget.tag_add("sel", "insert linestart", "insert lineend") insert_point = self._textwidget.index("insert") itemnum = int(insert_point.split(".")[0]) - 1 self._fire_callback(event.keysym.lower(), itemnum) return "break" ##////////////////////////////////////////////////////// ## Improved OptionMenu ##////////////////////////////////////////////////////// class MutableOptionMenu(Menubutton): def __init__(self, master, values, **options): self._callback = options.get("command") if "command" in options: del options["command"] # Create a variable self._variable = variable = StringVar() if len(values) > 0: variable.set(values[0]) kw = { "borderwidth": 2, "textvariable": variable, "indicatoron": 1, "relief": RAISED, "anchor": "c", "highlightthickness": 2, } kw.update(options) Widget.__init__(self, master, "menubutton", kw) self.widgetName = "tk_optionMenu" self._menu = Menu(self, name="menu", tearoff=0) self.menuname = self._menu._w self._values = [] for value in values: self.add(value) self["menu"] = self._menu def add(self, value): if value in self._values: return def set(value=value): self.set(value) self._menu.add_command(label=value, command=set) self._values.append(value) def set(self, value): self._variable.set(value) if self._callback: self._callback(value) def remove(self, value): # Might raise indexerror: pass to parent. i = self._values.index(value) del self._values[i] self._menu.delete(i, i) def __getitem__(self, name): if name == "menu": return self.__menu return Widget.__getitem__(self, name) def destroy(self): """Destroy this widget and the associated menu.""" Menubutton.destroy(self) self._menu = None ##////////////////////////////////////////////////////// ## Test code. ##////////////////////////////////////////////////////// def demo(): """ A simple demonstration showing how to use canvas widgets. """ def fill(cw): from random import randint cw["fill"] = "#00%04d" % randint(0, 9999) def color(cw): from random import randint cw["color"] = "#ff%04d" % randint(0, 9999) cf = CanvasFrame(closeenough=10, width=300, height=300) c = cf.canvas() ct3 = TextWidget(c, "hiya there", draggable=1) ct2 = TextWidget(c, "o o\n||\n___\n U", draggable=1, justify="center") co = OvalWidget(c, ct2, outline="red") ct = TextWidget(c, "o o\n||\n\\___/", draggable=1, justify="center") cp = ParenWidget(c, ct, color="red") cb = BoxWidget(c, cp, fill="cyan", draggable=1, width=3, margin=10) equation = SequenceWidget( c, SymbolWidget(c, "forall"), TextWidget(c, "x"), SymbolWidget(c, "exists"), TextWidget(c, "y: "), TextWidget(c, "x"), SymbolWidget(c, "notequal"), TextWidget(c, "y"), ) space = SpaceWidget(c, 0, 30) cstack = StackWidget(c, cb, ct3, space, co, equation, align="center") prompt_msg = TextWidget( c, "try clicking\nand dragging", draggable=1, justify="center" ) cs = SequenceWidget(c, cstack, prompt_msg) zz = BracketWidget(c, cs, color="green4", width=3) cf.add_widget(zz, 60, 30) cb.bind_click(fill) ct.bind_click(color) co.bind_click(fill) ct2.bind_click(color) ct3.bind_click(color) cf.mainloop() # ShowText(None, 'title', ((('this is text'*150)+'\n')*5)) if __name__ == "__main__": demo() nltk-3.7/nltk/featstruct.py000066400000000000000000003117271420073152400160520ustar00rootroot00000000000000# Natural Language Toolkit: Feature Structures # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper , # Rob Speer, # Steven Bird # URL: # For license information, see LICENSE.TXT """ Basic data classes for representing feature structures, and for performing basic operations on those feature structures. A feature structure is a mapping from feature identifiers to feature values, where each feature value is either a basic value (such as a string or an integer), or a nested feature structure. There are two types of feature structure, implemented by two subclasses of ``FeatStruct``: - feature dictionaries, implemented by ``FeatDict``, act like Python dictionaries. Feature identifiers may be strings or instances of the ``Feature`` class. - feature lists, implemented by ``FeatList``, act like Python lists. Feature identifiers are integers. Feature structures are typically used to represent partial information about objects. A feature identifier that is not mapped to a value stands for a feature whose value is unknown (*not* a feature without a value). Two feature structures that represent (potentially overlapping) information about the same object can be combined by unification. When two inconsistent feature structures are unified, the unification fails and returns None. Features can be specified using "feature paths", or tuples of feature identifiers that specify path through the nested feature structures to a value. Feature structures may contain reentrant feature values. A "reentrant feature value" is a single feature value that can be accessed via multiple feature paths. Unification preserves the reentrance relations imposed by both of the unified feature structures. In the feature structure resulting from unification, any modifications to a reentrant feature value will be visible using any of its feature paths. Feature structure variables are encoded using the ``nltk.sem.Variable`` class. The variables' values are tracked using a bindings dictionary, which maps variables to their values. When two feature structures are unified, a fresh bindings dictionary is created to track their values; and before unification completes, all bound variables are replaced by their values. Thus, the bindings dictionaries are usually strictly internal to the unification process. However, it is possible to track the bindings of variables if you choose to, by supplying your own initial bindings dictionary to the ``unify()`` function. When unbound variables are unified with one another, they become aliased. This is encoded by binding one variable to the other. Lightweight Feature Structures ============================== Many of the functions defined by ``nltk.featstruct`` can be applied directly to simple Python dictionaries and lists, rather than to full-fledged ``FeatDict`` and ``FeatList`` objects. In other words, Python ``dicts`` and ``lists`` can be used as "light-weight" feature structures. >>> from nltk.featstruct import unify >>> unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b'))) # doctest: +SKIP {'y': {'b': 'b'}, 'x': 1, 'a': 'a'} However, you should keep in mind the following caveats: - Python dictionaries & lists ignore reentrance when checking for equality between values. But two FeatStructs with different reentrances are considered nonequal, even if all their base values are equal. - FeatStructs can be easily frozen, allowing them to be used as keys in hash tables. Python dictionaries and lists can not. - FeatStructs display reentrance in their string representations; Python dictionaries and lists do not. - FeatStructs may *not* be mixed with Python dictionaries and lists (e.g., when performing unification). - FeatStructs provide a number of useful methods, such as ``walk()`` and ``cyclic()``, which are not available for Python dicts and lists. In general, if your feature structures will contain any reentrances, or if you plan to use them as dictionary keys, it is strongly recommended that you use full-fledged ``FeatStruct`` objects. """ import copy import re from functools import total_ordering from nltk.internals import raise_unorderable_types, read_str from nltk.sem.logic import ( Expression, LogicalExpressionException, LogicParser, SubstituteBindingsI, Variable, ) ###################################################################### # Feature Structure ###################################################################### @total_ordering class FeatStruct(SubstituteBindingsI): """ A mapping from feature identifiers to feature values, where each feature value is either a basic value (such as a string or an integer), or a nested feature structure. There are two types of feature structure: - feature dictionaries, implemented by ``FeatDict``, act like Python dictionaries. Feature identifiers may be strings or instances of the ``Feature`` class. - feature lists, implemented by ``FeatList``, act like Python lists. Feature identifiers are integers. Feature structures may be indexed using either simple feature identifiers or 'feature paths.' A feature path is a sequence of feature identifiers that stand for a corresponding sequence of indexing operations. In particular, ``fstruct[(f1,f2,...,fn)]`` is equivalent to ``fstruct[f1][f2]...[fn]``. Feature structures may contain reentrant feature structures. A "reentrant feature structure" is a single feature structure object that can be accessed via multiple feature paths. Feature structures may also be cyclic. A feature structure is "cyclic" if there is any feature path from the feature structure to itself. Two feature structures are considered equal if they assign the same values to all features, and have the same reentrancies. By default, feature structures are mutable. They may be made immutable with the ``freeze()`` method. Once they have been frozen, they may be hashed, and thus used as dictionary keys. """ _frozen = False """:ivar: A flag indicating whether this feature structure is frozen or not. Once this flag is set, it should never be un-set; and no further modification should be made to this feature structure.""" ##//////////////////////////////////////////////////////////// # { Constructor ##//////////////////////////////////////////////////////////// def __new__(cls, features=None, **morefeatures): """ Construct and return a new feature structure. If this constructor is called directly, then the returned feature structure will be an instance of either the ``FeatDict`` class or the ``FeatList`` class. :param features: The initial feature values for this feature structure: - FeatStruct(string) -> FeatStructReader().read(string) - FeatStruct(mapping) -> FeatDict(mapping) - FeatStruct(sequence) -> FeatList(sequence) - FeatStruct() -> FeatDict() :param morefeatures: If ``features`` is a mapping or None, then ``morefeatures`` provides additional features for the ``FeatDict`` constructor. """ # If the FeatStruct constructor is called directly, then decide # whether to create a FeatDict or a FeatList, based on the # contents of the `features` argument. if cls is FeatStruct: if features is None: return FeatDict.__new__(FeatDict, **morefeatures) elif _is_mapping(features): return FeatDict.__new__(FeatDict, features, **morefeatures) elif morefeatures: raise TypeError( "Keyword arguments may only be specified " "if features is None or is a mapping." ) if isinstance(features, str): if FeatStructReader._START_FDICT_RE.match(features): return FeatDict.__new__(FeatDict, features, **morefeatures) else: return FeatList.__new__(FeatList, features, **morefeatures) elif _is_sequence(features): return FeatList.__new__(FeatList, features) else: raise TypeError("Expected string or mapping or sequence") # Otherwise, construct the object as normal. else: return super().__new__(cls, features, **morefeatures) ##//////////////////////////////////////////////////////////// # { Uniform Accessor Methods ##//////////////////////////////////////////////////////////// # These helper functions allow the methods defined by FeatStruct # to treat all feature structures as mappings, even if they're # really lists. (Lists are treated as mappings from ints to vals) def _keys(self): """Return an iterable of the feature identifiers used by this FeatStruct.""" raise NotImplementedError() # Implemented by subclasses. def _values(self): """Return an iterable of the feature values directly defined by this FeatStruct.""" raise NotImplementedError() # Implemented by subclasses. def _items(self): """Return an iterable of (fid,fval) pairs, where fid is a feature identifier and fval is the corresponding feature value, for all features defined by this FeatStruct.""" raise NotImplementedError() # Implemented by subclasses. ##//////////////////////////////////////////////////////////// # { Equality & Hashing ##//////////////////////////////////////////////////////////// def equal_values(self, other, check_reentrance=False): """ Return True if ``self`` and ``other`` assign the same value to to every feature. In particular, return true if ``self[p]==other[p]`` for every feature path *p* such that ``self[p]`` or ``other[p]`` is a base value (i.e., not a nested feature structure). :param check_reentrance: If True, then also return False if there is any difference between the reentrances of ``self`` and ``other``. :note: the ``==`` is equivalent to ``equal_values()`` with ``check_reentrance=True``. """ return self._equal(other, check_reentrance, set(), set(), set()) def __eq__(self, other): """ Return true if ``self`` and ``other`` are both feature structures, assign the same values to all features, and contain the same reentrances. I.e., return ``self.equal_values(other, check_reentrance=True)``. :see: ``equal_values()`` """ return self._equal(other, True, set(), set(), set()) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, FeatStruct): # raise_unorderable_types("<", self, other) # Sometimes feature values can be pure strings, # so we need to be able to compare with non-featstructs: return self.__class__.__name__ < other.__class__.__name__ else: return len(self) < len(other) def __hash__(self): """ If this feature structure is frozen, return its hash value; otherwise, raise ``TypeError``. """ if not self._frozen: raise TypeError("FeatStructs must be frozen before they " "can be hashed.") try: return self._hash except AttributeError: self._hash = self._calculate_hashvalue(set()) return self._hash def _equal( self, other, check_reentrance, visited_self, visited_other, visited_pairs ): """ Return True iff self and other have equal values. :param visited_self: A set containing the ids of all ``self`` feature structures we've already visited. :param visited_other: A set containing the ids of all ``other`` feature structures we've already visited. :param visited_pairs: A set containing ``(selfid, otherid)`` pairs for all pairs of feature structures we've already visited. """ # If we're the same object, then we're equal. if self is other: return True # If we have different classes, we're definitely not equal. if self.__class__ != other.__class__: return False # If we define different features, we're definitely not equal. # (Perform len test first because it's faster -- we should # do profiling to see if this actually helps) if len(self) != len(other): return False if set(self._keys()) != set(other._keys()): return False # If we're checking reentrance, then any time we revisit a # structure, make sure that it was paired with the same # feature structure that it is now. Note: if check_reentrance, # then visited_pairs will never contain two pairs whose first # values are equal, or two pairs whose second values are equal. if check_reentrance: if id(self) in visited_self or id(other) in visited_other: return (id(self), id(other)) in visited_pairs # If we're not checking reentrance, then we still need to deal # with cycles. If we encounter the same (self, other) pair a # second time, then we won't learn anything more by examining # their children a second time, so just return true. else: if (id(self), id(other)) in visited_pairs: return True # Keep track of which nodes we've visited. visited_self.add(id(self)) visited_other.add(id(other)) visited_pairs.add((id(self), id(other))) # Now we have to check all values. If any of them don't match, # then return false. for (fname, self_fval) in self._items(): other_fval = other[fname] if isinstance(self_fval, FeatStruct): if not self_fval._equal( other_fval, check_reentrance, visited_self, visited_other, visited_pairs, ): return False else: if self_fval != other_fval: return False # Everything matched up; return true. return True def _calculate_hashvalue(self, visited): """ Return a hash value for this feature structure. :require: ``self`` must be frozen. :param visited: A set containing the ids of all feature structures we've already visited while hashing. """ if id(self) in visited: return 1 visited.add(id(self)) hashval = 5831 for (fname, fval) in sorted(self._items()): hashval *= 37 hashval += hash(fname) hashval *= 37 if isinstance(fval, FeatStruct): hashval += fval._calculate_hashvalue(visited) else: hashval += hash(fval) # Convert to a 32 bit int. hashval = int(hashval & 0x7FFFFFFF) return hashval ##//////////////////////////////////////////////////////////// # { Freezing ##//////////////////////////////////////////////////////////// #: Error message used by mutating methods when called on a frozen #: feature structure. _FROZEN_ERROR = "Frozen FeatStructs may not be modified." def freeze(self): """ Make this feature structure, and any feature structures it contains, immutable. Note: this method does not attempt to 'freeze' any feature value that is not a ``FeatStruct``; it is recommended that you use only immutable feature values. """ if self._frozen: return self._freeze(set()) def frozen(self): """ Return True if this feature structure is immutable. Feature structures can be made immutable with the ``freeze()`` method. Immutable feature structures may not be made mutable again, but new mutable copies can be produced with the ``copy()`` method. """ return self._frozen def _freeze(self, visited): """ Make this feature structure, and any feature structure it contains, immutable. :param visited: A set containing the ids of all feature structures we've already visited while freezing. """ if id(self) in visited: return visited.add(id(self)) self._frozen = True for (fname, fval) in sorted(self._items()): if isinstance(fval, FeatStruct): fval._freeze(visited) ##//////////////////////////////////////////////////////////// # { Copying ##//////////////////////////////////////////////////////////// def copy(self, deep=True): """ Return a new copy of ``self``. The new copy will not be frozen. :param deep: If true, create a deep copy; if false, create a shallow copy. """ if deep: return copy.deepcopy(self) else: return self.__class__(self) # Subclasses should define __deepcopy__ to ensure that the new # copy will not be frozen. def __deepcopy__(self, memo): raise NotImplementedError() # Implemented by subclasses. ##//////////////////////////////////////////////////////////// # { Structural Information ##//////////////////////////////////////////////////////////// def cyclic(self): """ Return True if this feature structure contains itself. """ return self._find_reentrances({})[id(self)] def walk(self): """ Return an iterator that generates this feature structure, and each feature structure it contains. Each feature structure will be generated exactly once. """ return self._walk(set()) def _walk(self, visited): """ Return an iterator that generates this feature structure, and each feature structure it contains. :param visited: A set containing the ids of all feature structures we've already visited while freezing. """ raise NotImplementedError() # Implemented by subclasses. def _walk(self, visited): if id(self) in visited: return visited.add(id(self)) yield self for fval in self._values(): if isinstance(fval, FeatStruct): yield from fval._walk(visited) # Walk through the feature tree. The first time we see a feature # value, map it to False (not reentrant). If we see a feature # value more than once, then map it to True (reentrant). def _find_reentrances(self, reentrances): """ Return a dictionary that maps from the ``id`` of each feature structure contained in ``self`` (including ``self``) to a boolean value, indicating whether it is reentrant or not. """ if id(self) in reentrances: # We've seen it more than once. reentrances[id(self)] = True else: # This is the first time we've seen it. reentrances[id(self)] = False # Recurse to contained feature structures. for fval in self._values(): if isinstance(fval, FeatStruct): fval._find_reentrances(reentrances) return reentrances ##//////////////////////////////////////////////////////////// # { Variables & Bindings ##//////////////////////////////////////////////////////////// def substitute_bindings(self, bindings): """:see: ``nltk.featstruct.substitute_bindings()``""" return substitute_bindings(self, bindings) def retract_bindings(self, bindings): """:see: ``nltk.featstruct.retract_bindings()``""" return retract_bindings(self, bindings) def variables(self): """:see: ``nltk.featstruct.find_variables()``""" return find_variables(self) def rename_variables(self, vars=None, used_vars=(), new_vars=None): """:see: ``nltk.featstruct.rename_variables()``""" return rename_variables(self, vars, used_vars, new_vars) def remove_variables(self): """ Return the feature structure that is obtained by deleting any feature whose value is a ``Variable``. :rtype: FeatStruct """ return remove_variables(self) ##//////////////////////////////////////////////////////////// # { Unification ##//////////////////////////////////////////////////////////// def unify(self, other, bindings=None, trace=False, fail=None, rename_vars=True): return unify(self, other, bindings, trace, fail, rename_vars) def subsumes(self, other): """ Return True if ``self`` subsumes ``other``. I.e., return true If unifying ``self`` with ``other`` would result in a feature structure equal to ``other``. """ return subsumes(self, other) ##//////////////////////////////////////////////////////////// # { String Representations ##//////////////////////////////////////////////////////////// def __repr__(self): """ Display a single-line representation of this feature structure, suitable for embedding in other representations. """ return self._repr(self._find_reentrances({}), {}) def _repr(self, reentrances, reentrance_ids): """ Return a string representation of this feature structure. :param reentrances: A dictionary that maps from the ``id`` of each feature value in self, indicating whether that value is reentrant or not. :param reentrance_ids: A dictionary mapping from each ``id`` of a feature value to a unique identifier. This is modified by ``repr``: the first time a reentrant feature value is displayed, an identifier is added to ``reentrance_ids`` for it. """ raise NotImplementedError() # Mutation: disable if frozen. _FROZEN_ERROR = "Frozen FeatStructs may not be modified." _FROZEN_NOTICE = "\n%sIf self is frozen, raise ValueError." def _check_frozen(method, indent=""): """ Given a method function, return a new method function that first checks if ``self._frozen`` is true; and if so, raises ``ValueError`` with an appropriate message. Otherwise, call the method and return its result. """ def wrapped(self, *args, **kwargs): if self._frozen: raise ValueError(_FROZEN_ERROR) else: return method(self, *args, **kwargs) wrapped.__name__ = method.__name__ wrapped.__doc__ = (method.__doc__ or "") + (_FROZEN_NOTICE % indent) return wrapped ###################################################################### # Feature Dictionary ###################################################################### class FeatDict(FeatStruct, dict): """ A feature structure that acts like a Python dictionary. I.e., a mapping from feature identifiers to feature values, where a feature identifier can be a string or a ``Feature``; and where a feature value can be either a basic value (such as a string or an integer), or a nested feature structure. A feature identifiers for a ``FeatDict`` is sometimes called a "feature name". Two feature dicts are considered equal if they assign the same values to all features, and have the same reentrances. :see: ``FeatStruct`` for information about feature paths, reentrance, cyclic feature structures, mutability, freezing, and hashing. """ def __init__(self, features=None, **morefeatures): """ Create a new feature dictionary, with the specified features. :param features: The initial value for this feature dictionary. If ``features`` is a ``FeatStruct``, then its features are copied (shallow copy). If ``features`` is a dict, then a feature is created for each item, mapping its key to its value. If ``features`` is a string, then it is processed using ``FeatStructReader``. If ``features`` is a list of tuples ``(name, val)``, then a feature is created for each tuple. :param morefeatures: Additional features for the new feature dictionary. If a feature is listed under both ``features`` and ``morefeatures``, then the value from ``morefeatures`` will be used. """ if isinstance(features, str): FeatStructReader().fromstring(features, self) self.update(**morefeatures) else: # update() checks the types of features. self.update(features, **morefeatures) # //////////////////////////////////////////////////////////// # { Dict methods # //////////////////////////////////////////////////////////// _INDEX_ERROR = "Expected feature name or path. Got %r." def __getitem__(self, name_or_path): """If the feature with the given name or path exists, return its value; otherwise, raise ``KeyError``.""" if isinstance(name_or_path, (str, Feature)): return dict.__getitem__(self, name_or_path) elif isinstance(name_or_path, tuple): try: val = self for fid in name_or_path: if not isinstance(val, FeatStruct): raise KeyError # path contains base value val = val[fid] return val except (KeyError, IndexError) as e: raise KeyError(name_or_path) from e else: raise TypeError(self._INDEX_ERROR % name_or_path) def get(self, name_or_path, default=None): """If the feature with the given name or path exists, return its value; otherwise, return ``default``.""" try: return self[name_or_path] except KeyError: return default def __contains__(self, name_or_path): """Return true if a feature with the given name or path exists.""" try: self[name_or_path] return True except KeyError: return False def has_key(self, name_or_path): """Return true if a feature with the given name or path exists.""" return name_or_path in self def __delitem__(self, name_or_path): """If the feature with the given name or path exists, delete its value; otherwise, raise ``KeyError``.""" if self._frozen: raise ValueError(_FROZEN_ERROR) if isinstance(name_or_path, (str, Feature)): return dict.__delitem__(self, name_or_path) elif isinstance(name_or_path, tuple): if len(name_or_path) == 0: raise ValueError("The path () can not be set") else: parent = self[name_or_path[:-1]] if not isinstance(parent, FeatStruct): raise KeyError(name_or_path) # path contains base value del parent[name_or_path[-1]] else: raise TypeError(self._INDEX_ERROR % name_or_path) def __setitem__(self, name_or_path, value): """Set the value for the feature with the given name or path to ``value``. If ``name_or_path`` is an invalid path, raise ``KeyError``.""" if self._frozen: raise ValueError(_FROZEN_ERROR) if isinstance(name_or_path, (str, Feature)): return dict.__setitem__(self, name_or_path, value) elif isinstance(name_or_path, tuple): if len(name_or_path) == 0: raise ValueError("The path () can not be set") else: parent = self[name_or_path[:-1]] if not isinstance(parent, FeatStruct): raise KeyError(name_or_path) # path contains base value parent[name_or_path[-1]] = value else: raise TypeError(self._INDEX_ERROR % name_or_path) clear = _check_frozen(dict.clear) pop = _check_frozen(dict.pop) popitem = _check_frozen(dict.popitem) setdefault = _check_frozen(dict.setdefault) def update(self, features=None, **morefeatures): if self._frozen: raise ValueError(_FROZEN_ERROR) if features is None: items = () elif hasattr(features, "items") and callable(features.items): items = features.items() elif hasattr(features, "__iter__"): items = features else: raise ValueError("Expected mapping or list of tuples") for key, val in items: if not isinstance(key, (str, Feature)): raise TypeError("Feature names must be strings") self[key] = val for key, val in morefeatures.items(): if not isinstance(key, (str, Feature)): raise TypeError("Feature names must be strings") self[key] = val ##//////////////////////////////////////////////////////////// # { Copying ##//////////////////////////////////////////////////////////// def __deepcopy__(self, memo): memo[id(self)] = selfcopy = self.__class__() for (key, val) in self._items(): selfcopy[copy.deepcopy(key, memo)] = copy.deepcopy(val, memo) return selfcopy ##//////////////////////////////////////////////////////////// # { Uniform Accessor Methods ##//////////////////////////////////////////////////////////// def _keys(self): return self.keys() def _values(self): return self.values() def _items(self): return self.items() ##//////////////////////////////////////////////////////////// # { String Representations ##//////////////////////////////////////////////////////////// def __str__(self): """ Display a multi-line representation of this feature dictionary as an FVM (feature value matrix). """ return "\n".join(self._str(self._find_reentrances({}), {})) def _repr(self, reentrances, reentrance_ids): segments = [] prefix = "" suffix = "" # If this is the first time we've seen a reentrant structure, # then assign it a unique identifier. if reentrances[id(self)]: assert id(self) not in reentrance_ids reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) # sorting note: keys are unique strings, so we'll never fall # through to comparing values. for (fname, fval) in sorted(self.items()): display = getattr(fname, "display", None) if id(fval) in reentrance_ids: segments.append(f"{fname}->({reentrance_ids[id(fval)]})") elif ( display == "prefix" and not prefix and isinstance(fval, (Variable, str)) ): prefix = "%s" % fval elif display == "slash" and not suffix: if isinstance(fval, Variable): suffix = "/%s" % fval.name else: suffix = "/%s" % repr(fval) elif isinstance(fval, Variable): segments.append(f"{fname}={fval.name}") elif fval is True: segments.append("+%s" % fname) elif fval is False: segments.append("-%s" % fname) elif isinstance(fval, Expression): segments.append(f"{fname}=<{fval}>") elif not isinstance(fval, FeatStruct): segments.append(f"{fname}={repr(fval)}") else: fval_repr = fval._repr(reentrances, reentrance_ids) segments.append(f"{fname}={fval_repr}") # If it's reentrant, then add on an identifier tag. if reentrances[id(self)]: prefix = f"({reentrance_ids[id(self)]}){prefix}" return "{}[{}]{}".format(prefix, ", ".join(segments), suffix) def _str(self, reentrances, reentrance_ids): """ :return: A list of lines composing a string representation of this feature dictionary. :param reentrances: A dictionary that maps from the ``id`` of each feature value in self, indicating whether that value is reentrant or not. :param reentrance_ids: A dictionary mapping from each ``id`` of a feature value to a unique identifier. This is modified by ``repr``: the first time a reentrant feature value is displayed, an identifier is added to ``reentrance_ids`` for it. """ # If this is the first time we've seen a reentrant structure, # then tack on an id string. if reentrances[id(self)]: assert id(self) not in reentrance_ids reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) # Special case: empty feature dict. if len(self) == 0: if reentrances[id(self)]: return ["(%s) []" % reentrance_ids[id(self)]] else: return ["[]"] # What's the longest feature name? Use this to align names. maxfnamelen = max(len("%s" % k) for k in self.keys()) lines = [] # sorting note: keys are unique strings, so we'll never fall # through to comparing values. for (fname, fval) in sorted(self.items()): fname = ("%s" % fname).ljust(maxfnamelen) if isinstance(fval, Variable): lines.append(f"{fname} = {fval.name}") elif isinstance(fval, Expression): lines.append(f"{fname} = <{fval}>") elif isinstance(fval, FeatList): fval_repr = fval._repr(reentrances, reentrance_ids) lines.append(f"{fname} = {repr(fval_repr)}") elif not isinstance(fval, FeatDict): # It's not a nested feature structure -- just print it. lines.append(f"{fname} = {repr(fval)}") elif id(fval) in reentrance_ids: # It's a feature structure we've seen before -- print # the reentrance id. lines.append(f"{fname} -> ({reentrance_ids[id(fval)]})") else: # It's a new feature structure. Separate it from # other values by a blank line. if lines and lines[-1] != "": lines.append("") # Recursively print the feature's value (fval). fval_lines = fval._str(reentrances, reentrance_ids) # Indent each line to make room for fname. fval_lines = [(" " * (maxfnamelen + 3)) + l for l in fval_lines] # Pick which line we'll display fname on, & splice it in. nameline = (len(fval_lines) - 1) // 2 fval_lines[nameline] = ( fname + " =" + fval_lines[nameline][maxfnamelen + 2 :] ) # Add the feature structure to the output. lines += fval_lines # Separate FeatStructs by a blank line. lines.append("") # Get rid of any excess blank lines. if lines[-1] == "": lines.pop() # Add brackets around everything. maxlen = max(len(line) for line in lines) lines = ["[ {}{} ]".format(line, " " * (maxlen - len(line))) for line in lines] # If it's reentrant, then add on an identifier tag. if reentrances[id(self)]: idstr = "(%s) " % reentrance_ids[id(self)] lines = [(" " * len(idstr)) + l for l in lines] idline = (len(lines) - 1) // 2 lines[idline] = idstr + lines[idline][len(idstr) :] return lines ###################################################################### # Feature List ###################################################################### class FeatList(FeatStruct, list): """ A list of feature values, where each feature value is either a basic value (such as a string or an integer), or a nested feature structure. Feature lists may contain reentrant feature values. A "reentrant feature value" is a single feature value that can be accessed via multiple feature paths. Feature lists may also be cyclic. Two feature lists are considered equal if they assign the same values to all features, and have the same reentrances. :see: ``FeatStruct`` for information about feature paths, reentrance, cyclic feature structures, mutability, freezing, and hashing. """ def __init__(self, features=()): """ Create a new feature list, with the specified features. :param features: The initial list of features for this feature list. If ``features`` is a string, then it is paresd using ``FeatStructReader``. Otherwise, it should be a sequence of basic values and nested feature structures. """ if isinstance(features, str): FeatStructReader().fromstring(features, self) else: list.__init__(self, features) # //////////////////////////////////////////////////////////// # { List methods # //////////////////////////////////////////////////////////// _INDEX_ERROR = "Expected int or feature path. Got %r." def __getitem__(self, name_or_path): if isinstance(name_or_path, int): return list.__getitem__(self, name_or_path) elif isinstance(name_or_path, tuple): try: val = self for fid in name_or_path: if not isinstance(val, FeatStruct): raise KeyError # path contains base value val = val[fid] return val except (KeyError, IndexError) as e: raise KeyError(name_or_path) from e else: raise TypeError(self._INDEX_ERROR % name_or_path) def __delitem__(self, name_or_path): """If the feature with the given name or path exists, delete its value; otherwise, raise ``KeyError``.""" if self._frozen: raise ValueError(_FROZEN_ERROR) if isinstance(name_or_path, (int, slice)): return list.__delitem__(self, name_or_path) elif isinstance(name_or_path, tuple): if len(name_or_path) == 0: raise ValueError("The path () can not be set") else: parent = self[name_or_path[:-1]] if not isinstance(parent, FeatStruct): raise KeyError(name_or_path) # path contains base value del parent[name_or_path[-1]] else: raise TypeError(self._INDEX_ERROR % name_or_path) def __setitem__(self, name_or_path, value): """Set the value for the feature with the given name or path to ``value``. If ``name_or_path`` is an invalid path, raise ``KeyError``.""" if self._frozen: raise ValueError(_FROZEN_ERROR) if isinstance(name_or_path, (int, slice)): return list.__setitem__(self, name_or_path, value) elif isinstance(name_or_path, tuple): if len(name_or_path) == 0: raise ValueError("The path () can not be set") else: parent = self[name_or_path[:-1]] if not isinstance(parent, FeatStruct): raise KeyError(name_or_path) # path contains base value parent[name_or_path[-1]] = value else: raise TypeError(self._INDEX_ERROR % name_or_path) # __delslice__ = _check_frozen(list.__delslice__, ' ') # __setslice__ = _check_frozen(list.__setslice__, ' ') __iadd__ = _check_frozen(list.__iadd__) __imul__ = _check_frozen(list.__imul__) append = _check_frozen(list.append) extend = _check_frozen(list.extend) insert = _check_frozen(list.insert) pop = _check_frozen(list.pop) remove = _check_frozen(list.remove) reverse = _check_frozen(list.reverse) sort = _check_frozen(list.sort) ##//////////////////////////////////////////////////////////// # { Copying ##//////////////////////////////////////////////////////////// def __deepcopy__(self, memo): memo[id(self)] = selfcopy = self.__class__() selfcopy.extend(copy.deepcopy(fval, memo) for fval in self) return selfcopy ##//////////////////////////////////////////////////////////// # { Uniform Accessor Methods ##//////////////////////////////////////////////////////////// def _keys(self): return list(range(len(self))) def _values(self): return self def _items(self): return enumerate(self) ##//////////////////////////////////////////////////////////// # { String Representations ##//////////////////////////////////////////////////////////// # Special handling for: reentrances, variables, expressions. def _repr(self, reentrances, reentrance_ids): # If this is the first time we've seen a reentrant structure, # then assign it a unique identifier. if reentrances[id(self)]: assert id(self) not in reentrance_ids reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) prefix = "(%s)" % reentrance_ids[id(self)] else: prefix = "" segments = [] for fval in self: if id(fval) in reentrance_ids: segments.append("->(%s)" % reentrance_ids[id(fval)]) elif isinstance(fval, Variable): segments.append(fval.name) elif isinstance(fval, Expression): segments.append("%s" % fval) elif isinstance(fval, FeatStruct): segments.append(fval._repr(reentrances, reentrance_ids)) else: segments.append("%s" % repr(fval)) return "{}[{}]".format(prefix, ", ".join(segments)) ###################################################################### # Variables & Bindings ###################################################################### def substitute_bindings(fstruct, bindings, fs_class="default"): """ Return the feature structure that is obtained by replacing each variable bound by ``bindings`` with its binding. If a variable is aliased to a bound variable, then it will be replaced by that variable's value. If a variable is aliased to an unbound variable, then it will be replaced by that variable. :type bindings: dict(Variable -> any) :param bindings: A dictionary mapping from variables to values. """ if fs_class == "default": fs_class = _default_fs_class(fstruct) fstruct = copy.deepcopy(fstruct) _substitute_bindings(fstruct, bindings, fs_class, set()) return fstruct def _substitute_bindings(fstruct, bindings, fs_class, visited): # Visit each node only once: if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError("Expected mapping or sequence") for (fname, fval) in items: while isinstance(fval, Variable) and fval in bindings: fval = fstruct[fname] = bindings[fval] if isinstance(fval, fs_class): _substitute_bindings(fval, bindings, fs_class, visited) elif isinstance(fval, SubstituteBindingsI): fstruct[fname] = fval.substitute_bindings(bindings) def retract_bindings(fstruct, bindings, fs_class="default"): """ Return the feature structure that is obtained by replacing each feature structure value that is bound by ``bindings`` with the variable that binds it. A feature structure value must be identical to a bound value (i.e., have equal id) to be replaced. ``bindings`` is modified to point to this new feature structure, rather than the original feature structure. Feature structure values in ``bindings`` may be modified if they are contained in ``fstruct``. """ if fs_class == "default": fs_class = _default_fs_class(fstruct) (fstruct, new_bindings) = copy.deepcopy((fstruct, bindings)) bindings.update(new_bindings) inv_bindings = {id(val): var for (var, val) in bindings.items()} _retract_bindings(fstruct, inv_bindings, fs_class, set()) return fstruct def _retract_bindings(fstruct, inv_bindings, fs_class, visited): # Visit each node only once: if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError("Expected mapping or sequence") for (fname, fval) in items: if isinstance(fval, fs_class): if id(fval) in inv_bindings: fstruct[fname] = inv_bindings[id(fval)] _retract_bindings(fval, inv_bindings, fs_class, visited) def find_variables(fstruct, fs_class="default"): """ :return: The set of variables used by this feature structure. :rtype: set(Variable) """ if fs_class == "default": fs_class = _default_fs_class(fstruct) return _variables(fstruct, set(), fs_class, set()) def _variables(fstruct, vars, fs_class, visited): # Visit each node only once: if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError("Expected mapping or sequence") for (fname, fval) in items: if isinstance(fval, Variable): vars.add(fval) elif isinstance(fval, fs_class): _variables(fval, vars, fs_class, visited) elif isinstance(fval, SubstituteBindingsI): vars.update(fval.variables()) return vars def rename_variables( fstruct, vars=None, used_vars=(), new_vars=None, fs_class="default" ): """ Return the feature structure that is obtained by replacing any of this feature structure's variables that are in ``vars`` with new variables. The names for these new variables will be names that are not used by any variable in ``vars``, or in ``used_vars``, or in this feature structure. :type vars: set :param vars: The set of variables that should be renamed. If not specified, ``find_variables(fstruct)`` is used; i.e., all variables will be given new names. :type used_vars: set :param used_vars: A set of variables whose names should not be used by the new variables. :type new_vars: dict(Variable -> Variable) :param new_vars: A dictionary that is used to hold the mapping from old variables to new variables. For each variable *v* in this feature structure: - If ``new_vars`` maps *v* to *v'*, then *v* will be replaced by *v'*. - If ``new_vars`` does not contain *v*, but ``vars`` does contain *v*, then a new entry will be added to ``new_vars``, mapping *v* to the new variable that is used to replace it. To consistently rename the variables in a set of feature structures, simply apply rename_variables to each one, using the same dictionary: >>> from nltk.featstruct import FeatStruct >>> fstruct1 = FeatStruct('[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]') >>> fstruct2 = FeatStruct('[subj=[agr=[number=?z,gender=?y]], obj=[agr=[number=?z,gender=?y]]]') >>> new_vars = {} # Maps old vars to alpha-renamed vars >>> fstruct1.rename_variables(new_vars=new_vars) [obj=[agr=[gender=?y2]], subj=[agr=[gender=?y2]]] >>> fstruct2.rename_variables(new_vars=new_vars) [obj=[agr=[gender=?y2, number=?z2]], subj=[agr=[gender=?y2, number=?z2]]] If new_vars is not specified, then an empty dictionary is used. """ if fs_class == "default": fs_class = _default_fs_class(fstruct) # Default values: if new_vars is None: new_vars = {} if vars is None: vars = find_variables(fstruct, fs_class) else: vars = set(vars) # Add our own variables to used_vars. used_vars = find_variables(fstruct, fs_class).union(used_vars) # Copy ourselves, and rename variables in the copy. return _rename_variables( copy.deepcopy(fstruct), vars, used_vars, new_vars, fs_class, set() ) def _rename_variables(fstruct, vars, used_vars, new_vars, fs_class, visited): if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError("Expected mapping or sequence") for (fname, fval) in items: if isinstance(fval, Variable): # If it's in new_vars, then rebind it. if fval in new_vars: fstruct[fname] = new_vars[fval] # If it's in vars, pick a new name for it. elif fval in vars: new_vars[fval] = _rename_variable(fval, used_vars) fstruct[fname] = new_vars[fval] used_vars.add(new_vars[fval]) elif isinstance(fval, fs_class): _rename_variables(fval, vars, used_vars, new_vars, fs_class, visited) elif isinstance(fval, SubstituteBindingsI): # Pick new names for any variables in `vars` for var in fval.variables(): if var in vars and var not in new_vars: new_vars[var] = _rename_variable(var, used_vars) used_vars.add(new_vars[var]) # Replace all variables in `new_vars`. fstruct[fname] = fval.substitute_bindings(new_vars) return fstruct def _rename_variable(var, used_vars): name, n = re.sub(r"\d+$", "", var.name), 2 if not name: name = "?" while Variable(f"{name}{n}") in used_vars: n += 1 return Variable(f"{name}{n}") def remove_variables(fstruct, fs_class="default"): """ :rtype: FeatStruct :return: The feature structure that is obtained by deleting all features whose values are ``Variables``. """ if fs_class == "default": fs_class = _default_fs_class(fstruct) return _remove_variables(copy.deepcopy(fstruct), fs_class, set()) def _remove_variables(fstruct, fs_class, visited): if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = list(fstruct.items()) elif _is_sequence(fstruct): items = list(enumerate(fstruct)) else: raise ValueError("Expected mapping or sequence") for (fname, fval) in items: if isinstance(fval, Variable): del fstruct[fname] elif isinstance(fval, fs_class): _remove_variables(fval, fs_class, visited) return fstruct ###################################################################### # Unification ###################################################################### class _UnificationFailure: def __repr__(self): return "nltk.featstruct.UnificationFailure" UnificationFailure = _UnificationFailure() """A unique value used to indicate unification failure. It can be returned by ``Feature.unify_base_values()`` or by custom ``fail()`` functions to indicate that unificaiton should fail.""" # The basic unification algorithm: # 1. Make copies of self and other (preserving reentrance) # 2. Destructively unify self and other # 3. Apply forward pointers, to preserve reentrance. # 4. Replace bound variables with their values. def unify( fstruct1, fstruct2, bindings=None, trace=False, fail=None, rename_vars=True, fs_class="default", ): """ Unify ``fstruct1`` with ``fstruct2``, and return the resulting feature structure. This unified feature structure is the minimal feature structure that contains all feature value assignments from both ``fstruct1`` and ``fstruct2``, and that preserves all reentrancies. If no such feature structure exists (because ``fstruct1`` and ``fstruct2`` specify incompatible values for some feature), then unification fails, and ``unify`` returns None. Bound variables are replaced by their values. Aliased variables are replaced by their representative variable (if unbound) or the value of their representative variable (if bound). I.e., if variable *v* is in ``bindings``, then *v* is replaced by ``bindings[v]``. This will be repeated until the variable is replaced by an unbound variable or a non-variable value. Unbound variables are bound when they are unified with values; and aliased when they are unified with variables. I.e., if variable *v* is not in ``bindings``, and is unified with a variable or value *x*, then ``bindings[v]`` is set to *x*. If ``bindings`` is unspecified, then all variables are assumed to be unbound. I.e., ``bindings`` defaults to an empty dict. >>> from nltk.featstruct import FeatStruct >>> FeatStruct('[a=?x]').unify(FeatStruct('[b=?x]')) [a=?x, b=?x2] :type bindings: dict(Variable -> any) :param bindings: A set of variable bindings to be used and updated during unification. :type trace: bool :param trace: If true, generate trace output. :type rename_vars: bool :param rename_vars: If True, then rename any variables in ``fstruct2`` that are also used in ``fstruct1``, in order to avoid collisions on variable names. """ # Decide which class(es) will be treated as feature structures, # for the purposes of unification. if fs_class == "default": fs_class = _default_fs_class(fstruct1) if _default_fs_class(fstruct2) != fs_class: raise ValueError( "Mixing FeatStruct objects with Python " "dicts and lists is not supported." ) assert isinstance(fstruct1, fs_class) assert isinstance(fstruct2, fs_class) # If bindings are unspecified, use an empty set of bindings. user_bindings = bindings is not None if bindings is None: bindings = {} # Make copies of fstruct1 and fstruct2 (since the unification # algorithm is destructive). Do it all at once, to preserve # reentrance links between fstruct1 and fstruct2. Copy bindings # as well, in case there are any bound vars that contain parts # of fstruct1 or fstruct2. (fstruct1copy, fstruct2copy, bindings_copy) = copy.deepcopy( (fstruct1, fstruct2, bindings) ) # Copy the bindings back to the original bindings dict. bindings.update(bindings_copy) if rename_vars: vars1 = find_variables(fstruct1copy, fs_class) vars2 = find_variables(fstruct2copy, fs_class) _rename_variables(fstruct2copy, vars1, vars2, {}, fs_class, set()) # Do the actual unification. If it fails, return None. forward = {} if trace: _trace_unify_start((), fstruct1copy, fstruct2copy) try: result = _destructively_unify( fstruct1copy, fstruct2copy, bindings, forward, trace, fail, fs_class, () ) except _UnificationFailureError: return None # _destructively_unify might return UnificationFailure, e.g. if we # tried to unify a mapping with a sequence. if result is UnificationFailure: if fail is None: return None else: return fail(fstruct1copy, fstruct2copy, ()) # Replace any feature structure that has a forward pointer # with the target of its forward pointer. result = _apply_forwards(result, forward, fs_class, set()) if user_bindings: _apply_forwards_to_bindings(forward, bindings) # Replace bound vars with values. _resolve_aliases(bindings) _substitute_bindings(result, bindings, fs_class, set()) # Return the result. if trace: _trace_unify_succeed((), result) if trace: _trace_bindings((), bindings) return result class _UnificationFailureError(Exception): """An exception that is used by ``_destructively_unify`` to abort unification when a failure is encountered.""" def _destructively_unify( fstruct1, fstruct2, bindings, forward, trace, fail, fs_class, path ): """ Attempt to unify ``fstruct1`` and ``fstruct2`` by modifying them in-place. If the unification succeeds, then ``fstruct1`` will contain the unified value, the value of ``fstruct2`` is undefined, and forward[id(fstruct2)] is set to fstruct1. If the unification fails, then a _UnificationFailureError is raised, and the values of ``fstruct1`` and ``fstruct2`` are undefined. :param bindings: A dictionary mapping variables to values. :param forward: A dictionary mapping feature structures ids to replacement structures. When two feature structures are merged, a mapping from one to the other will be added to the forward dictionary; and changes will be made only to the target of the forward dictionary. ``_destructively_unify`` will always 'follow' any links in the forward dictionary for fstruct1 and fstruct2 before actually unifying them. :param trace: If true, generate trace output :param path: The feature path that led us to this unification step. Used for trace output. """ # If fstruct1 is already identical to fstruct2, we're done. # Note: this, together with the forward pointers, ensures # that unification will terminate even for cyclic structures. if fstruct1 is fstruct2: if trace: _trace_unify_identity(path, fstruct1) return fstruct1 # Set fstruct2's forward pointer to point to fstruct1; this makes # fstruct1 the canonical copy for fstruct2. Note that we need to # do this before we recurse into any child structures, in case # they're cyclic. forward[id(fstruct2)] = fstruct1 # Unifying two mappings: if _is_mapping(fstruct1) and _is_mapping(fstruct2): for fname in fstruct1: if getattr(fname, "default", None) is not None: fstruct2.setdefault(fname, fname.default) for fname in fstruct2: if getattr(fname, "default", None) is not None: fstruct1.setdefault(fname, fname.default) # Unify any values that are defined in both fstruct1 and # fstruct2. Copy any values that are defined in fstruct2 but # not in fstruct1 to fstruct1. Note: sorting fstruct2's # features isn't actually necessary; but we do it to give # deterministic behavior, e.g. for tracing. for fname, fval2 in sorted(fstruct2.items()): if fname in fstruct1: fstruct1[fname] = _unify_feature_values( fname, fstruct1[fname], fval2, bindings, forward, trace, fail, fs_class, path + (fname,), ) else: fstruct1[fname] = fval2 return fstruct1 # Contains the unified value. # Unifying two sequences: elif _is_sequence(fstruct1) and _is_sequence(fstruct2): # If the lengths don't match, fail. if len(fstruct1) != len(fstruct2): return UnificationFailure # Unify corresponding values in fstruct1 and fstruct2. for findex in range(len(fstruct1)): fstruct1[findex] = _unify_feature_values( findex, fstruct1[findex], fstruct2[findex], bindings, forward, trace, fail, fs_class, path + (findex,), ) return fstruct1 # Contains the unified value. # Unifying sequence & mapping: fail. The failure function # doesn't get a chance to recover in this case. elif (_is_sequence(fstruct1) or _is_mapping(fstruct1)) and ( _is_sequence(fstruct2) or _is_mapping(fstruct2) ): return UnificationFailure # Unifying anything else: not allowed! raise TypeError("Expected mappings or sequences") def _unify_feature_values( fname, fval1, fval2, bindings, forward, trace, fail, fs_class, fpath ): """ Attempt to unify ``fval1`` and and ``fval2``, and return the resulting unified value. The method of unification will depend on the types of ``fval1`` and ``fval2``: 1. If they're both feature structures, then destructively unify them (see ``_destructively_unify()``. 2. If they're both unbound variables, then alias one variable to the other (by setting bindings[v2]=v1). 3. If one is an unbound variable, and the other is a value, then bind the unbound variable to the value. 4. If one is a feature structure, and the other is a base value, then fail. 5. If they're both base values, then unify them. By default, this will succeed if they are equal, and fail otherwise. """ if trace: _trace_unify_start(fpath, fval1, fval2) # Look up the "canonical" copy of fval1 and fval2 while id(fval1) in forward: fval1 = forward[id(fval1)] while id(fval2) in forward: fval2 = forward[id(fval2)] # If fval1 or fval2 is a bound variable, then # replace it by the variable's bound value. This # includes aliased variables, which are encoded as # variables bound to other variables. fvar1 = fvar2 = None while isinstance(fval1, Variable) and fval1 in bindings: fvar1 = fval1 fval1 = bindings[fval1] while isinstance(fval2, Variable) and fval2 in bindings: fvar2 = fval2 fval2 = bindings[fval2] # Case 1: Two feature structures (recursive case) if isinstance(fval1, fs_class) and isinstance(fval2, fs_class): result = _destructively_unify( fval1, fval2, bindings, forward, trace, fail, fs_class, fpath ) # Case 2: Two unbound variables (create alias) elif isinstance(fval1, Variable) and isinstance(fval2, Variable): if fval1 != fval2: bindings[fval2] = fval1 result = fval1 # Case 3: An unbound variable and a value (bind) elif isinstance(fval1, Variable): bindings[fval1] = fval2 result = fval1 elif isinstance(fval2, Variable): bindings[fval2] = fval1 result = fval2 # Case 4: A feature structure & a base value (fail) elif isinstance(fval1, fs_class) or isinstance(fval2, fs_class): result = UnificationFailure # Case 5: Two base values else: # Case 5a: Feature defines a custom unification method for base values if isinstance(fname, Feature): result = fname.unify_base_values(fval1, fval2, bindings) # Case 5b: Feature value defines custom unification method elif isinstance(fval1, CustomFeatureValue): result = fval1.unify(fval2) # Sanity check: unify value should be symmetric if isinstance(fval2, CustomFeatureValue) and result != fval2.unify(fval1): raise AssertionError( "CustomFeatureValue objects %r and %r disagree " "about unification value: %r vs. %r" % (fval1, fval2, result, fval2.unify(fval1)) ) elif isinstance(fval2, CustomFeatureValue): result = fval2.unify(fval1) # Case 5c: Simple values -- check if they're equal. else: if fval1 == fval2: result = fval1 else: result = UnificationFailure # If either value was a bound variable, then update the # bindings. (This is really only necessary if fname is a # Feature or if either value is a CustomFeatureValue.) if result is not UnificationFailure: if fvar1 is not None: bindings[fvar1] = result result = fvar1 if fvar2 is not None and fvar2 != fvar1: bindings[fvar2] = result result = fvar2 # If we unification failed, call the failure function; it # might decide to continue anyway. if result is UnificationFailure: if fail is not None: result = fail(fval1, fval2, fpath) if trace: _trace_unify_fail(fpath[:-1], result) if result is UnificationFailure: raise _UnificationFailureError # Normalize the result. if isinstance(result, fs_class): result = _apply_forwards(result, forward, fs_class, set()) if trace: _trace_unify_succeed(fpath, result) if trace and isinstance(result, fs_class): _trace_bindings(fpath, bindings) return result def _apply_forwards_to_bindings(forward, bindings): """ Replace any feature structure that has a forward pointer with the target of its forward pointer (to preserve reentrancy). """ for (var, value) in bindings.items(): while id(value) in forward: value = forward[id(value)] bindings[var] = value def _apply_forwards(fstruct, forward, fs_class, visited): """ Replace any feature structure that has a forward pointer with the target of its forward pointer (to preserve reentrancy). """ # Follow our own forwards pointers (if any) while id(fstruct) in forward: fstruct = forward[id(fstruct)] # Visit each node only once: if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError("Expected mapping or sequence") for fname, fval in items: if isinstance(fval, fs_class): # Replace w/ forwarded value. while id(fval) in forward: fval = forward[id(fval)] fstruct[fname] = fval # Recurse to child. _apply_forwards(fval, forward, fs_class, visited) return fstruct def _resolve_aliases(bindings): """ Replace any bound aliased vars with their binding; and replace any unbound aliased vars with their representative var. """ for (var, value) in bindings.items(): while isinstance(value, Variable) and value in bindings: value = bindings[var] = bindings[value] def _trace_unify_start(path, fval1, fval2): if path == (): print("\nUnification trace:") else: fullname = ".".join("%s" % n for n in path) print(" " + "| " * (len(path) - 1) + "|") print(" " + "| " * (len(path) - 1) + "| Unify feature: %s" % fullname) print(" " + "| " * len(path) + " / " + _trace_valrepr(fval1)) print(" " + "| " * len(path) + "|\\ " + _trace_valrepr(fval2)) def _trace_unify_identity(path, fval1): print(" " + "| " * len(path) + "|") print(" " + "| " * len(path) + "| (identical objects)") print(" " + "| " * len(path) + "|") print(" " + "| " * len(path) + "+-->" + repr(fval1)) def _trace_unify_fail(path, result): if result is UnificationFailure: resume = "" else: resume = " (nonfatal)" print(" " + "| " * len(path) + "| |") print(" " + "X " * len(path) + "X X <-- FAIL" + resume) def _trace_unify_succeed(path, fval1): # Print the result. print(" " + "| " * len(path) + "|") print(" " + "| " * len(path) + "+-->" + repr(fval1)) def _trace_bindings(path, bindings): # Print the bindings (if any). if len(bindings) > 0: binditems = sorted(bindings.items(), key=lambda v: v[0].name) bindstr = "{%s}" % ", ".join( f"{var}: {_trace_valrepr(val)}" for (var, val) in binditems ) print(" " + "| " * len(path) + " Bindings: " + bindstr) def _trace_valrepr(val): if isinstance(val, Variable): return "%s" % val else: return "%s" % repr(val) def subsumes(fstruct1, fstruct2): """ Return True if ``fstruct1`` subsumes ``fstruct2``. I.e., return true if unifying ``fstruct1`` with ``fstruct2`` would result in a feature structure equal to ``fstruct2.`` :rtype: bool """ return fstruct2 == unify(fstruct1, fstruct2) def conflicts(fstruct1, fstruct2, trace=0): """ Return a list of the feature paths of all features which are assigned incompatible values by ``fstruct1`` and ``fstruct2``. :rtype: list(tuple) """ conflict_list = [] def add_conflict(fval1, fval2, path): conflict_list.append(path) return fval1 unify(fstruct1, fstruct2, fail=add_conflict, trace=trace) return conflict_list ###################################################################### # Helper Functions ###################################################################### def _is_mapping(v): return hasattr(v, "__contains__") and hasattr(v, "keys") def _is_sequence(v): return hasattr(v, "__iter__") and hasattr(v, "__len__") and not isinstance(v, str) def _default_fs_class(obj): if isinstance(obj, FeatStruct): return FeatStruct if isinstance(obj, (dict, list)): return (dict, list) else: raise ValueError( "To unify objects of type %s, you must specify " "fs_class explicitly." % obj.__class__.__name__ ) ###################################################################### # FeatureValueSet & FeatureValueTuple ###################################################################### class SubstituteBindingsSequence(SubstituteBindingsI): """ A mixin class for sequence classes that distributes variables() and substitute_bindings() over the object's elements. """ def variables(self): return [elt for elt in self if isinstance(elt, Variable)] + sum( ( list(elt.variables()) for elt in self if isinstance(elt, SubstituteBindingsI) ), [], ) def substitute_bindings(self, bindings): return self.__class__([self.subst(v, bindings) for v in self]) def subst(self, v, bindings): if isinstance(v, SubstituteBindingsI): return v.substitute_bindings(bindings) else: return bindings.get(v, v) class FeatureValueTuple(SubstituteBindingsSequence, tuple): """ A base feature value that is a tuple of other base feature values. FeatureValueTuple implements ``SubstituteBindingsI``, so it any variable substitutions will be propagated to the elements contained by the set. A ``FeatureValueTuple`` is immutable. """ def __repr__(self): # [xx] really use %s here? if len(self) == 0: return "()" return "(%s)" % ", ".join(f"{b}" for b in self) class FeatureValueSet(SubstituteBindingsSequence, frozenset): """ A base feature value that is a set of other base feature values. FeatureValueSet implements ``SubstituteBindingsI``, so it any variable substitutions will be propagated to the elements contained by the set. A ``FeatureValueSet`` is immutable. """ def __repr__(self): # [xx] really use %s here? if len(self) == 0: return "{/}" # distinguish from dict. # n.b., we sort the string reprs of our elements, to ensure # that our own repr is deterministic. return "{%s}" % ", ".join(sorted(f"{b}" for b in self)) __str__ = __repr__ class FeatureValueUnion(SubstituteBindingsSequence, frozenset): """ A base feature value that represents the union of two or more ``FeatureValueSet`` or ``Variable``. """ def __new__(cls, values): # If values contains FeatureValueUnions, then collapse them. values = _flatten(values, FeatureValueUnion) # If the resulting list contains no variables, then # use a simple FeatureValueSet instead. if sum(isinstance(v, Variable) for v in values) == 0: values = _flatten(values, FeatureValueSet) return FeatureValueSet(values) # If we contain a single variable, return that variable. if len(values) == 1: return list(values)[0] # Otherwise, build the FeatureValueUnion. return frozenset.__new__(cls, values) def __repr__(self): # n.b., we sort the string reprs of our elements, to ensure # that our own repr is deterministic. also, note that len(self) # is guaranteed to be 2 or more. return "{%s}" % "+".join(sorted(f"{b}" for b in self)) class FeatureValueConcat(SubstituteBindingsSequence, tuple): """ A base feature value that represents the concatenation of two or more ``FeatureValueTuple`` or ``Variable``. """ def __new__(cls, values): # If values contains FeatureValueConcats, then collapse them. values = _flatten(values, FeatureValueConcat) # If the resulting list contains no variables, then # use a simple FeatureValueTuple instead. if sum(isinstance(v, Variable) for v in values) == 0: values = _flatten(values, FeatureValueTuple) return FeatureValueTuple(values) # If we contain a single variable, return that variable. if len(values) == 1: return list(values)[0] # Otherwise, build the FeatureValueConcat. return tuple.__new__(cls, values) def __repr__(self): # n.b.: len(self) is guaranteed to be 2 or more. return "(%s)" % "+".join(f"{b}" for b in self) def _flatten(lst, cls): """ Helper function -- return a copy of list, with all elements of type ``cls`` spliced in rather than appended in. """ result = [] for elt in lst: if isinstance(elt, cls): result.extend(elt) else: result.append(elt) return result ###################################################################### # Specialized Features ###################################################################### @total_ordering class Feature: """ A feature identifier that's specialized to put additional constraints, default values, etc. """ def __init__(self, name, default=None, display=None): assert display in (None, "prefix", "slash") self._name = name # [xx] rename to .identifier? self._default = default # [xx] not implemented yet. self._display = display if self._display == "prefix": self._sortkey = (-1, self._name) elif self._display == "slash": self._sortkey = (1, self._name) else: self._sortkey = (0, self._name) @property def name(self): """The name of this feature.""" return self._name @property def default(self): """Default value for this feature.""" return self._default @property def display(self): """Custom display location: can be prefix, or slash.""" return self._display def __repr__(self): return "*%s*" % self.name def __lt__(self, other): if isinstance(other, str): return True if not isinstance(other, Feature): raise_unorderable_types("<", self, other) return self._sortkey < other._sortkey def __eq__(self, other): return type(self) == type(other) and self._name == other._name def __ne__(self, other): return not self == other def __hash__(self): return hash(self._name) # //////////////////////////////////////////////////////////// # These can be overridden by subclasses: # //////////////////////////////////////////////////////////// def read_value(self, s, position, reentrances, parser): return parser.read_value(s, position, reentrances) def unify_base_values(self, fval1, fval2, bindings): """ If possible, return a single value.. If not, return the value ``UnificationFailure``. """ if fval1 == fval2: return fval1 else: return UnificationFailure class SlashFeature(Feature): def read_value(self, s, position, reentrances, parser): return parser.read_partial(s, position, reentrances) class RangeFeature(Feature): RANGE_RE = re.compile(r"(-?\d+):(-?\d+)") def read_value(self, s, position, reentrances, parser): m = self.RANGE_RE.match(s, position) if not m: raise ValueError("range", position) return (int(m.group(1)), int(m.group(2))), m.end() def unify_base_values(self, fval1, fval2, bindings): if fval1 is None: return fval2 if fval2 is None: return fval1 rng = max(fval1[0], fval2[0]), min(fval1[1], fval2[1]) if rng[1] < rng[0]: return UnificationFailure return rng SLASH = SlashFeature("slash", default=False, display="slash") TYPE = Feature("type", display="prefix") ###################################################################### # Specialized Feature Values ###################################################################### @total_ordering class CustomFeatureValue: """ An abstract base class for base values that define a custom unification method. The custom unification method of ``CustomFeatureValue`` will be used during unification if: - The ``CustomFeatureValue`` is unified with another base value. - The ``CustomFeatureValue`` is not the value of a customized ``Feature`` (which defines its own unification method). If two ``CustomFeatureValue`` objects are unified with one another during feature structure unification, then the unified base values they return *must* be equal; otherwise, an ``AssertionError`` will be raised. Subclasses must define ``unify()``, ``__eq__()`` and ``__lt__()``. Subclasses may also wish to define ``__hash__()``. """ def unify(self, other): """ If this base value unifies with ``other``, then return the unified value. Otherwise, return ``UnificationFailure``. """ raise NotImplementedError("abstract base class") def __eq__(self, other): raise NotImplementedError("abstract base class") def __ne__(self, other): return not self == other def __lt__(self, other): raise NotImplementedError("abstract base class") def __hash__(self): raise TypeError("%s objects or unhashable" % self.__class__.__name__) ###################################################################### # Feature Structure Reader ###################################################################### class FeatStructReader: def __init__( self, features=(SLASH, TYPE), fdict_class=FeatStruct, flist_class=FeatList, logic_parser=None, ): self._features = {f.name: f for f in features} self._fdict_class = fdict_class self._flist_class = flist_class self._prefix_feature = None self._slash_feature = None for feature in features: if feature.display == "slash": if self._slash_feature: raise ValueError("Multiple features w/ display=slash") self._slash_feature = feature if feature.display == "prefix": if self._prefix_feature: raise ValueError("Multiple features w/ display=prefix") self._prefix_feature = feature self._features_with_defaults = [ feature for feature in features if feature.default is not None ] if logic_parser is None: logic_parser = LogicParser() self._logic_parser = logic_parser def fromstring(self, s, fstruct=None): """ Convert a string representation of a feature structure (as displayed by repr) into a ``FeatStruct``. This process imposes the following restrictions on the string representation: - Feature names cannot contain any of the following: whitespace, parentheses, quote marks, equals signs, dashes, commas, and square brackets. Feature names may not begin with plus signs or minus signs. - Only the following basic feature value are supported: strings, integers, variables, None, and unquoted alphanumeric strings. - For reentrant values, the first mention must specify a reentrance identifier and a value; and any subsequent mentions must use arrows (``'->'``) to reference the reentrance identifier. """ s = s.strip() value, position = self.read_partial(s, 0, {}, fstruct) if position != len(s): self._error(s, "end of string", position) return value _START_FSTRUCT_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)") _END_FSTRUCT_RE = re.compile(r"\s*]\s*") _SLASH_RE = re.compile(r"/") _FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)<>"\'\-=\[\],]+)\s*') _REENTRANCE_RE = re.compile(r"\s*->\s*") _TARGET_RE = re.compile(r"\s*\((\d+)\)\s*") _ASSIGN_RE = re.compile(r"\s*=\s*") _COMMA_RE = re.compile(r"\s*,\s*") _BARE_PREFIX_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()") # This one is used to distinguish fdicts from flists: _START_FDICT_RE = re.compile( r"(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))" % ( _BARE_PREFIX_RE.pattern, _START_FSTRUCT_RE.pattern, _FEATURE_NAME_RE.pattern, _FEATURE_NAME_RE.pattern, ) ) def read_partial(self, s, position=0, reentrances=None, fstruct=None): """ Helper function that reads in a feature structure. :param s: The string to read. :param position: The position in the string to start parsing. :param reentrances: A dictionary from reentrance ids to values. Defaults to an empty dictionary. :return: A tuple (val, pos) of the feature structure created by parsing and the position where the parsed feature structure ends. :rtype: bool """ if reentrances is None: reentrances = {} try: return self._read_partial(s, position, reentrances, fstruct) except ValueError as e: if len(e.args) != 2: raise self._error(s, *e.args) def _read_partial(self, s, position, reentrances, fstruct=None): # Create the new feature structure if fstruct is None: if self._START_FDICT_RE.match(s, position): fstruct = self._fdict_class() else: fstruct = self._flist_class() # Read up to the open bracket. match = self._START_FSTRUCT_RE.match(s, position) if not match: match = self._BARE_PREFIX_RE.match(s, position) if not match: raise ValueError("open bracket or identifier", position) position = match.end() # If there as an identifier, record it. if match.group(1): identifier = match.group(1) if identifier in reentrances: raise ValueError("new identifier", match.start(1)) reentrances[identifier] = fstruct if isinstance(fstruct, FeatDict): fstruct.clear() return self._read_partial_featdict(s, position, match, reentrances, fstruct) else: del fstruct[:] return self._read_partial_featlist(s, position, match, reentrances, fstruct) def _read_partial_featlist(self, s, position, match, reentrances, fstruct): # Prefix features are not allowed: if match.group(2): raise ValueError("open bracket") # Bare prefixes are not allowed: if not match.group(3): raise ValueError("open bracket") # Build a list of the features defined by the structure. while position < len(s): # Check for the close bracket. match = self._END_FSTRUCT_RE.match(s, position) if match is not None: return fstruct, match.end() # Reentances have the form "-> (target)" match = self._REENTRANCE_RE.match(s, position) if match: position = match.end() match = self._TARGET_RE.match(s, position) if not match: raise ValueError("identifier", position) target = match.group(1) if target not in reentrances: raise ValueError("bound identifier", position) position = match.end() fstruct.append(reentrances[target]) # Anything else is a value. else: value, position = self._read_value(0, s, position, reentrances) fstruct.append(value) # If there's a close bracket, handle it at the top of the loop. if self._END_FSTRUCT_RE.match(s, position): continue # Otherwise, there should be a comma match = self._COMMA_RE.match(s, position) if match is None: raise ValueError("comma", position) position = match.end() # We never saw a close bracket. raise ValueError("close bracket", position) def _read_partial_featdict(self, s, position, match, reentrances, fstruct): # If there was a prefix feature, record it. if match.group(2): if self._prefix_feature is None: raise ValueError("open bracket or identifier", match.start(2)) prefixval = match.group(2).strip() if prefixval.startswith("?"): prefixval = Variable(prefixval) fstruct[self._prefix_feature] = prefixval # If group 3 is empty, then we just have a bare prefix, so # we're done. if not match.group(3): return self._finalize(s, match.end(), reentrances, fstruct) # Build a list of the features defined by the structure. # Each feature has one of the three following forms: # name = value # name -> (target) # +name # -name while position < len(s): # Use these variables to hold info about each feature: name = value = None # Check for the close bracket. match = self._END_FSTRUCT_RE.match(s, position) if match is not None: return self._finalize(s, match.end(), reentrances, fstruct) # Get the feature name's name match = self._FEATURE_NAME_RE.match(s, position) if match is None: raise ValueError("feature name", position) name = match.group(2) position = match.end() # Check if it's a special feature. if name[0] == "*" and name[-1] == "*": name = self._features.get(name[1:-1]) if name is None: raise ValueError("known special feature", match.start(2)) # Check if this feature has a value already. if name in fstruct: raise ValueError("new name", match.start(2)) # Boolean value ("+name" or "-name") if match.group(1) == "+": value = True if match.group(1) == "-": value = False # Reentrance link ("-> (target)") if value is None: match = self._REENTRANCE_RE.match(s, position) if match is not None: position = match.end() match = self._TARGET_RE.match(s, position) if not match: raise ValueError("identifier", position) target = match.group(1) if target not in reentrances: raise ValueError("bound identifier", position) position = match.end() value = reentrances[target] # Assignment ("= value"). if value is None: match = self._ASSIGN_RE.match(s, position) if match: position = match.end() value, position = self._read_value(name, s, position, reentrances) # None of the above: error. else: raise ValueError("equals sign", position) # Store the value. fstruct[name] = value # If there's a close bracket, handle it at the top of the loop. if self._END_FSTRUCT_RE.match(s, position): continue # Otherwise, there should be a comma match = self._COMMA_RE.match(s, position) if match is None: raise ValueError("comma", position) position = match.end() # We never saw a close bracket. raise ValueError("close bracket", position) def _finalize(self, s, pos, reentrances, fstruct): """ Called when we see the close brace -- checks for a slash feature, and adds in default values. """ # Add the slash feature (if any) match = self._SLASH_RE.match(s, pos) if match: name = self._slash_feature v, pos = self._read_value(name, s, match.end(), reentrances) fstruct[name] = v ## Add any default features. -- handle in unficiation instead? # for feature in self._features_with_defaults: # fstruct.setdefault(feature, feature.default) # Return the value. return fstruct, pos def _read_value(self, name, s, position, reentrances): if isinstance(name, Feature): return name.read_value(s, position, reentrances, self) else: return self.read_value(s, position, reentrances) def read_value(self, s, position, reentrances): for (handler, regexp) in self.VALUE_HANDLERS: match = regexp.match(s, position) if match: handler_func = getattr(self, handler) return handler_func(s, position, reentrances, match) raise ValueError("value", position) def _error(self, s, expected, position): lines = s.split("\n") while position > len(lines[0]): position -= len(lines.pop(0)) + 1 # +1 for the newline. estr = ( "Error parsing feature structure\n " + lines[0] + "\n " + " " * position + "^ " + "Expected %s" % expected ) raise ValueError(estr) # //////////////////////////////////////////////////////////// # { Value Readers # //////////////////////////////////////////////////////////// #: A table indicating how feature values should be processed. Each #: entry in the table is a pair (handler, regexp). The first entry #: with a matching regexp will have its handler called. Handlers #: should have the following signature:: #: #: def handler(s, position, reentrances, match): ... #: #: and should return a tuple (value, position), where position is #: the string position where the value ended. (n.b.: order is #: important here!) VALUE_HANDLERS = [ ("read_fstruct_value", _START_FSTRUCT_RE), ("read_var_value", re.compile(r"\?[a-zA-Z_][a-zA-Z0-9_]*")), ("read_str_value", re.compile("[uU]?[rR]?(['\"])")), ("read_int_value", re.compile(r"-?\d+")), ("read_sym_value", re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*")), ( "read_app_value", re.compile(r"<(app)\((\?[a-z][a-z]*)\s*," r"\s*(\?[a-z][a-z]*)\)>"), ), # ('read_logic_value', re.compile(r'<([^>]*)>')), # lazily match any character after '<' until we hit a '>' not preceded by '-' ("read_logic_value", re.compile(r"<(.*?)(?")), ("read_set_value", re.compile(r"{")), ("read_tuple_value", re.compile(r"\(")), ] def read_fstruct_value(self, s, position, reentrances, match): return self.read_partial(s, position, reentrances) def read_str_value(self, s, position, reentrances, match): return read_str(s, position) def read_int_value(self, s, position, reentrances, match): return int(match.group()), match.end() # Note: the '?' is included in the variable name. def read_var_value(self, s, position, reentrances, match): return Variable(match.group()), match.end() _SYM_CONSTS = {"None": None, "True": True, "False": False} def read_sym_value(self, s, position, reentrances, match): val, end = match.group(), match.end() return self._SYM_CONSTS.get(val, val), end def read_app_value(self, s, position, reentrances, match): """Mainly included for backwards compat.""" return self._logic_parser.parse("%s(%s)" % match.group(2, 3)), match.end() def read_logic_value(self, s, position, reentrances, match): try: try: expr = self._logic_parser.parse(match.group(1)) except LogicalExpressionException as e: raise ValueError from e return expr, match.end() except ValueError as e: raise ValueError("logic expression", match.start(1)) from e def read_tuple_value(self, s, position, reentrances, match): return self._read_seq_value( s, position, reentrances, match, ")", FeatureValueTuple, FeatureValueConcat ) def read_set_value(self, s, position, reentrances, match): return self._read_seq_value( s, position, reentrances, match, "}", FeatureValueSet, FeatureValueUnion ) def _read_seq_value( self, s, position, reentrances, match, close_paren, seq_class, plus_class ): """ Helper function used by read_tuple_value and read_set_value. """ cp = re.escape(close_paren) position = match.end() # Special syntax of empty tuples: m = re.compile(r"\s*/?\s*%s" % cp).match(s, position) if m: return seq_class(), m.end() # Read values: values = [] seen_plus = False while True: # Close paren: return value. m = re.compile(r"\s*%s" % cp).match(s, position) if m: if seen_plus: return plus_class(values), m.end() else: return seq_class(values), m.end() # Read the next value. val, position = self.read_value(s, position, reentrances) values.append(val) # Comma or looking at close paren m = re.compile(r"\s*(,|\+|(?=%s))\s*" % cp).match(s, position) if not m: raise ValueError("',' or '+' or '%s'" % cp, position) if m.group(1) == "+": seen_plus = True position = m.end() ###################################################################### # { Demo ###################################################################### def display_unification(fs1, fs2, indent=" "): # Print the two input feature structures, side by side. fs1_lines = ("%s" % fs1).split("\n") fs2_lines = ("%s" % fs2).split("\n") if len(fs1_lines) > len(fs2_lines): blankline = "[" + " " * (len(fs2_lines[0]) - 2) + "]" fs2_lines += [blankline] * len(fs1_lines) else: blankline = "[" + " " * (len(fs1_lines[0]) - 2) + "]" fs1_lines += [blankline] * len(fs2_lines) for (fs1_line, fs2_line) in zip(fs1_lines, fs2_lines): print(indent + fs1_line + " " + fs2_line) print(indent + "-" * len(fs1_lines[0]) + " " + "-" * len(fs2_lines[0])) linelen = len(fs1_lines[0]) * 2 + 3 print(indent + "| |".center(linelen)) print(indent + "+-----UNIFY-----+".center(linelen)) print(indent + "|".center(linelen)) print(indent + "V".center(linelen)) bindings = {} result = fs1.unify(fs2, bindings) if result is None: print(indent + "(FAILED)".center(linelen)) else: print( "\n".join(indent + l.center(linelen) for l in ("%s" % result).split("\n")) ) if bindings and len(bindings.bound_variables()) > 0: print(repr(bindings).center(linelen)) return result def interactive_demo(trace=False): import random import sys HELP = """ 1-%d: Select the corresponding feature structure q: Quit t: Turn tracing on or off l: List all feature structures ?: Help """ print( """ This demo will repeatedly present you with a list of feature structures, and ask you to choose two for unification. Whenever a new feature structure is generated, it is added to the list of choices that you can pick from. However, since this can be a large number of feature structures, the demo will only print out a random subset for you to choose between at a given time. If you want to see the complete lists, type "l". For a list of valid commands, type "?". """ ) print('Press "Enter" to continue...') sys.stdin.readline() fstruct_strings = [ "[agr=[number=sing, gender=masc]]", "[agr=[gender=masc, person=3]]", "[agr=[gender=fem, person=3]]", "[subj=[agr=(1)[]], agr->(1)]", "[obj=?x]", "[subj=?x]", "[/=None]", "[/=NP]", "[cat=NP]", "[cat=VP]", "[cat=PP]", "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]", "[gender=masc, agr=?C]", "[gender=?S, agr=[gender=?S,person=3]]", ] all_fstructs = [ (i, FeatStruct(fstruct_strings[i])) for i in range(len(fstruct_strings)) ] def list_fstructs(fstructs): for i, fstruct in fstructs: print() lines = ("%s" % fstruct).split("\n") print("%3d: %s" % (i + 1, lines[0])) for line in lines[1:]: print(" " + line) print() while True: # Pick 5 feature structures at random from the master list. MAX_CHOICES = 5 if len(all_fstructs) > MAX_CHOICES: fstructs = sorted(random.sample(all_fstructs, MAX_CHOICES)) else: fstructs = all_fstructs print("_" * 75) print("Choose two feature structures to unify:") list_fstructs(fstructs) selected = [None, None] for (nth, i) in (("First", 0), ("Second", 1)): while selected[i] is None: print( ( "%s feature structure (1-%d,q,t,l,?): " % (nth, len(all_fstructs)) ), end=" ", ) try: input = sys.stdin.readline().strip() if input in ("q", "Q", "x", "X"): return if input in ("t", "T"): trace = not trace print(" Trace = %s" % trace) continue if input in ("h", "H", "?"): print(HELP % len(fstructs)) continue if input in ("l", "L"): list_fstructs(all_fstructs) continue num = int(input) - 1 selected[i] = all_fstructs[num][1] print() except: print("Bad sentence number") continue if trace: result = selected[0].unify(selected[1], trace=1) else: result = display_unification(selected[0], selected[1]) if result is not None: for i, fstruct in all_fstructs: if repr(result) == repr(fstruct): break else: all_fstructs.append((len(all_fstructs), result)) print('\nType "Enter" to continue unifying; or "q" to quit.') input = sys.stdin.readline().strip() if input in ("q", "Q", "x", "X"): return def demo(trace=False): """ Just for testing """ # import random # processor breaks with values like '3rd' fstruct_strings = [ "[agr=[number=sing, gender=masc]]", "[agr=[gender=masc, person=3]]", "[agr=[gender=fem, person=3]]", "[subj=[agr=(1)[]], agr->(1)]", "[obj=?x]", "[subj=?x]", "[/=None]", "[/=NP]", "[cat=NP]", "[cat=VP]", "[cat=PP]", "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]", "[gender=masc, agr=?C]", "[gender=?S, agr=[gender=?S,person=3]]", ] all_fstructs = [FeatStruct(fss) for fss in fstruct_strings] # MAX_CHOICES = 5 # if len(all_fstructs) > MAX_CHOICES: # fstructs = random.sample(all_fstructs, MAX_CHOICES) # fstructs.sort() # else: # fstructs = all_fstructs for fs1 in all_fstructs: for fs2 in all_fstructs: print( "\n*******************\nfs1 is:\n%s\n\nfs2 is:\n%s\n\nresult is:\n%s" % (fs1, fs2, unify(fs1, fs2)) ) if __name__ == "__main__": demo() __all__ = [ "FeatStruct", "FeatDict", "FeatList", "unify", "subsumes", "conflicts", "Feature", "SlashFeature", "RangeFeature", "SLASH", "TYPE", "FeatStructReader", ] nltk-3.7/nltk/grammar.py000066400000000000000000001601271420073152400153100ustar00rootroot00000000000000# Natural Language Toolkit: Context Free Grammars # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # Jason Narad # Peter Ljunglöf # Tom Aarsen <> # URL: # For license information, see LICENSE.TXT # """ Basic data classes for representing context free grammars. A "grammar" specifies which trees can represent the structure of a given text. Each of these trees is called a "parse tree" for the text (or simply a "parse"). In a "context free" grammar, the set of parse trees for any piece of a text can depend only on that piece, and not on the rest of the text (i.e., the piece's context). Context free grammars are often used to find possible syntactic structures for sentences. In this context, the leaves of a parse tree are word tokens; and the node values are phrasal categories, such as ``NP`` and ``VP``. The ``CFG`` class is used to encode context free grammars. Each ``CFG`` consists of a start symbol and a set of productions. The "start symbol" specifies the root node value for parse trees. For example, the start symbol for syntactic parsing is usually ``S``. Start symbols are encoded using the ``Nonterminal`` class, which is discussed below. A Grammar's "productions" specify what parent-child relationships a parse tree can contain. Each production specifies that a particular node can be the parent of a particular set of children. For example, the production `` -> `` specifies that an ``S`` node can be the parent of an ``NP`` node and a ``VP`` node. Grammar productions are implemented by the ``Production`` class. Each ``Production`` consists of a left hand side and a right hand side. The "left hand side" is a ``Nonterminal`` that specifies the node type for a potential parent; and the "right hand side" is a list that specifies allowable children for that parent. This lists consists of ``Nonterminals`` and text types: each ``Nonterminal`` indicates that the corresponding child may be a ``TreeToken`` with the specified node type; and each text type indicates that the corresponding child may be a ``Token`` with the with that type. The ``Nonterminal`` class is used to distinguish node values from leaf values. This prevents the grammar from accidentally using a leaf value (such as the English word "A") as the node of a subtree. Within a ``CFG``, all node values are wrapped in the ``Nonterminal`` class. Note, however, that the trees that are specified by the grammar do *not* include these ``Nonterminal`` wrappers. Grammars can also be given a more procedural interpretation. According to this interpretation, a Grammar specifies any tree structure *tree* that can be produced by the following procedure: | Set tree to the start symbol | Repeat until tree contains no more nonterminal leaves: | Choose a production prod with whose left hand side | lhs is a nonterminal leaf of tree. | Replace the nonterminal leaf with a subtree, whose node | value is the value wrapped by the nonterminal lhs, and | whose children are the right hand side of prod. The operation of replacing the left hand side (*lhs*) of a production with the right hand side (*rhs*) in a tree (*tree*) is known as "expanding" *lhs* to *rhs* in *tree*. """ import re from functools import total_ordering from nltk.featstruct import SLASH, TYPE, FeatDict, FeatStruct, FeatStructReader from nltk.internals import raise_unorderable_types from nltk.probability import ImmutableProbabilisticMixIn from nltk.util import invert_graph, transitive_closure ################################################################# # Nonterminal ################################################################# @total_ordering class Nonterminal: """ A non-terminal symbol for a context free grammar. ``Nonterminal`` is a wrapper class for node values; it is used by ``Production`` objects to distinguish node values from leaf values. The node value that is wrapped by a ``Nonterminal`` is known as its "symbol". Symbols are typically strings representing phrasal categories (such as ``"NP"`` or ``"VP"``). However, more complex symbol types are sometimes used (e.g., for lexicalized grammars). Since symbols are node values, they must be immutable and hashable. Two ``Nonterminals`` are considered equal if their symbols are equal. :see: ``CFG``, ``Production`` :type _symbol: any :ivar _symbol: The node value corresponding to this ``Nonterminal``. This value must be immutable and hashable. """ def __init__(self, symbol): """ Construct a new non-terminal from the given symbol. :type symbol: any :param symbol: The node value corresponding to this ``Nonterminal``. This value must be immutable and hashable. """ self._symbol = symbol def symbol(self): """ Return the node value corresponding to this ``Nonterminal``. :rtype: (any) """ return self._symbol def __eq__(self, other): """ Return True if this non-terminal is equal to ``other``. In particular, return True if ``other`` is a ``Nonterminal`` and this non-terminal's symbol is equal to ``other`` 's symbol. :rtype: bool """ return type(self) == type(other) and self._symbol == other._symbol def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, Nonterminal): raise_unorderable_types("<", self, other) return self._symbol < other._symbol def __hash__(self): return hash(self._symbol) def __repr__(self): """ Return a string representation for this ``Nonterminal``. :rtype: str """ if isinstance(self._symbol, str): return "%s" % self._symbol else: return "%s" % repr(self._symbol) def __str__(self): """ Return a string representation for this ``Nonterminal``. :rtype: str """ if isinstance(self._symbol, str): return "%s" % self._symbol else: return "%s" % repr(self._symbol) def __div__(self, rhs): """ Return a new nonterminal whose symbol is ``A/B``, where ``A`` is the symbol for this nonterminal, and ``B`` is the symbol for rhs. :param rhs: The nonterminal used to form the right hand side of the new nonterminal. :type rhs: Nonterminal :rtype: Nonterminal """ return Nonterminal(f"{self._symbol}/{rhs._symbol}") def __truediv__(self, rhs): """ Return a new nonterminal whose symbol is ``A/B``, where ``A`` is the symbol for this nonterminal, and ``B`` is the symbol for rhs. This function allows use of the slash ``/`` operator with the future import of division. :param rhs: The nonterminal used to form the right hand side of the new nonterminal. :type rhs: Nonterminal :rtype: Nonterminal """ return self.__div__(rhs) def nonterminals(symbols): """ Given a string containing a list of symbol names, return a list of ``Nonterminals`` constructed from those symbols. :param symbols: The symbol name string. This string can be delimited by either spaces or commas. :type symbols: str :return: A list of ``Nonterminals`` constructed from the symbol names given in ``symbols``. The ``Nonterminals`` are sorted in the same order as the symbols names. :rtype: list(Nonterminal) """ if "," in symbols: symbol_list = symbols.split(",") else: symbol_list = symbols.split() return [Nonterminal(s.strip()) for s in symbol_list] class FeatStructNonterminal(FeatDict, Nonterminal): """A feature structure that's also a nonterminal. It acts as its own symbol, and automatically freezes itself when hashed.""" def __hash__(self): self.freeze() return FeatStruct.__hash__(self) def symbol(self): return self def is_nonterminal(item): """ :return: True if the item is a ``Nonterminal``. :rtype: bool """ return isinstance(item, Nonterminal) ################################################################# # Terminals ################################################################# def is_terminal(item): """ Return True if the item is a terminal, which currently is if it is hashable and not a ``Nonterminal``. :rtype: bool """ return hasattr(item, "__hash__") and not isinstance(item, Nonterminal) ################################################################# # Productions ################################################################# @total_ordering class Production: """ A grammar production. Each production maps a single symbol on the "left-hand side" to a sequence of symbols on the "right-hand side". (In the case of context-free productions, the left-hand side must be a ``Nonterminal``, and the right-hand side is a sequence of terminals and ``Nonterminals``.) "terminals" can be any immutable hashable object that is not a ``Nonterminal``. Typically, terminals are strings representing words, such as ``"dog"`` or ``"under"``. :see: ``CFG`` :see: ``DependencyGrammar`` :see: ``Nonterminal`` :type _lhs: Nonterminal :ivar _lhs: The left-hand side of the production. :type _rhs: tuple(Nonterminal, terminal) :ivar _rhs: The right-hand side of the production. """ def __init__(self, lhs, rhs): """ Construct a new ``Production``. :param lhs: The left-hand side of the new ``Production``. :type lhs: Nonterminal :param rhs: The right-hand side of the new ``Production``. :type rhs: sequence(Nonterminal and terminal) """ if isinstance(rhs, str): raise TypeError( "production right hand side should be a list, " "not a string" ) self._lhs = lhs self._rhs = tuple(rhs) def lhs(self): """ Return the left-hand side of this ``Production``. :rtype: Nonterminal """ return self._lhs def rhs(self): """ Return the right-hand side of this ``Production``. :rtype: sequence(Nonterminal and terminal) """ return self._rhs def __len__(self): """ Return the length of the right-hand side. :rtype: int """ return len(self._rhs) def is_nonlexical(self): """ Return True if the right-hand side only contains ``Nonterminals`` :rtype: bool """ return all(is_nonterminal(n) for n in self._rhs) def is_lexical(self): """ Return True if the right-hand contain at least one terminal token. :rtype: bool """ return not self.is_nonlexical() def __str__(self): """ Return a verbose string representation of the ``Production``. :rtype: str """ result = "%s -> " % repr(self._lhs) result += " ".join(repr(el) for el in self._rhs) return result def __repr__(self): """ Return a concise string representation of the ``Production``. :rtype: str """ return "%s" % self def __eq__(self, other): """ Return True if this ``Production`` is equal to ``other``. :rtype: bool """ return ( type(self) == type(other) and self._lhs == other._lhs and self._rhs == other._rhs ) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, Production): raise_unorderable_types("<", self, other) return (self._lhs, self._rhs) < (other._lhs, other._rhs) def __hash__(self): """ Return a hash value for the ``Production``. :rtype: int """ return hash((self._lhs, self._rhs)) class DependencyProduction(Production): """ A dependency grammar production. Each production maps a single head word to an unordered list of one or more modifier words. """ def __str__(self): """ Return a verbose string representation of the ``DependencyProduction``. :rtype: str """ result = f"'{self._lhs}' ->" for elt in self._rhs: result += f" '{elt}'" return result class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn): """ A probabilistic context free grammar production. A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that has an associated probability, which represents how likely it is that this production will be used. In particular, the probability of a ``ProbabilisticProduction`` records the likelihood that its right-hand side is the correct instantiation for any given occurrence of its left-hand side. :see: ``Production`` """ def __init__(self, lhs, rhs, **prob): """ Construct a new ``ProbabilisticProduction``. :param lhs: The left-hand side of the new ``ProbabilisticProduction``. :type lhs: Nonterminal :param rhs: The right-hand side of the new ``ProbabilisticProduction``. :type rhs: sequence(Nonterminal and terminal) :param prob: Probability parameters of the new ``ProbabilisticProduction``. """ ImmutableProbabilisticMixIn.__init__(self, **prob) Production.__init__(self, lhs, rhs) def __str__(self): return super().__str__() + ( " [1.0]" if (self.prob() == 1.0) else " [%g]" % self.prob() ) def __eq__(self, other): return ( type(self) == type(other) and self._lhs == other._lhs and self._rhs == other._rhs and self.prob() == other.prob() ) def __ne__(self, other): return not self == other def __hash__(self): return hash((self._lhs, self._rhs, self.prob())) ################################################################# # Grammars ################################################################# class CFG: """ A context-free grammar. A grammar consists of a start state and a set of productions. The set of terminals and nonterminals is implicitly specified by the productions. If you need efficient key-based access to productions, you can use a subclass to implement it. """ def __init__(self, start, productions, calculate_leftcorners=True): """ Create a new context-free grammar, from the given start state and set of ``Production`` instances. :param start: The start symbol :type start: Nonterminal :param productions: The list of productions that defines the grammar :type productions: list(Production) :param calculate_leftcorners: False if we don't want to calculate the leftcorner relation. In that case, some optimized chart parsers won't work. :type calculate_leftcorners: bool """ if not is_nonterminal(start): raise TypeError( "start should be a Nonterminal object," " not a %s" % type(start).__name__ ) self._start = start self._productions = productions self._categories = {prod.lhs() for prod in productions} self._calculate_indexes() self._calculate_grammar_forms() if calculate_leftcorners: self._calculate_leftcorners() def _calculate_indexes(self): self._lhs_index = {} self._rhs_index = {} self._empty_index = {} self._lexical_index = {} for prod in self._productions: # Left hand side. lhs = prod._lhs if lhs not in self._lhs_index: self._lhs_index[lhs] = [] self._lhs_index[lhs].append(prod) if prod._rhs: # First item in right hand side. rhs0 = prod._rhs[0] if rhs0 not in self._rhs_index: self._rhs_index[rhs0] = [] self._rhs_index[rhs0].append(prod) else: # The right hand side is empty. self._empty_index[prod.lhs()] = prod # Lexical tokens in the right hand side. for token in prod._rhs: if is_terminal(token): self._lexical_index.setdefault(token, set()).add(prod) def _calculate_leftcorners(self): # Calculate leftcorner relations, for use in optimized parsing. self._immediate_leftcorner_categories = {cat: {cat} for cat in self._categories} self._immediate_leftcorner_words = {cat: set() for cat in self._categories} for prod in self.productions(): if len(prod) > 0: cat, left = prod.lhs(), prod.rhs()[0] if is_nonterminal(left): self._immediate_leftcorner_categories[cat].add(left) else: self._immediate_leftcorner_words[cat].add(left) lc = transitive_closure(self._immediate_leftcorner_categories, reflexive=True) self._leftcorners = lc self._leftcorner_parents = invert_graph(lc) nr_leftcorner_categories = sum( map(len, self._immediate_leftcorner_categories.values()) ) nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values())) if nr_leftcorner_words > nr_leftcorner_categories > 10000: # If the grammar is big, the leftcorner-word dictionary will be too large. # In that case it is better to calculate the relation on demand. self._leftcorner_words = None return self._leftcorner_words = {} for cat in self._leftcorners: lefts = self._leftcorners[cat] lc = self._leftcorner_words[cat] = set() for left in lefts: lc.update(self._immediate_leftcorner_words.get(left, set())) @classmethod def fromstring(cls, input, encoding=None): """ Return the grammar instance corresponding to the input string(s). :param input: a grammar, either in the form of a string or as a list of strings. """ start, productions = read_grammar( input, standard_nonterm_parser, encoding=encoding ) return cls(start, productions) def start(self): """ Return the start symbol of the grammar :rtype: Nonterminal """ return self._start # tricky to balance readability and efficiency here! # can't use set operations as they don't preserve ordering def productions(self, lhs=None, rhs=None, empty=False): """ Return the grammar productions, filtered by the left-hand side or the first item in the right-hand side. :param lhs: Only return productions with the given left-hand side. :param rhs: Only return productions with the given first item in the right-hand side. :param empty: Only return productions with an empty right-hand side. :return: A list of productions matching the given constraints. :rtype: list(Production) """ if rhs and empty: raise ValueError( "You cannot select empty and non-empty " "productions at the same time." ) # no constraints so return everything if not lhs and not rhs: if not empty: return self._productions else: return self._empty_index.values() # only lhs specified so look up its index elif lhs and not rhs: if not empty: return self._lhs_index.get(lhs, []) elif lhs in self._empty_index: return [self._empty_index[lhs]] else: return [] # only rhs specified so look up its index elif rhs and not lhs: return self._rhs_index.get(rhs, []) # intersect else: return [ prod for prod in self._lhs_index.get(lhs, []) if prod in self._rhs_index.get(rhs, []) ] def leftcorners(self, cat): """ Return the set of all nonterminals that the given nonterminal can start with, including itself. This is the reflexive, transitive closure of the immediate leftcorner relation: (A > B) iff (A -> B beta) :param cat: the parent of the leftcorners :type cat: Nonterminal :return: the set of all leftcorners :rtype: set(Nonterminal) """ return self._leftcorners.get(cat, {cat}) def is_leftcorner(self, cat, left): """ True if left is a leftcorner of cat, where left can be a terminal or a nonterminal. :param cat: the parent of the leftcorner :type cat: Nonterminal :param left: the suggested leftcorner :type left: Terminal or Nonterminal :rtype: bool """ if is_nonterminal(left): return left in self.leftcorners(cat) elif self._leftcorner_words: return left in self._leftcorner_words.get(cat, set()) else: return any( left in self._immediate_leftcorner_words.get(parent, set()) for parent in self.leftcorners(cat) ) def leftcorner_parents(self, cat): """ Return the set of all nonterminals for which the given category is a left corner. This is the inverse of the leftcorner relation. :param cat: the suggested leftcorner :type cat: Nonterminal :return: the set of all parents to the leftcorner :rtype: set(Nonterminal) """ return self._leftcorner_parents.get(cat, {cat}) def check_coverage(self, tokens): """ Check whether the grammar rules cover the given list of tokens. If not, then raise an exception. :type tokens: list(str) """ missing = [tok for tok in tokens if not self._lexical_index.get(tok)] if missing: missing = ", ".join(f"{w!r}" for w in missing) raise ValueError( "Grammar does not cover some of the " "input words: %r." % missing ) def _calculate_grammar_forms(self): """ Pre-calculate of which form(s) the grammar is. """ prods = self._productions self._is_lexical = all(p.is_lexical() for p in prods) self._is_nonlexical = all(p.is_nonlexical() for p in prods if len(p) != 1) self._min_len = min(len(p) for p in prods) self._max_len = max(len(p) for p in prods) self._all_unary_are_lexical = all(p.is_lexical() for p in prods if len(p) == 1) def is_lexical(self): """ Return True if all productions are lexicalised. """ return self._is_lexical def is_nonlexical(self): """ Return True if all lexical rules are "preterminals", that is, unary rules which can be separated in a preprocessing step. This means that all productions are of the forms A -> B1 ... Bn (n>=0), or A -> "s". Note: is_lexical() and is_nonlexical() are not opposites. There are grammars which are neither, and grammars which are both. """ return self._is_nonlexical def min_len(self): """ Return the right-hand side length of the shortest grammar production. """ return self._min_len def max_len(self): """ Return the right-hand side length of the longest grammar production. """ return self._max_len def is_nonempty(self): """ Return True if there are no empty productions. """ return self._min_len > 0 def is_binarised(self): """ Return True if all productions are at most binary. Note that there can still be empty and unary productions. """ return self._max_len <= 2 def is_flexible_chomsky_normal_form(self): """ Return True if all productions are of the forms A -> B C, A -> B, or A -> "s". """ return self.is_nonempty() and self.is_nonlexical() and self.is_binarised() def is_chomsky_normal_form(self): """ Return True if the grammar is of Chomsky Normal Form, i.e. all productions are of the form A -> B C, or A -> "s". """ return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical def chomsky_normal_form(self, new_token_padding="@$@", flexible=False): """ Returns a new Grammar that is in chomsky normal :param: new_token_padding Customise new rule formation during binarisation """ if self.is_chomsky_normal_form(): return self if self.productions(empty=True): raise ValueError( "Grammar has Empty rules. " "Cannot deal with them at the moment" ) # check for mixed rules for rule in self.productions(): if rule.is_lexical() and len(rule.rhs()) > 1: raise ValueError( f"Cannot handled mixed rule {rule.lhs()} => {rule.rhs()}" ) step1 = CFG.eliminate_start(self) step2 = CFG.binarize(step1, new_token_padding) if flexible: return step2 step3 = CFG.remove_unitary_rules(step2) step4 = CFG(step3.start(), list(set(step3.productions()))) return step4 @classmethod def remove_unitary_rules(cls, grammar): """ Remove nonlexical unitary rules and convert them to lexical """ result = [] unitary = [] for rule in grammar.productions(): if len(rule) == 1 and rule.is_nonlexical(): unitary.append(rule) else: result.append(rule) while unitary: rule = unitary.pop(0) for item in grammar.productions(lhs=rule.rhs()[0]): new_rule = Production(rule.lhs(), item.rhs()) if len(new_rule) != 1 or new_rule.is_lexical(): result.append(new_rule) else: unitary.append(new_rule) n_grammar = CFG(grammar.start(), result) return n_grammar @classmethod def binarize(cls, grammar, padding="@$@"): """ Convert all non-binary rules into binary by introducing new tokens. Example:: Original: A => B C D After Conversion: A => B A@$@B A@$@B => C D """ result = [] for rule in grammar.productions(): if len(rule.rhs()) > 2: # this rule needs to be broken down left_side = rule.lhs() for k in range(0, len(rule.rhs()) - 2): tsym = rule.rhs()[k] new_sym = Nonterminal(left_side.symbol() + padding + tsym.symbol()) new_production = Production(left_side, (tsym, new_sym)) left_side = new_sym result.append(new_production) last_prd = Production(left_side, rule.rhs()[-2:]) result.append(last_prd) else: result.append(rule) n_grammar = CFG(grammar.start(), result) return n_grammar @classmethod def eliminate_start(cls, grammar): """ Eliminate start rule in case it appears on RHS Example: S -> S0 S1 and S0 -> S1 S Then another rule S0_Sigma -> S is added """ start = grammar.start() result = [] need_to_add = None for rule in grammar.productions(): if start in rule.rhs(): need_to_add = True result.append(rule) if need_to_add: start = Nonterminal("S0_SIGMA") result.append(Production(start, [grammar.start()])) n_grammar = CFG(start, result) return n_grammar return grammar def __repr__(self): return "" % len(self._productions) def __str__(self): result = "Grammar with %d productions" % len(self._productions) result += " (start state = %r)" % self._start for production in self._productions: result += "\n %s" % production return result class FeatureGrammar(CFG): """ A feature-based grammar. This is equivalent to a ``CFG`` whose nonterminals are all ``FeatStructNonterminal``. A grammar consists of a start state and a set of productions. The set of terminals and nonterminals is implicitly specified by the productions. """ def __init__(self, start, productions): """ Create a new feature-based grammar, from the given start state and set of ``Productions``. :param start: The start symbol :type start: FeatStructNonterminal :param productions: The list of productions that defines the grammar :type productions: list(Production) """ CFG.__init__(self, start, productions) # The difference with CFG is that the productions are # indexed on the TYPE feature of the nonterminals. # This is calculated by the method _get_type_if_possible(). def _calculate_indexes(self): self._lhs_index = {} self._rhs_index = {} self._empty_index = {} self._empty_productions = [] self._lexical_index = {} for prod in self._productions: # Left hand side. lhs = self._get_type_if_possible(prod._lhs) if lhs not in self._lhs_index: self._lhs_index[lhs] = [] self._lhs_index[lhs].append(prod) if prod._rhs: # First item in right hand side. rhs0 = self._get_type_if_possible(prod._rhs[0]) if rhs0 not in self._rhs_index: self._rhs_index[rhs0] = [] self._rhs_index[rhs0].append(prod) else: # The right hand side is empty. if lhs not in self._empty_index: self._empty_index[lhs] = [] self._empty_index[lhs].append(prod) self._empty_productions.append(prod) # Lexical tokens in the right hand side. for token in prod._rhs: if is_terminal(token): self._lexical_index.setdefault(token, set()).add(prod) @classmethod def fromstring( cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None ): """ Return a feature structure based grammar. :param input: a grammar, either in the form of a string or else as a list of strings. :param features: a tuple of features (default: SLASH, TYPE) :param logic_parser: a parser for lambda-expressions, by default, ``LogicParser()`` :param fstruct_reader: a feature structure parser (only if features and logic_parser is None) """ if features is None: features = (SLASH, TYPE) if fstruct_reader is None: fstruct_reader = FeatStructReader( features, FeatStructNonterminal, logic_parser=logic_parser ) elif logic_parser is not None: raise Exception( "'logic_parser' and 'fstruct_reader' must " "not both be set" ) start, productions = read_grammar( input, fstruct_reader.read_partial, encoding=encoding ) return cls(start, productions) def productions(self, lhs=None, rhs=None, empty=False): """ Return the grammar productions, filtered by the left-hand side or the first item in the right-hand side. :param lhs: Only return productions with the given left-hand side. :param rhs: Only return productions with the given first item in the right-hand side. :param empty: Only return productions with an empty right-hand side. :rtype: list(Production) """ if rhs and empty: raise ValueError( "You cannot select empty and non-empty " "productions at the same time." ) # no constraints so return everything if not lhs and not rhs: if empty: return self._empty_productions else: return self._productions # only lhs specified so look up its index elif lhs and not rhs: if empty: return self._empty_index.get(self._get_type_if_possible(lhs), []) else: return self._lhs_index.get(self._get_type_if_possible(lhs), []) # only rhs specified so look up its index elif rhs and not lhs: return self._rhs_index.get(self._get_type_if_possible(rhs), []) # intersect else: return [ prod for prod in self._lhs_index.get(self._get_type_if_possible(lhs), []) if prod in self._rhs_index.get(self._get_type_if_possible(rhs), []) ] def leftcorners(self, cat): """ Return the set of all words that the given category can start with. Also called the "first set" in compiler construction. """ raise NotImplementedError("Not implemented yet") def leftcorner_parents(self, cat): """ Return the set of all categories for which the given category is a left corner. """ raise NotImplementedError("Not implemented yet") def _get_type_if_possible(self, item): """ Helper function which returns the ``TYPE`` feature of the ``item``, if it exists, otherwise it returns the ``item`` itself """ if isinstance(item, dict) and TYPE in item: return FeatureValueType(item[TYPE]) else: return item @total_ordering class FeatureValueType: """ A helper class for ``FeatureGrammars``, designed to be different from ordinary strings. This is to stop the ``FeatStruct`` ``FOO[]`` from being compare equal to the terminal "FOO". """ def __init__(self, value): self._value = value def __repr__(self): return "<%s>" % self._value def __eq__(self, other): return type(self) == type(other) and self._value == other._value def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, FeatureValueType): raise_unorderable_types("<", self, other) return self._value < other._value def __hash__(self): return hash(self._value) class DependencyGrammar: """ A dependency grammar. A DependencyGrammar consists of a set of productions. Each production specifies a head/modifier relationship between a pair of words. """ def __init__(self, productions): """ Create a new dependency grammar, from the set of ``Productions``. :param productions: The list of productions that defines the grammar :type productions: list(Production) """ self._productions = productions @classmethod def fromstring(cls, input): productions = [] for linenum, line in enumerate(input.split("\n")): line = line.strip() if line.startswith("#") or line == "": continue try: productions += _read_dependency_production(line) except ValueError as e: raise ValueError(f"Unable to parse line {linenum}: {line}") from e if len(productions) == 0: raise ValueError("No productions found!") return cls(productions) def contains(self, head, mod): """ :param head: A head word. :type head: str :param mod: A mod word, to test as a modifier of 'head'. :type mod: str :return: true if this ``DependencyGrammar`` contains a ``DependencyProduction`` mapping 'head' to 'mod'. :rtype: bool """ for production in self._productions: for possibleMod in production._rhs: if production._lhs == head and possibleMod == mod: return True return False def __contains__(self, head, mod): """ Return True if this ``DependencyGrammar`` contains a ``DependencyProduction`` mapping 'head' to 'mod'. :param head: A head word. :type head: str :param mod: A mod word, to test as a modifier of 'head'. :type mod: str :rtype: bool """ for production in self._productions: for possibleMod in production._rhs: if production._lhs == head and possibleMod == mod: return True return False # # should be rewritten, the set comp won't work in all comparisons # def contains_exactly(self, head, modlist): # for production in self._productions: # if(len(production._rhs) == len(modlist)): # if(production._lhs == head): # set1 = Set(production._rhs) # set2 = Set(modlist) # if(set1 == set2): # return True # return False def __str__(self): """ Return a verbose string representation of the ``DependencyGrammar`` :rtype: str """ str = "Dependency grammar with %d productions" % len(self._productions) for production in self._productions: str += "\n %s" % production return str def __repr__(self): """ Return a concise string representation of the ``DependencyGrammar`` """ return "Dependency grammar with %d productions" % len(self._productions) class ProbabilisticDependencyGrammar: """ """ def __init__(self, productions, events, tags): self._productions = productions self._events = events self._tags = tags def contains(self, head, mod): """ Return True if this ``DependencyGrammar`` contains a ``DependencyProduction`` mapping 'head' to 'mod'. :param head: A head word. :type head: str :param mod: A mod word, to test as a modifier of 'head'. :type mod: str :rtype: bool """ for production in self._productions: for possibleMod in production._rhs: if production._lhs == head and possibleMod == mod: return True return False def __str__(self): """ Return a verbose string representation of the ``ProbabilisticDependencyGrammar`` :rtype: str """ str = "Statistical dependency grammar with %d productions" % len( self._productions ) for production in self._productions: str += "\n %s" % production str += "\nEvents:" for event in self._events: str += "\n %d:%s" % (self._events[event], event) str += "\nTags:" for tag_word in self._tags: str += f"\n {tag_word}:\t({self._tags[tag_word]})" return str def __repr__(self): """ Return a concise string representation of the ``ProbabilisticDependencyGrammar`` """ return "Statistical Dependency grammar with %d productions" % len( self._productions ) class PCFG(CFG): """ A probabilistic context-free grammar. A PCFG consists of a start state and a set of productions with probabilities. The set of terminals and nonterminals is implicitly specified by the productions. PCFG productions use the ``ProbabilisticProduction`` class. ``PCFGs`` impose the constraint that the set of productions with any given left-hand-side must have probabilities that sum to 1 (allowing for a small margin of error). If you need efficient key-based access to productions, you can use a subclass to implement it. :type EPSILON: float :cvar EPSILON: The acceptable margin of error for checking that productions with a given left-hand side have probabilities that sum to 1. """ EPSILON = 0.01 def __init__(self, start, productions, calculate_leftcorners=True): """ Create a new context-free grammar, from the given start state and set of ``ProbabilisticProductions``. :param start: The start symbol :type start: Nonterminal :param productions: The list of productions that defines the grammar :type productions: list(Production) :raise ValueError: if the set of productions with any left-hand-side do not have probabilities that sum to a value within EPSILON of 1. :param calculate_leftcorners: False if we don't want to calculate the leftcorner relation. In that case, some optimized chart parsers won't work. :type calculate_leftcorners: bool """ CFG.__init__(self, start, productions, calculate_leftcorners) # Make sure that the probabilities sum to one. probs = {} for production in productions: probs[production.lhs()] = probs.get(production.lhs(), 0) + production.prob() for (lhs, p) in probs.items(): if not ((1 - PCFG.EPSILON) < p < (1 + PCFG.EPSILON)): raise ValueError("Productions for %r do not sum to 1" % lhs) @classmethod def fromstring(cls, input, encoding=None): """ Return a probabilistic context-free grammar corresponding to the input string(s). :param input: a grammar, either in the form of a string or else as a list of strings. """ start, productions = read_grammar( input, standard_nonterm_parser, probabilistic=True, encoding=encoding ) return cls(start, productions) ################################################################# # Inducing Grammars ################################################################# # Contributed by Nathan Bodenstab def induce_pcfg(start, productions): r""" Induce a PCFG grammar from a list of productions. The probability of a production A -> B C in a PCFG is: | count(A -> B C) | P(B, C | A) = --------------- where \* is any right hand side | count(A -> \*) :param start: The start symbol :type start: Nonterminal :param productions: The list of productions that defines the grammar :type productions: list(Production) """ # Production count: the number of times a given production occurs pcount = {} # LHS-count: counts the number of times a given lhs occurs lcount = {} for prod in productions: lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 pcount[prod] = pcount.get(prod, 0) + 1 prods = [ ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()]) for p in pcount ] return PCFG(start, prods) ################################################################# # Helper functions for reading productions ################################################################# def _read_cfg_production(input): """ Return a list of context-free ``Productions``. """ return _read_production(input, standard_nonterm_parser) def _read_pcfg_production(input): """ Return a list of PCFG ``ProbabilisticProductions``. """ return _read_production(input, standard_nonterm_parser, probabilistic=True) def _read_fcfg_production(input, fstruct_reader): """ Return a list of feature-based ``Productions``. """ return _read_production(input, fstruct_reader) # Parsing generic grammars _ARROW_RE = re.compile(r"\s* -> \s*", re.VERBOSE) _PROBABILITY_RE = re.compile(r"( \[ [\d\.]+ \] ) \s*", re.VERBOSE) _TERMINAL_RE = re.compile(r'( "[^"]*" | \'[^\']*\' ) \s*', re.VERBOSE) _DISJUNCTION_RE = re.compile(r"\| \s*", re.VERBOSE) def _read_production(line, nonterm_parser, probabilistic=False): """ Parse a grammar rule, given as a string, and return a list of productions. """ pos = 0 # Parse the left-hand side. lhs, pos = nonterm_parser(line, pos) # Skip over the arrow. m = _ARROW_RE.match(line, pos) if not m: raise ValueError("Expected an arrow") pos = m.end() # Parse the right hand side. probabilities = [0.0] rhsides = [[]] while pos < len(line): # Probability. m = _PROBABILITY_RE.match(line, pos) if probabilistic and m: pos = m.end() probabilities[-1] = float(m.group(1)[1:-1]) if probabilities[-1] > 1.0: raise ValueError( "Production probability %f, " "should not be greater than 1.0" % (probabilities[-1],) ) # String -- add terminal. elif line[pos] in "'\"": m = _TERMINAL_RE.match(line, pos) if not m: raise ValueError("Unterminated string") rhsides[-1].append(m.group(1)[1:-1]) pos = m.end() # Vertical bar -- start new rhside. elif line[pos] == "|": m = _DISJUNCTION_RE.match(line, pos) probabilities.append(0.0) rhsides.append([]) pos = m.end() # Anything else -- nonterminal. else: nonterm, pos = nonterm_parser(line, pos) rhsides[-1].append(nonterm) if probabilistic: return [ ProbabilisticProduction(lhs, rhs, prob=probability) for (rhs, probability) in zip(rhsides, probabilities) ] else: return [Production(lhs, rhs) for rhs in rhsides] ################################################################# # Reading Phrase Structure Grammars ################################################################# def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None): """ Return a pair consisting of a starting category and a list of ``Productions``. :param input: a grammar, either in the form of a string or else as a list of strings. :param nonterm_parser: a function for parsing nonterminals. It should take a ``(string, position)`` as argument and return a ``(nonterminal, position)`` as result. :param probabilistic: are the grammar rules probabilistic? :type probabilistic: bool :param encoding: the encoding of the grammar, if it is a binary string :type encoding: str """ if encoding is not None: input = input.decode(encoding) if isinstance(input, str): lines = input.split("\n") else: lines = input start = None productions = [] continue_line = "" for linenum, line in enumerate(lines): line = continue_line + line.strip() if line.startswith("#") or line == "": continue if line.endswith("\\"): continue_line = line[:-1].rstrip() + " " continue continue_line = "" try: if line[0] == "%": directive, args = line[1:].split(None, 1) if directive == "start": start, pos = nonterm_parser(args, 0) if pos != len(args): raise ValueError("Bad argument to start directive") else: raise ValueError("Bad directive") else: # expand out the disjunctions on the RHS productions += _read_production(line, nonterm_parser, probabilistic) except ValueError as e: raise ValueError(f"Unable to parse line {linenum + 1}: {line}\n{e}") from e if not productions: raise ValueError("No productions found!") if not start: start = productions[0].lhs() return (start, productions) _STANDARD_NONTERM_RE = re.compile(r"( [\w/][\w/^<>-]* ) \s*", re.VERBOSE) def standard_nonterm_parser(string, pos): m = _STANDARD_NONTERM_RE.match(string, pos) if not m: raise ValueError("Expected a nonterminal, found: " + string[pos:]) return (Nonterminal(m.group(1)), m.end()) ################################################################# # Reading Dependency Grammars ################################################################# _READ_DG_RE = re.compile( r"""^\s* # leading whitespace ('[^']+')\s* # single-quoted lhs (?:[-=]+>)\s* # arrow (?:( # rhs: "[^"]+" # doubled-quoted terminal | '[^']+' # single-quoted terminal | \| # disjunction ) \s*) # trailing space *$""", # zero or more copies re.VERBOSE, ) _SPLIT_DG_RE = re.compile(r"""('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)""") def _read_dependency_production(s): if not _READ_DG_RE.match(s): raise ValueError("Bad production string") pieces = _SPLIT_DG_RE.split(s) pieces = [p for i, p in enumerate(pieces) if i % 2 == 1] lhside = pieces[0].strip("'\"") rhsides = [[]] for piece in pieces[2:]: if piece == "|": rhsides.append([]) else: rhsides[-1].append(piece.strip("'\"")) return [DependencyProduction(lhside, rhside) for rhside in rhsides] ################################################################# # Demonstration ################################################################# def cfg_demo(): """ A demonstration showing how ``CFGs`` can be created and used. """ from nltk import CFG, Production, nonterminals # Create some nonterminals S, NP, VP, PP = nonterminals("S, NP, VP, PP") N, V, P, Det = nonterminals("N, V, P, Det") VP_slash_NP = VP / NP print("Some nonterminals:", [S, NP, VP, PP, N, V, P, Det, VP / NP]) print(" S.symbol() =>", repr(S.symbol())) print() print(Production(S, [NP])) # Create some Grammar Productions grammar = CFG.fromstring( """ S -> NP VP PP -> P NP NP -> Det N | NP PP VP -> V NP | VP PP Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' """ ) print("A Grammar:", repr(grammar)) print(" grammar.start() =>", repr(grammar.start())) print(" grammar.productions() =>", end=" ") # Use string.replace(...) is to line-wrap the output. print(repr(grammar.productions()).replace(",", ",\n" + " " * 25)) print() def pcfg_demo(): """ A demonstration showing how a ``PCFG`` can be created and used. """ from nltk import induce_pcfg, treetransforms from nltk.corpus import treebank from nltk.parse import pchart toy_pcfg1 = PCFG.fromstring( """ S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """ ) toy_pcfg2 = PCFG.fromstring( """ S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """ ) pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print("A PCFG production:", repr(pcfg_prod)) print(" pcfg_prod.lhs() =>", repr(pcfg_prod.lhs())) print(" pcfg_prod.rhs() =>", repr(pcfg_prod.rhs())) print(" pcfg_prod.prob() =>", repr(pcfg_prod.prob())) print() grammar = toy_pcfg2 print("A PCFG grammar:", repr(grammar)) print(" grammar.start() =>", repr(grammar.start())) print(" grammar.productions() =>", end=" ") # Use .replace(...) is to line-wrap the output. print(repr(grammar.productions()).replace(",", ",\n" + " " * 26)) print() # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] item = treebank._fileids[0] for tree in treebank.parsed_sents(item)[:3]: # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = Nonterminal("S") grammar = induce_pcfg(S, productions) print(grammar) print() print("Parse sentence using induced grammar:") parser = pchart.InsideChartParser(grammar) parser.trace(3) # doesn't work as tokens are different: # sent = treebank.tokenized('wsj_0001.mrg')[0] sent = treebank.parsed_sents(item)[0].leaves() print(sent) for parse in parser.parse(sent): print(parse) def fcfg_demo(): import nltk.data g = nltk.data.load("grammars/book_grammars/feat0.fcfg") print(g) print() def dg_demo(): """ A demonstration showing the creation and inspection of a ``DependencyGrammar``. """ grammar = DependencyGrammar.fromstring( """ 'scratch' -> 'cats' | 'walls' 'walls' -> 'the' 'cats' -> 'the' """ ) print(grammar) def sdg_demo(): """ A demonstration of how to read a string representation of a CoNLL format dependency tree. """ from nltk.parse import DependencyGraph dg = DependencyGraph( """ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 met met Prep Prep voor 8 mod _ _ 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ 7 gaan ga V V hulp|inf 6 vc _ _ 8 winkelen winkel V V intrans|inf 11 cnj _ _ 9 , , Punc Punc komma 8 punct _ _ 10 zwemmen zwem V V intrans|inf 11 cnj _ _ 11 of of Conj Conj neven 7 vc _ _ 12 terrassen terras N N soort|mv|neut 11 cnj _ _ 13 . . Punc Punc punt 12 punct _ _ """ ) tree = dg.tree() print(tree.pprint()) def demo(): cfg_demo() pcfg_demo() fcfg_demo() dg_demo() sdg_demo() if __name__ == "__main__": demo() __all__ = [ "Nonterminal", "nonterminals", "CFG", "Production", "PCFG", "ProbabilisticProduction", "DependencyGrammar", "DependencyProduction", "ProbabilisticDependencyGrammar", "induce_pcfg", "read_grammar", ] nltk-3.7/nltk/help.py000066400000000000000000000031551420073152400146070ustar00rootroot00000000000000# Natural Language Toolkit (NLTK) Help # # Copyright (C) 2001-2022 NLTK Project # Authors: Steven Bird # URL: # For license information, see LICENSE.TXT """ Provide structured access to documentation. """ import re from textwrap import wrap from nltk.data import load def brown_tagset(tagpattern=None): _format_tagset("brown_tagset", tagpattern) def claws5_tagset(tagpattern=None): _format_tagset("claws5_tagset", tagpattern) def upenn_tagset(tagpattern=None): _format_tagset("upenn_tagset", tagpattern) ##################################################################### # UTILITIES ##################################################################### def _print_entries(tags, tagdict): for tag in tags: entry = tagdict[tag] defn = [tag + ": " + entry[0]] examples = wrap( entry[1], width=75, initial_indent=" ", subsequent_indent=" " ) print("\n".join(defn + examples)) def _format_tagset(tagset, tagpattern=None): tagdict = load("help/tagsets/" + tagset + ".pickle") if not tagpattern: _print_entries(sorted(tagdict), tagdict) elif tagpattern in tagdict: _print_entries([tagpattern], tagdict) else: tagpattern = re.compile(tagpattern) tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)] if tags: _print_entries(tags, tagdict) else: print("No matching tags found.") if __name__ == "__main__": brown_tagset(r"NN.*") upenn_tagset(r".*\$") claws5_tagset("UNDEFINED") brown_tagset(r"NN") nltk-3.7/nltk/inference/000077500000000000000000000000001420073152400152375ustar00rootroot00000000000000nltk-3.7/nltk/inference/__init__.py000066400000000000000000000014261420073152400173530ustar00rootroot00000000000000# Natural Language Toolkit: Inference # # Copyright (C) 2001-2022 NLTK Project # Author: Dan Garrette # Ewan Klein # # URL: # For license information, see LICENSE.TXT """ Classes and interfaces for theorem proving and model building. """ from nltk.inference.api import ParallelProverBuilder, ParallelProverBuilderCommand from nltk.inference.discourse import ( CfgReadingCommand, DiscourseTester, DrtGlueReadingCommand, ReadingCommand, ) from nltk.inference.mace import Mace, MaceCommand from nltk.inference.prover9 import Prover9, Prover9Command from nltk.inference.resolution import ResolutionProver, ResolutionProverCommand from nltk.inference.tableau import TableauProver, TableauProverCommand nltk-3.7/nltk/inference/api.py000066400000000000000000000450021420073152400163630ustar00rootroot00000000000000# Natural Language Toolkit: Classifier Interface # # Author: Ewan Klein # Dan Garrette # # URL: # For license information, see LICENSE.TXT """ Interfaces and base classes for theorem provers and model builders. ``Prover`` is a standard interface for a theorem prover which tries to prove a goal from a list of assumptions. ``ModelBuilder`` is a standard interface for a model builder. Given just a set of assumptions. the model builder tries to build a model for the assumptions. Given a set of assumptions and a goal *G*, the model builder tries to find a counter-model, in the sense of a model that will satisfy the assumptions plus the negation of *G*. """ import threading import time from abc import ABCMeta, abstractmethod class Prover(metaclass=ABCMeta): """ Interface for trying to prove a goal from assumptions. Both the goal and the assumptions are constrained to be formulas of ``logic.Expression``. """ def prove(self, goal=None, assumptions=None, verbose=False): """ :return: Whether the proof was successful or not. :rtype: bool """ return self._prove(goal, assumptions, verbose)[0] @abstractmethod def _prove(self, goal=None, assumptions=None, verbose=False): """ :return: Whether the proof was successful or not, along with the proof :rtype: tuple: (bool, str) """ class ModelBuilder(metaclass=ABCMeta): """ Interface for trying to build a model of set of formulas. Open formulas are assumed to be universally quantified. Both the goal and the assumptions are constrained to be formulas of ``logic.Expression``. """ def build_model(self, goal=None, assumptions=None, verbose=False): """ Perform the actual model building. :return: Whether a model was generated :rtype: bool """ return self._build_model(goal, assumptions, verbose)[0] @abstractmethod def _build_model(self, goal=None, assumptions=None, verbose=False): """ Perform the actual model building. :return: Whether a model was generated, and the model itself :rtype: tuple(bool, sem.Valuation) """ class TheoremToolCommand(metaclass=ABCMeta): """ This class holds a goal and a list of assumptions to be used in proving or model building. """ @abstractmethod def add_assumptions(self, new_assumptions): """ Add new assumptions to the assumption list. :param new_assumptions: new assumptions :type new_assumptions: list(sem.Expression) """ @abstractmethod def retract_assumptions(self, retracted, debug=False): """ Retract assumptions from the assumption list. :param debug: If True, give warning when ``retracted`` is not present on assumptions list. :type debug: bool :param retracted: assumptions to be retracted :type retracted: list(sem.Expression) """ @abstractmethod def assumptions(self): """ List the current assumptions. :return: list of ``Expression`` """ @abstractmethod def goal(self): """ Return the goal :return: ``Expression`` """ @abstractmethod def print_assumptions(self): """ Print the list of the current assumptions. """ class ProverCommand(TheoremToolCommand): """ This class holds a ``Prover``, a goal, and a list of assumptions. When prove() is called, the ``Prover`` is executed with the goal and assumptions. """ @abstractmethod def prove(self, verbose=False): """ Perform the actual proof. """ @abstractmethod def proof(self, simplify=True): """ Return the proof string :param simplify: bool simplify the proof? :return: str """ @abstractmethod def get_prover(self): """ Return the prover object :return: ``Prover`` """ class ModelBuilderCommand(TheoremToolCommand): """ This class holds a ``ModelBuilder``, a goal, and a list of assumptions. When build_model() is called, the ``ModelBuilder`` is executed with the goal and assumptions. """ @abstractmethod def build_model(self, verbose=False): """ Perform the actual model building. :return: A model if one is generated; None otherwise. :rtype: sem.Valuation """ @abstractmethod def model(self, format=None): """ Return a string representation of the model :param simplify: bool simplify the proof? :return: str """ @abstractmethod def get_model_builder(self): """ Return the model builder object :return: ``ModelBuilder`` """ class BaseTheoremToolCommand(TheoremToolCommand): """ This class holds a goal and a list of assumptions to be used in proving or model building. """ def __init__(self, goal=None, assumptions=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) """ self._goal = goal if not assumptions: self._assumptions = [] else: self._assumptions = list(assumptions) self._result = None """A holder for the result, to prevent unnecessary re-proving""" def add_assumptions(self, new_assumptions): """ Add new assumptions to the assumption list. :param new_assumptions: new assumptions :type new_assumptions: list(sem.Expression) """ self._assumptions.extend(new_assumptions) self._result = None def retract_assumptions(self, retracted, debug=False): """ Retract assumptions from the assumption list. :param debug: If True, give warning when ``retracted`` is not present on assumptions list. :type debug: bool :param retracted: assumptions to be retracted :type retracted: list(sem.Expression) """ retracted = set(retracted) result_list = list(filter(lambda a: a not in retracted, self._assumptions)) if debug and result_list == self._assumptions: print(Warning("Assumptions list has not been changed:")) self.print_assumptions() self._assumptions = result_list self._result = None def assumptions(self): """ List the current assumptions. :return: list of ``Expression`` """ return self._assumptions def goal(self): """ Return the goal :return: ``Expression`` """ return self._goal def print_assumptions(self): """ Print the list of the current assumptions. """ for a in self.assumptions(): print(a) class BaseProverCommand(BaseTheoremToolCommand, ProverCommand): """ This class holds a ``Prover``, a goal, and a list of assumptions. When prove() is called, the ``Prover`` is executed with the goal and assumptions. """ def __init__(self, prover, goal=None, assumptions=None): """ :param prover: The theorem tool to execute with the assumptions :type prover: Prover :see: ``BaseTheoremToolCommand`` """ self._prover = prover """The theorem tool to execute with the assumptions""" BaseTheoremToolCommand.__init__(self, goal, assumptions) self._proof = None def prove(self, verbose=False): """ Perform the actual proof. Store the result to prevent unnecessary re-proving. """ if self._result is None: self._result, self._proof = self._prover._prove( self.goal(), self.assumptions(), verbose ) return self._result def proof(self, simplify=True): """ Return the proof string :param simplify: bool simplify the proof? :return: str """ if self._result is None: raise LookupError("You have to call prove() first to get a proof!") else: return self.decorate_proof(self._proof, simplify) def decorate_proof(self, proof_string, simplify=True): """ Modify and return the proof string :param proof_string: str the proof to decorate :param simplify: bool simplify the proof? :return: str """ return proof_string def get_prover(self): return self._prover class BaseModelBuilderCommand(BaseTheoremToolCommand, ModelBuilderCommand): """ This class holds a ``ModelBuilder``, a goal, and a list of assumptions. When build_model() is called, the ``ModelBuilder`` is executed with the goal and assumptions. """ def __init__(self, modelbuilder, goal=None, assumptions=None): """ :param modelbuilder: The theorem tool to execute with the assumptions :type modelbuilder: ModelBuilder :see: ``BaseTheoremToolCommand`` """ self._modelbuilder = modelbuilder """The theorem tool to execute with the assumptions""" BaseTheoremToolCommand.__init__(self, goal, assumptions) self._model = None def build_model(self, verbose=False): """ Attempt to build a model. Store the result to prevent unnecessary re-building. """ if self._result is None: self._result, self._model = self._modelbuilder._build_model( self.goal(), self.assumptions(), verbose ) return self._result def model(self, format=None): """ Return a string representation of the model :param simplify: bool simplify the proof? :return: str """ if self._result is None: raise LookupError("You have to call build_model() first to " "get a model!") else: return self._decorate_model(self._model, format) def _decorate_model(self, valuation_str, format=None): """ :param valuation_str: str with the model builder's output :param format: str indicating the format for displaying :return: str """ return valuation_str def get_model_builder(self): return self._modelbuilder class TheoremToolCommandDecorator(TheoremToolCommand): """ A base decorator for the ``ProverCommandDecorator`` and ``ModelBuilderCommandDecorator`` classes from which decorators can extend. """ def __init__(self, command): """ :param command: ``TheoremToolCommand`` to decorate """ self._command = command # The decorator has its own versions of 'result' different from the # underlying command self._result = None def assumptions(self): return self._command.assumptions() def goal(self): return self._command.goal() def add_assumptions(self, new_assumptions): self._command.add_assumptions(new_assumptions) self._result = None def retract_assumptions(self, retracted, debug=False): self._command.retract_assumptions(retracted, debug) self._result = None def print_assumptions(self): self._command.print_assumptions() class ProverCommandDecorator(TheoremToolCommandDecorator, ProverCommand): """ A base decorator for the ``ProverCommand`` class from which other prover command decorators can extend. """ def __init__(self, proverCommand): """ :param proverCommand: ``ProverCommand`` to decorate """ TheoremToolCommandDecorator.__init__(self, proverCommand) # The decorator has its own versions of 'result' and 'proof' # because they may be different from the underlying command self._proof = None def prove(self, verbose=False): if self._result is None: prover = self.get_prover() self._result, self._proof = prover._prove( self.goal(), self.assumptions(), verbose ) return self._result def proof(self, simplify=True): """ Return the proof string :param simplify: bool simplify the proof? :return: str """ if self._result is None: raise LookupError("You have to call prove() first to get a proof!") else: return self.decorate_proof(self._proof, simplify) def decorate_proof(self, proof_string, simplify=True): """ Modify and return the proof string :param proof_string: str the proof to decorate :param simplify: bool simplify the proof? :return: str """ return self._command.decorate_proof(proof_string, simplify) def get_prover(self): return self._command.get_prover() class ModelBuilderCommandDecorator(TheoremToolCommandDecorator, ModelBuilderCommand): """ A base decorator for the ``ModelBuilderCommand`` class from which other prover command decorators can extend. """ def __init__(self, modelBuilderCommand): """ :param modelBuilderCommand: ``ModelBuilderCommand`` to decorate """ TheoremToolCommandDecorator.__init__(self, modelBuilderCommand) # The decorator has its own versions of 'result' and 'valuation' # because they may be different from the underlying command self._model = None def build_model(self, verbose=False): """ Attempt to build a model. Store the result to prevent unnecessary re-building. """ if self._result is None: modelbuilder = self.get_model_builder() self._result, self._model = modelbuilder._build_model( self.goal(), self.assumptions(), verbose ) return self._result def model(self, format=None): """ Return a string representation of the model :param simplify: bool simplify the proof? :return: str """ if self._result is None: raise LookupError("You have to call build_model() first to " "get a model!") else: return self._decorate_model(self._model, format) def _decorate_model(self, valuation_str, format=None): """ Modify and return the proof string :param valuation_str: str with the model builder's output :param format: str indicating the format for displaying :return: str """ return self._command._decorate_model(valuation_str, format) def get_model_builder(self): return self._command.get_prover() class ParallelProverBuilder(Prover, ModelBuilder): """ This class stores both a prover and a model builder and when either prove() or build_model() is called, then both theorem tools are run in parallel. Whichever finishes first, the prover or the model builder, is the result that will be used. """ def __init__(self, prover, modelbuilder): self._prover = prover self._modelbuilder = modelbuilder def _prove(self, goal=None, assumptions=None, verbose=False): return self._run(goal, assumptions, verbose), "" def _build_model(self, goal=None, assumptions=None, verbose=False): return not self._run(goal, assumptions, verbose), "" def _run(self, goal, assumptions, verbose): # Set up two thread, Prover and ModelBuilder to run in parallel tp_thread = TheoremToolThread( lambda: self._prover.prove(goal, assumptions, verbose), verbose, "TP" ) mb_thread = TheoremToolThread( lambda: self._modelbuilder.build_model(goal, assumptions, verbose), verbose, "MB", ) tp_thread.start() mb_thread.start() while tp_thread.is_alive() and mb_thread.is_alive(): # wait until either the prover or the model builder is done pass if tp_thread.result is not None: return tp_thread.result elif mb_thread.result is not None: return not mb_thread.result else: return None class ParallelProverBuilderCommand(BaseProverCommand, BaseModelBuilderCommand): """ This command stores both a prover and a model builder and when either prove() or build_model() is called, then both theorem tools are run in parallel. Whichever finishes first, the prover or the model builder, is the result that will be used. Because the theorem prover result is the opposite of the model builder result, we will treat self._result as meaning "proof found/no model found". """ def __init__(self, prover, modelbuilder, goal=None, assumptions=None): BaseProverCommand.__init__(self, prover, goal, assumptions) BaseModelBuilderCommand.__init__(self, modelbuilder, goal, assumptions) def prove(self, verbose=False): return self._run(verbose) def build_model(self, verbose=False): return not self._run(verbose) def _run(self, verbose): # Set up two thread, Prover and ModelBuilder to run in parallel tp_thread = TheoremToolThread( lambda: BaseProverCommand.prove(self, verbose), verbose, "TP" ) mb_thread = TheoremToolThread( lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, "MB" ) tp_thread.start() mb_thread.start() while tp_thread.is_alive() and mb_thread.is_alive(): # wait until either the prover or the model builder is done pass if tp_thread.result is not None: self._result = tp_thread.result elif mb_thread.result is not None: self._result = not mb_thread.result return self._result class TheoremToolThread(threading.Thread): def __init__(self, command, verbose, name=None): threading.Thread.__init__(self) self._command = command self._result = None self._verbose = verbose self._name = name def run(self): try: self._result = self._command() if self._verbose: print( "Thread %s finished with result %s at %s" % (self._name, self._result, time.localtime(time.time())) ) except Exception as e: print(e) print("Thread %s completed abnormally" % (self._name)) @property def result(self): return self._result nltk-3.7/nltk/inference/discourse.py000066400000000000000000000530301420073152400176120ustar00rootroot00000000000000# Natural Language Toolkit: Discourse Processing # # Author: Ewan Klein # Dan Garrette # # URL: # For license information, see LICENSE.TXT r""" Module for incrementally developing simple discourses, and checking for semantic ambiguity, consistency and informativeness. Many of the ideas are based on the CURT family of programs of Blackburn and Bos (see http://homepages.inf.ed.ac.uk/jbos/comsem/book1.html). Consistency checking is carried out by using the ``mace`` module to call the Mace4 model builder. Informativeness checking is carried out with a call to ``Prover.prove()`` from the ``inference`` module. ``DiscourseTester`` is a constructor for discourses. The basic data structure is a list of sentences, stored as ``self._sentences``. Each sentence in the list is assigned a "sentence ID" (``sid``) of the form ``s``\ *i*. For example:: s0: A boxer walks s1: Every boxer chases a girl Each sentence can be ambiguous between a number of readings, each of which receives a "reading ID" (``rid``) of the form ``s``\ *i* -``r``\ *j*. For example:: s0 readings: s0-r1: some x.(boxer(x) & walk(x)) s0-r0: some x.(boxerdog(x) & walk(x)) A "thread" is a list of readings, represented as a list of ``rid``\ s. Each thread receives a "thread ID" (``tid``) of the form ``d``\ *i*. For example:: d0: ['s0-r0', 's1-r0'] The set of all threads for a discourse is the Cartesian product of all the readings of the sequences of sentences. (This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show those threads which are consistent (taking into account any background assumptions). """ import os from abc import ABCMeta, abstractmethod from functools import reduce from operator import add, and_ from nltk.data import show_cfg from nltk.inference.mace import MaceCommand from nltk.inference.prover9 import Prover9Command from nltk.parse import load_parser from nltk.parse.malt import MaltParser from nltk.sem.drt import AnaphoraResolutionException, resolve_anaphora from nltk.sem.glue import DrtGlue from nltk.sem.logic import Expression from nltk.tag import RegexpTagger class ReadingCommand(metaclass=ABCMeta): @abstractmethod def parse_to_readings(self, sentence): """ :param sentence: the sentence to read :type sentence: str """ def process_thread(self, sentence_readings): """ This method should be used to handle dependencies between readings such as resolving anaphora. :param sentence_readings: readings to process :type sentence_readings: list(Expression) :return: the list of readings after processing :rtype: list(Expression) """ return sentence_readings @abstractmethod def combine_readings(self, readings): """ :param readings: readings to combine :type readings: list(Expression) :return: one combined reading :rtype: Expression """ @abstractmethod def to_fol(self, expression): """ Convert this expression into a First-Order Logic expression. :param expression: an expression :type expression: Expression :return: a FOL version of the input expression :rtype: Expression """ class CfgReadingCommand(ReadingCommand): def __init__(self, gramfile=None): """ :param gramfile: name of file where grammar can be loaded :type gramfile: str """ self._gramfile = ( gramfile if gramfile else "grammars/book_grammars/discourse.fcfg" ) self._parser = load_parser(self._gramfile) def parse_to_readings(self, sentence): """:see: ReadingCommand.parse_to_readings()""" from nltk.sem import root_semrep tokens = sentence.split() trees = self._parser.parse(tokens) return [root_semrep(tree) for tree in trees] def combine_readings(self, readings): """:see: ReadingCommand.combine_readings()""" return reduce(and_, readings) def to_fol(self, expression): """:see: ReadingCommand.to_fol()""" return expression class DrtGlueReadingCommand(ReadingCommand): def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None): """ :param semtype_file: name of file where grammar can be loaded :param remove_duplicates: should duplicates be removed? :param depparser: the dependency parser """ if semtype_file is None: semtype_file = os.path.join( "grammars", "sample_grammars", "drt_glue.semtype" ) self._glue = DrtGlue( semtype_file=semtype_file, remove_duplicates=remove_duplicates, depparser=depparser, ) def parse_to_readings(self, sentence): """:see: ReadingCommand.parse_to_readings()""" return self._glue.parse_to_meaning(sentence) def process_thread(self, sentence_readings): """:see: ReadingCommand.process_thread()""" try: return [self.combine_readings(sentence_readings)] except AnaphoraResolutionException: return [] def combine_readings(self, readings): """:see: ReadingCommand.combine_readings()""" thread_reading = reduce(add, readings) return resolve_anaphora(thread_reading.simplify()) def to_fol(self, expression): """:see: ReadingCommand.to_fol()""" return expression.fol() class DiscourseTester: """ Check properties of an ongoing discourse. """ def __init__(self, input, reading_command=None, background=None): """ Initialize a ``DiscourseTester``. :param input: the discourse sentences :type input: list of str :param background: Formulas which express background assumptions :type background: list(Expression) """ self._input = input self._sentences = {"s%s" % i: sent for i, sent in enumerate(input)} self._models = None self._readings = {} self._reading_command = ( reading_command if reading_command else CfgReadingCommand() ) self._threads = {} self._filtered_threads = {} if background is not None: from nltk.sem.logic import Expression for e in background: assert isinstance(e, Expression) self._background = background else: self._background = [] ############################### # Sentences ############################### def sentences(self): """ Display the list of sentences in the current discourse. """ for id in sorted(self._sentences): print(f"{id}: {self._sentences[id]}") def add_sentence(self, sentence, informchk=False, consistchk=False): """ Add a sentence to the current discourse. Updates ``self._input`` and ``self._sentences``. :param sentence: An input sentence :type sentence: str :param informchk: if ``True``, check that the result of adding the sentence is thread-informative. Updates ``self._readings``. :param consistchk: if ``True``, check that the result of adding the sentence is thread-consistent. Updates ``self._readings``. """ # check whether the new sentence is informative (i.e. not entailed by the previous discourse) if informchk: self.readings(verbose=False) for tid in sorted(self._threads): assumptions = [reading for (rid, reading) in self.expand_threads(tid)] assumptions += self._background for sent_reading in self._get_readings(sentence): tp = Prover9Command(goal=sent_reading, assumptions=assumptions) if tp.prove(): print( "Sentence '%s' under reading '%s':" % (sentence, str(sent_reading)) ) print("Not informative relative to thread '%s'" % tid) self._input.append(sentence) self._sentences = {"s%s" % i: sent for i, sent in enumerate(self._input)} # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of # of assumptions if consistchk: self.readings(verbose=False) self.models(show=False) def retract_sentence(self, sentence, verbose=True): """ Remove a sentence from the current discourse. Updates ``self._input``, ``self._sentences`` and ``self._readings``. :param sentence: An input sentence :type sentence: str :param verbose: If ``True``, report on the updated list of sentences. """ try: self._input.remove(sentence) except ValueError: print( "Retraction failed. The sentence '%s' is not part of the current discourse:" % sentence ) self.sentences() return None self._sentences = {"s%s" % i: sent for i, sent in enumerate(self._input)} self.readings(verbose=False) if verbose: print("Current sentences are ") self.sentences() def grammar(self): """ Print out the grammar in use for parsing input sentences """ show_cfg(self._reading_command._gramfile) ############################### # Readings and Threads ############################### def _get_readings(self, sentence): """ Build a list of semantic readings for a sentence. :rtype: list(Expression) """ return self._reading_command.parse_to_readings(sentence) def _construct_readings(self): """ Use ``self._sentences`` to construct a value for ``self._readings``. """ # re-initialize self._readings in case we have retracted a sentence self._readings = {} for sid in sorted(self._sentences): sentence = self._sentences[sid] readings = self._get_readings(sentence) self._readings[sid] = { f"{sid}-r{rid}": reading.simplify() for rid, reading in enumerate(sorted(readings, key=str)) } def _construct_threads(self): """ Use ``self._readings`` to construct a value for ``self._threads`` and use the model builder to construct a value for ``self._filtered_threads`` """ thread_list = [[]] for sid in sorted(self._readings): thread_list = self.multiply(thread_list, sorted(self._readings[sid])) self._threads = {"d%s" % tid: thread for tid, thread in enumerate(thread_list)} # re-initialize the filtered threads self._filtered_threads = {} # keep the same ids, but only include threads which get models consistency_checked = self._check_consistency(self._threads) for (tid, thread) in self._threads.items(): if (tid, True) in consistency_checked: self._filtered_threads[tid] = thread def _show_readings(self, sentence=None): """ Print out the readings for the discourse (or a single sentence). """ if sentence is not None: print("The sentence '%s' has these readings:" % sentence) for r in [str(reading) for reading in (self._get_readings(sentence))]: print(" %s" % r) else: for sid in sorted(self._readings): print() print("%s readings:" % sid) print() #'-' * 30 for rid in sorted(self._readings[sid]): lf = self._readings[sid][rid] print(f"{rid}: {lf.normalize()}") def _show_threads(self, filter=False, show_thread_readings=False): """ Print out the value of ``self._threads`` or ``self._filtered_hreads`` """ threads = self._filtered_threads if filter else self._threads for tid in sorted(threads): if show_thread_readings: readings = [ self._readings[rid.split("-")[0]][rid] for rid in self._threads[tid] ] try: thread_reading = ( ": %s" % self._reading_command.combine_readings(readings).normalize() ) except Exception as e: thread_reading = ": INVALID: %s" % e.__class__.__name__ else: thread_reading = "" print("%s:" % tid, self._threads[tid], thread_reading) def readings( self, sentence=None, threaded=False, verbose=True, filter=False, show_thread_readings=False, ): """ Construct and show the readings of the discourse (or of a single sentence). :param sentence: test just this sentence :type sentence: str :param threaded: if ``True``, print out each thread ID and the corresponding thread. :param filter: if ``True``, only print out consistent thread IDs and threads. """ self._construct_readings() self._construct_threads() # if we are filtering or showing thread readings, show threads if filter or show_thread_readings: threaded = True if verbose: if not threaded: self._show_readings(sentence=sentence) else: self._show_threads( filter=filter, show_thread_readings=show_thread_readings ) def expand_threads(self, thread_id, threads=None): """ Given a thread ID, find the list of ``logic.Expression`` objects corresponding to the reading IDs in that thread. :param thread_id: thread ID :type thread_id: str :param threads: a mapping from thread IDs to lists of reading IDs :type threads: dict :return: A list of pairs ``(rid, reading)`` where reading is the ``logic.Expression`` associated with a reading ID :rtype: list of tuple """ if threads is None: threads = self._threads return [ (rid, self._readings[sid][rid]) for rid in threads[thread_id] for sid in rid.split("-")[:1] ] ############################### # Models and Background ############################### def _check_consistency(self, threads, show=False, verbose=False): results = [] for tid in sorted(threads): assumptions = [ reading for (rid, reading) in self.expand_threads(tid, threads=threads) ] assumptions = list( map( self._reading_command.to_fol, self._reading_command.process_thread(assumptions), ) ) if assumptions: assumptions += self._background # if Mace4 finds a model, it always seems to find it quickly mb = MaceCommand(None, assumptions, max_models=20) modelfound = mb.build_model() else: modelfound = False results.append((tid, modelfound)) if show: spacer(80) print("Model for Discourse Thread %s" % tid) spacer(80) if verbose: for a in assumptions: print(a) spacer(80) if modelfound: print(mb.model(format="cooked")) else: print("No model found!\n") return results def models(self, thread_id=None, show=True, verbose=False): """ Call Mace4 to build a model for each current discourse thread. :param thread_id: thread ID :type thread_id: str :param show: If ``True``, display the model that has been found. """ self._construct_readings() self._construct_threads() threads = {thread_id: self._threads[thread_id]} if thread_id else self._threads for (tid, modelfound) in self._check_consistency( threads, show=show, verbose=verbose ): idlist = [rid for rid in threads[tid]] if not modelfound: print(f"Inconsistent discourse: {tid} {idlist}:") for rid, reading in self.expand_threads(tid): print(f" {rid}: {reading.normalize()}") print() else: print(f"Consistent discourse: {tid} {idlist}:") for rid, reading in self.expand_threads(tid): print(f" {rid}: {reading.normalize()}") print() def add_background(self, background, verbose=False): """ Add a list of background assumptions for reasoning about the discourse. When called, this method also updates the discourse model's set of readings and threads. :param background: Formulas which contain background information :type background: list(Expression) """ from nltk.sem.logic import Expression for (count, e) in enumerate(background): assert isinstance(e, Expression) if verbose: print("Adding assumption %s to background" % count) self._background.append(e) # update the state self._construct_readings() self._construct_threads() def background(self): """ Show the current background assumptions. """ for e in self._background: print(str(e)) ############################### # Misc ############################### @staticmethod def multiply(discourse, readings): """ Multiply every thread in ``discourse`` by every reading in ``readings``. Given discourse = [['A'], ['B']], readings = ['a', 'b', 'c'] , returns [['A', 'a'], ['A', 'b'], ['A', 'c'], ['B', 'a'], ['B', 'b'], ['B', 'c']] :param discourse: the current list of readings :type discourse: list of lists :param readings: an additional list of readings :type readings: list(Expression) :rtype: A list of lists """ result = [] for sublist in discourse: for r in readings: new = [] new += sublist new.append(r) result.append(new) return result def load_fol(s): """ Temporarily duplicated from ``nltk.sem.util``. Convert a file of first order formulas into a list of ``Expression`` objects. :param s: the contents of the file :type s: str :return: a list of parsed formulas. :rtype: list(Expression) """ statements = [] for linenum, line in enumerate(s.splitlines()): line = line.strip() if line.startswith("#") or line == "": continue try: statements.append(Expression.fromstring(line)) except Exception as e: raise ValueError(f"Unable to parse line {linenum}: {line}") from e return statements ############################### # Demo ############################### def discourse_demo(reading_command=None): """ Illustrate the various methods of ``DiscourseTester`` """ dt = DiscourseTester( ["A boxer walks", "Every boxer chases a girl"], reading_command ) dt.models() print() # dt.grammar() print() dt.sentences() print() dt.readings() print() dt.readings(threaded=True) print() dt.models("d1") dt.add_sentence("John is a boxer") print() dt.sentences() print() dt.readings(threaded=True) print() dt = DiscourseTester( ["A student dances", "Every student is a person"], reading_command ) print() dt.add_sentence("No person dances", consistchk=True) print() dt.readings() print() dt.retract_sentence("No person dances", verbose=True) print() dt.models() print() dt.readings("A person dances") print() dt.add_sentence("A person dances", informchk=True) dt = DiscourseTester( ["Vincent is a boxer", "Fido is a boxer", "Vincent is married", "Fido barks"], reading_command, ) dt.readings(filter=True) import nltk.data background_file = os.path.join("grammars", "book_grammars", "background.fol") background = nltk.data.load(background_file) print() dt.add_background(background, verbose=False) dt.background() print() dt.readings(filter=True) print() dt.models() def drt_discourse_demo(reading_command=None): """ Illustrate the various methods of ``DiscourseTester`` """ dt = DiscourseTester(["every dog chases a boy", "he runs"], reading_command) dt.models() print() dt.sentences() print() dt.readings() print() dt.readings(show_thread_readings=True) print() dt.readings(filter=True, show_thread_readings=True) def spacer(num=30): print("-" * num) def demo(): discourse_demo() tagger = RegexpTagger( [ ("^(chases|runs)$", "VB"), ("^(a)$", "ex_quant"), ("^(every)$", "univ_quant"), ("^(dog|boy)$", "NN"), ("^(he)$", "PRP"), ] ) depparser = MaltParser(tagger=tagger) drt_discourse_demo( DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser) ) if __name__ == "__main__": demo() nltk-3.7/nltk/inference/mace.py000066400000000000000000000271241420073152400165240ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the Mace4 Model Builder # # Author: Dan Garrette # Ewan Klein # URL: # For license information, see LICENSE.TXT """ A model builder that makes use of the external 'Mace4' package. """ import os import tempfile from nltk.inference.api import BaseModelBuilderCommand, ModelBuilder from nltk.inference.prover9 import Prover9CommandParent, Prover9Parent from nltk.sem import Expression, Valuation from nltk.sem.logic import is_indvar class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand): """ A ``MaceCommand`` specific to the ``Mace`` model builder. It contains a print_assumptions() method that is used to print the list of assumptions in multiple formats. """ _interpformat_bin = None def __init__(self, goal=None, assumptions=None, max_models=500, model_builder=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) :param max_models: The maximum number of models that Mace will try before simply returning false. (Use 0 for no maximum.) :type max_models: int """ if model_builder is not None: assert isinstance(model_builder, Mace) else: model_builder = Mace(max_models) BaseModelBuilderCommand.__init__(self, model_builder, goal, assumptions) @property def valuation(mbc): return mbc.model("valuation") def _convert2val(self, valuation_str): """ Transform the output file into an NLTK-style Valuation. :return: A model if one is generated; None otherwise. :rtype: sem.Valuation """ valuation_standard_format = self._transform_output(valuation_str, "standard") val = [] for line in valuation_standard_format.splitlines(False): l = line.strip() if l.startswith("interpretation"): # find the number of entities in the model num_entities = int(l[l.index("(") + 1 : l.index(",")].strip()) elif l.startswith("function") and l.find("_") == -1: # replace the integer identifier with a corresponding alphabetic character name = l[l.index("(") + 1 : l.index(",")].strip() if is_indvar(name): name = name.upper() value = int(l[l.index("[") + 1 : l.index("]")].strip()) val.append((name, MaceCommand._make_model_var(value))) elif l.startswith("relation"): l = l[l.index("(") + 1 :] if "(" in l: # relation is not nullary name = l[: l.index("(")].strip() values = [ int(v.strip()) for v in l[l.index("[") + 1 : l.index("]")].split(",") ] val.append( (name, MaceCommand._make_relation_set(num_entities, values)) ) else: # relation is nullary name = l[: l.index(",")].strip() value = int(l[l.index("[") + 1 : l.index("]")].strip()) val.append((name, value == 1)) return Valuation(val) @staticmethod def _make_relation_set(num_entities, values): """ Convert a Mace4-style relation table into a dictionary. :param num_entities: the number of entities in the model; determines the row length in the table. :type num_entities: int :param values: a list of 1's and 0's that represent whether a relation holds in a Mace4 model. :type values: list of int """ r = set() for position in [pos for (pos, v) in enumerate(values) if v == 1]: r.add( tuple(MaceCommand._make_relation_tuple(position, values, num_entities)) ) return r @staticmethod def _make_relation_tuple(position, values, num_entities): if len(values) == 1: return [] else: sublist_size = len(values) // num_entities sublist_start = position // sublist_size sublist_position = int(position % sublist_size) sublist = values[ sublist_start * sublist_size : (sublist_start + 1) * sublist_size ] return [ MaceCommand._make_model_var(sublist_start) ] + MaceCommand._make_relation_tuple( sublist_position, sublist, num_entities ) @staticmethod def _make_model_var(value): """ Pick an alphabetic character as identifier for an entity in the model. :param value: where to index into the list of characters :type value: int """ letter = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", ][value] num = value // 26 return letter + str(num) if num > 0 else letter def _decorate_model(self, valuation_str, format): """ Print out a Mace4 model using any Mace4 ``interpformat`` format. See https://www.cs.unm.edu/~mccune/mace4/manual/ for details. :param valuation_str: str with the model builder's output :param format: str indicating the format for displaying models. Defaults to 'standard' format. :return: str """ if not format: return valuation_str elif format == "valuation": return self._convert2val(valuation_str) else: return self._transform_output(valuation_str, format) def _transform_output(self, valuation_str, format): """ Transform the output file into any Mace4 ``interpformat`` format. :param format: Output format for displaying models. :type format: str """ if format in [ "standard", "standard2", "portable", "tabular", "raw", "cooked", "xml", "tex", ]: return self._call_interpformat(valuation_str, [format])[0] else: raise LookupError("The specified format does not exist") def _call_interpformat(self, input_str, args=[], verbose=False): """ Call the ``interpformat`` binary with the given input. :param input_str: A string whose contents are used as stdin. :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if self._interpformat_bin is None: self._interpformat_bin = self._modelbuilder._find_binary( "interpformat", verbose ) return self._modelbuilder._call( input_str, self._interpformat_bin, args, verbose ) class Mace(Prover9Parent, ModelBuilder): _mace4_bin = None def __init__(self, end_size=500): self._end_size = end_size """The maximum model size that Mace will try before simply returning false. (Use -1 for no maximum.)""" def _build_model(self, goal=None, assumptions=None, verbose=False): """ Use Mace4 to build a first order model. :return: ``True`` if a model was found (i.e. Mace returns value of 0), else ``False`` """ if not assumptions: assumptions = [] stdout, returncode = self._call_mace4( self.prover9_input(goal, assumptions), verbose=verbose ) return (returncode == 0, stdout) def _call_mace4(self, input_str, args=[], verbose=False): """ Call the ``mace4`` binary with the given input. :param input_str: A string whose contents are used as stdin. :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if self._mace4_bin is None: self._mace4_bin = self._find_binary("mace4", verbose) updated_input_str = "" if self._end_size > 0: updated_input_str += "assign(end_size, %d).\n\n" % self._end_size updated_input_str += input_str return self._call(updated_input_str, self._mace4_bin, args, verbose) def spacer(num=30): print("-" * num) def decode_result(found): """ Decode the result of model_found() :param found: The output of model_found() :type found: bool """ return {True: "Countermodel found", False: "No countermodel found", None: "None"}[ found ] def test_model_found(arguments): """ Try some proofs and exhibit the results. """ for (goal, assumptions) in arguments: g = Expression.fromstring(goal) alist = [lp.parse(a) for a in assumptions] m = MaceCommand(g, assumptions=alist, max_models=50) found = m.build_model() for a in alist: print(" %s" % a) print(f"|- {g}: {decode_result(found)}\n") def test_build_model(arguments): """ Try to build a ``nltk.sem.Valuation``. """ g = Expression.fromstring("all x.man(x)") alist = [ Expression.fromstring(a) for a in [ "man(John)", "man(Socrates)", "man(Bill)", "some x.(-(x = John) & man(x) & sees(John,x))", "some x.(-(x = Bill) & man(x))", "all x.some y.(man(x) -> gives(Socrates,x,y))", ] ] m = MaceCommand(g, assumptions=alist) m.build_model() spacer() print("Assumptions and Goal") spacer() for a in alist: print(" %s" % a) print(f"|- {g}: {decode_result(m.build_model())}\n") spacer() # print(m.model('standard')) # print(m.model('cooked')) print("Valuation") spacer() print(m.valuation, "\n") def test_transform_output(argument_pair): """ Transform the model into various Mace4 ``interpformat`` formats. """ g = Expression.fromstring(argument_pair[0]) alist = [lp.parse(a) for a in argument_pair[1]] m = MaceCommand(g, assumptions=alist) m.build_model() for a in alist: print(" %s" % a) print(f"|- {g}: {m.build_model()}\n") for format in ["standard", "portable", "xml", "cooked"]: spacer() print("Using '%s' format" % format) spacer() print(m.model(format=format)) def test_make_relation_set(): print( MaceCommand._make_relation_set(num_entities=3, values=[1, 0, 1]) == {("c",), ("a",)} ) print( MaceCommand._make_relation_set( num_entities=3, values=[0, 0, 0, 0, 0, 0, 1, 0, 0] ) == {("c", "a")} ) print( MaceCommand._make_relation_set(num_entities=2, values=[0, 0, 1, 0, 0, 0, 1, 0]) == {("a", "b", "a"), ("b", "b", "a")} ) arguments = [ ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), ("(not mortal(Socrates))", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), ] def demo(): test_model_found(arguments) test_build_model(arguments) test_transform_output(arguments[1]) if __name__ == "__main__": demo() nltk-3.7/nltk/inference/nonmonotonic.py000066400000000000000000000442651420073152400203440ustar00rootroot00000000000000# Natural Language Toolkit: Nonmonotonic Reasoning # # Author: Daniel H. Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT """ A module to perform nonmonotonic reasoning. The ideas and demonstrations in this module are based on "Logical Foundations of Artificial Intelligence" by Michael R. Genesereth and Nils J. Nilsson. """ from collections import defaultdict from functools import reduce from nltk.inference.api import Prover, ProverCommandDecorator from nltk.inference.prover9 import Prover9, Prover9Command from nltk.sem.logic import ( AbstractVariableExpression, AllExpression, AndExpression, ApplicationExpression, BooleanExpression, EqualityExpression, ExistsExpression, Expression, ImpExpression, NegatedExpression, Variable, VariableExpression, operator, unique_variable, ) class ProverParseError(Exception): pass def get_domain(goal, assumptions): if goal is None: all_expressions = assumptions else: all_expressions = assumptions + [-goal] return reduce(operator.or_, (a.constants() for a in all_expressions), set()) class ClosedDomainProver(ProverCommandDecorator): """ This is a prover decorator that adds domain closure assumptions before proving. """ def assumptions(self): assumptions = [a for a in self._command.assumptions()] goal = self._command.goal() domain = get_domain(goal, assumptions) return [self.replace_quants(ex, domain) for ex in assumptions] def goal(self): goal = self._command.goal() domain = get_domain(goal, self._command.assumptions()) return self.replace_quants(goal, domain) def replace_quants(self, ex, domain): """ Apply the closed domain assumption to the expression - Domain = union([e.free()|e.constants() for e in all_expressions]) - translate "exists x.P" to "(z=d1 | z=d2 | ... ) & P.replace(x,z)" OR "P.replace(x, d1) | P.replace(x, d2) | ..." - translate "all x.P" to "P.replace(x, d1) & P.replace(x, d2) & ..." :param ex: ``Expression`` :param domain: set of {Variable}s :return: ``Expression`` """ if isinstance(ex, AllExpression): conjuncts = [ ex.term.replace(ex.variable, VariableExpression(d)) for d in domain ] conjuncts = [self.replace_quants(c, domain) for c in conjuncts] return reduce(lambda x, y: x & y, conjuncts) elif isinstance(ex, BooleanExpression): return ex.__class__( self.replace_quants(ex.first, domain), self.replace_quants(ex.second, domain), ) elif isinstance(ex, NegatedExpression): return -self.replace_quants(ex.term, domain) elif isinstance(ex, ExistsExpression): disjuncts = [ ex.term.replace(ex.variable, VariableExpression(d)) for d in domain ] disjuncts = [self.replace_quants(d, domain) for d in disjuncts] return reduce(lambda x, y: x | y, disjuncts) else: return ex class UniqueNamesProver(ProverCommandDecorator): """ This is a prover decorator that adds unique names assumptions before proving. """ def assumptions(self): """ - Domain = union([e.free()|e.constants() for e in all_expressions]) - if "d1 = d2" cannot be proven from the premises, then add "d1 != d2" """ assumptions = self._command.assumptions() domain = list(get_domain(self._command.goal(), assumptions)) # build a dictionary of obvious equalities eq_sets = SetHolder() for a in assumptions: if isinstance(a, EqualityExpression): av = a.first.variable bv = a.second.variable # put 'a' and 'b' in the same set eq_sets[av].add(bv) new_assumptions = [] for i, a in enumerate(domain): for b in domain[i + 1 :]: # if a and b are not already in the same equality set if b not in eq_sets[a]: newEqEx = EqualityExpression( VariableExpression(a), VariableExpression(b) ) if Prover9().prove(newEqEx, assumptions): # we can prove that the names are the same entity. # remember that they are equal so we don't re-check. eq_sets[a].add(b) else: # we can't prove it, so assume unique names new_assumptions.append(-newEqEx) return assumptions + new_assumptions class SetHolder(list): """ A list of sets of Variables. """ def __getitem__(self, item): """ :param item: ``Variable`` :return: the set containing 'item' """ assert isinstance(item, Variable) for s in self: if item in s: return s # item is not found in any existing set. so create a new set new = {item} self.append(new) return new class ClosedWorldProver(ProverCommandDecorator): """ This is a prover decorator that completes predicates before proving. If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion of "P". If the assumptions contain "all x.(ostrich(x) -> bird(x))", then "all x.(bird(x) -> ostrich(x))" is the completion of "bird". If the assumptions don't contain anything that are "P", then "all x.-P(x)" is the completion of "P". walk(Socrates) Socrates != Bill + all x.(walk(x) -> (x=Socrates)) ---------------- -walk(Bill) see(Socrates, John) see(John, Mary) Socrates != John John != Mary + all x.all y.(see(x,y) -> ((x=Socrates & y=John) | (x=John & y=Mary))) ---------------- -see(Socrates, Mary) all x.(ostrich(x) -> bird(x)) bird(Tweety) -ostrich(Sam) Sam != Tweety + all x.(bird(x) -> (ostrich(x) | x=Tweety)) + all x.-ostrich(x) ------------------- -bird(Sam) """ def assumptions(self): assumptions = self._command.assumptions() predicates = self._make_predicate_dict(assumptions) new_assumptions = [] for p in predicates: predHolder = predicates[p] new_sig = self._make_unique_signature(predHolder) new_sig_exs = [VariableExpression(v) for v in new_sig] disjuncts = [] # Turn the signatures into disjuncts for sig in predHolder.signatures: equality_exs = [] for v1, v2 in zip(new_sig_exs, sig): equality_exs.append(EqualityExpression(v1, v2)) disjuncts.append(reduce(lambda x, y: x & y, equality_exs)) # Turn the properties into disjuncts for prop in predHolder.properties: # replace variables from the signature with new sig variables bindings = {} for v1, v2 in zip(new_sig_exs, prop[0]): bindings[v2] = v1 disjuncts.append(prop[1].substitute_bindings(bindings)) # make the assumption if disjuncts: # disjuncts exist, so make an implication antecedent = self._make_antecedent(p, new_sig) consequent = reduce(lambda x, y: x | y, disjuncts) accum = ImpExpression(antecedent, consequent) else: # nothing has property 'p' accum = NegatedExpression(self._make_antecedent(p, new_sig)) # quantify the implication for new_sig_var in new_sig[::-1]: accum = AllExpression(new_sig_var, accum) new_assumptions.append(accum) return assumptions + new_assumptions def _make_unique_signature(self, predHolder): """ This method figures out how many arguments the predicate takes and returns a tuple containing that number of unique variables. """ return tuple(unique_variable() for i in range(predHolder.signature_len)) def _make_antecedent(self, predicate, signature): """ Return an application expression with 'predicate' as the predicate and 'signature' as the list of arguments. """ antecedent = predicate for v in signature: antecedent = antecedent(VariableExpression(v)) return antecedent def _make_predicate_dict(self, assumptions): """ Create a dictionary of predicates from the assumptions. :param assumptions: a list of ``Expression``s :return: dict mapping ``AbstractVariableExpression`` to ``PredHolder`` """ predicates = defaultdict(PredHolder) for a in assumptions: self._map_predicates(a, predicates) return predicates def _map_predicates(self, expression, predDict): if isinstance(expression, ApplicationExpression): func, args = expression.uncurry() if isinstance(func, AbstractVariableExpression): predDict[func].append_sig(tuple(args)) elif isinstance(expression, AndExpression): self._map_predicates(expression.first, predDict) self._map_predicates(expression.second, predDict) elif isinstance(expression, AllExpression): # collect all the universally quantified variables sig = [expression.variable] term = expression.term while isinstance(term, AllExpression): sig.append(term.variable) term = term.term if isinstance(term, ImpExpression): if isinstance(term.first, ApplicationExpression) and isinstance( term.second, ApplicationExpression ): func1, args1 = term.first.uncurry() func2, args2 = term.second.uncurry() if ( isinstance(func1, AbstractVariableExpression) and isinstance(func2, AbstractVariableExpression) and sig == [v.variable for v in args1] and sig == [v.variable for v in args2] ): predDict[func2].append_prop((tuple(sig), term.first)) predDict[func1].validate_sig_len(sig) class PredHolder: """ This class will be used by a dictionary that will store information about predicates to be used by the ``ClosedWorldProver``. The 'signatures' property is a list of tuples defining signatures for which the predicate is true. For instance, 'see(john, mary)' would be result in the signature '(john,mary)' for 'see'. The second element of the pair is a list of pairs such that the first element of the pair is a tuple of variables and the second element is an expression of those variables that makes the predicate true. For instance, 'all x.all y.(see(x,y) -> know(x,y))' would result in "((x,y),('see(x,y)'))" for 'know'. """ def __init__(self): self.signatures = [] self.properties = [] self.signature_len = None def append_sig(self, new_sig): self.validate_sig_len(new_sig) self.signatures.append(new_sig) def append_prop(self, new_prop): self.validate_sig_len(new_prop[0]) self.properties.append(new_prop) def validate_sig_len(self, new_sig): if self.signature_len is None: self.signature_len = len(new_sig) elif self.signature_len != len(new_sig): raise Exception("Signature lengths do not match") def __str__(self): return f"({self.signatures},{self.properties},{self.signature_len})" def __repr__(self): return "%s" % self def closed_domain_demo(): lexpr = Expression.fromstring p1 = lexpr(r"exists x.walk(x)") p2 = lexpr(r"man(Socrates)") c = lexpr(r"walk(Socrates)") prover = Prover9Command(c, [p1, p2]) print(prover.prove()) cdp = ClosedDomainProver(prover) print("assumptions:") for a in cdp.assumptions(): print(" ", a) print("goal:", cdp.goal()) print(cdp.prove()) p1 = lexpr(r"exists x.walk(x)") p2 = lexpr(r"man(Socrates)") p3 = lexpr(r"-walk(Bill)") c = lexpr(r"walk(Socrates)") prover = Prover9Command(c, [p1, p2, p3]) print(prover.prove()) cdp = ClosedDomainProver(prover) print("assumptions:") for a in cdp.assumptions(): print(" ", a) print("goal:", cdp.goal()) print(cdp.prove()) p1 = lexpr(r"exists x.walk(x)") p2 = lexpr(r"man(Socrates)") p3 = lexpr(r"-walk(Bill)") c = lexpr(r"walk(Socrates)") prover = Prover9Command(c, [p1, p2, p3]) print(prover.prove()) cdp = ClosedDomainProver(prover) print("assumptions:") for a in cdp.assumptions(): print(" ", a) print("goal:", cdp.goal()) print(cdp.prove()) p1 = lexpr(r"walk(Socrates)") p2 = lexpr(r"walk(Bill)") c = lexpr(r"all x.walk(x)") prover = Prover9Command(c, [p1, p2]) print(prover.prove()) cdp = ClosedDomainProver(prover) print("assumptions:") for a in cdp.assumptions(): print(" ", a) print("goal:", cdp.goal()) print(cdp.prove()) p1 = lexpr(r"girl(mary)") p2 = lexpr(r"dog(rover)") p3 = lexpr(r"all x.(girl(x) -> -dog(x))") p4 = lexpr(r"all x.(dog(x) -> -girl(x))") p5 = lexpr(r"chase(mary, rover)") c = lexpr(r"exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))") prover = Prover9Command(c, [p1, p2, p3, p4, p5]) print(prover.prove()) cdp = ClosedDomainProver(prover) print("assumptions:") for a in cdp.assumptions(): print(" ", a) print("goal:", cdp.goal()) print(cdp.prove()) def unique_names_demo(): lexpr = Expression.fromstring p1 = lexpr(r"man(Socrates)") p2 = lexpr(r"man(Bill)") c = lexpr(r"exists x.exists y.(x != y)") prover = Prover9Command(c, [p1, p2]) print(prover.prove()) unp = UniqueNamesProver(prover) print("assumptions:") for a in unp.assumptions(): print(" ", a) print("goal:", unp.goal()) print(unp.prove()) p1 = lexpr(r"all x.(walk(x) -> (x = Socrates))") p2 = lexpr(r"Bill = William") p3 = lexpr(r"Bill = Billy") c = lexpr(r"-walk(William)") prover = Prover9Command(c, [p1, p2, p3]) print(prover.prove()) unp = UniqueNamesProver(prover) print("assumptions:") for a in unp.assumptions(): print(" ", a) print("goal:", unp.goal()) print(unp.prove()) def closed_world_demo(): lexpr = Expression.fromstring p1 = lexpr(r"walk(Socrates)") p2 = lexpr(r"(Socrates != Bill)") c = lexpr(r"-walk(Bill)") prover = Prover9Command(c, [p1, p2]) print(prover.prove()) cwp = ClosedWorldProver(prover) print("assumptions:") for a in cwp.assumptions(): print(" ", a) print("goal:", cwp.goal()) print(cwp.prove()) p1 = lexpr(r"see(Socrates, John)") p2 = lexpr(r"see(John, Mary)") p3 = lexpr(r"(Socrates != John)") p4 = lexpr(r"(John != Mary)") c = lexpr(r"-see(Socrates, Mary)") prover = Prover9Command(c, [p1, p2, p3, p4]) print(prover.prove()) cwp = ClosedWorldProver(prover) print("assumptions:") for a in cwp.assumptions(): print(" ", a) print("goal:", cwp.goal()) print(cwp.prove()) p1 = lexpr(r"all x.(ostrich(x) -> bird(x))") p2 = lexpr(r"bird(Tweety)") p3 = lexpr(r"-ostrich(Sam)") p4 = lexpr(r"Sam != Tweety") c = lexpr(r"-bird(Sam)") prover = Prover9Command(c, [p1, p2, p3, p4]) print(prover.prove()) cwp = ClosedWorldProver(prover) print("assumptions:") for a in cwp.assumptions(): print(" ", a) print("goal:", cwp.goal()) print(cwp.prove()) def combination_prover_demo(): lexpr = Expression.fromstring p1 = lexpr(r"see(Socrates, John)") p2 = lexpr(r"see(John, Mary)") c = lexpr(r"-see(Socrates, Mary)") prover = Prover9Command(c, [p1, p2]) print(prover.prove()) command = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover))) for a in command.assumptions(): print(a) print(command.prove()) def default_reasoning_demo(): lexpr = Expression.fromstring premises = [] # define taxonomy premises.append(lexpr(r"all x.(elephant(x) -> animal(x))")) premises.append(lexpr(r"all x.(bird(x) -> animal(x))")) premises.append(lexpr(r"all x.(dove(x) -> bird(x))")) premises.append(lexpr(r"all x.(ostrich(x) -> bird(x))")) premises.append(lexpr(r"all x.(flying_ostrich(x) -> ostrich(x))")) # default properties premises.append( lexpr(r"all x.((animal(x) & -Ab1(x)) -> -fly(x))") ) # normal animals don't fly premises.append( lexpr(r"all x.((bird(x) & -Ab2(x)) -> fly(x))") ) # normal birds fly premises.append( lexpr(r"all x.((ostrich(x) & -Ab3(x)) -> -fly(x))") ) # normal ostriches don't fly # specify abnormal entities premises.append(lexpr(r"all x.(bird(x) -> Ab1(x))")) # flight premises.append(lexpr(r"all x.(ostrich(x) -> Ab2(x))")) # non-flying bird premises.append(lexpr(r"all x.(flying_ostrich(x) -> Ab3(x))")) # flying ostrich # define entities premises.append(lexpr(r"elephant(E)")) premises.append(lexpr(r"dove(D)")) premises.append(lexpr(r"ostrich(O)")) # print the assumptions prover = Prover9Command(None, premises) command = UniqueNamesProver(ClosedWorldProver(prover)) for a in command.assumptions(): print(a) print_proof("-fly(E)", premises) print_proof("fly(D)", premises) print_proof("-fly(O)", premises) def print_proof(goal, premises): lexpr = Expression.fromstring prover = Prover9Command(lexpr(goal), premises) command = UniqueNamesProver(ClosedWorldProver(prover)) print(goal, prover.prove(), command.prove()) def demo(): closed_domain_demo() unique_names_demo() closed_world_demo() combination_prover_demo() default_reasoning_demo() if __name__ == "__main__": demo() nltk-3.7/nltk/inference/prover9.py000066400000000000000000000366161420073152400172330ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the Prover9 Theorem Prover # # Copyright (C) 2001-2022 NLTK Project # Author: Dan Garrette # Ewan Klein # # URL: # For license information, see LICENSE.TXT """ A theorem prover that makes use of the external 'Prover9' package. """ import os import subprocess import nltk from nltk.inference.api import BaseProverCommand, Prover from nltk.sem.logic import ( AllExpression, AndExpression, EqualityExpression, ExistsExpression, Expression, IffExpression, ImpExpression, NegatedExpression, OrExpression, ) # # Following is not yet used. Return code for 2 actually realized as 512. # p9_return_codes = { 0: True, 1: "(FATAL)", # A fatal error occurred (user's syntax error). 2: False, # (SOS_EMPTY) Prover9 ran out of things to do # (sos list exhausted). 3: "(MAX_MEGS)", # The max_megs (memory limit) parameter was exceeded. 4: "(MAX_SECONDS)", # The max_seconds parameter was exceeded. 5: "(MAX_GIVEN)", # The max_given parameter was exceeded. 6: "(MAX_KEPT)", # The max_kept parameter was exceeded. 7: "(ACTION)", # A Prover9 action terminated the search. 101: "(SIGSEGV)", # Prover9 crashed, most probably due to a bug. } class Prover9CommandParent: """ A common base class used by both ``Prover9Command`` and ``MaceCommand``, which is responsible for maintaining a goal and a set of assumptions, and generating prover9-style input files from them. """ def print_assumptions(self, output_format="nltk"): """ Print the list of the current assumptions. """ if output_format.lower() == "nltk": for a in self.assumptions(): print(a) elif output_format.lower() == "prover9": for a in convert_to_prover9(self.assumptions()): print(a) else: raise NameError( "Unrecognized value for 'output_format': %s" % output_format ) class Prover9Command(Prover9CommandParent, BaseProverCommand): """ A ``ProverCommand`` specific to the ``Prover9`` prover. It contains the a print_assumptions() method that is used to print the list of assumptions in multiple formats. """ def __init__(self, goal=None, assumptions=None, timeout=60, prover=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) :param timeout: number of seconds before timeout; set to 0 for no timeout. :type timeout: int :param prover: a prover. If not set, one will be created. :type prover: Prover9 """ if not assumptions: assumptions = [] if prover is not None: assert isinstance(prover, Prover9) else: prover = Prover9(timeout) BaseProverCommand.__init__(self, prover, goal, assumptions) def decorate_proof(self, proof_string, simplify=True): """ :see BaseProverCommand.decorate_proof() """ if simplify: return self._prover._call_prooftrans(proof_string, ["striplabels"])[ 0 ].rstrip() else: return proof_string.rstrip() class Prover9Parent: """ A common class extended by both ``Prover9`` and ``Mace ``. It contains the functionality required to convert NLTK-style expressions into Prover9-style expressions. """ _binary_location = None def config_prover9(self, binary_location, verbose=False): if binary_location is None: self._binary_location = None self._prover9_bin = None else: name = "prover9" self._prover9_bin = nltk.internals.find_binary( name, path_to_bin=binary_location, env_vars=["PROVER9"], url="https://www.cs.unm.edu/~mccune/prover9/", binary_names=[name, name + ".exe"], verbose=verbose, ) self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1) def prover9_input(self, goal, assumptions): """ :return: The input string that should be provided to the prover9 binary. This string is formed based on the goal, assumptions, and timeout value of this object. """ s = "" if assumptions: s += "formulas(assumptions).\n" for p9_assumption in convert_to_prover9(assumptions): s += " %s.\n" % p9_assumption s += "end_of_list.\n\n" if goal: s += "formulas(goals).\n" s += " %s.\n" % convert_to_prover9(goal) s += "end_of_list.\n\n" return s def binary_locations(self): """ A list of directories that should be searched for the prover9 executables. This list is used by ``config_prover9`` when searching for the prover9 executables. """ return [ "/usr/local/bin/prover9", "/usr/local/bin/prover9/bin", "/usr/local/bin", "/usr/bin", "/usr/local/prover9", "/usr/local/share/prover9", ] def _find_binary(self, name, verbose=False): binary_locations = self.binary_locations() if self._binary_location is not None: binary_locations += [self._binary_location] return nltk.internals.find_binary( name, searchpath=binary_locations, env_vars=["PROVER9"], url="https://www.cs.unm.edu/~mccune/prover9/", binary_names=[name, name + ".exe"], verbose=verbose, ) def _call(self, input_str, binary, args=[], verbose=False): """ Call the binary with the given input. :param input_str: A string whose contents are used as stdin. :param binary: The location of the binary to call :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if verbose: print("Calling:", binary) print("Args:", args) print("Input:\n", input_str, "\n") # Call prover9 via a subprocess cmd = [binary] + args try: input_str = input_str.encode("utf8") except AttributeError: pass p = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE ) (stdout, stderr) = p.communicate(input=input_str) if verbose: print("Return code:", p.returncode) if stdout: print("stdout:\n", stdout, "\n") if stderr: print("stderr:\n", stderr, "\n") return (stdout.decode("utf-8"), p.returncode) def convert_to_prover9(input): """ Convert a ``logic.Expression`` to Prover9 format. """ if isinstance(input, list): result = [] for s in input: try: result.append(_convert_to_prover9(s.simplify())) except: print("input %s cannot be converted to Prover9 input syntax" % input) raise return result else: try: return _convert_to_prover9(input.simplify()) except: print("input %s cannot be converted to Prover9 input syntax" % input) raise def _convert_to_prover9(expression): """ Convert ``logic.Expression`` to Prover9 formatted string. """ if isinstance(expression, ExistsExpression): return ( "exists " + str(expression.variable) + " " + _convert_to_prover9(expression.term) ) elif isinstance(expression, AllExpression): return ( "all " + str(expression.variable) + " " + _convert_to_prover9(expression.term) ) elif isinstance(expression, NegatedExpression): return "-(" + _convert_to_prover9(expression.term) + ")" elif isinstance(expression, AndExpression): return ( "(" + _convert_to_prover9(expression.first) + " & " + _convert_to_prover9(expression.second) + ")" ) elif isinstance(expression, OrExpression): return ( "(" + _convert_to_prover9(expression.first) + " | " + _convert_to_prover9(expression.second) + ")" ) elif isinstance(expression, ImpExpression): return ( "(" + _convert_to_prover9(expression.first) + " -> " + _convert_to_prover9(expression.second) + ")" ) elif isinstance(expression, IffExpression): return ( "(" + _convert_to_prover9(expression.first) + " <-> " + _convert_to_prover9(expression.second) + ")" ) elif isinstance(expression, EqualityExpression): return ( "(" + _convert_to_prover9(expression.first) + " = " + _convert_to_prover9(expression.second) + ")" ) else: return str(expression) class Prover9(Prover9Parent, Prover): _prover9_bin = None _prooftrans_bin = None def __init__(self, timeout=60): self._timeout = timeout """The timeout value for prover9. If a proof can not be found in this amount of time, then prover9 will return false. (Use 0 for no timeout.)""" def _prove(self, goal=None, assumptions=None, verbose=False): """ Use Prover9 to prove a theorem. :return: A pair whose first element is a boolean indicating if the proof was successful (i.e. returns value of 0) and whose second element is the output of the prover. """ if not assumptions: assumptions = [] stdout, returncode = self._call_prover9( self.prover9_input(goal, assumptions), verbose=verbose ) return (returncode == 0, stdout) def prover9_input(self, goal, assumptions): """ :see: Prover9Parent.prover9_input """ s = "clear(auto_denials).\n" # only one proof required return s + Prover9Parent.prover9_input(self, goal, assumptions) def _call_prover9(self, input_str, args=[], verbose=False): """ Call the ``prover9`` binary with the given input. :param input_str: A string whose contents are used as stdin. :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if self._prover9_bin is None: self._prover9_bin = self._find_binary("prover9", verbose) updated_input_str = "" if self._timeout > 0: updated_input_str += "assign(max_seconds, %d).\n\n" % self._timeout updated_input_str += input_str stdout, returncode = self._call( updated_input_str, self._prover9_bin, args, verbose ) if returncode not in [0, 2]: errormsgprefix = "%%ERROR:" if errormsgprefix in stdout: msgstart = stdout.index(errormsgprefix) errormsg = stdout[msgstart:].strip() else: errormsg = None if returncode in [3, 4, 5, 6]: raise Prover9LimitExceededException(returncode, errormsg) else: raise Prover9FatalException(returncode, errormsg) return stdout, returncode def _call_prooftrans(self, input_str, args=[], verbose=False): """ Call the ``prooftrans`` binary with the given input. :param input_str: A string whose contents are used as stdin. :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if self._prooftrans_bin is None: self._prooftrans_bin = self._find_binary("prooftrans", verbose) return self._call(input_str, self._prooftrans_bin, args, verbose) class Prover9Exception(Exception): def __init__(self, returncode, message): msg = p9_return_codes[returncode] if message: msg += "\n%s" % message Exception.__init__(self, msg) class Prover9FatalException(Prover9Exception): pass class Prover9LimitExceededException(Prover9Exception): pass ###################################################################### # { Tests and Demos ###################################################################### def test_config(): a = Expression.fromstring("(walk(j) & sing(j))") g = Expression.fromstring("walk(j)") p = Prover9Command(g, assumptions=[a]) p._executable_path = None p.prover9_search = [] p.prove() # config_prover9('/usr/local/bin') print(p.prove()) print(p.proof()) def test_convert_to_prover9(expr): """ Test that parsing works OK. """ for t in expr: e = Expression.fromstring(t) print(convert_to_prover9(e)) def test_prove(arguments): """ Try some proofs and exhibit the results. """ for (goal, assumptions) in arguments: g = Expression.fromstring(goal) alist = [Expression.fromstring(a) for a in assumptions] p = Prover9Command(g, assumptions=alist).prove() for a in alist: print(" %s" % a) print(f"|- {g}: {p}\n") arguments = [ ("(man(x) <-> (not (not man(x))))", []), ("(not (man(x) & (not man(x))))", []), ("(man(x) | (not man(x)))", []), ("(man(x) & (not man(x)))", []), ("(man(x) -> man(x))", []), ("(not (man(x) & (not man(x))))", []), ("(man(x) | (not man(x)))", []), ("(man(x) -> man(x))", []), ("(man(x) <-> man(x))", []), ("(not (man(x) <-> (not man(x))))", []), ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), ("((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))", []), ("(all x.man(x) -> all x.man(x))", []), ("some x.all y.sees(x,y)", []), ( "some e3.(walk(e3) & subj(e3, mary))", [ "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))" ], ), ( "some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))", [ "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))" ], ), ] expressions = [ r"some x y.sees(x,y)", r"some x.(man(x) & walks(x))", r"\x.(man(x) & walks(x))", r"\x y.sees(x,y)", r"walks(john)", r"\x.big(x, \y.mouse(y))", r"(walks(x) & (runs(x) & (threes(x) & fours(x))))", r"(walks(x) -> runs(x))", r"some x.(PRO(x) & sees(John, x))", r"some x.(man(x) & (not walks(x)))", r"all x.(man(x) -> walks(x))", ] def spacer(num=45): print("-" * num) def demo(): print("Testing configuration") spacer() test_config() print() print("Testing conversion to Prover9 format") spacer() test_convert_to_prover9(expressions) print() print("Testing proofs") spacer() test_prove(arguments) if __name__ == "__main__": demo() nltk-3.7/nltk/inference/resolution.py000077500000000000000000000626221420073152400200270ustar00rootroot00000000000000# Natural Language Toolkit: First-order Resolution-based Theorem Prover # # Author: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT """ Module for a resolution-based First Order theorem prover. """ import operator from collections import defaultdict from functools import reduce from nltk.inference.api import BaseProverCommand, Prover from nltk.sem import skolemize from nltk.sem.logic import ( AndExpression, ApplicationExpression, EqualityExpression, Expression, IndividualVariableExpression, NegatedExpression, OrExpression, Variable, VariableExpression, is_indvar, unique_variable, ) class ProverParseError(Exception): pass class ResolutionProver(Prover): ANSWER_KEY = "ANSWER" _assume_false = True def _prove(self, goal=None, assumptions=None, verbose=False): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof :type assumptions: list(sem.Expression) """ if not assumptions: assumptions = [] result = None try: clauses = [] if goal: clauses.extend(clausify(-goal)) for a in assumptions: clauses.extend(clausify(a)) result, clauses = self._attempt_proof(clauses) if verbose: print(ResolutionProverCommand._decorate_clauses(clauses)) except RuntimeError as e: if self._assume_false and str(e).startswith( "maximum recursion depth exceeded" ): result = False clauses = [] else: if verbose: print(e) else: raise e return (result, clauses) def _attempt_proof(self, clauses): # map indices to lists of indices, to store attempted unifications tried = defaultdict(list) i = 0 while i < len(clauses): if not clauses[i].is_tautology(): # since we try clauses in order, we should start after the last # index tried if tried[i]: j = tried[i][-1] + 1 else: j = i + 1 # nothing tried yet for 'i', so start with the next while j < len(clauses): # don't: 1) unify a clause with itself, # 2) use tautologies if i != j and j and not clauses[j].is_tautology(): tried[i].append(j) newclauses = clauses[i].unify(clauses[j]) if newclauses: for newclause in newclauses: newclause._parents = (i + 1, j + 1) clauses.append(newclause) if not len(newclause): # if there's an empty clause return (True, clauses) i = -1 # since we added a new clause, restart from the top break j += 1 i += 1 return (False, clauses) class ResolutionProverCommand(BaseProverCommand): def __init__(self, goal=None, assumptions=None, prover=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) """ if prover is not None: assert isinstance(prover, ResolutionProver) else: prover = ResolutionProver() BaseProverCommand.__init__(self, prover, goal, assumptions) self._clauses = None def prove(self, verbose=False): """ Perform the actual proof. Store the result to prevent unnecessary re-proving. """ if self._result is None: self._result, clauses = self._prover._prove( self.goal(), self.assumptions(), verbose ) self._clauses = clauses self._proof = ResolutionProverCommand._decorate_clauses(clauses) return self._result def find_answers(self, verbose=False): self.prove(verbose) answers = set() answer_ex = VariableExpression(Variable(ResolutionProver.ANSWER_KEY)) for clause in self._clauses: for term in clause: if ( isinstance(term, ApplicationExpression) and term.function == answer_ex and not isinstance(term.argument, IndividualVariableExpression) ): answers.add(term.argument) return answers @staticmethod def _decorate_clauses(clauses): """ Decorate the proof output. """ out = "" max_clause_len = max(len(str(clause)) for clause in clauses) max_seq_len = len(str(len(clauses))) for i in range(len(clauses)): parents = "A" taut = "" if clauses[i].is_tautology(): taut = "Tautology" if clauses[i]._parents: parents = str(clauses[i]._parents) parents = " " * (max_clause_len - len(str(clauses[i])) + 1) + parents seq = " " * (max_seq_len - len(str(i + 1))) + str(i + 1) out += f"[{seq}] {clauses[i]} {parents} {taut}\n" return out class Clause(list): def __init__(self, data): list.__init__(self, data) self._is_tautology = None self._parents = None def unify(self, other, bindings=None, used=None, skipped=None, debug=False): """ Attempt to unify this Clause with the other, returning a list of resulting, unified, Clauses. :param other: ``Clause`` with which to unify :param bindings: ``BindingDict`` containing bindings that should be used during the unification :param used: tuple of two lists of atoms. The first lists the atoms from 'self' that were successfully unified with atoms from 'other'. The second lists the atoms from 'other' that were successfully unified with atoms from 'self'. :param skipped: tuple of two ``Clause`` objects. The first is a list of all the atoms from the 'self' Clause that have not been unified with anything on the path. The second is same thing for the 'other' Clause. :param debug: bool indicating whether debug statements should print :return: list containing all the resulting ``Clause`` objects that could be obtained by unification """ if bindings is None: bindings = BindingDict() if used is None: used = ([], []) if skipped is None: skipped = ([], []) if isinstance(debug, bool): debug = DebugObject(debug) newclauses = _iterate_first( self, other, bindings, used, skipped, _complete_unify_path, debug ) # remove subsumed clauses. make a list of all indices of subsumed # clauses, and then remove them from the list subsumed = [] for i, c1 in enumerate(newclauses): if i not in subsumed: for j, c2 in enumerate(newclauses): if i != j and j not in subsumed and c1.subsumes(c2): subsumed.append(j) result = [] for i in range(len(newclauses)): if i not in subsumed: result.append(newclauses[i]) return result def isSubsetOf(self, other): """ Return True iff every term in 'self' is a term in 'other'. :param other: ``Clause`` :return: bool """ for a in self: if a not in other: return False return True def subsumes(self, other): """ Return True iff 'self' subsumes 'other', this is, if there is a substitution such that every term in 'self' can be unified with a term in 'other'. :param other: ``Clause`` :return: bool """ negatedother = [] for atom in other: if isinstance(atom, NegatedExpression): negatedother.append(atom.term) else: negatedother.append(-atom) negatedotherClause = Clause(negatedother) bindings = BindingDict() used = ([], []) skipped = ([], []) debug = DebugObject(False) return ( len( _iterate_first( self, negatedotherClause, bindings, used, skipped, _subsumes_finalize, debug, ) ) > 0 ) def __getslice__(self, start, end): return Clause(list.__getslice__(self, start, end)) def __sub__(self, other): return Clause([a for a in self if a not in other]) def __add__(self, other): return Clause(list.__add__(self, other)) def is_tautology(self): """ Self is a tautology if it contains ground terms P and -P. The ground term, P, must be an exact match, ie, not using unification. """ if self._is_tautology is not None: return self._is_tautology for i, a in enumerate(self): if not isinstance(a, EqualityExpression): j = len(self) - 1 while j > i: b = self[j] if isinstance(a, NegatedExpression): if a.term == b: self._is_tautology = True return True elif isinstance(b, NegatedExpression): if a == b.term: self._is_tautology = True return True j -= 1 self._is_tautology = False return False def free(self): return reduce(operator.or_, ((atom.free() | atom.constants()) for atom in self)) def replace(self, variable, expression): """ Replace every instance of variable with expression across every atom in the clause :param variable: ``Variable`` :param expression: ``Expression`` """ return Clause([atom.replace(variable, expression) for atom in self]) def substitute_bindings(self, bindings): """ Replace every binding :param bindings: A list of tuples mapping Variable Expressions to the Expressions to which they are bound. :return: ``Clause`` """ return Clause([atom.substitute_bindings(bindings) for atom in self]) def __str__(self): return "{" + ", ".join("%s" % item for item in self) + "}" def __repr__(self): return "%s" % self def _iterate_first(first, second, bindings, used, skipped, finalize_method, debug): """ This method facilitates movement through the terms of 'self' """ debug.line(f"unify({first},{second}) {bindings}") if not len(first) or not len(second): # if no more recursions can be performed return finalize_method(first, second, bindings, used, skipped, debug) else: # explore this 'self' atom result = _iterate_second( first, second, bindings, used, skipped, finalize_method, debug + 1 ) # skip this possible 'self' atom newskipped = (skipped[0] + [first[0]], skipped[1]) result += _iterate_first( first[1:], second, bindings, used, newskipped, finalize_method, debug + 1 ) try: newbindings, newused, unused = _unify_terms( first[0], second[0], bindings, used ) # Unification found, so progress with this line of unification # put skipped and unused terms back into play for later unification. newfirst = first[1:] + skipped[0] + unused[0] newsecond = second[1:] + skipped[1] + unused[1] result += _iterate_first( newfirst, newsecond, newbindings, newused, ([], []), finalize_method, debug + 1, ) except BindingException: # the atoms could not be unified, pass return result def _iterate_second(first, second, bindings, used, skipped, finalize_method, debug): """ This method facilitates movement through the terms of 'other' """ debug.line(f"unify({first},{second}) {bindings}") if not len(first) or not len(second): # if no more recursions can be performed return finalize_method(first, second, bindings, used, skipped, debug) else: # skip this possible pairing and move to the next newskipped = (skipped[0], skipped[1] + [second[0]]) result = _iterate_second( first, second[1:], bindings, used, newskipped, finalize_method, debug + 1 ) try: newbindings, newused, unused = _unify_terms( first[0], second[0], bindings, used ) # Unification found, so progress with this line of unification # put skipped and unused terms back into play for later unification. newfirst = first[1:] + skipped[0] + unused[0] newsecond = second[1:] + skipped[1] + unused[1] result += _iterate_second( newfirst, newsecond, newbindings, newused, ([], []), finalize_method, debug + 1, ) except BindingException: # the atoms could not be unified, pass return result def _unify_terms(a, b, bindings=None, used=None): """ This method attempts to unify two terms. Two expressions are unifiable if there exists a substitution function S such that S(a) == S(-b). :param a: ``Expression`` :param b: ``Expression`` :param bindings: ``BindingDict`` a starting set of bindings with which the unification must be consistent :return: ``BindingDict`` A dictionary of the bindings required to unify :raise ``BindingException``: If the terms cannot be unified """ assert isinstance(a, Expression) assert isinstance(b, Expression) if bindings is None: bindings = BindingDict() if used is None: used = ([], []) # Use resolution if isinstance(a, NegatedExpression) and isinstance(b, ApplicationExpression): newbindings = most_general_unification(a.term, b, bindings) newused = (used[0] + [a], used[1] + [b]) unused = ([], []) elif isinstance(a, ApplicationExpression) and isinstance(b, NegatedExpression): newbindings = most_general_unification(a, b.term, bindings) newused = (used[0] + [a], used[1] + [b]) unused = ([], []) # Use demodulation elif isinstance(a, EqualityExpression): newbindings = BindingDict([(a.first.variable, a.second)]) newused = (used[0] + [a], used[1]) unused = ([], [b]) elif isinstance(b, EqualityExpression): newbindings = BindingDict([(b.first.variable, b.second)]) newused = (used[0], used[1] + [b]) unused = ([a], []) else: raise BindingException((a, b)) return newbindings, newused, unused def _complete_unify_path(first, second, bindings, used, skipped, debug): if used[0] or used[1]: # if bindings were made along the path newclause = Clause(skipped[0] + skipped[1] + first + second) debug.line(" -> New Clause: %s" % newclause) return [newclause.substitute_bindings(bindings)] else: # no bindings made means no unification occurred. so no result debug.line(" -> End") return [] def _subsumes_finalize(first, second, bindings, used, skipped, debug): if not len(skipped[0]) and not len(first): # If there are no skipped terms and no terms left in 'first', then # all of the terms in the original 'self' were unified with terms # in 'other'. Therefore, there exists a binding (this one) such that # every term in self can be unified with a term in other, which # is the definition of subsumption. return [True] else: return [] def clausify(expression): """ Skolemize, clausify, and standardize the variables apart. """ clause_list = [] for clause in _clausify(skolemize(expression)): for free in clause.free(): if is_indvar(free.name): newvar = VariableExpression(unique_variable()) clause = clause.replace(free, newvar) clause_list.append(clause) return clause_list def _clausify(expression): """ :param expression: a skolemized expression in CNF """ if isinstance(expression, AndExpression): return _clausify(expression.first) + _clausify(expression.second) elif isinstance(expression, OrExpression): first = _clausify(expression.first) second = _clausify(expression.second) assert len(first) == 1 assert len(second) == 1 return [first[0] + second[0]] elif isinstance(expression, EqualityExpression): return [Clause([expression])] elif isinstance(expression, ApplicationExpression): return [Clause([expression])] elif isinstance(expression, NegatedExpression): if isinstance(expression.term, ApplicationExpression): return [Clause([expression])] elif isinstance(expression.term, EqualityExpression): return [Clause([expression])] raise ProverParseError() class BindingDict: def __init__(self, binding_list=None): """ :param binding_list: list of (``AbstractVariableExpression``, ``AtomicExpression``) to initialize the dictionary """ self.d = {} if binding_list: for (v, b) in binding_list: self[v] = b def __setitem__(self, variable, binding): """ A binding is consistent with the dict if its variable is not already bound, OR if its variable is already bound to its argument. :param variable: ``Variable`` The variable to bind :param binding: ``Expression`` The atomic to which 'variable' should be bound :raise BindingException: If the variable cannot be bound in this dictionary """ assert isinstance(variable, Variable) assert isinstance(binding, Expression) try: existing = self[variable] except KeyError: existing = None if not existing or binding == existing: self.d[variable] = binding elif isinstance(binding, IndividualVariableExpression): # Since variable is already bound, try to bind binding to variable try: existing = self[binding.variable] except KeyError: existing = None binding2 = VariableExpression(variable) if not existing or binding2 == existing: self.d[binding.variable] = binding2 else: raise BindingException( "Variable %s already bound to another " "value" % (variable) ) else: raise BindingException( "Variable %s already bound to another " "value" % (variable) ) def __getitem__(self, variable): """ Return the expression to which 'variable' is bound """ assert isinstance(variable, Variable) intermediate = self.d[variable] while intermediate: try: intermediate = self.d[intermediate] except KeyError: return intermediate def __contains__(self, item): return item in self.d def __add__(self, other): """ :param other: ``BindingDict`` The dict with which to combine self :return: ``BindingDict`` A new dict containing all the elements of both parameters :raise BindingException: If the parameter dictionaries are not consistent with each other """ try: combined = BindingDict() for v in self.d: combined[v] = self.d[v] for v in other.d: combined[v] = other.d[v] return combined except BindingException as e: raise BindingException( "Attempting to add two contradicting " "BindingDicts: '%s' and '%s'" % (self, other) ) from e def __len__(self): return len(self.d) def __str__(self): data_str = ", ".join(f"{v}: {self.d[v]}" for v in sorted(self.d.keys())) return "{" + data_str + "}" def __repr__(self): return "%s" % self def most_general_unification(a, b, bindings=None): """ Find the most general unification of the two given expressions :param a: ``Expression`` :param b: ``Expression`` :param bindings: ``BindingDict`` a starting set of bindings with which the unification must be consistent :return: a list of bindings :raise BindingException: if the Expressions cannot be unified """ if bindings is None: bindings = BindingDict() if a == b: return bindings elif isinstance(a, IndividualVariableExpression): return _mgu_var(a, b, bindings) elif isinstance(b, IndividualVariableExpression): return _mgu_var(b, a, bindings) elif isinstance(a, ApplicationExpression) and isinstance(b, ApplicationExpression): return most_general_unification( a.function, b.function, bindings ) + most_general_unification(a.argument, b.argument, bindings) raise BindingException((a, b)) def _mgu_var(var, expression, bindings): if var.variable in expression.free() | expression.constants(): raise BindingException((var, expression)) else: return BindingDict([(var.variable, expression)]) + bindings class BindingException(Exception): def __init__(self, arg): if isinstance(arg, tuple): Exception.__init__(self, "'%s' cannot be bound to '%s'" % arg) else: Exception.__init__(self, arg) class UnificationException(Exception): def __init__(self, a, b): Exception.__init__(self, f"'{a}' cannot unify with '{b}'") class DebugObject: def __init__(self, enabled=True, indent=0): self.enabled = enabled self.indent = indent def __add__(self, i): return DebugObject(self.enabled, self.indent + i) def line(self, line): if self.enabled: print(" " * self.indent + line) def testResolutionProver(): resolution_test(r"man(x)") resolution_test(r"(man(x) -> man(x))") resolution_test(r"(man(x) -> --man(x))") resolution_test(r"-(man(x) and -man(x))") resolution_test(r"(man(x) or -man(x))") resolution_test(r"(man(x) -> man(x))") resolution_test(r"-(man(x) and -man(x))") resolution_test(r"(man(x) or -man(x))") resolution_test(r"(man(x) -> man(x))") resolution_test(r"(man(x) iff man(x))") resolution_test(r"-(man(x) iff -man(x))") resolution_test("all x.man(x)") resolution_test("-all x.some y.F(x,y) & some x.all y.(-F(x,y))") resolution_test("some x.all y.sees(x,y)") p1 = Expression.fromstring(r"all x.(man(x) -> mortal(x))") p2 = Expression.fromstring(r"man(Socrates)") c = Expression.fromstring(r"mortal(Socrates)") print(f"{p1}, {p2} |- {c}: {ResolutionProver().prove(c, [p1, p2])}") p1 = Expression.fromstring(r"all x.(man(x) -> walks(x))") p2 = Expression.fromstring(r"man(John)") c = Expression.fromstring(r"some y.walks(y)") print(f"{p1}, {p2} |- {c}: {ResolutionProver().prove(c, [p1, p2])}") p = Expression.fromstring(r"some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))") c = Expression.fromstring(r"some e0.walk(e0,mary)") print(f"{p} |- {c}: {ResolutionProver().prove(c, [p])}") def resolution_test(e): f = Expression.fromstring(e) t = ResolutionProver().prove(f) print(f"|- {f}: {t}") def test_clausify(): lexpr = Expression.fromstring print(clausify(lexpr("P(x) | Q(x)"))) print(clausify(lexpr("(P(x) & Q(x)) | R(x)"))) print(clausify(lexpr("P(x) | (Q(x) & R(x))"))) print(clausify(lexpr("(P(x) & Q(x)) | (R(x) & S(x))"))) print(clausify(lexpr("P(x) | Q(x) | R(x)"))) print(clausify(lexpr("P(x) | (Q(x) & R(x)) | S(x)"))) print(clausify(lexpr("exists x.P(x) | Q(x)"))) print(clausify(lexpr("-(-P(x) & Q(x))"))) print(clausify(lexpr("P(x) <-> Q(x)"))) print(clausify(lexpr("-(P(x) <-> Q(x))"))) print(clausify(lexpr("-(all x.P(x))"))) print(clausify(lexpr("-(some x.P(x))"))) print(clausify(lexpr("some x.P(x)"))) print(clausify(lexpr("some x.all y.P(x,y)"))) print(clausify(lexpr("all y.some x.P(x,y)"))) print(clausify(lexpr("all z.all y.some x.P(x,y,z)"))) print(clausify(lexpr("all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))"))) def demo(): test_clausify() print() testResolutionProver() print() p = Expression.fromstring("man(x)") print(ResolutionProverCommand(p, [p]).prove()) if __name__ == "__main__": demo() nltk-3.7/nltk/inference/tableau.py000066400000000000000000000620101420073152400172250ustar00rootroot00000000000000# Natural Language Toolkit: First-Order Tableau Theorem Prover # # Copyright (C) 2001-2022 NLTK Project # Author: Dan Garrette # # URL: # For license information, see LICENSE.TXT """ Module for a tableau-based First Order theorem prover. """ from nltk.inference.api import BaseProverCommand, Prover from nltk.internals import Counter from nltk.sem.logic import ( AbstractVariableExpression, AllExpression, AndExpression, ApplicationExpression, EqualityExpression, ExistsExpression, Expression, FunctionVariableExpression, IffExpression, ImpExpression, LambdaExpression, NegatedExpression, OrExpression, Variable, VariableExpression, unique_variable, ) _counter = Counter() class ProverParseError(Exception): pass class TableauProver(Prover): _assume_false = False def _prove(self, goal=None, assumptions=None, verbose=False): if not assumptions: assumptions = [] result = None try: agenda = Agenda() if goal: agenda.put(-goal) agenda.put_all(assumptions) debugger = Debug(verbose) result = self._attempt_proof(agenda, set(), set(), debugger) except RuntimeError as e: if self._assume_false and str(e).startswith( "maximum recursion depth exceeded" ): result = False else: if verbose: print(e) else: raise e return (result, "\n".join(debugger.lines)) def _attempt_proof(self, agenda, accessible_vars, atoms, debug): (current, context), category = agenda.pop_first() # if there's nothing left in the agenda, and we haven't closed the path if not current: debug.line("AGENDA EMPTY") return False proof_method = { Categories.ATOM: self._attempt_proof_atom, Categories.PROP: self._attempt_proof_prop, Categories.N_ATOM: self._attempt_proof_n_atom, Categories.N_PROP: self._attempt_proof_n_prop, Categories.APP: self._attempt_proof_app, Categories.N_APP: self._attempt_proof_n_app, Categories.N_EQ: self._attempt_proof_n_eq, Categories.D_NEG: self._attempt_proof_d_neg, Categories.N_ALL: self._attempt_proof_n_all, Categories.N_EXISTS: self._attempt_proof_n_some, Categories.AND: self._attempt_proof_and, Categories.N_OR: self._attempt_proof_n_or, Categories.N_IMP: self._attempt_proof_n_imp, Categories.OR: self._attempt_proof_or, Categories.IMP: self._attempt_proof_imp, Categories.N_AND: self._attempt_proof_n_and, Categories.IFF: self._attempt_proof_iff, Categories.N_IFF: self._attempt_proof_n_iff, Categories.EQ: self._attempt_proof_eq, Categories.EXISTS: self._attempt_proof_some, Categories.ALL: self._attempt_proof_all, }[category] debug.line((current, context)) return proof_method(current, context, agenda, accessible_vars, atoms, debug) def _attempt_proof_atom( self, current, context, agenda, accessible_vars, atoms, debug ): # Check if the branch is closed. Return 'True' if it is if (current, True) in atoms: debug.line("CLOSED", 1) return True if context: if isinstance(context.term, NegatedExpression): current = current.negate() agenda.put(context(current).simplify()) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) else: # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars agenda.mark_alls_fresh() return self._attempt_proof( agenda, accessible_vars | set(current.args), atoms | {(current, False)}, debug + 1, ) def _attempt_proof_n_atom( self, current, context, agenda, accessible_vars, atoms, debug ): # Check if the branch is closed. Return 'True' if it is if (current.term, False) in atoms: debug.line("CLOSED", 1) return True if context: if isinstance(context.term, NegatedExpression): current = current.negate() agenda.put(context(current).simplify()) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) else: # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars agenda.mark_alls_fresh() return self._attempt_proof( agenda, accessible_vars | set(current.term.args), atoms | {(current.term, True)}, debug + 1, ) def _attempt_proof_prop( self, current, context, agenda, accessible_vars, atoms, debug ): # Check if the branch is closed. Return 'True' if it is if (current, True) in atoms: debug.line("CLOSED", 1) return True # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars agenda.mark_alls_fresh() return self._attempt_proof( agenda, accessible_vars, atoms | {(current, False)}, debug + 1 ) def _attempt_proof_n_prop( self, current, context, agenda, accessible_vars, atoms, debug ): # Check if the branch is closed. Return 'True' if it is if (current.term, False) in atoms: debug.line("CLOSED", 1) return True # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars agenda.mark_alls_fresh() return self._attempt_proof( agenda, accessible_vars, atoms | {(current.term, True)}, debug + 1 ) def _attempt_proof_app( self, current, context, agenda, accessible_vars, atoms, debug ): f, args = current.uncurry() for i, arg in enumerate(args): if not TableauProver.is_atom(arg): ctx = f nv = Variable("X%s" % _counter.get()) for j, a in enumerate(args): ctx = ctx(VariableExpression(nv)) if i == j else ctx(a) if context: ctx = context(ctx).simplify() ctx = LambdaExpression(nv, ctx) agenda.put(arg, ctx) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) raise Exception("If this method is called, there must be a non-atomic argument") def _attempt_proof_n_app( self, current, context, agenda, accessible_vars, atoms, debug ): f, args = current.term.uncurry() for i, arg in enumerate(args): if not TableauProver.is_atom(arg): ctx = f nv = Variable("X%s" % _counter.get()) for j, a in enumerate(args): ctx = ctx(VariableExpression(nv)) if i == j else ctx(a) if context: # combine new context with existing ctx = context(ctx).simplify() ctx = LambdaExpression(nv, -ctx) agenda.put(-arg, ctx) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) raise Exception("If this method is called, there must be a non-atomic argument") def _attempt_proof_n_eq( self, current, context, agenda, accessible_vars, atoms, debug ): ########################################################################### # Since 'current' is of type '~(a=b)', the path is closed if 'a' == 'b' ########################################################################### if current.term.first == current.term.second: debug.line("CLOSED", 1) return True agenda[Categories.N_EQ].add((current, context)) current._exhausted = True return self._attempt_proof( agenda, accessible_vars | {current.term.first, current.term.second}, atoms, debug + 1, ) def _attempt_proof_d_neg( self, current, context, agenda, accessible_vars, atoms, debug ): agenda.put(current.term.term, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_n_all( self, current, context, agenda, accessible_vars, atoms, debug ): agenda[Categories.EXISTS].add( (ExistsExpression(current.term.variable, -current.term.term), context) ) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_n_some( self, current, context, agenda, accessible_vars, atoms, debug ): agenda[Categories.ALL].add( (AllExpression(current.term.variable, -current.term.term), context) ) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_and( self, current, context, agenda, accessible_vars, atoms, debug ): agenda.put(current.first, context) agenda.put(current.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_n_or( self, current, context, agenda, accessible_vars, atoms, debug ): agenda.put(-current.term.first, context) agenda.put(-current.term.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_n_imp( self, current, context, agenda, accessible_vars, atoms, debug ): agenda.put(current.term.first, context) agenda.put(-current.term.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_or( self, current, context, agenda, accessible_vars, atoms, debug ): new_agenda = agenda.clone() agenda.put(current.first, context) new_agenda.put(current.second, context) return self._attempt_proof( agenda, accessible_vars, atoms, debug + 1 ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_imp( self, current, context, agenda, accessible_vars, atoms, debug ): new_agenda = agenda.clone() agenda.put(-current.first, context) new_agenda.put(current.second, context) return self._attempt_proof( agenda, accessible_vars, atoms, debug + 1 ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_n_and( self, current, context, agenda, accessible_vars, atoms, debug ): new_agenda = agenda.clone() agenda.put(-current.term.first, context) new_agenda.put(-current.term.second, context) return self._attempt_proof( agenda, accessible_vars, atoms, debug + 1 ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_iff( self, current, context, agenda, accessible_vars, atoms, debug ): new_agenda = agenda.clone() agenda.put(current.first, context) agenda.put(current.second, context) new_agenda.put(-current.first, context) new_agenda.put(-current.second, context) return self._attempt_proof( agenda, accessible_vars, atoms, debug + 1 ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_n_iff( self, current, context, agenda, accessible_vars, atoms, debug ): new_agenda = agenda.clone() agenda.put(current.term.first, context) agenda.put(-current.term.second, context) new_agenda.put(-current.term.first, context) new_agenda.put(current.term.second, context) return self._attempt_proof( agenda, accessible_vars, atoms, debug + 1 ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) def _attempt_proof_eq( self, current, context, agenda, accessible_vars, atoms, debug ): ######################################################################### # Since 'current' is of the form '(a = b)', replace ALL free instances # of 'a' with 'b' ######################################################################### agenda.put_atoms(atoms) agenda.replace_all(current.first, current.second) accessible_vars.discard(current.first) agenda.mark_neqs_fresh() return self._attempt_proof(agenda, accessible_vars, set(), debug + 1) def _attempt_proof_some( self, current, context, agenda, accessible_vars, atoms, debug ): new_unique_variable = VariableExpression(unique_variable()) agenda.put(current.term.replace(current.variable, new_unique_variable), context) agenda.mark_alls_fresh() return self._attempt_proof( agenda, accessible_vars | {new_unique_variable}, atoms, debug + 1 ) def _attempt_proof_all( self, current, context, agenda, accessible_vars, atoms, debug ): try: current._used_vars except AttributeError: current._used_vars = set() # if there are accessible_vars on the path if accessible_vars: # get the set of bound variables that have not be used by this AllExpression bv_available = accessible_vars - current._used_vars if bv_available: variable_to_use = list(bv_available)[0] debug.line("--> Using '%s'" % variable_to_use, 2) current._used_vars |= {variable_to_use} agenda.put( current.term.replace(current.variable, variable_to_use), context ) agenda[Categories.ALL].add((current, context)) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) else: # no more available variables to substitute debug.line("--> Variables Exhausted", 2) current._exhausted = True agenda[Categories.ALL].add((current, context)) return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) else: new_unique_variable = VariableExpression(unique_variable()) debug.line("--> Using '%s'" % new_unique_variable, 2) current._used_vars |= {new_unique_variable} agenda.put( current.term.replace(current.variable, new_unique_variable), context ) agenda[Categories.ALL].add((current, context)) agenda.mark_alls_fresh() return self._attempt_proof( agenda, accessible_vars | {new_unique_variable}, atoms, debug + 1 ) @staticmethod def is_atom(e): if isinstance(e, NegatedExpression): e = e.term if isinstance(e, ApplicationExpression): for arg in e.args: if not TableauProver.is_atom(arg): return False return True elif isinstance(e, AbstractVariableExpression) or isinstance( e, LambdaExpression ): return True else: return False class TableauProverCommand(BaseProverCommand): def __init__(self, goal=None, assumptions=None, prover=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) """ if prover is not None: assert isinstance(prover, TableauProver) else: prover = TableauProver() BaseProverCommand.__init__(self, prover, goal, assumptions) class Agenda: def __init__(self): self.sets = tuple(set() for i in range(21)) def clone(self): new_agenda = Agenda() set_list = [s.copy() for s in self.sets] new_allExs = set() for allEx, _ in set_list[Categories.ALL]: new_allEx = AllExpression(allEx.variable, allEx.term) try: new_allEx._used_vars = {used for used in allEx._used_vars} except AttributeError: new_allEx._used_vars = set() new_allExs.add((new_allEx, None)) set_list[Categories.ALL] = new_allExs set_list[Categories.N_EQ] = { (NegatedExpression(n_eq.term), ctx) for (n_eq, ctx) in set_list[Categories.N_EQ] } new_agenda.sets = tuple(set_list) return new_agenda def __getitem__(self, index): return self.sets[index] def put(self, expression, context=None): if isinstance(expression, AllExpression): ex_to_add = AllExpression(expression.variable, expression.term) try: ex_to_add._used_vars = {used for used in expression._used_vars} except AttributeError: ex_to_add._used_vars = set() else: ex_to_add = expression self.sets[self._categorize_expression(ex_to_add)].add((ex_to_add, context)) def put_all(self, expressions): for expression in expressions: self.put(expression) def put_atoms(self, atoms): for atom, neg in atoms: if neg: self[Categories.N_ATOM].add((-atom, None)) else: self[Categories.ATOM].add((atom, None)) def pop_first(self): """Pop the first expression that appears in the agenda""" for i, s in enumerate(self.sets): if s: if i in [Categories.N_EQ, Categories.ALL]: for ex in s: try: if not ex[0]._exhausted: s.remove(ex) return (ex, i) except AttributeError: s.remove(ex) return (ex, i) else: return (s.pop(), i) return ((None, None), None) def replace_all(self, old, new): for s in self.sets: for ex, ctx in s: ex.replace(old.variable, new) if ctx is not None: ctx.replace(old.variable, new) def mark_alls_fresh(self): for u, _ in self.sets[Categories.ALL]: u._exhausted = False def mark_neqs_fresh(self): for neq, _ in self.sets[Categories.N_EQ]: neq._exhausted = False def _categorize_expression(self, current): if isinstance(current, NegatedExpression): return self._categorize_NegatedExpression(current) elif isinstance(current, FunctionVariableExpression): return Categories.PROP elif TableauProver.is_atom(current): return Categories.ATOM elif isinstance(current, AllExpression): return Categories.ALL elif isinstance(current, AndExpression): return Categories.AND elif isinstance(current, OrExpression): return Categories.OR elif isinstance(current, ImpExpression): return Categories.IMP elif isinstance(current, IffExpression): return Categories.IFF elif isinstance(current, EqualityExpression): return Categories.EQ elif isinstance(current, ExistsExpression): return Categories.EXISTS elif isinstance(current, ApplicationExpression): return Categories.APP else: raise ProverParseError("cannot categorize %s" % current.__class__.__name__) def _categorize_NegatedExpression(self, current): negated = current.term if isinstance(negated, NegatedExpression): return Categories.D_NEG elif isinstance(negated, FunctionVariableExpression): return Categories.N_PROP elif TableauProver.is_atom(negated): return Categories.N_ATOM elif isinstance(negated, AllExpression): return Categories.N_ALL elif isinstance(negated, AndExpression): return Categories.N_AND elif isinstance(negated, OrExpression): return Categories.N_OR elif isinstance(negated, ImpExpression): return Categories.N_IMP elif isinstance(negated, IffExpression): return Categories.N_IFF elif isinstance(negated, EqualityExpression): return Categories.N_EQ elif isinstance(negated, ExistsExpression): return Categories.N_EXISTS elif isinstance(negated, ApplicationExpression): return Categories.N_APP else: raise ProverParseError("cannot categorize %s" % negated.__class__.__name__) class Debug: def __init__(self, verbose, indent=0, lines=None): self.verbose = verbose self.indent = indent if not lines: lines = [] self.lines = lines def __add__(self, increment): return Debug(self.verbose, self.indent + 1, self.lines) def line(self, data, indent=0): if isinstance(data, tuple): ex, ctx = data if ctx: data = f"{ex}, {ctx}" else: data = "%s" % ex if isinstance(ex, AllExpression): try: used_vars = "[%s]" % ( ",".join("%s" % ve.variable.name for ve in ex._used_vars) ) data += ": %s" % used_vars except AttributeError: data += ": []" newline = "{}{}".format(" " * (self.indent + indent), data) self.lines.append(newline) if self.verbose: print(newline) class Categories: ATOM = 0 PROP = 1 N_ATOM = 2 N_PROP = 3 APP = 4 N_APP = 5 N_EQ = 6 D_NEG = 7 N_ALL = 8 N_EXISTS = 9 AND = 10 N_OR = 11 N_IMP = 12 OR = 13 IMP = 14 N_AND = 15 IFF = 16 N_IFF = 17 EQ = 18 EXISTS = 19 ALL = 20 def testTableauProver(): tableau_test("P | -P") tableau_test("P & -P") tableau_test("Q", ["P", "(P -> Q)"]) tableau_test("man(x)") tableau_test("(man(x) -> man(x))") tableau_test("(man(x) -> --man(x))") tableau_test("-(man(x) and -man(x))") tableau_test("(man(x) or -man(x))") tableau_test("(man(x) -> man(x))") tableau_test("-(man(x) and -man(x))") tableau_test("(man(x) or -man(x))") tableau_test("(man(x) -> man(x))") tableau_test("(man(x) iff man(x))") tableau_test("-(man(x) iff -man(x))") tableau_test("all x.man(x)") tableau_test("all x.all y.((x = y) -> (y = x))") tableau_test("all x.all y.all z.(((x = y) & (y = z)) -> (x = z))") # tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))') # tableau_test('some x.all y.sees(x,y)') p1 = "all x.(man(x) -> mortal(x))" p2 = "man(Socrates)" c = "mortal(Socrates)" tableau_test(c, [p1, p2]) p1 = "all x.(man(x) -> walks(x))" p2 = "man(John)" c = "some y.walks(y)" tableau_test(c, [p1, p2]) p = "((x = y) & walks(y))" c = "walks(x)" tableau_test(c, [p]) p = "((x = y) & ((y = z) & (z = w)))" c = "(x = w)" tableau_test(c, [p]) p = "some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))" c = "some e0.walk(e0,mary)" tableau_test(c, [p]) c = "(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))" tableau_test(c) # p = 'some e1.some e2.((believe e1 john e2) and (walk e2 mary))' # c = 'some x.some e3.some e4.((believe e3 x e4) and (walk e4 mary))' # tableau_test(c, [p]) def testHigherOrderTableauProver(): tableau_test("believe(j, -lie(b))", ["believe(j, -lie(b) & -cheat(b))"]) tableau_test("believe(j, lie(b) & cheat(b))", ["believe(j, lie(b))"]) tableau_test( "believe(j, lie(b))", ["lie(b)"] ) # how do we capture that John believes all things that are true tableau_test( "believe(j, know(b, cheat(b)))", ["believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))"], ) tableau_test("P(Q(y), R(y) & R(z))", ["P(Q(x) & Q(y), R(y) & R(z))"]) tableau_test("believe(j, cheat(b) & lie(b))", ["believe(j, lie(b) & cheat(b))"]) tableau_test("believe(j, -cheat(b) & -lie(b))", ["believe(j, -lie(b) & -cheat(b))"]) def tableau_test(c, ps=None, verbose=False): pc = Expression.fromstring(c) pps = [Expression.fromstring(p) for p in ps] if ps else [] if not ps: ps = [] print( "%s |- %s: %s" % (", ".join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose)) ) def demo(): testTableauProver() testHigherOrderTableauProver() if __name__ == "__main__": demo() nltk-3.7/nltk/internals.py000066400000000000000000001134101420073152400156520ustar00rootroot00000000000000# Natural Language Toolkit: Internal utility functions # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # Nitin Madnani # URL: # For license information, see LICENSE.TXT import fnmatch import locale import os import re import stat import subprocess import sys import textwrap import types import warnings from xml.etree import ElementTree ########################################################################## # Java Via Command-Line ########################################################################## _java_bin = None _java_options = [] # [xx] add classpath option to config_java? def config_java(bin=None, options=None, verbose=False): """ Configure nltk's java interface, by letting nltk know where it can find the Java binary, and what extra options (if any) should be passed to Java when it is run. :param bin: The full path to the Java binary. If not specified, then nltk will search the system for a Java binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str :param options: A list of options that should be passed to the Java binary when it is called. A common value is ``'-Xmx512m'``, which tells Java binary to increase the maximum heap size to 512 megabytes. If no options are specified, then do not modify the options list. :type options: list(str) """ global _java_bin, _java_options _java_bin = find_binary( "java", bin, env_vars=["JAVAHOME", "JAVA_HOME"], verbose=verbose, binary_names=["java.exe"], ) if options is not None: if isinstance(options, str): options = options.split() _java_options = list(options) def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True): """ Execute the given java command, by opening a subprocess that calls Java. If java has not yet been configured, it will be configured by calling ``config_java()`` with no arguments. :param cmd: The java command that should be called, formatted as a list of strings. Typically, the first string will be the name of the java class; and the remaining strings will be arguments for that java class. :type cmd: list(str) :param classpath: A ``':'`` separated list of directories, JAR archives, and ZIP archives to search for class files. :type classpath: str :param stdin, stdout, stderr: Specify the executed programs' standard input, standard output and standard error file handles, respectively. Valid values are ``subprocess.PIPE``, an existing file descriptor (a positive integer), an existing file object, 'pipe', 'stdout', 'devnull' and None. ``subprocess.PIPE`` indicates that a new pipe to the child should be created. With None, no redirection will occur; the child's file handles will be inherited from the parent. Additionally, stderr can be ``subprocess.STDOUT``, which indicates that the stderr data from the applications should be captured into the same file handle as for stdout. :param blocking: If ``false``, then return immediately after spawning the subprocess. In this case, the return value is the ``Popen`` object, and not a ``(stdout, stderr)`` tuple. :return: If ``blocking=True``, then return a tuple ``(stdout, stderr)``, containing the stdout and stderr outputs generated by the java command if the ``stdout`` and ``stderr`` parameters were set to ``subprocess.PIPE``; or None otherwise. If ``blocking=False``, then return a ``subprocess.Popen`` object. :raise OSError: If the java command returns a nonzero return code. """ subprocess_output_dict = { "pipe": subprocess.PIPE, "stdout": subprocess.STDOUT, "devnull": subprocess.DEVNULL, } stdin = subprocess_output_dict.get(stdin, stdin) stdout = subprocess_output_dict.get(stdout, stdout) stderr = subprocess_output_dict.get(stderr, stderr) if isinstance(cmd, str): raise TypeError("cmd should be a list of strings") # Make sure we know where a java binary is. if _java_bin is None: config_java() # Set up the classpath. if isinstance(classpath, str): classpaths = [classpath] else: classpaths = list(classpath) classpath = os.path.pathsep.join(classpaths) # Construct the full command string. cmd = list(cmd) cmd = ["-cp", classpath] + cmd cmd = [_java_bin] + _java_options + cmd # Call java via a subprocess p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) if not blocking: return p (stdout, stderr) = p.communicate() # Check the return code. if p.returncode != 0: print(_decode_stdoutdata(stderr)) raise OSError("Java command failed : " + str(cmd)) return (stdout, stderr) if 0: # config_java(options='-Xmx512m') # Write: # java('weka.classifiers.bayes.NaiveBayes', # ['-d', '/tmp/names.model', '-t', '/tmp/train.arff'], # classpath='/Users/edloper/Desktop/weka/weka.jar') # Read: (a, b) = java( [ "weka.classifiers.bayes.NaiveBayes", "-l", "/tmp/names.model", "-T", "/tmp/test.arff", "-p", "0", ], # , '-distribution'], classpath="/Users/edloper/Desktop/weka/weka.jar", ) ###################################################################### # Parsing ###################################################################### class ReadError(ValueError): """ Exception raised by read_* functions when they fail. :param position: The index in the input string where an error occurred. :param expected: What was expected when an error occurred. """ def __init__(self, expected, position): ValueError.__init__(self, expected, position) self.expected = expected self.position = position def __str__(self): return f"Expected {self.expected} at {self.position}" _STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')") def read_str(s, start_position): """ If a Python string literal begins at the specified position in the given string, then return a tuple ``(val, end_position)`` containing the value of the string literal and the position where it ends. Otherwise, raise a ``ReadError``. :param s: A string that will be checked to see if within which a Python string literal exists. :type s: str :param start_position: The specified beginning position of the string ``s`` to begin regex matching. :type start_position: int :return: A tuple containing the matched string literal evaluated as a string and the end position of the string literal. :rtype: tuple(str, int) :raise ReadError: If the ``_STRING_START_RE`` regex doesn't return a match in ``s`` at ``start_position``, i.e., open quote. If the ``_STRING_END_RE`` regex doesn't return a match in ``s`` at the end of the first match, i.e., close quote. :raise ValueError: If an invalid string (i.e., contains an invalid escape sequence) is passed into the ``eval``. :Example: >>> from nltk.internals import read_str >>> read_str('"Hello", World!', 0) ('Hello', 7) """ # Read the open quote, and any modifiers. m = _STRING_START_RE.match(s, start_position) if not m: raise ReadError("open quote", start_position) quotemark = m.group(1) # Find the close quote. _STRING_END_RE = re.compile(r"\\|%s" % quotemark) position = m.end() while True: match = _STRING_END_RE.search(s, position) if not match: raise ReadError("close quote", position) if match.group(0) == "\\": position = match.end() + 1 else: break # Process it, using eval. Strings with invalid escape sequences # might raise ValueError. try: return eval(s[start_position : match.end()]), match.end() except ValueError as e: raise ReadError("valid escape sequence", start_position) from e _READ_INT_RE = re.compile(r"-?\d+") def read_int(s, start_position): """ If an integer begins at the specified position in the given string, then return a tuple ``(val, end_position)`` containing the value of the integer and the position where it ends. Otherwise, raise a ``ReadError``. :param s: A string that will be checked to see if within which a Python integer exists. :type s: str :param start_position: The specified beginning position of the string ``s`` to begin regex matching. :type start_position: int :return: A tuple containing the matched integer casted to an int, and the end position of the int in ``s``. :rtype: tuple(int, int) :raise ReadError: If the ``_READ_INT_RE`` regex doesn't return a match in ``s`` at ``start_position``. :Example: >>> from nltk.internals import read_int >>> read_int('42 is the answer', 0) (42, 2) """ m = _READ_INT_RE.match(s, start_position) if not m: raise ReadError("integer", start_position) return int(m.group()), m.end() _READ_NUMBER_VALUE = re.compile(r"-?(\d*)([.]?\d*)?") def read_number(s, start_position): """ If an integer or float begins at the specified position in the given string, then return a tuple ``(val, end_position)`` containing the value of the number and the position where it ends. Otherwise, raise a ``ReadError``. :param s: A string that will be checked to see if within which a Python number exists. :type s: str :param start_position: The specified beginning position of the string ``s`` to begin regex matching. :type start_position: int :return: A tuple containing the matched number casted to a ``float``, and the end position of the number in ``s``. :rtype: tuple(float, int) :raise ReadError: If the ``_READ_NUMBER_VALUE`` regex doesn't return a match in ``s`` at ``start_position``. :Example: >>> from nltk.internals import read_number >>> read_number('Pi is 3.14159', 6) (3.14159, 13) """ m = _READ_NUMBER_VALUE.match(s, start_position) if not m or not (m.group(1) or m.group(2)): raise ReadError("number", start_position) if m.group(2): return float(m.group()), m.end() else: return int(m.group()), m.end() ###################################################################### # Check if a method has been overridden ###################################################################### def overridden(method): """ :return: True if ``method`` overrides some method with the same name in a base class. This is typically used when defining abstract base classes or interfaces, to allow subclasses to define either of two related methods: >>> class EaterI: ... '''Subclass must define eat() or batch_eat().''' ... def eat(self, food): ... if overridden(self.batch_eat): ... return self.batch_eat([food])[0] ... else: ... raise NotImplementedError() ... def batch_eat(self, foods): ... return [self.eat(food) for food in foods] :type method: instance method """ if isinstance(method, types.MethodType) and method.__self__.__class__ is not None: name = method.__name__ funcs = [ cls.__dict__[name] for cls in _mro(method.__self__.__class__) if name in cls.__dict__ ] return len(funcs) > 1 else: raise TypeError("Expected an instance method.") def _mro(cls): """ Return the method resolution order for ``cls`` -- i.e., a list containing ``cls`` and all its base classes, in the order in which they would be checked by ``getattr``. For new-style classes, this is just cls.__mro__. For classic classes, this can be obtained by a depth-first left-to-right traversal of ``__bases__``. """ if isinstance(cls, type): return cls.__mro__ else: mro = [cls] for base in cls.__bases__: mro.extend(_mro(base)) return mro ###################################################################### # Deprecation decorator & base class ###################################################################### # [xx] dedent msg first if it comes from a docstring. def _add_epytext_field(obj, field, message): """Add an epytext @field to a given object's docstring.""" indent = "" # If we already have a docstring, then add a blank line to separate # it from the new field, and check its indentation. if obj.__doc__: obj.__doc__ = obj.__doc__.rstrip() + "\n\n" indents = re.findall(r"(?<=\n)[ ]+(?!\s)", obj.__doc__.expandtabs()) if indents: indent = min(indents) # If we don't have a docstring, add an empty one. else: obj.__doc__ = "" obj.__doc__ += textwrap.fill( f"@{field}: {message}", initial_indent=indent, subsequent_indent=indent + " ", ) def deprecated(message): """ A decorator used to mark functions as deprecated. This will cause a warning to be printed the when the function is used. Usage: >>> from nltk.internals import deprecated >>> @deprecated('Use foo() instead') ... def bar(x): ... print(x/10) """ def decorator(func): msg = f"Function {func.__name__}() has been deprecated. {message}" msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ") def newFunc(*args, **kwargs): warnings.warn(msg, category=DeprecationWarning, stacklevel=2) return func(*args, **kwargs) # Copy the old function's name, docstring, & dict newFunc.__dict__.update(func.__dict__) newFunc.__name__ = func.__name__ newFunc.__doc__ = func.__doc__ newFunc.__deprecated__ = True # Add a @deprecated field to the docstring. _add_epytext_field(newFunc, "deprecated", message) return newFunc return decorator class Deprecated: """ A base class used to mark deprecated classes. A typical usage is to alert users that the name of a class has changed: >>> from nltk.internals import Deprecated >>> class NewClassName: ... pass # All logic goes here. ... >>> class OldClassName(Deprecated, NewClassName): ... "Use NewClassName instead." The docstring of the deprecated class will be used in the deprecation warning message. """ def __new__(cls, *args, **kwargs): # Figure out which class is the deprecated one. dep_cls = None for base in _mro(cls): if Deprecated in base.__bases__: dep_cls = base break assert dep_cls, "Unable to determine which base is deprecated." # Construct an appropriate warning. doc = dep_cls.__doc__ or "".strip() # If there's a @deprecated field, strip off the field marker. doc = re.sub(r"\A\s*@deprecated:", r"", doc) # Strip off any indentation. doc = re.sub(r"(?m)^\s*", "", doc) # Construct a 'name' string. name = "Class %s" % dep_cls.__name__ if cls != dep_cls: name += " (base class for %s)" % cls.__name__ # Put it all together. msg = f"{name} has been deprecated. {doc}" # Wrap it. msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ") warnings.warn(msg, category=DeprecationWarning, stacklevel=2) # Do the actual work of __new__. return object.__new__(cls) ########################################################################## # COUNTER, FOR UNIQUE NAMING ########################################################################## class Counter: """ A counter that auto-increments each time its value is read. """ def __init__(self, initial_value=0): self._value = initial_value def get(self): self._value += 1 return self._value ########################################################################## # Search for files/binaries ########################################################################## def find_file_iter( filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False, finding_dir=False, ): """ Search for a file to be used by nltk. :param filename: The name or path of the file. :param env_vars: A list of environment variable names to check. :param file_names: A list of alternative file names to check. :param searchpath: List of directories to search. :param url: URL presented to user for download help. :param verbose: Whether or not to print path when a file is found. """ file_names = [filename] + (file_names or []) assert isinstance(filename, str) assert not isinstance(file_names, str) assert not isinstance(searchpath, str) if isinstance(env_vars, str): env_vars = env_vars.split() yielded = False # File exists, no magic for alternative in file_names: path_to_file = os.path.join(filename, alternative) if os.path.isfile(path_to_file): if verbose: print(f"[Found {filename}: {path_to_file}]") yielded = True yield path_to_file # Check the bare alternatives if os.path.isfile(alternative): if verbose: print(f"[Found {filename}: {alternative}]") yielded = True yield alternative # Check if the alternative is inside a 'file' directory path_to_file = os.path.join(filename, "file", alternative) if os.path.isfile(path_to_file): if verbose: print(f"[Found {filename}: {path_to_file}]") yielded = True yield path_to_file # Check environment variables for env_var in env_vars: if env_var in os.environ: if finding_dir: # This is to file a directory instead of file yielded = True yield os.environ[env_var] for env_dir in os.environ[env_var].split(os.pathsep): # Check if the environment variable contains a direct path to the bin if os.path.isfile(env_dir): if verbose: print(f"[Found {filename}: {env_dir}]") yielded = True yield env_dir # Check if the possible bin names exist inside the environment variable directories for alternative in file_names: path_to_file = os.path.join(env_dir, alternative) if os.path.isfile(path_to_file): if verbose: print(f"[Found {filename}: {path_to_file}]") yielded = True yield path_to_file # Check if the alternative is inside a 'file' directory # path_to_file = os.path.join(env_dir, 'file', alternative) # Check if the alternative is inside a 'bin' directory path_to_file = os.path.join(env_dir, "bin", alternative) if os.path.isfile(path_to_file): if verbose: print(f"[Found {filename}: {path_to_file}]") yielded = True yield path_to_file # Check the path list. for directory in searchpath: for alternative in file_names: path_to_file = os.path.join(directory, alternative) if os.path.isfile(path_to_file): yielded = True yield path_to_file # If we're on a POSIX system, then try using the 'which' command # to find the file. if os.name == "posix": for alternative in file_names: try: p = subprocess.Popen( ["which", alternative], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) stdout, stderr = p.communicate() path = _decode_stdoutdata(stdout).strip() if path.endswith(alternative) and os.path.exists(path): if verbose: print(f"[Found {filename}: {path}]") yielded = True yield path except (KeyboardInterrupt, SystemExit, OSError): raise finally: pass if not yielded: msg = ( "NLTK was unable to find the %s file!" "\nUse software specific " "configuration parameters" % filename ) if env_vars: msg += " or set the %s environment variable" % env_vars[0] msg += "." if searchpath: msg += "\n\n Searched in:" msg += "".join("\n - %s" % d for d in searchpath) if url: msg += f"\n\n For more information on {filename}, see:\n <{url}>" div = "=" * 75 raise LookupError(f"\n\n{div}\n{msg}\n{div}") def find_file( filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False ): return next( find_file_iter(filename, env_vars, searchpath, file_names, url, verbose) ) def find_dir( filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False ): return next( find_file_iter( filename, env_vars, searchpath, file_names, url, verbose, finding_dir=True ) ) def find_binary_iter( name, path_to_bin=None, env_vars=(), searchpath=(), binary_names=None, url=None, verbose=False, ): """ Search for a file to be used by nltk. :param name: The name or path of the file. :param path_to_bin: The user-supplied binary location (deprecated) :param env_vars: A list of environment variable names to check. :param file_names: A list of alternative file names to check. :param searchpath: List of directories to search. :param url: URL presented to user for download help. :param verbose: Whether or not to print path when a file is found. """ yield from find_file_iter( path_to_bin or name, env_vars, searchpath, binary_names, url, verbose ) def find_binary( name, path_to_bin=None, env_vars=(), searchpath=(), binary_names=None, url=None, verbose=False, ): return next( find_binary_iter( name, path_to_bin, env_vars, searchpath, binary_names, url, verbose ) ) def find_jar_iter( name_pattern, path_to_jar=None, env_vars=(), searchpath=(), url=None, verbose=False, is_regex=False, ): """ Search for a jar that is used by nltk. :param name_pattern: The name of the jar file :param path_to_jar: The user-supplied jar location, or None. :param env_vars: A list of environment variable names to check in addition to the CLASSPATH variable which is checked by default. :param searchpath: List of directories to search. :param is_regex: Whether name is a regular expression. """ assert isinstance(name_pattern, str) assert not isinstance(searchpath, str) if isinstance(env_vars, str): env_vars = env_vars.split() yielded = False # Make sure we check the CLASSPATH first env_vars = ["CLASSPATH"] + list(env_vars) # If an explicit location was given, then check it, and yield it if # it's present; otherwise, complain. if path_to_jar is not None: if os.path.isfile(path_to_jar): yielded = True yield path_to_jar else: raise LookupError( f"Could not find {name_pattern} jar file at {path_to_jar}" ) # Check environment variables for env_var in env_vars: if env_var in os.environ: if env_var == "CLASSPATH": classpath = os.environ["CLASSPATH"] for cp in classpath.split(os.path.pathsep): cp = os.path.expanduser(cp) if os.path.isfile(cp): filename = os.path.basename(cp) if ( is_regex and re.match(name_pattern, filename) or (not is_regex and filename == name_pattern) ): if verbose: print(f"[Found {name_pattern}: {cp}]") yielded = True yield cp # The case where user put directory containing the jar file in the classpath if os.path.isdir(cp): if not is_regex: if os.path.isfile(os.path.join(cp, name_pattern)): if verbose: print(f"[Found {name_pattern}: {cp}]") yielded = True yield os.path.join(cp, name_pattern) else: # Look for file using regular expression for file_name in os.listdir(cp): if re.match(name_pattern, file_name): if verbose: print( "[Found %s: %s]" % ( name_pattern, os.path.join(cp, file_name), ) ) yielded = True yield os.path.join(cp, file_name) else: jar_env = os.path.expanduser(os.environ[env_var]) jar_iter = ( ( os.path.join(jar_env, path_to_jar) for path_to_jar in os.listdir(jar_env) ) if os.path.isdir(jar_env) else (jar_env,) ) for path_to_jar in jar_iter: if os.path.isfile(path_to_jar): filename = os.path.basename(path_to_jar) if ( is_regex and re.match(name_pattern, filename) or (not is_regex and filename == name_pattern) ): if verbose: print(f"[Found {name_pattern}: {path_to_jar}]") yielded = True yield path_to_jar # Check the path list. for directory in searchpath: if is_regex: for filename in os.listdir(directory): path_to_jar = os.path.join(directory, filename) if os.path.isfile(path_to_jar): if re.match(name_pattern, filename): if verbose: print(f"[Found {filename}: {path_to_jar}]") yielded = True yield path_to_jar else: path_to_jar = os.path.join(directory, name_pattern) if os.path.isfile(path_to_jar): if verbose: print(f"[Found {name_pattern}: {path_to_jar}]") yielded = True yield path_to_jar if not yielded: # If nothing was found, raise an error msg = "NLTK was unable to find %s!" % name_pattern if env_vars: msg += " Set the %s environment variable" % env_vars[0] msg = textwrap.fill(msg + ".", initial_indent=" ", subsequent_indent=" ") if searchpath: msg += "\n\n Searched in:" msg += "".join("\n - %s" % d for d in searchpath) if url: msg += "\n\n For more information, on {}, see:\n <{}>".format( name_pattern, url, ) div = "=" * 75 raise LookupError(f"\n\n{div}\n{msg}\n{div}") def find_jar( name_pattern, path_to_jar=None, env_vars=(), searchpath=(), url=None, verbose=False, is_regex=False, ): return next( find_jar_iter( name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex ) ) def find_jars_within_path(path_to_jars): return [ os.path.join(root, filename) for root, dirnames, filenames in os.walk(path_to_jars) for filename in fnmatch.filter(filenames, "*.jar") ] def _decode_stdoutdata(stdoutdata): """Convert data read from stdout/stderr to unicode""" if not isinstance(stdoutdata, bytes): return stdoutdata encoding = getattr(sys.__stdout__, "encoding", locale.getpreferredencoding()) if encoding is None: return stdoutdata.decode() return stdoutdata.decode(encoding) ########################################################################## # Import Stdlib Module ########################################################################## def import_from_stdlib(module): """ When python is run from within the nltk/ directory tree, the current directory is included at the beginning of the search path. Unfortunately, that means that modules within nltk can sometimes shadow standard library modules. As an example, the stdlib 'inspect' module will attempt to import the stdlib 'tokenize' module, but will instead end up importing NLTK's 'tokenize' module instead (causing the import to fail). """ old_path = sys.path sys.path = [d for d in sys.path if d not in ("", ".")] m = __import__(module) sys.path = old_path return m ########################################################################## # Wrapper for ElementTree Elements ########################################################################## class ElementWrapper: """ A wrapper around ElementTree Element objects whose main purpose is to provide nicer __repr__ and __str__ methods. In addition, any of the wrapped Element's methods that return other Element objects are overridden to wrap those values before returning them. This makes Elements more convenient to work with in interactive sessions and doctests, at the expense of some efficiency. """ # Prevent double-wrapping: def __new__(cls, etree): """ Create and return a wrapper around a given Element object. If ``etree`` is an ``ElementWrapper``, then ``etree`` is returned as-is. """ if isinstance(etree, ElementWrapper): return etree else: return object.__new__(ElementWrapper) def __init__(self, etree): r""" Initialize a new Element wrapper for ``etree``. If ``etree`` is a string, then it will be converted to an Element object using ``ElementTree.fromstring()`` first: >>> ElementWrapper("") \n"> """ if isinstance(etree, str): etree = ElementTree.fromstring(etree) self.__dict__["_etree"] = etree def unwrap(self): """ Return the Element object wrapped by this wrapper. """ return self._etree ##//////////////////////////////////////////////////////////// # { String Representation ##//////////////////////////////////////////////////////////// def __repr__(self): s = ElementTree.tostring(self._etree, encoding="utf8").decode("utf8") if len(s) > 60: e = s.rfind("<") if (len(s) - e) > 30: e = -20 s = f"{s[:30]}...{s[e:]}" return "" % s def __str__(self): """ :return: the result of applying ``ElementTree.tostring()`` to the wrapped Element object. """ return ( ElementTree.tostring(self._etree, encoding="utf8").decode("utf8").rstrip() ) ##//////////////////////////////////////////////////////////// # { Element interface Delegation (pass-through) ##//////////////////////////////////////////////////////////// def __getattr__(self, attrib): return getattr(self._etree, attrib) def __setattr__(self, attr, value): return setattr(self._etree, attr, value) def __delattr__(self, attr): return delattr(self._etree, attr) def __setitem__(self, index, element): self._etree[index] = element def __delitem__(self, index): del self._etree[index] def __setslice__(self, start, stop, elements): self._etree[start:stop] = elements def __delslice__(self, start, stop): del self._etree[start:stop] def __len__(self): return len(self._etree) ##//////////////////////////////////////////////////////////// # { Element interface Delegation (wrap result) ##//////////////////////////////////////////////////////////// def __getitem__(self, index): return ElementWrapper(self._etree[index]) def __getslice__(self, start, stop): return [ElementWrapper(elt) for elt in self._etree[start:stop]] def getchildren(self): return [ElementWrapper(elt) for elt in self._etree] def getiterator(self, tag=None): return (ElementWrapper(elt) for elt in self._etree.getiterator(tag)) def makeelement(self, tag, attrib): return ElementWrapper(self._etree.makeelement(tag, attrib)) def find(self, path): elt = self._etree.find(path) if elt is None: return elt else: return ElementWrapper(elt) def findall(self, path): return [ElementWrapper(elt) for elt in self._etree.findall(path)] ###################################################################### # Helper for Handling Slicing ###################################################################### def slice_bounds(sequence, slice_obj, allow_step=False): """ Given a slice, return the corresponding (start, stop) bounds, taking into account None indices and negative indices. The following guarantees are made for the returned start and stop values: - 0 <= start <= len(sequence) - 0 <= stop <= len(sequence) - start <= stop :raise ValueError: If ``slice_obj.step`` is not None. :param allow_step: If true, then the slice object may have a non-None step. If it does, then return a tuple (start, stop, step). """ start, stop = (slice_obj.start, slice_obj.stop) # If allow_step is true, then include the step in our return # value tuple. if allow_step: step = slice_obj.step if step is None: step = 1 # Use a recursive call without allow_step to find the slice # bounds. If step is negative, then the roles of start and # stop (in terms of default values, etc), are swapped. if step < 0: start, stop = slice_bounds(sequence, slice(stop, start)) else: start, stop = slice_bounds(sequence, slice(start, stop)) return start, stop, step # Otherwise, make sure that no non-default step value is used. elif slice_obj.step not in (None, 1): raise ValueError( "slices with steps are not supported by %s" % sequence.__class__.__name__ ) # Supply default offsets. if start is None: start = 0 if stop is None: stop = len(sequence) # Handle negative indices. if start < 0: start = max(0, len(sequence) + start) if stop < 0: stop = max(0, len(sequence) + stop) # Make sure stop doesn't go past the end of the list. Note that # we avoid calculating len(sequence) if possible, because for lazy # sequences, calculating the length of a sequence can be expensive. if stop > 0: try: sequence[stop - 1] except IndexError: stop = len(sequence) # Make sure start isn't past stop. start = min(start, stop) # That's all folks! return start, stop ###################################################################### # Permission Checking ###################################################################### def is_writable(path): # Ensure that it exists. if not os.path.exists(path): return False # If we're on a posix system, check its permissions. if hasattr(os, "getuid"): statdata = os.stat(path) perm = stat.S_IMODE(statdata.st_mode) # is it world-writable? if perm & 0o002: return True # do we own it? elif statdata.st_uid == os.getuid() and (perm & 0o200): return True # are we in a group that can write to it? elif (statdata.st_gid in [os.getgid()] + os.getgroups()) and (perm & 0o020): return True # otherwise, we can't write to it. else: return False # Otherwise, we'll assume it's writable. # [xx] should we do other checks on other platforms? return True ###################################################################### # NLTK Error reporting ###################################################################### def raise_unorderable_types(ordering, a, b): raise TypeError( "unorderable types: %s() %s %s()" % (type(a).__name__, ordering, type(b).__name__) ) nltk-3.7/nltk/jsontags.py000066400000000000000000000035331420073152400155070ustar00rootroot00000000000000# Natural Language Toolkit: JSON Encoder/Decoder Helpers # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Xu # # URL: # For license information, see LICENSE.TXT """ Register JSON tags, so the nltk data loader knows what module and class to look for. NLTK uses simple '!' tags to mark the types of objects, but the fully-qualified "tag:nltk.org,2011:" prefix is also accepted in case anyone ends up using it. """ import json json_tags = {} TAG_PREFIX = "!" def register_tag(cls): """ Decorates a class to register it's json tag. """ json_tags[TAG_PREFIX + getattr(cls, "json_tag")] = cls return cls class JSONTaggedEncoder(json.JSONEncoder): def default(self, obj): obj_tag = getattr(obj, "json_tag", None) if obj_tag is None: return super().default(obj) obj_tag = TAG_PREFIX + obj_tag obj = obj.encode_json_obj() return {obj_tag: obj} class JSONTaggedDecoder(json.JSONDecoder): def decode(self, s): return self.decode_obj(super().decode(s)) @classmethod def decode_obj(cls, obj): # Decode nested objects first. if isinstance(obj, dict): obj = {key: cls.decode_obj(val) for (key, val) in obj.items()} elif isinstance(obj, list): obj = list(cls.decode_obj(val) for val in obj) # Check if we have a tagged object. if not isinstance(obj, dict) or len(obj) != 1: return obj obj_tag = next(iter(obj.keys())) if not obj_tag.startswith("!"): return obj if obj_tag not in json_tags: raise ValueError("Unknown tag", obj_tag) obj_cls = json_tags[obj_tag] return obj_cls.decode_json_obj(obj[obj_tag]) __all__ = ["register_tag", "json_tags", "JSONTaggedEncoder", "JSONTaggedDecoder"] nltk-3.7/nltk/lazyimport.py000066400000000000000000000105651420073152400160740ustar00rootroot00000000000000# This module is from mx/DateTime/LazyModule.py and is # distributed under the terms of the eGenix.com Public License Agreement # https://www.egenix.com/products/eGenix.com-Public-License-1.1.0.pdf """ Helper to enable simple lazy module import. 'Lazy' means the actual import is deferred until an attribute is requested from the module's namespace. This has the advantage of allowing all imports to be done at the top of a script (in a prominent and visible place) without having a great impact on startup time. Copyright (c) 1999-2005, Marc-Andre Lemburg; mailto:mal@lemburg.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. """ ### Constants _debug = 0 ### class LazyModule: """Lazy module class. Lazy modules are imported into the given namespaces whenever a non-special attribute (there are some attributes like __doc__ that class instances handle without calling __getattr__) is requested. The module is then registered under the given name in locals usually replacing the import wrapper instance. The import itself is done using globals as global namespace. Example of creating a lazy load module: ISO = LazyModule('ISO',locals(),globals()) Later, requesting an attribute from ISO will load the module automatically into the locals() namespace, overriding the LazyModule instance: t = ISO.Week(1998,1,1) """ # Flag which indicates whether the LazyModule is initialized or not __lazymodule_init = 0 # Name of the module to load __lazymodule_name = "" # Flag which indicates whether the module was loaded or not __lazymodule_loaded = 0 # Locals dictionary where to register the module __lazymodule_locals = None # Globals dictionary to use for the module import __lazymodule_globals = None def __init__(self, name, locals, globals=None): """Create a LazyModule instance wrapping module name. The module will later on be registered in locals under the given module name. globals is optional and defaults to locals. """ self.__lazymodule_locals = locals if globals is None: globals = locals self.__lazymodule_globals = globals mainname = globals.get("__name__", "") if mainname: self.__name__ = mainname + "." + name self.__lazymodule_name = name else: self.__name__ = self.__lazymodule_name = name self.__lazymodule_init = 1 def __lazymodule_import(self): """Import the module now.""" # Load and register module name = self.__lazymodule_name if self.__lazymodule_loaded: return self.__lazymodule_locals[name] if _debug: print("LazyModule: Loading module %r" % name) self.__lazymodule_locals[name] = module = __import__( name, self.__lazymodule_locals, self.__lazymodule_globals, "*" ) # Fill namespace with all symbols from original module to # provide faster access. self.__dict__.update(module.__dict__) # Set import flag self.__dict__["__lazymodule_loaded"] = 1 if _debug: print("LazyModule: Module %r loaded" % name) return module def __getattr__(self, name): """Import the module on demand and get the attribute.""" if self.__lazymodule_loaded: raise AttributeError(name) if _debug: print( "LazyModule: " "Module load triggered by attribute %r read access" % name ) module = self.__lazymodule_import() return getattr(module, name) def __setattr__(self, name, value): """Import the module on demand and set the attribute.""" if not self.__lazymodule_init: self.__dict__[name] = value return if self.__lazymodule_loaded: self.__lazymodule_locals[self.__lazymodule_name] = value self.__dict__[name] = value return if _debug: print( "LazyModule: " "Module load triggered by attribute %r write access" % name ) module = self.__lazymodule_import() setattr(module, name, value) def __repr__(self): return "" % self.__name__ nltk-3.7/nltk/lm/000077500000000000000000000000001420073152400137115ustar00rootroot00000000000000nltk-3.7/nltk/lm/__init__.py000066400000000000000000000172101420073152400160230ustar00rootroot00000000000000# Natural Language Toolkit: Language Models # # Copyright (C) 2001-2022 NLTK Project # Authors: Ilia Kurenkov # URL: >> text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']] If we want to train a bigram model, we need to turn this text into bigrams. Here's what the first sentence of our text would look like if we use a function from NLTK for this. >>> from nltk.util import bigrams >>> list(bigrams(text[0])) [('a', 'b'), ('b', 'c')] Notice how "b" occurs both as the first and second member of different bigrams but "a" and "c" don't? Wouldn't it be nice to somehow indicate how often sentences start with "a" and end with "c"? A standard way to deal with this is to add special "padding" symbols to the sentence before splitting it into ngrams. Fortunately, NLTK also has a function for that, let's see what it does to the first sentence. >>> from nltk.util import pad_sequence >>> list(pad_sequence(text[0], ... pad_left=True, ... left_pad_symbol="", ... pad_right=True, ... right_pad_symbol="", ... n=2)) ['', 'a', 'b', 'c', ''] Note the `n` argument, that tells the function we need padding for bigrams. Now, passing all these parameters every time is tedious and in most cases they can be safely assumed as defaults anyway. Thus our module provides a convenience function that has all these arguments already set while the other arguments remain the same as for `pad_sequence`. >>> from nltk.lm.preprocessing import pad_both_ends >>> list(pad_both_ends(text[0], n=2)) ['', 'a', 'b', 'c', ''] Combining the two parts discussed so far we get the following preparation steps for one sentence. >>> list(bigrams(pad_both_ends(text[0], n=2))) [('', 'a'), ('a', 'b'), ('b', 'c'), ('c', '')] To make our model more robust we could also train it on unigrams (single words) as well as bigrams, its main source of information. NLTK once again helpfully provides a function called `everygrams`. While not the most efficient, it is conceptually simple. >>> from nltk.util import everygrams >>> padded_bigrams = list(pad_both_ends(text[0], n=2)) >>> list(everygrams(padded_bigrams, max_len=2)) [('',), ('', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', ''), ('',)] We are almost ready to start counting ngrams, just one more step left. During training and evaluation our model will rely on a vocabulary that defines which words are "known" to the model. To create this vocabulary we need to pad our sentences (just like for counting ngrams) and then combine the sentences into one flat stream of words. >>> from nltk.lm.preprocessing import flatten >>> list(flatten(pad_both_ends(sent, n=2) for sent in text)) ['', 'a', 'b', 'c', '', '', 'a', 'c', 'd', 'c', 'e', 'f', ''] In most cases we want to use the same text as the source for both vocabulary and ngram counts. Now that we understand what this means for our preprocessing, we can simply import a function that does everything for us. >>> from nltk.lm.preprocessing import padded_everygram_pipeline >>> train, vocab = padded_everygram_pipeline(2, text) So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy iterators. They are evaluated on demand at training time. Training ======== Having prepared our data we are ready to start training a model. As a simple example, let us train a Maximum Likelihood Estimator (MLE). We only need to specify the highest ngram order to instantiate it. >>> from nltk.lm import MLE >>> lm = MLE(2) This automatically creates an empty vocabulary... >>> len(lm.vocab) 0 ... which gets filled as we fit the model. >>> lm.fit(train, vocab) >>> print(lm.vocab) >>> len(lm.vocab) 9 The vocabulary helps us handle words that have not occurred during training. >>> lm.vocab.lookup(text[0]) ('a', 'b', 'c') >>> lm.vocab.lookup(["aliens", "from", "Mars"]) ('', '', '') Moreover, in some cases we want to ignore words that we did see during training but that didn't occur frequently enough, to provide us useful information. You can tell the vocabulary to ignore such words. To find out how that works, check out the docs for the `Vocabulary` class. Using a Trained Model ===================== When it comes to ngram models the training boils down to counting up the ngrams from the training corpus. >>> print(lm.counts) This provides a convenient interface to access counts for unigrams... >>> lm.counts['a'] 2 ...and bigrams (in this case "a b") >>> lm.counts[['a']]['b'] 1 And so on. However, the real purpose of training a language model is to have it score how probable words are in certain contexts. This being MLE, the model returns the item's relative frequency as its score. >>> lm.score("a") 0.15384615384615385 Items that are not seen during training are mapped to the vocabulary's "unknown label" token. This is "" by default. >>> lm.score("") == lm.score("aliens") True Here's how you get the score for a word given some preceding context. For example we want to know what is the chance that "b" is preceded by "a". >>> lm.score("b", ["a"]) 0.5 To avoid underflow when working with many small score values it makes sense to take their logarithm. For convenience this can be done with the `logscore` method. >>> lm.logscore("a") -2.700439718141092 Building on this method, we can also evaluate our model's cross-entropy and perplexity with respect to sequences of ngrams. >>> test = [('a', 'b'), ('c', 'd')] >>> lm.entropy(test) 1.292481250360578 >>> lm.perplexity(test) 2.449489742783178 It is advisable to preprocess your test text exactly the same way as you did the training text. One cool feature of ngram models is that they can be used to generate text. >>> lm.generate(1, random_seed=3) '' >>> lm.generate(5, random_seed=3) ['', 'a', 'b', 'c', 'd'] Provide `random_seed` if you want to consistently reproduce the same text all other things being equal. Here we are using it to test the examples. You can also condition your generation on some preceding text with the `context` argument. >>> lm.generate(5, text_seed=['c'], random_seed=3) ['', 'c', 'd', 'c', 'd'] Note that an ngram model is restricted in how much preceding context it can take into account. For example, a trigram model can only condition its output on 2 preceding words. If you pass in a 4-word context, the first two words will be ignored. """ from nltk.lm.counter import NgramCounter from nltk.lm.models import ( MLE, AbsoluteDiscountingInterpolated, KneserNeyInterpolated, Laplace, Lidstone, StupidBackoff, WittenBellInterpolated, ) from nltk.lm.vocabulary import Vocabulary __all__ = [ "Vocabulary", "NgramCounter", "MLE", "Lidstone", "Laplace", "WittenBellInterpolated", "KneserNeyInterpolated", "AbsoluteDiscountingInterpolated", "StupidBackoff", ] nltk-3.7/nltk/lm/api.py000066400000000000000000000174321420073152400150430ustar00rootroot00000000000000# Natural Language Toolkit: Language Models # # Copyright (C) 2001-2022 NLTK Project # Authors: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT """Language Model Interface.""" import random from abc import ABCMeta, abstractmethod from bisect import bisect from itertools import accumulate from nltk.lm.counter import NgramCounter from nltk.lm.util import log_base2 from nltk.lm.vocabulary import Vocabulary class Smoothing(metaclass=ABCMeta): """Ngram Smoothing Interface Implements Chen & Goodman 1995's idea that all smoothing algorithms have certain features in common. This should ideally allow smoothing algorithms to work both with Backoff and Interpolation. """ def __init__(self, vocabulary, counter): """ :param vocabulary: The Ngram vocabulary object. :type vocabulary: nltk.lm.vocab.Vocabulary :param counter: The counts of the vocabulary items. :type counter: nltk.lm.counter.NgramCounter """ self.vocab = vocabulary self.counts = counter @abstractmethod def unigram_score(self, word): raise NotImplementedError() @abstractmethod def alpha_gamma(self, word, context): raise NotImplementedError() def _mean(items): """Return average (aka mean) for sequence of items.""" return sum(items) / len(items) def _random_generator(seed_or_generator): if isinstance(seed_or_generator, random.Random): return seed_or_generator return random.Random(seed_or_generator) def _weighted_choice(population, weights, random_generator=None): """Like random.choice, but with weights. Heavily inspired by python 3.6 `random.choices`. """ if not population: raise ValueError("Can't choose from empty population") if len(population) != len(weights): raise ValueError("The number of weights does not match the population") cum_weights = list(accumulate(weights)) total = cum_weights[-1] threshold = random_generator.random() return population[bisect(cum_weights, total * threshold)] class LanguageModel(metaclass=ABCMeta): """ABC for Language Models. Cannot be directly instantiated itself. """ def __init__(self, order, vocabulary=None, counter=None): """Creates new LanguageModel. :param vocabulary: If provided, this vocabulary will be used instead of creating a new one when training. :type vocabulary: `nltk.lm.Vocabulary` or None :param counter: If provided, use this object to count ngrams. :type vocabulary: `nltk.lm.NgramCounter` or None :param ngrams_fn: If given, defines how sentences in training text are turned to ngram sequences. :type ngrams_fn: function or None :param pad_fn: If given, defines how sentences in training text are padded. :type pad_fn: function or None """ self.order = order self.vocab = Vocabulary() if vocabulary is None else vocabulary self.counts = NgramCounter() if counter is None else counter def fit(self, text, vocabulary_text=None): """Trains the model on a text. :param text: Training text as a sequence of sentences. """ if not self.vocab: if vocabulary_text is None: raise ValueError( "Cannot fit without a vocabulary or text to create it from." ) self.vocab.update(vocabulary_text) self.counts.update(self.vocab.lookup(sent) for sent in text) def score(self, word, context=None): """Masks out of vocab (OOV) words and computes their model score. For model-specific logic of calculating scores, see the `unmasked_score` method. """ return self.unmasked_score( self.vocab.lookup(word), self.vocab.lookup(context) if context else None ) @abstractmethod def unmasked_score(self, word, context=None): """Score a word given some optional context. Concrete models are expected to provide an implementation. Note that this method does not mask its arguments with the OOV label. Use the `score` method for that. :param str word: Word for which we want the score :param tuple(str) context: Context the word is in. If `None`, compute unigram score. :param context: tuple(str) or None :rtype: float """ raise NotImplementedError() def logscore(self, word, context=None): """Evaluate the log score of this word in this context. The arguments are the same as for `score` and `unmasked_score`. """ return log_base2(self.score(word, context)) def context_counts(self, context): """Helper method for retrieving counts for a given context. Assumes context has been checked and oov words in it masked. :type context: tuple(str) or None """ return ( self.counts[len(context) + 1][context] if context else self.counts.unigrams ) def entropy(self, text_ngrams): """Calculate cross-entropy of model for given evaluation text. :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples. :rtype: float """ return -1 * _mean( [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams] ) def perplexity(self, text_ngrams): """Calculates the perplexity of the given text. This is simply 2 ** cross-entropy for the text, so the arguments are the same. """ return pow(2.0, self.entropy(text_ngrams)) def generate(self, num_words=1, text_seed=None, random_seed=None): """Generate words from the model. :param int num_words: How many words to generate. By default 1. :param text_seed: Generation can be conditioned on preceding context. :param random_seed: A random seed or an instance of `random.Random`. If provided, makes the random sampling part of generation reproducible. :return: One (str) word or a list of words generated from model. Examples: >>> from nltk.lm import MLE >>> lm = MLE(2) >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c']) >>> lm.fit([[("a",), ("b",), ("c",)]]) >>> lm.generate(random_seed=3) 'a' >>> lm.generate(text_seed=['a']) 'b' """ text_seed = [] if text_seed is None else list(text_seed) random_generator = _random_generator(random_seed) # This is the base recursion case. if num_words == 1: context = ( text_seed[-self.order + 1 :] if len(text_seed) >= self.order else text_seed ) samples = self.context_counts(self.vocab.lookup(context)) while context and not samples: context = context[1:] if len(context) > 1 else [] samples = self.context_counts(self.vocab.lookup(context)) # Sorting samples achieves two things: # - reproducible randomness when sampling # - turns Mapping into Sequence which `_weighted_choice` expects samples = sorted(samples) return _weighted_choice( samples, tuple(self.score(w, context) for w in samples), random_generator, ) # We build up text one word at a time using the preceding context. generated = [] for _ in range(num_words): generated.append( self.generate( num_words=1, text_seed=text_seed + generated, random_seed=random_generator, ) ) return generated nltk-3.7/nltk/lm/counter.py000066400000000000000000000117361420073152400157520ustar00rootroot00000000000000# Natural Language Toolkit # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT """ Language Model Counter ---------------------- """ from collections import defaultdict from collections.abc import Sequence from nltk.probability import ConditionalFreqDist, FreqDist class NgramCounter: """Class for counting ngrams. Will count any ngram sequence you give it ;) First we need to make sure we are feeding the counter sentences of ngrams. >>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]] >>> from nltk.util import ngrams >>> text_bigrams = [ngrams(sent, 2) for sent in text] >>> text_unigrams = [ngrams(sent, 1) for sent in text] The counting itself is very simple. >>> from nltk.lm import NgramCounter >>> ngram_counts = NgramCounter(text_bigrams + text_unigrams) You can conveniently access ngram counts using standard python dictionary notation. String keys will give you unigram counts. >>> ngram_counts['a'] 2 >>> ngram_counts['aliens'] 0 If you want to access counts for higher order ngrams, use a list or a tuple. These are treated as "context" keys, so what you get is a frequency distribution over all continuations after the given context. >>> sorted(ngram_counts[['a']].items()) [('b', 1), ('c', 1)] >>> sorted(ngram_counts[('a',)].items()) [('b', 1), ('c', 1)] This is equivalent to specifying explicitly the order of the ngram (in this case 2 for bigram) and indexing on the context. >>> ngram_counts[2][('a',)] is ngram_counts[['a']] True Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples! It is generally advisable to use the less verbose and more flexible square bracket notation. To get the count of the full ngram "a b", do this: >>> ngram_counts[['a']]['b'] 1 Specifying the ngram order as a number can be useful for accessing all ngrams in that order. >>> ngram_counts[2] The keys of this `ConditionalFreqDist` are the contexts we discussed earlier. Unigrams can also be accessed with a human-friendly alias. >>> ngram_counts.unigrams is ngram_counts[1] True Similarly to `collections.Counter`, you can update counts after initialization. >>> ngram_counts['e'] 0 >>> ngram_counts.update([ngrams(["d", "e", "f"], 1)]) >>> ngram_counts['e'] 1 """ def __init__(self, ngram_text=None): """Creates a new NgramCounter. If `ngram_text` is specified, counts ngrams from it, otherwise waits for `update` method to be called explicitly. :param ngram_text: Optional text containing sentences of ngrams, as for `update` method. :type ngram_text: Iterable(Iterable(tuple(str))) or None """ self._counts = defaultdict(ConditionalFreqDist) self._counts[1] = self.unigrams = FreqDist() if ngram_text: self.update(ngram_text) def update(self, ngram_text): """Updates ngram counts from `ngram_text`. Expects `ngram_text` to be a sequence of sentences (sequences). Each sentence consists of ngrams as tuples of strings. :param Iterable(Iterable(tuple(str))) ngram_text: Text containing sentences of ngrams. :raises TypeError: if the ngrams are not tuples. """ for sent in ngram_text: for ngram in sent: if not isinstance(ngram, tuple): raise TypeError( "Ngram <{}> isn't a tuple, " "but {}".format(ngram, type(ngram)) ) ngram_order = len(ngram) if ngram_order == 1: self.unigrams[ngram[0]] += 1 continue context, word = ngram[:-1], ngram[-1] self[ngram_order][context][word] += 1 def N(self): """Returns grand total number of ngrams stored. This includes ngrams from all orders, so some duplication is expected. :rtype: int >>> from nltk.lm import NgramCounter >>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]]) >>> counts.N() 3 """ return sum(val.N() for val in self._counts.values()) def __getitem__(self, item): """User-friendly access to ngram counts.""" if isinstance(item, int): return self._counts[item] elif isinstance(item, str): return self._counts.__getitem__(1)[item] elif isinstance(item, Sequence): return self._counts.__getitem__(len(item) + 1)[tuple(item)] def __str__(self): return "<{} with {} ngram orders and {} ngrams>".format( self.__class__.__name__, len(self._counts), self.N() ) def __len__(self): return self._counts.__len__() def __contains__(self, item): return item in self._counts nltk-3.7/nltk/lm/models.py000066400000000000000000000112321420073152400155450ustar00rootroot00000000000000# Natural Language Toolkit: Language Models # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # Manu Joseph # URL: # For license information, see LICENSE.TXT """Language Models""" from nltk.lm.api import LanguageModel, Smoothing from nltk.lm.smoothing import AbsoluteDiscounting, KneserNey, WittenBell class MLE(LanguageModel): """Class for providing MLE ngram model scores. Inherits initialization from BaseNgramModel. """ def unmasked_score(self, word, context=None): """Returns the MLE score for a word given a context. Args: - word is expected to be a string - context is expected to be something reasonably convertible to a tuple """ return self.context_counts(context).freq(word) class Lidstone(LanguageModel): """Provides Lidstone-smoothed scores. In addition to initialization arguments from BaseNgramModel also requires a number by which to increase the counts, gamma. """ def __init__(self, gamma, *args, **kwargs): super().__init__(*args, **kwargs) self.gamma = gamma def unmasked_score(self, word, context=None): """Add-one smoothing: Lidstone or Laplace. To see what kind, look at `gamma` attribute on the class. """ counts = self.context_counts(context) word_count = counts[word] norm_count = counts.N() return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma) class Laplace(Lidstone): """Implements Laplace (add one) smoothing. Initialization identical to BaseNgramModel because gamma is always 1. """ def __init__(self, *args, **kwargs): super().__init__(1, *args, **kwargs) class StupidBackoff(LanguageModel): """Provides StupidBackoff scores. In addition to initialization arguments from BaseNgramModel also requires a parameter alpha with which we scale the lower order probabilities. Note that this is not a true probability distribution as scores for ngrams of the same order do not sum up to unity. """ def __init__(self, alpha=0.4, *args, **kwargs): super().__init__(*args, **kwargs) self.alpha = alpha def unmasked_score(self, word, context=None): if not context: # Base recursion return self.counts.unigrams.freq(word) counts = self.context_counts(context) word_count = counts[word] norm_count = counts.N() if word_count > 0: return word_count / norm_count else: return self.alpha * self.unmasked_score(word, context[1:]) class InterpolatedLanguageModel(LanguageModel): """Logic common to all interpolated language models. The idea to abstract this comes from Chen & Goodman 1995. Do not instantiate this class directly! """ def __init__(self, smoothing_cls, order, **kwargs): params = kwargs.pop("params", {}) super().__init__(order, **kwargs) self.estimator = smoothing_cls(self.vocab, self.counts, **params) def unmasked_score(self, word, context=None): if not context: # The base recursion case: no context, we only have a unigram. return self.estimator.unigram_score(word) if not self.counts[context]: # It can also happen that we have no data for this context. # In that case we defer to the lower-order ngram. # This is the same as setting alpha to 0 and gamma to 1. alpha, gamma = 0, 1 else: alpha, gamma = self.estimator.alpha_gamma(word, context) return alpha + gamma * self.unmasked_score(word, context[1:]) class WittenBellInterpolated(InterpolatedLanguageModel): """Interpolated version of Witten-Bell smoothing.""" def __init__(self, order, **kwargs): super().__init__(WittenBell, order, **kwargs) class AbsoluteDiscountingInterpolated(InterpolatedLanguageModel): """Interpolated version of smoothing with absolute discount.""" def __init__(self, order, discount=0.75, **kwargs): super().__init__( AbsoluteDiscounting, order, params={"discount": discount}, **kwargs ) class KneserNeyInterpolated(InterpolatedLanguageModel): """Interpolated version of Kneser-Ney smoothing.""" def __init__(self, order, discount=0.1, **kwargs): if not (0 <= discount <= 1): raise ValueError( "Discount must be between 0 and 1 for probabilities to sum to unity." ) super().__init__( KneserNey, order, params={"discount": discount, "order": order}, **kwargs ) nltk-3.7/nltk/lm/preprocessing.py000066400000000000000000000031771420073152400171560ustar00rootroot00000000000000# Natural Language Toolkit: Language Model Unit Tests # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT from functools import partial from itertools import chain from nltk.util import everygrams, pad_sequence flatten = chain.from_iterable pad_both_ends = partial( pad_sequence, pad_left=True, left_pad_symbol="", pad_right=True, right_pad_symbol="", ) pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order. Following convention pads the start of sentence pads its end. """ def padded_everygrams(order, sentence): """Helper with some useful defaults. Applies pad_both_ends to sentence and follows it up with everygrams. """ return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order) def padded_everygram_pipeline(order, text): """Default preprocessing for a sequence of sentences. Creates two iterators: - sentences padded and turned into sequences of `nltk.util.everygrams` - sentences padded as above and chained together for a flat stream of words :param order: Largest ngram length produced by `everygrams`. :param text: Text to iterate over. Expected to be an iterable of sentences. :type text: Iterable[Iterable[str]] :return: iterator over text as ngrams, iterator over text as vocabulary data """ padding_fn = partial(pad_both_ends, n=order) return ( (everygrams(list(padding_fn(sent)), max_len=order) for sent in text), flatten(map(padding_fn, text)), ) nltk-3.7/nltk/lm/smoothing.py000066400000000000000000000107761420073152400163050ustar00rootroot00000000000000# Natural Language Toolkit: Language Model Unit Tests # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # Manu Joseph # URL: # For license information, see LICENSE.TXT """Smoothing algorithms for language modeling. According to Chen & Goodman 1995 these should work with both Backoff and Interpolation. """ from operator import methodcaller from nltk import ConditionalFreqDist from nltk.lm.api import Smoothing def _count_values_gt_zero(distribution): """Count values that are greater than zero in a distribution. Assumes distribution is either a mapping with counts as values or an instance of `nltk.ConditionalFreqDist`. """ as_count = ( methodcaller("N") if isinstance(distribution, ConditionalFreqDist) else lambda count: count ) # We explicitly check that values are > 0 to guard against negative counts. return sum( 1 for dist_or_count in distribution.values() if as_count(dist_or_count) > 0 ) class WittenBell(Smoothing): """Witten-Bell smoothing.""" def __init__(self, vocabulary, counter, **kwargs): super().__init__(vocabulary, counter, **kwargs) def alpha_gamma(self, word, context): alpha = self.counts[context].freq(word) gamma = self._gamma(context) return (1.0 - gamma) * alpha, gamma def _gamma(self, context): n_plus = _count_values_gt_zero(self.counts[context]) return n_plus / (n_plus + self.counts[context].N()) def unigram_score(self, word): return self.counts.unigrams.freq(word) class AbsoluteDiscounting(Smoothing): """Smoothing with absolute discount.""" def __init__(self, vocabulary, counter, discount=0.75, **kwargs): super().__init__(vocabulary, counter, **kwargs) self.discount = discount def alpha_gamma(self, word, context): alpha = ( max(self.counts[context][word] - self.discount, 0) / self.counts[context].N() ) gamma = self._gamma(context) return alpha, gamma def _gamma(self, context): n_plus = _count_values_gt_zero(self.counts[context]) return (self.discount * n_plus) / self.counts[context].N() def unigram_score(self, word): return self.counts.unigrams.freq(word) class KneserNey(Smoothing): """Kneser-Ney Smoothing. This is an extension of smoothing with a discount. Resources: - https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf - https://www.youtube.com/watch?v=ody1ysUTD7o - https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8 - https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf - https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf """ def __init__(self, vocabulary, counter, order, discount=0.1, **kwargs): super().__init__(vocabulary, counter, **kwargs) self.discount = discount self._order = order def unigram_score(self, word): word_continuation_count, total_count = self._continuation_counts(word) return word_continuation_count / total_count def alpha_gamma(self, word, context): prefix_counts = self.counts[context] word_continuation_count, total_count = ( (prefix_counts[word], prefix_counts.N()) if len(context) + 1 == self._order else self._continuation_counts(word, context) ) alpha = max(word_continuation_count - self.discount, 0.0) / total_count gamma = self.discount * _count_values_gt_zero(prefix_counts) / total_count return alpha, gamma def _continuation_counts(self, word, context=tuple()): """Count continuations that end with context and word. Continuations track unique ngram "types", regardless of how many instances were observed for each "type". This is different than raw ngram counts which track number of instances. """ higher_order_ngrams_with_context = ( counts for prefix_ngram, counts in self.counts[len(context) + 2].items() if prefix_ngram[1:] == context ) higher_order_ngrams_with_word_count, total = 0, 0 for counts in higher_order_ngrams_with_context: higher_order_ngrams_with_word_count += int(counts[word] > 0) total += _count_values_gt_zero(counts) return higher_order_ngrams_with_word_count, total nltk-3.7/nltk/lm/util.py000066400000000000000000000007071420073152400152440ustar00rootroot00000000000000# Natural Language Toolkit # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT """Language Model Utilities""" from math import log NEG_INF = float("-inf") POS_INF = float("inf") def log_base2(score): """Convenience function for computing logarithms with base 2.""" if score == 0.0: return NEG_INF return log(score, 2) nltk-3.7/nltk/lm/vocabulary.py000066400000000000000000000153411420073152400164360ustar00rootroot00000000000000# Natural Language Toolkit # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT """Language Model Vocabulary""" import sys from collections import Counter from collections.abc import Iterable from functools import singledispatch from itertools import chain @singledispatch def _dispatched_lookup(words, vocab): raise TypeError(f"Unsupported type for looking up in vocabulary: {type(words)}") @_dispatched_lookup.register(Iterable) def _(words, vocab): """Look up a sequence of words in the vocabulary. Returns an iterator over looked up words. """ return tuple(_dispatched_lookup(w, vocab) for w in words) @_dispatched_lookup.register(str) def _string_lookup(word, vocab): """Looks up one word in the vocabulary.""" return word if word in vocab else vocab.unk_label class Vocabulary: """Stores language model vocabulary. Satisfies two common language modeling requirements for a vocabulary: - When checking membership and calculating its size, filters items by comparing their counts to a cutoff value. - Adds a special "unknown" token which unseen words are mapped to. >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd'] >>> from nltk.lm import Vocabulary >>> vocab = Vocabulary(words, unk_cutoff=2) Tokens with counts greater than or equal to the cutoff value will be considered part of the vocabulary. >>> vocab['c'] 3 >>> 'c' in vocab True >>> vocab['d'] 2 >>> 'd' in vocab True Tokens with frequency counts less than the cutoff value will be considered not part of the vocabulary even though their entries in the count dictionary are preserved. >>> vocab['b'] 1 >>> 'b' in vocab False >>> vocab['aliens'] 0 >>> 'aliens' in vocab False Keeping the count entries for seen words allows us to change the cutoff value without having to recalculate the counts. >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1) >>> "b" in vocab2 True The cutoff value influences not only membership checking but also the result of getting the size of the vocabulary using the built-in `len`. Note that while the number of keys in the vocabulary's counter stays the same, the items in the vocabulary differ depending on the cutoff. We use `sorted` to demonstrate because it keeps the order consistent. >>> sorted(vocab2.counts) ['-', 'a', 'b', 'c', 'd', 'r'] >>> sorted(vocab2) ['-', '', 'a', 'b', 'c', 'd', 'r'] >>> sorted(vocab.counts) ['-', 'a', 'b', 'c', 'd', 'r'] >>> sorted(vocab) ['', 'a', 'c', 'd'] In addition to items it gets populated with, the vocabulary stores a special token that stands in for so-called "unknown" items. By default it's "". >>> "" in vocab True We can look up words in a vocabulary using its `lookup` method. "Unseen" words (with counts less than cutoff) are looked up as the unknown label. If given one word (a string) as an input, this method will return a string. >>> vocab.lookup("a") 'a' >>> vocab.lookup("aliens") '' If given a sequence, it will return an tuple of the looked up words. >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c']) ('', 'a', '', 'd', '', 'c') It's possible to update the counts after the vocabulary has been created. In general, the interface is the same as that of `collections.Counter`. >>> vocab['b'] 1 >>> vocab.update(["b", "b", "c"]) >>> vocab['b'] 3 """ def __init__(self, counts=None, unk_cutoff=1, unk_label=""): """Create a new Vocabulary. :param counts: Optional iterable or `collections.Counter` instance to pre-seed the Vocabulary. In case it is iterable, counts are calculated. :param int unk_cutoff: Words that occur less frequently than this value are not considered part of the vocabulary. :param unk_label: Label for marking words not part of vocabulary. """ self.unk_label = unk_label if unk_cutoff < 1: raise ValueError(f"Cutoff value cannot be less than 1. Got: {unk_cutoff}") self._cutoff = unk_cutoff self.counts = Counter() self.update(counts if counts is not None else "") @property def cutoff(self): """Cutoff value. Items with count below this value are not considered part of vocabulary. """ return self._cutoff def update(self, *counter_args, **counter_kwargs): """Update vocabulary counts. Wraps `collections.Counter.update` method. """ self.counts.update(*counter_args, **counter_kwargs) self._len = sum(1 for _ in self) def lookup(self, words): """Look up one or more words in the vocabulary. If passed one word as a string will return that word or `self.unk_label`. Otherwise will assume it was passed a sequence of words, will try to look each of them up and return an iterator over the looked up words. :param words: Word(s) to look up. :type words: Iterable(str) or str :rtype: generator(str) or str :raises: TypeError for types other than strings or iterables >>> from nltk.lm import Vocabulary >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2) >>> vocab.lookup("a") 'a' >>> vocab.lookup("aliens") '' >>> vocab.lookup(["a", "b", "c", ["x", "b"]]) ('a', 'b', '', ('', 'b')) """ return _dispatched_lookup(words, self) def __getitem__(self, item): return self._cutoff if item == self.unk_label else self.counts[item] def __contains__(self, item): """Only consider items with counts GE to cutoff as being in the vocabulary.""" return self[item] >= self.cutoff def __iter__(self): """Building on membership check define how to iterate over vocabulary.""" return chain( (item for item in self.counts if item in self), [self.unk_label] if self.counts else [], ) def __len__(self): """Computing size of vocabulary reflects the cutoff.""" return self._len def __eq__(self, other): return ( self.unk_label == other.unk_label and self.cutoff == other.cutoff and self.counts == other.counts ) def __str__(self): return "<{} with cutoff={} unk_label='{}' and {} items>".format( self.__class__.__name__, self.cutoff, self.unk_label, len(self) ) nltk-3.7/nltk/metrics/000077500000000000000000000000001420073152400147475ustar00rootroot00000000000000nltk-3.7/nltk/metrics/__init__.py000066400000000000000000000022501420073152400170570ustar00rootroot00000000000000# Natural Language Toolkit: Metrics # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # """ NLTK Metrics Classes and methods for scoring processing modules. """ from nltk.metrics.agreement import AnnotationTask from nltk.metrics.aline import align from nltk.metrics.association import ( BigramAssocMeasures, ContingencyMeasures, NgramAssocMeasures, QuadgramAssocMeasures, TrigramAssocMeasures, ) from nltk.metrics.confusionmatrix import ConfusionMatrix from nltk.metrics.distance import ( binary_distance, custom_distance, edit_distance, edit_distance_align, fractional_presence, interval_distance, jaccard_distance, masi_distance, presence, ) from nltk.metrics.paice import Paice from nltk.metrics.scores import ( accuracy, approxrand, f_measure, log_likelihood, precision, recall, ) from nltk.metrics.segmentation import ghd, pk, windowdiff from nltk.metrics.spearman import ( ranks_from_scores, ranks_from_sequence, spearman_correlation, ) nltk-3.7/nltk/metrics/agreement.py000066400000000000000000000371411420073152400172760ustar00rootroot00000000000000# Natural Language Toolkit: Agreement Metrics # # Copyright (C) 2001-2022 NLTK Project # Author: Tom Lippincott # URL: # For license information, see LICENSE.TXT # """ Implementations of inter-annotator agreement coefficients surveyed by Artstein and Poesio (2007), Inter-Coder Agreement for Computational Linguistics. An agreement coefficient calculates the amount that annotators agreed on label assignments beyond what is expected by chance. In defining the AnnotationTask class, we use naming conventions similar to the paper's terminology. There are three types of objects in an annotation task: the coders (variables "c" and "C") the items to be annotated (variables "i" and "I") the potential categories to be assigned (variables "k" and "K") Additionally, it is often the case that we don't want to treat two different labels as complete disagreement, and so the AnnotationTask constructor can also take a distance metric as a final argument. Distance metrics are simply functions that take two arguments, and return a value between 0.0 and 1.0 indicating the distance between them. If not supplied, the default is binary comparison between the arguments. The simplest way to initialize an AnnotationTask is with a list of triples, each containing a coder's assignment for one object in the task: task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...]) Note that the data list needs to contain the same number of triples for each individual coder, containing category values for the same set of items. Alpha (Krippendorff 1980) Kappa (Cohen 1960) S (Bennet, Albert and Goldstein 1954) Pi (Scott 1955) TODO: Describe handling of multiple coders and missing data Expected results from the Artstein and Poesio survey paper: >>> from nltk.metrics.agreement import AnnotationTask >>> import os.path >>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))]) >>> t.avg_Ao() 0.88 >>> t.pi() 0.7995322418977615... >>> t.S() 0.8199999999999998... This would have returned a wrong value (0.0) in @785fb79 as coders are in the wrong order. Subsequently, all values for pi(), S(), and kappa() would have been wrong as they are computed with avg_Ao(). >>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')]) >>> t2.avg_Ao() 1.0 The following, of course, also works. >>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')]) >>> t3.avg_Ao() 1.0 """ import logging from itertools import groupby from operator import itemgetter from nltk.internals import deprecated from nltk.metrics.distance import binary_distance from nltk.probability import ConditionalFreqDist, FreqDist log = logging.getLogger(__name__) class AnnotationTask: """Represents an annotation task, i.e. people assign labels to items. Notation tries to match notation in Artstein and Poesio (2007). In general, coders and items can be represented as any hashable object. Integers, for example, are fine, though strings are more readable. Labels must support the distance functions applied to them, so e.g. a string-edit-distance makes no sense if your labels are integers, whereas interval distance needs numeric values. A notable case of this is the MASI metric, which requires Python sets. """ def __init__(self, data=None, distance=binary_distance): """Initialize an annotation task. The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples, each representing a coder's labeling of an item: ``(coder,item,label)`` The distance argument is a function taking two arguments (labels) and producing a numerical distance. The distance from a label to itself should be zero: ``distance(l,l) = 0`` """ self.distance = distance self.I = set() self.K = set() self.C = set() self.data = [] if data is not None: self.load_array(data) def __str__(self): return "\r\n".join( map( lambda x: "%s\t%s\t%s" % (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])), self.data, ) ) def load_array(self, array): """Load an sequence of annotation results, appending to any data already loaded. The argument is a sequence of 3-tuples, each representing a coder's labeling of an item: (coder,item,label) """ for coder, item, labels in array: self.C.add(coder) self.K.add(labels) self.I.add(item) self.data.append({"coder": coder, "labels": labels, "item": item}) def agr(self, cA, cB, i, data=None): """Agreement between two coders on a given item""" data = data or self.data # cfedermann: we don't know what combination of coder/item will come # first in x; to avoid StopIteration problems due to assuming an order # cA,cB, we allow either for k1 and then look up the missing as k2. k1 = next(x for x in data if x["coder"] in (cA, cB) and x["item"] == i) if k1["coder"] == cA: k2 = next(x for x in data if x["coder"] == cB and x["item"] == i) else: k2 = next(x for x in data if x["coder"] == cA and x["item"] == i) ret = 1.0 - float(self.distance(k1["labels"], k2["labels"])) log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret) log.debug( 'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret ) return ret def Nk(self, k): return float(sum(1 for x in self.data if x["labels"] == k)) def Nik(self, i, k): return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k)) def Nck(self, c, k): return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k)) @deprecated("Use Nk, Nik or Nck instead") def N(self, k=None, i=None, c=None): """Implements the "n-notation" used in Artstein and Poesio (2007)""" if k is not None and i is None and c is None: ret = self.Nk(k) elif k is not None and i is not None and c is None: ret = self.Nik(i, k) elif k is not None and c is not None and i is None: ret = self.Nck(c, k) else: raise ValueError( f"You must pass either i or c, not both! (k={k!r},i={i!r},c={c!r})" ) log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret) return ret def _grouped_data(self, field, data=None): data = data or self.data return groupby(sorted(data, key=itemgetter(field)), itemgetter(field)) def Ao(self, cA, cB): """Observed agreement between two coders on all items.""" data = self._grouped_data( "item", (x for x in self.data if x["coder"] in (cA, cB)) ) ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len( self.I ) log.debug("Observed agreement between %s and %s: %f", cA, cB, ret) return ret def _pairwise_average(self, function): """ Calculates the average of function results for each coder pair """ total = 0 n = 0 s = self.C.copy() for cA in self.C: s.remove(cA) for cB in s: total += function(cA, cB) n += 1 ret = total / n return ret def avg_Ao(self): """Average observed agreement across all coders and items.""" ret = self._pairwise_average(self.Ao) log.debug("Average observed agreement: %f", ret) return ret def Do_Kw_pairwise(self, cA, cB, max_distance=1.0): """The observed disagreement for the weighted kappa coefficient.""" total = 0.0 data = (x for x in self.data if x["coder"] in (cA, cB)) for i, itemdata in self._grouped_data("item", data): # we should have two items; distance doesn't care which comes first total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"]) ret = total / (len(self.I) * max_distance) log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret) return ret def Do_Kw(self, max_distance=1.0): """Averaged over all labelers""" ret = self._pairwise_average( lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance) ) log.debug("Observed disagreement: %f", ret) return ret # Agreement Coefficients def S(self): """Bennett, Albert and Goldstein 1954""" Ae = 1.0 / len(self.K) ret = (self.avg_Ao() - Ae) / (1.0 - Ae) return ret def pi(self): """Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988). """ total = 0.0 label_freqs = FreqDist(x["labels"] for x in self.data) for k, f in label_freqs.items(): total += f ** 2 Ae = total / ((len(self.I) * len(self.C)) ** 2) return (self.avg_Ao() - Ae) / (1 - Ae) def Ae_kappa(self, cA, cB): Ae = 0.0 nitems = float(len(self.I)) label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data) for k in label_freqs.conditions(): Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) return Ae def kappa_pairwise(self, cA, cB): """ """ Ae = self.Ae_kappa(cA, cB) ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae) log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae) return ret def kappa(self): """Cohen 1960 Averages naively over kappas for each coder pair. """ return self._pairwise_average(self.kappa_pairwise) def multi_kappa(self): """Davies and Fleiss 1982 Averages over observed and expected agreements for each coder pair. """ Ae = self._pairwise_average(self.Ae_kappa) return (self.avg_Ao() - Ae) / (1.0 - Ae) def Disagreement(self, label_freqs): total_labels = sum(label_freqs.values()) pairs = 0.0 for j, nj in label_freqs.items(): for l, nl in label_freqs.items(): pairs += float(nj * nl) * self.distance(l, j) return 1.0 * pairs / (total_labels * (total_labels - 1)) def alpha(self): """Krippendorff 1980""" # check for degenerate cases if len(self.K) == 0: raise ValueError("Cannot calculate alpha, no data present!") if len(self.K) == 1: log.debug("Only one annotation value, alpha returning 1.") return 1 if len(self.C) == 1 and len(self.I) == 1: raise ValueError("Cannot calculate alpha, only one coder and item present!") total_disagreement = 0.0 total_ratings = 0 all_valid_labels_freq = FreqDist([]) total_do = 0.0 # Total observed disagreement for all items. for i, itemdata in self._grouped_data("item"): label_freqs = FreqDist(x["labels"] for x in itemdata) labels_count = sum(label_freqs.values()) if labels_count < 2: # Ignore the item. continue all_valid_labels_freq += label_freqs total_do += self.Disagreement(label_freqs) * labels_count do = total_do / sum(all_valid_labels_freq.values()) de = self.Disagreement(all_valid_labels_freq) # Expected disagreement. k_alpha = 1.0 - do / de return k_alpha def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0): """Cohen 1968""" total = 0.0 label_freqs = ConditionalFreqDist( (x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB) ) for j in self.K: for l in self.K: total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l) De = total / (max_distance * pow(len(self.I), 2)) log.debug("Expected disagreement between %s and %s: %f", cA, cB, De) Do = self.Do_Kw_pairwise(cA, cB) ret = 1.0 - (Do / De) return ret def weighted_kappa(self, max_distance=1.0): """Cohen 1968""" return self._pairwise_average( lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance) ) if __name__ == "__main__": import optparse import re from nltk.metrics import distance # process command-line arguments parser = optparse.OptionParser() parser.add_option( "-d", "--distance", dest="distance", default="binary_distance", help="distance metric to use", ) parser.add_option( "-a", "--agreement", dest="agreement", default="kappa", help="agreement coefficient to calculate", ) parser.add_option( "-e", "--exclude", dest="exclude", action="append", default=[], help="coder names to exclude (may be specified multiple times)", ) parser.add_option( "-i", "--include", dest="include", action="append", default=[], help="coder names to include, same format as exclude", ) parser.add_option( "-f", "--file", dest="file", help="file to read labelings from, each line with three columns: 'labeler item labels'", ) parser.add_option( "-v", "--verbose", dest="verbose", default="0", help="how much debugging to print on stderr (0-4)", ) parser.add_option( "-c", "--columnsep", dest="columnsep", default="\t", help="char/string that separates the three columns in the file, defaults to tab", ) parser.add_option( "-l", "--labelsep", dest="labelsep", default=",", help="char/string that separates labels (if labelers can assign more than one), defaults to comma", ) parser.add_option( "-p", "--presence", dest="presence", default=None, help="convert each labeling into 1 or 0, based on presence of LABEL", ) parser.add_option( "-T", "--thorough", dest="thorough", default=False, action="store_true", help="calculate agreement for every subset of the annotators", ) (options, remainder) = parser.parse_args() if not options.file: parser.print_help() exit() logging.basicConfig(level=50 - 10 * int(options.verbose)) # read in data from the specified file data = [] with open(options.file) as infile: for l in infile: toks = l.split(options.columnsep) coder, object_, labels = ( toks[0], str(toks[1:-1]), frozenset(toks[-1].strip().split(options.labelsep)), ) if ( (options.include == options.exclude) or (len(options.include) > 0 and coder in options.include) or (len(options.exclude) > 0 and coder not in options.exclude) ): data.append((coder, object_, labels)) if options.presence: task = AnnotationTask( data, getattr(distance, options.distance)(options.presence) ) else: task = AnnotationTask(data, getattr(distance, options.distance)) if options.thorough: pass else: print(getattr(task, options.agreement)()) logging.shutdown() nltk-3.7/nltk/metrics/aline.py000066400000000000000000000753541420073152400164270ustar00rootroot00000000000000# Natural Language Toolkit: ALINE # # Copyright (C) 2001-2022 NLTK Project # Author: Greg Kondrak # Geoff Bacon (Python port) # URL: # For license information, see LICENSE.TXT """ ALINE https://webdocs.cs.ualberta.ca/~kondrak/ Copyright 2002 by Grzegorz Kondrak. ALINE is an algorithm for aligning phonetic sequences, described in [1]. This module is a port of Kondrak's (2002) ALINE. It provides functions for phonetic sequence alignment and similarity analysis. These are useful in historical linguistics, sociolinguistics and synchronic phonology. ALINE has parameters that can be tuned for desired output. These parameters are: - C_skip, C_sub, C_exp, C_vwl - Salience weights - Segmental features In this implementation, some parameters have been changed from their default values as described in [1], in order to replicate published results. All changes are noted in comments. Example usage ------------- # Get optimal alignment of two phonetic sequences >>> align('θin', 'tenwis') # doctest: +SKIP [[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]] [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation, University of Toronto. """ try: import numpy as np except ImportError: np = None # === Constants === inf = float("inf") # Default values for maximum similarity scores (Kondrak 2002: 54) C_skip = -10 # Indels C_sub = 35 # Substitutions C_exp = 45 # Expansions/compressions C_vwl = 5 # Vowel/consonant relative weight (decreased from 10) consonants = [ "B", "N", "R", "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "x", "z", "ç", "ð", "ħ", "ŋ", "ɖ", "ɟ", "ɢ", "ɣ", "ɦ", "ɬ", "ɮ", "ɰ", "ɱ", "ɲ", "ɳ", "ɴ", "ɸ", "ɹ", "ɻ", "ɽ", "ɾ", "ʀ", "ʁ", "ʂ", "ʃ", "ʈ", "ʋ", "ʐ ", "ʒ", "ʔ", "ʕ", "ʙ", "ʝ", "β", "θ", "χ", "ʐ", "w", ] # Relevant features for comparing consonants and vowels R_c = [ "aspirated", "lateral", "manner", "nasal", "place", "retroflex", "syllabic", "voice", ] # 'high' taken out of R_v because same as manner R_v = [ "back", "lateral", "long", "manner", "nasal", "place", "retroflex", "round", "syllabic", "voice", ] # Flattened feature matrix (Kondrak 2002: 56) similarity_matrix = { # place "bilabial": 1.0, "labiodental": 0.95, "dental": 0.9, "alveolar": 0.85, "retroflex": 0.8, "palato-alveolar": 0.75, "palatal": 0.7, "velar": 0.6, "uvular": 0.5, "pharyngeal": 0.3, "glottal": 0.1, "labiovelar": 1.0, "vowel": -1.0, # added 'vowel' # manner "stop": 1.0, "affricate": 0.9, "fricative": 0.85, # increased fricative from 0.8 "trill": 0.7, "tap": 0.65, "approximant": 0.6, "high vowel": 0.4, "mid vowel": 0.2, "low vowel": 0.0, "vowel2": 0.5, # added vowel # high "high": 1.0, "mid": 0.5, "low": 0.0, # back "front": 1.0, "central": 0.5, "back": 0.0, # binary features "plus": 1.0, "minus": 0.0, } # Relative weights of phonetic features (Kondrak 2002: 55) salience = { "syllabic": 5, "place": 40, "manner": 50, "voice": 5, # decreased from 10 "nasal": 20, # increased from 10 "retroflex": 10, "lateral": 10, "aspirated": 5, "long": 0, # decreased from 1 "high": 3, # decreased from 5 "back": 2, # decreased from 5 "round": 2, # decreased from 5 } # (Kondrak 2002: 59-60) feature_matrix = { # Consonants "p": { "place": "bilabial", "manner": "stop", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "b": { "place": "bilabial", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "t": { "place": "alveolar", "manner": "stop", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "d": { "place": "alveolar", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʈ": { "place": "retroflex", "manner": "stop", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "plus", "lateral": "minus", "aspirated": "minus", }, "ɖ": { "place": "retroflex", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "plus", "lateral": "minus", "aspirated": "minus", }, "c": { "place": "palatal", "manner": "stop", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɟ": { "place": "palatal", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "k": { "place": "velar", "manner": "stop", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "g": { "place": "velar", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "q": { "place": "uvular", "manner": "stop", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɢ": { "place": "uvular", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʔ": { "place": "glottal", "manner": "stop", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "m": { "place": "bilabial", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "plus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɱ": { "place": "labiodental", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "plus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "n": { "place": "alveolar", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "plus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɳ": { "place": "retroflex", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "plus", "retroflex": "plus", "lateral": "minus", "aspirated": "minus", }, "ɲ": { "place": "palatal", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "plus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ŋ": { "place": "velar", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "plus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɴ": { "place": "uvular", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "plus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "N": { "place": "uvular", "manner": "stop", "syllabic": "minus", "voice": "plus", "nasal": "plus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʙ": { "place": "bilabial", "manner": "trill", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "B": { "place": "bilabial", "manner": "trill", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "r": { "place": "alveolar", "manner": "trill", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "plus", "lateral": "minus", "aspirated": "minus", }, "ʀ": { "place": "uvular", "manner": "trill", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "R": { "place": "uvular", "manner": "trill", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɾ": { "place": "alveolar", "manner": "tap", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɽ": { "place": "retroflex", "manner": "tap", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "plus", "lateral": "minus", "aspirated": "minus", }, "ɸ": { "place": "bilabial", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "β": { "place": "bilabial", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "f": { "place": "labiodental", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "v": { "place": "labiodental", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "θ": { "place": "dental", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ð": { "place": "dental", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "s": { "place": "alveolar", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "z": { "place": "alveolar", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʃ": { "place": "palato-alveolar", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʒ": { "place": "palato-alveolar", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʂ": { "place": "retroflex", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "plus", "lateral": "minus", "aspirated": "minus", }, "ʐ": { "place": "retroflex", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "plus", "lateral": "minus", "aspirated": "minus", }, "ç": { "place": "palatal", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʝ": { "place": "palatal", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "x": { "place": "velar", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɣ": { "place": "velar", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "χ": { "place": "uvular", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʁ": { "place": "uvular", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ħ": { "place": "pharyngeal", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ʕ": { "place": "pharyngeal", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "h": { "place": "glottal", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɦ": { "place": "glottal", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɬ": { "place": "alveolar", "manner": "fricative", "syllabic": "minus", "voice": "minus", "nasal": "minus", "retroflex": "minus", "lateral": "plus", "aspirated": "minus", }, "ɮ": { "place": "alveolar", "manner": "fricative", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "plus", "aspirated": "minus", }, "ʋ": { "place": "labiodental", "manner": "approximant", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɹ": { "place": "alveolar", "manner": "approximant", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɻ": { "place": "retroflex", "manner": "approximant", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "plus", "lateral": "minus", "aspirated": "minus", }, "j": { "place": "palatal", "manner": "approximant", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "ɰ": { "place": "velar", "manner": "approximant", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, "l": { "place": "alveolar", "manner": "approximant", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "plus", "aspirated": "minus", }, "w": { "place": "labiovelar", "manner": "approximant", "syllabic": "minus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "aspirated": "minus", }, # Vowels "i": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "high", "back": "front", "round": "minus", "long": "minus", "aspirated": "minus", }, "y": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "high", "back": "front", "round": "plus", "long": "minus", "aspirated": "minus", }, "e": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "front", "round": "minus", "long": "minus", "aspirated": "minus", }, "E": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "front", "round": "minus", "long": "plus", "aspirated": "minus", }, "ø": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "front", "round": "plus", "long": "minus", "aspirated": "minus", }, "ɛ": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "front", "round": "minus", "long": "minus", "aspirated": "minus", }, "œ": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "front", "round": "plus", "long": "minus", "aspirated": "minus", }, "æ": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "low", "back": "front", "round": "minus", "long": "minus", "aspirated": "minus", }, "a": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "low", "back": "front", "round": "minus", "long": "minus", "aspirated": "minus", }, "A": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "low", "back": "front", "round": "minus", "long": "plus", "aspirated": "minus", }, "ɨ": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "high", "back": "central", "round": "minus", "long": "minus", "aspirated": "minus", }, "ʉ": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "high", "back": "central", "round": "plus", "long": "minus", "aspirated": "minus", }, "ə": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "central", "round": "minus", "long": "minus", "aspirated": "minus", }, "u": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "high", "back": "back", "round": "plus", "long": "minus", "aspirated": "minus", }, "U": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "high", "back": "back", "round": "plus", "long": "plus", "aspirated": "minus", }, "o": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "back", "round": "plus", "long": "minus", "aspirated": "minus", }, "O": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "back", "round": "plus", "long": "plus", "aspirated": "minus", }, "ɔ": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "mid", "back": "back", "round": "plus", "long": "minus", "aspirated": "minus", }, "ɒ": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "low", "back": "back", "round": "minus", "long": "minus", "aspirated": "minus", }, "I": { "place": "vowel", "manner": "vowel2", "syllabic": "plus", "voice": "plus", "nasal": "minus", "retroflex": "minus", "lateral": "minus", "high": "high", "back": "front", "round": "minus", "long": "plus", "aspirated": "minus", }, } # === Algorithm === def align(str1, str2, epsilon=0): """ Compute the alignment of two phonetic strings. :type str1, str2: str :param str1, str2: Two strings to be aligned :type epsilon: float (0.0 to 1.0) :param epsilon: Adjusts threshold similarity score for near-optimal alignments :rtype: list(list(tuple(str, str))) :return: Alignment(s) of str1 and str2 (Kondrak 2002: 51) """ if np is None: raise ImportError("You need numpy in order to use the align function") assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0." m = len(str1) n = len(str2) # This includes Kondrak's initialization of row 0 and column 0 to all 0s. S = np.zeros((m + 1, n + 1), dtype=float) # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense, # and breaks array and string indices. Make sure they never get chosen # by setting them to -inf. for i in range(1, m + 1): for j in range(1, n + 1): edit1 = S[i - 1, j] + sigma_skip(str1[i - 1]) edit2 = S[i, j - 1] + sigma_skip(str2[j - 1]) edit3 = S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) if i > 1: edit4 = S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) else: edit4 = -inf if j > 1: edit5 = S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) else: edit5 = -inf S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0) T = (1 - epsilon) * np.amax(S) # Threshold score for near-optimal alignments alignments = [] for i in range(1, m + 1): for j in range(1, n + 1): if S[i, j] >= T: alignments.append(_retrieve(i, j, 0, S, T, str1, str2, [])) return alignments def _retrieve(i, j, s, S, T, str1, str2, out): """ Retrieve the path through the similarity matrix S starting at (i, j). :rtype: list(tuple(str, str)) :return: Alignment of str1 and str2 """ if S[i, j] == 0: return out else: if j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T: out.insert(0, (str1[i - 1], str2[j - 2 : j])) _retrieve( i - 1, j - 2, s + sigma_exp(str1[i - 1], str2[j - 2 : j]), S, T, str1, str2, out, ) elif ( i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T ): out.insert(0, (str1[i - 2 : i], str2[j - 1])) _retrieve( i - 2, j - 1, s + sigma_exp(str2[j - 1], str1[i - 2 : i]), S, T, str1, str2, out, ) elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T: out.insert(0, ("-", str2[j - 1])) _retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out) elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T: out.insert(0, (str1[i - 1], "-")) _retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out) elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T: out.insert(0, (str1[i - 1], str2[j - 1])) _retrieve( i - 1, j - 1, s + sigma_sub(str1[i - 1], str2[j - 1]), S, T, str1, str2, out, ) return out def sigma_skip(p): """ Returns score of an indel of P. (Kondrak 2002: 54) """ return C_skip def sigma_sub(p, q): """ Returns score of a substitution of P with Q. (Kondrak 2002: 54) """ return C_sub - delta(p, q) - V(p) - V(q) def sigma_exp(p, q): """ Returns score of an expansion/compression. (Kondrak 2002: 54) """ q1 = q[0] q2 = q[1] return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2)) def delta(p, q): """ Return weighted sum of difference between P and Q. (Kondrak 2002: 54) """ features = R(p, q) total = 0 for f in features: total += diff(p, q, f) * salience[f] return total def diff(p, q, f): """ Returns difference between phonetic segments P and Q for feature F. (Kondrak 2002: 52, 54) """ p_features, q_features = feature_matrix[p], feature_matrix[q] return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]]) def R(p, q): """ Return relevant features for segment comparison. (Kondrak 2002: 54) """ if p in consonants or q in consonants: return R_c return R_v def V(p): """ Return vowel weight if P is vowel. (Kondrak 2002: 54) """ if p in consonants: return 0 return C_vwl # === Test === def demo(): """ A demonstration of the result of aligning phonetic sequences used in Kondrak's (2002) dissertation. """ data = [pair.split(",") for pair in cognate_data.split("\n")] for pair in data: alignment = align(pair[0], pair[1])[0] alignment = [f"({a[0]}, {a[1]})" for a in alignment] alignment = " ".join(alignment) print(f"{pair[0]} ~ {pair[1]} : {alignment}") cognate_data = """jo,ʒə tu,ty nosotros,nu kjen,ki ke,kwa todos,tu una,ən dos,dø tres,trwa ombre,om arbol,arbrə pluma,plym kabeθa,kap boka,buʃ pje,pje koraθon,kœr ber,vwar benir,vənir deθir,dir pobre,povrə ðis,dIzes ðæt,das wat,vas nat,nixt loŋ,laŋ mæn,man fleʃ,flajʃ bləd,blyt feðər,fEdər hær,hAr ir,Or aj,awgə nowz,nAzə mawθ,munt təŋ,tsuŋə fut,fys nij,knI hænd,hant hart,herts livər,lEbər ænd,ante æt,ad blow,flAre ir,awris ijt,edere fiʃ,piʃkis flow,fluere staɾ,stella ful,plenus græs,gramen hart,kordis horn,korny aj,ego nij,genU məðər,mAter mawntən,mons nejm,nomen njuw,nowus wən,unus rawnd,rotundus sow,suere sit,sedere θrij,tres tuwθ,dentis θin,tenwis kinwawa,kenuaʔ nina,nenah napewa,napɛw wapimini,wapemen namesa,namɛʔs okimawa,okemaw ʃiʃipa,seʔsep ahkohkwa,ahkɛh pematesiweni,pematesewen asenja,aʔsɛn""" if __name__ == "__main__": demo() nltk-3.7/nltk/metrics/artstein_poesio_example.txt000066400000000000000000000037001420073152400224320ustar00rootroot00000000000000a 1 stat b 1 stat a 2 stat b 2 stat a 3 stat b 3 stat a 4 stat b 4 stat a 5 stat b 5 stat a 6 stat b 6 stat a 7 stat b 7 stat a 8 stat b 8 stat a 9 stat b 9 stat a 10 stat b 10 stat a 11 stat b 11 stat a 12 stat b 12 stat a 13 stat b 13 stat a 14 stat b 14 stat a 15 stat b 15 stat a 16 stat b 16 stat a 17 stat b 17 stat a 18 stat b 18 stat a 19 stat b 19 stat a 20 stat b 20 stat a 21 stat b 21 stat a 22 stat b 22 stat a 23 stat b 23 stat a 24 stat b 24 stat a 25 stat b 25 stat a 26 stat b 26 stat a 27 stat b 27 stat a 28 stat b 28 stat a 29 stat b 29 stat a 30 stat b 30 stat a 31 stat b 31 stat a 32 stat b 32 stat a 33 stat b 33 stat a 34 stat b 34 stat a 35 stat b 35 stat a 36 stat b 36 stat a 37 stat b 37 stat a 38 stat b 38 stat a 39 stat b 39 stat a 40 stat b 40 stat a 41 stat b 41 stat a 42 stat b 42 stat a 43 stat b 43 stat a 44 stat b 44 stat a 45 stat b 45 stat a 46 stat b 46 stat a 47 ireq b 47 stat a 48 ireq b 48 stat a 49 ireq b 49 stat a 50 ireq b 50 stat a 51 ireq b 51 stat a 52 ireq b 52 stat a 53 ireq b 53 ireq a 54 ireq b 54 ireq a 55 ireq b 55 ireq a 56 ireq b 56 ireq a 57 ireq b 57 ireq a 58 ireq b 58 ireq a 59 ireq b 59 ireq a 60 ireq b 60 ireq a 61 ireq b 61 ireq a 62 ireq b 62 ireq a 63 ireq b 63 ireq a 64 ireq b 64 ireq a 65 ireq b 65 ireq a 66 ireq b 66 ireq a 67 ireq b 67 ireq a 68 ireq b 68 ireq a 69 ireq b 69 ireq a 70 ireq b 70 ireq a 71 ireq b 71 ireq a 72 ireq b 72 ireq a 73 ireq b 73 ireq a 74 ireq b 74 ireq a 75 ireq b 75 ireq a 76 ireq b 76 ireq a 77 ireq b 77 ireq a 78 ireq b 78 ireq a 79 ireq b 79 ireq a 80 ireq b 80 ireq a 81 ireq b 81 ireq a 82 ireq b 82 ireq a 83 ireq b 83 ireq a 84 ireq b 84 ireq a 85 ireq b 85 chck a 86 ireq b 86 chck a 87 ireq b 87 chck a 88 ireq b 88 chck a 89 ireq b 89 chck a 90 ireq b 90 chck a 91 chck b 91 chck a 92 chck b 92 chck a 93 chck b 93 chck a 94 chck b 94 chck a 95 chck b 95 chck a 96 chck b 96 chck a 97 chck b 97 chck a 98 chck b 98 chck a 99 chck b 99 chck a 100 chck b 100 chck nltk-3.7/nltk/metrics/association.py000066400000000000000000000373371420073152400176520ustar00rootroot00000000000000# Natural Language Toolkit: Ngram Association Measures # # Copyright (C) 2001-2022 NLTK Project # Author: Joel Nothman # URL: # For license information, see LICENSE.TXT """ Provides scoring functions for a number of association measures through a generic, abstract implementation in ``NgramAssocMeasures``, and n-specific ``BigramAssocMeasures`` and ``TrigramAssocMeasures``. """ import math as _math from abc import ABCMeta, abstractmethod from functools import reduce _log2 = lambda x: _math.log2(x) _ln = _math.log _product = lambda s: reduce(lambda x, y: x * y, s) _SMALL = 1e-20 try: from scipy.stats import fisher_exact except ImportError: def fisher_exact(*_args, **_kwargs): raise NotImplementedError ### Indices to marginals arguments: NGRAM = 0 """Marginals index for the ngram count""" UNIGRAMS = -2 """Marginals index for a tuple of each unigram count""" TOTAL = -1 """Marginals index for the number of words in the data""" class NgramAssocMeasures(metaclass=ABCMeta): """ An abstract class defining a collection of generic association measures. Each public method returns a score, taking the following arguments:: score_fn(count_of_ngram, (count_of_n-1gram_1, ..., count_of_n-1gram_j), (count_of_n-2gram_1, ..., count_of_n-2gram_k), ..., (count_of_1gram_1, ..., count_of_1gram_n), count_of_total_words) See ``BigramAssocMeasures`` and ``TrigramAssocMeasures`` Inheriting classes should define a property _n, and a method _contingency which calculates contingency values from marginals in order for all association measures defined here to be usable. """ _n = 0 @staticmethod @abstractmethod def _contingency(*marginals): """Calculates values of a contingency table from marginal values.""" raise NotImplementedError( "The contingency table is not available" "in the general ngram case" ) @staticmethod @abstractmethod def _marginals(*contingency): """Calculates values of contingency table marginals from its values.""" raise NotImplementedError( "The contingency table is not available" "in the general ngram case" ) @classmethod def _expected_values(cls, cont): """Calculates expected values for a contingency table.""" n_all = sum(cont) bits = [1 << i for i in range(cls._n)] # For each contingency table cell for i in range(len(cont)): # Yield the expected value yield ( _product( sum(cont[x] for x in range(2 ** cls._n) if (x & j) == (i & j)) for j in bits ) / (n_all ** (cls._n - 1)) ) @staticmethod def raw_freq(*marginals): """Scores ngrams by their frequency""" return marginals[NGRAM] / marginals[TOTAL] @classmethod def student_t(cls, *marginals): """Scores ngrams using Student's t test with independence hypothesis for unigrams, as in Manning and Schutze 5.3.1. """ return ( marginals[NGRAM] - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1)) ) / (marginals[NGRAM] + _SMALL) ** 0.5 @classmethod def chi_sq(cls, *marginals): """Scores ngrams using Pearson's chi-square as in Manning and Schutze 5.3.3. """ cont = cls._contingency(*marginals) exps = cls._expected_values(cont) return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps)) @staticmethod def mi_like(*marginals, **kwargs): """Scores ngrams using a variant of mutual information. The keyword argument power sets an exponent (default 3) for the numerator. No logarithm of the result is calculated. """ return marginals[NGRAM] ** kwargs.get("power", 3) / _product( marginals[UNIGRAMS] ) @classmethod def pmi(cls, *marginals): """Scores ngrams by pointwise mutual information, as in Manning and Schutze 5.4. """ return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2( _product(marginals[UNIGRAMS]) ) @classmethod def likelihood_ratio(cls, *marginals): """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.""" cont = cls._contingency(*marginals) return 2 * sum( obs * _ln(obs / (exp + _SMALL) + _SMALL) for obs, exp in zip(cont, cls._expected_values(cont)) ) @classmethod def poisson_stirling(cls, *marginals): """Scores ngrams using the Poisson-Stirling measure.""" exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1)) return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1) @classmethod def jaccard(cls, *marginals): """Scores ngrams using the Jaccard index.""" cont = cls._contingency(*marginals) return cont[0] / sum(cont[:-1]) class BigramAssocMeasures(NgramAssocMeasures): """ A collection of bigram association measures. Each association measure is provided as a function with three arguments:: bigram_score_fn(n_ii, (n_ix, n_xi), n_xx) The arguments constitute the marginals of a contingency table, counting the occurrences of particular events in a corpus. The letter i in the suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: - n_ii counts ``(w1, w2)``, i.e. the bigram being scored - n_ix counts ``(w1, *)`` - n_xi counts ``(*, w2)`` - n_xx counts ``(*, *)``, i.e. any bigram This may be shown with respect to a contingency table:: w1 ~w1 ------ ------ w2 | n_ii | n_oi | = n_xi ------ ------ ~w2 | n_io | n_oo | ------ ------ = n_ix TOTAL = n_xx """ _n = 2 @staticmethod def _contingency(n_ii, n_ix_xi_tuple, n_xx): """Calculates values of a bigram contingency table from marginal values.""" (n_ix, n_xi) = n_ix_xi_tuple n_oi = n_xi - n_ii n_io = n_ix - n_ii return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io) @staticmethod def _marginals(n_ii, n_oi, n_io, n_oo): """Calculates values of contingency table marginals from its values.""" return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii) @staticmethod def _expected_values(cont): """Calculates expected values for a contingency table.""" n_xx = sum(cont) # For each contingency table cell for i in range(4): yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx @classmethod def phi_sq(cls, *marginals): """Scores bigrams using phi-square, the square of the Pearson correlation coefficient. """ n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) return (n_ii * n_oo - n_io * n_oi) ** 2 / ( (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo) ) @classmethod def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx): """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number of bigrams, as in Manning and Schutze 5.3.3. """ (n_ix, n_xi) = n_ix_xi_tuple return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx) @classmethod def fisher(cls, *marginals): """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less sensitive to small counts than PMI or Chi Sq, but also more expensive to compute. Requires scipy. """ n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less") return pvalue @staticmethod def dice(n_ii, n_ix_xi_tuple, n_xx): """Scores bigrams using Dice's coefficient.""" (n_ix, n_xi) = n_ix_xi_tuple return 2 * n_ii / (n_ix + n_xi) class TrigramAssocMeasures(NgramAssocMeasures): """ A collection of trigram association measures. Each association measure is provided as a function with four arguments:: trigram_score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_xxx) The arguments constitute the marginals of a contingency table, counting the occurrences of particular events in a corpus. The letter i in the suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored - n_ixx counts ``(w1, *, *)`` - n_xxx counts ``(*, *, *)``, i.e. any trigram """ _n = 3 @staticmethod def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx): """Calculates values of a trigram contingency table (or cube) from marginal values. >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000) (1, 0, 0, 0, 0, 72, 0, 1927) """ (n_iix, n_ixi, n_xii) = n_iix_tuple (n_ixx, n_xix, n_xxi) = n_ixx_tuple n_oii = n_xii - n_iii n_ioi = n_ixi - n_iii n_iio = n_iix - n_iii n_ooi = n_xxi - n_iii - n_oii - n_ioi n_oio = n_xix - n_iii - n_oii - n_iio n_ioo = n_ixx - n_iii - n_ioi - n_iio n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo) @staticmethod def _marginals(*contingency): """Calculates values of contingency table marginals from its values. >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927) (1, (1, 1, 1), (1, 73, 1), 2000) """ n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency return ( n_iii, (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii), ( n_iii + n_ioi + n_iio + n_ioo, n_iii + n_oii + n_iio + n_oio, n_iii + n_oii + n_ioi + n_ooi, ), sum(contingency), ) class QuadgramAssocMeasures(NgramAssocMeasures): """ A collection of quadgram association measures. Each association measure is provided as a function with five arguments:: trigram_score_fn(n_iiii, (n_iiix, n_iixi, n_ixii, n_xiii), (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), (n_ixxx, n_xixx, n_xxix, n_xxxi), n_all) The arguments constitute the marginals of a contingency table, counting the occurrences of particular events in a corpus. The letter i in the suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored - n_ixxi counts ``(w1, *, *, w4)`` - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram """ _n = 4 @staticmethod def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx): """Calculates values of a quadgram contingency table from marginal values. """ (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple n_oiii = n_xiii - n_iiii n_ioii = n_ixii - n_iiii n_iioi = n_iixi - n_iiii n_ooii = n_xxii - n_iiii - n_oiii - n_ioii n_oioi = n_xixi - n_iiii - n_oiii - n_iioi n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi n_iiio = n_iiix - n_iiii n_oiio = n_xiix - n_iiii - n_oiii - n_iiio n_ioio = n_ixix - n_iiii - n_ioii - n_iiio n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio n_iioo = n_iixx - n_iiii - n_iioi - n_iiio n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio n_oooo = ( n_xxxx - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_oioi - n_iooi - n_oooi - n_iiio - n_oiio - n_ioio - n_ooio - n_iioo - n_oioo - n_iooo ) return ( n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo, ) @staticmethod def _marginals(*contingency): """Calculates values of contingency table marginals from its values. QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653) (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540) """ ( n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo, ) = contingency n_iiix = n_iiii + n_iiio n_iixi = n_iiii + n_iioi n_ixii = n_iiii + n_ioii n_xiii = n_iiii + n_oiii n_iixx = n_iiii + n_iioi + n_iiio + n_iioo n_ixix = n_iiii + n_ioii + n_iiio + n_ioio n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi n_xixi = n_iiii + n_oiii + n_iioi + n_oioi n_xxii = n_iiii + n_oiii + n_ioii + n_ooii n_xiix = n_iiii + n_oiii + n_iiio + n_oiio n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi n_all = sum(contingency) return ( n_iiii, (n_iiix, n_iixi, n_ixii, n_xiii), (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), (n_ixxx, n_xixx, n_xxix, n_xxxi), n_all, ) class ContingencyMeasures: """Wraps NgramAssocMeasures classes such that the arguments of association measures are contingency table values rather than marginals. """ def __init__(self, measures): """Constructs a ContingencyMeasures given a NgramAssocMeasures class""" self.__class__.__name__ = "Contingency" + measures.__class__.__name__ for k in dir(measures): if k.startswith("__"): continue v = getattr(measures, k) if not k.startswith("_"): v = self._make_contingency_fn(measures, v) setattr(self, k, v) @staticmethod def _make_contingency_fn(measures, old_fn): """From an association measure function, produces a new function which accepts contingency table values as its arguments. """ def res(*contingency): return old_fn(*measures._marginals(*contingency)) res.__doc__ = old_fn.__doc__ res.__name__ = old_fn.__name__ return res nltk-3.7/nltk/metrics/confusionmatrix.py000066400000000000000000000306161420073152400205570ustar00rootroot00000000000000# Natural Language Toolkit: Confusion Matrices # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Tom Aarsen <> # URL: # For license information, see LICENSE.TXT from nltk.probability import FreqDist class ConfusionMatrix: """ The confusion matrix between a list of reference values and a corresponding list of test values. Entry *[r,t]* of this matrix is a count of the number of times that the reference value *r* corresponds to the test value *t*. E.g.: >>> from nltk.metrics import ConfusionMatrix >>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split() >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() >>> cm = ConfusionMatrix(ref, test) >>> print(cm['NN', 'NN']) 3 Note that the diagonal entries *Ri=Tj* of this matrix corresponds to correct values; and the off-diagonal entries correspond to incorrect values. """ def __init__(self, reference, test, sort_by_count=False): """ Construct a new confusion matrix from a list of reference values and a corresponding list of test values. :type reference: list :param reference: An ordered list of reference values. :type test: list :param test: A list of values to compare against the corresponding reference values. :raise ValueError: If ``reference`` and ``length`` do not have the same length. """ if len(reference) != len(test): raise ValueError("Lists must have the same length.") # Get a list of all values. if sort_by_count: ref_fdist = FreqDist(reference) test_fdist = FreqDist(test) def key(v): return -(ref_fdist[v] + test_fdist[v]) values = sorted(set(reference + test), key=key) else: values = sorted(set(reference + test)) # Construct a value->index dictionary indices = {val: i for (i, val) in enumerate(values)} # Make a confusion matrix table. confusion = [[0 for val in values] for val in values] max_conf = 0 # Maximum confusion for w, g in zip(reference, test): confusion[indices[w]][indices[g]] += 1 max_conf = max(max_conf, confusion[indices[w]][indices[g]]) #: A list of all values in ``reference`` or ``test``. self._values = values #: A dictionary mapping values in ``self._values`` to their indices. self._indices = indices #: The confusion matrix itself (as a list of lists of counts). self._confusion = confusion #: The greatest count in ``self._confusion`` (used for printing). self._max_conf = max_conf #: The total number of values in the confusion matrix. self._total = len(reference) #: The number of correct (on-diagonal) values in the matrix. self._correct = sum(confusion[i][i] for i in range(len(values))) def __getitem__(self, li_lj_tuple): """ :return: The number of times that value ``li`` was expected and value ``lj`` was given. :rtype: int """ (li, lj) = li_lj_tuple i = self._indices[li] j = self._indices[lj] return self._confusion[i][j] def __repr__(self): return f"" def __str__(self): return self.pretty_format() def pretty_format( self, show_percents=False, values_in_chart=True, truncate=None, sort_by_count=False, ): """ :return: A multi-line string representation of this confusion matrix. :type truncate: int :param truncate: If specified, then only show the specified number of values. Any sorting (e.g., sort_by_count) will be performed before truncation. :param sort_by_count: If true, then sort by the count of each label in the reference data. I.e., labels that occur more frequently in the reference label will be towards the left edge of the matrix, and labels that occur less frequently will be towards the right edge. @todo: add marginals? """ confusion = self._confusion values = self._values if sort_by_count: values = sorted( values, key=lambda v: -sum(self._confusion[self._indices[v]]) ) if truncate: values = values[:truncate] if values_in_chart: value_strings = ["%s" % val for val in values] else: value_strings = [str(n + 1) for n in range(len(values))] # Construct a format string for row values valuelen = max(len(val) for val in value_strings) value_format = "%" + repr(valuelen) + "s | " # Construct a format string for matrix entries if show_percents: entrylen = 6 entry_format = "%5.1f%%" zerostr = " ." else: entrylen = len(repr(self._max_conf)) entry_format = "%" + repr(entrylen) + "d" zerostr = " " * (entrylen - 1) + "." # Write the column values. s = "" for i in range(valuelen): s += (" " * valuelen) + " |" for val in value_strings: if i >= valuelen - len(val): s += val[i - valuelen + len(val)].rjust(entrylen + 1) else: s += " " * (entrylen + 1) s += " |\n" # Write a dividing line s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values))) # Write the entries. for val, li in zip(value_strings, values): i = self._indices[li] s += value_format % val for lj in values: j = self._indices[lj] if confusion[i][j] == 0: s += zerostr elif show_percents: s += entry_format % (100.0 * confusion[i][j] / self._total) else: s += entry_format % confusion[i][j] if i == j: prevspace = s.rfind(" ") s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">" else: s += " " s += "|\n" # Write a dividing line s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values))) # Write a key s += "(row = reference; col = test)\n" if not values_in_chart: s += "Value key:\n" for i, value in enumerate(values): s += "%6d: %s\n" % (i + 1, value) return s def key(self): values = self._values str = "Value key:\n" indexlen = len(repr(len(values) - 1)) key_format = " %" + repr(indexlen) + "d: %s\n" for i in range(len(values)): str += key_format % (i, values[i]) return str def recall(self, value): """Given a value in the confusion matrix, return the recall that corresponds to this value. The recall is defined as: - *r* = true positive / (true positive + false positive) and can loosely be considered the ratio of how often ``value`` was predicted correctly relative to how often ``value`` was the true result. :param value: value used in the ConfusionMatrix :return: the recall corresponding to ``value``. :rtype: float """ # Number of times `value` was correct, and also predicted TP = self[value, value] # Number of times `value` was correct TP_FN = sum(self[value, pred_value] for pred_value in self._values) if TP_FN == 0: return 0.0 return TP / TP_FN def precision(self, value): """Given a value in the confusion matrix, return the precision that corresponds to this value. The precision is defined as: - *p* = true positive / (true positive + false negative) and can loosely be considered the ratio of how often ``value`` was predicted correctly relative to the number of predictions for ``value``. :param value: value used in the ConfusionMatrix :return: the precision corresponding to ``value``. :rtype: float """ # Number of times `value` was correct, and also predicted TP = self[value, value] # Number of times `value` was predicted TP_FP = sum(self[real_value, value] for real_value in self._values) if TP_FP == 0: return 0.0 return TP / TP_FP def f_measure(self, value, alpha=0.5): """ Given a value used in the confusion matrix, return the f-measure that corresponds to this value. The f-measure is the harmonic mean of the ``precision`` and ``recall``, weighted by ``alpha``. In particular, given the precision *p* and recall *r* defined by: - *p* = true positive / (true positive + false negative) - *r* = true positive / (true positive + false positive) The f-measure is: - *1/(alpha/p + (1-alpha)/r)* With ``alpha = 0.5``, this reduces to: - *2pr / (p + r)* :param value: value used in the ConfusionMatrix :param alpha: Ratio of the cost of false negative compared to false positives. Defaults to 0.5, where the costs are equal. :type alpha: float :return: the F-measure corresponding to ``value``. :rtype: float """ p = self.precision(value) r = self.recall(value) if p == 0.0 or r == 0.0: return 0.0 return 1.0 / (alpha / p + (1 - alpha) / r) def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False): """ Tabulate the **recall**, **precision** and **f-measure** for each value in this confusion matrix. >>> reference = "DET NN VB DET JJ NN NN IN DET NN".split() >>> test = "DET VB VB DET NN NN NN IN DET NN".split() >>> cm = ConfusionMatrix(reference, test) >>> print(cm.evaluate()) Tag | Prec. | Recall | F-measure ----+--------+--------+----------- DET | 1.0000 | 1.0000 | 1.0000 IN | 1.0000 | 1.0000 | 1.0000 JJ | 0.0000 | 0.0000 | 0.0000 NN | 0.7500 | 0.7500 | 0.7500 VB | 0.5000 | 1.0000 | 0.6667 :param alpha: Ratio of the cost of false negative compared to false positives, as used in the f-measure computation. Defaults to 0.5, where the costs are equal. :type alpha: float :param truncate: If specified, then only show the specified number of values. Any sorting (e.g., sort_by_count) will be performed before truncation. Defaults to None :type truncate: int, optional :param sort_by_count: Whether to sort the outputs on frequency in the reference label. Defaults to False. :type sort_by_count: bool, optional :return: A tabulated recall, precision and f-measure string :rtype: str """ tags = self._values # Apply keyword parameters if sort_by_count: tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]])) if truncate: tags = tags[:truncate] tag_column_len = max(max(len(tag) for tag in tags), 3) # Construct the header s = ( f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n" f"{'-' * tag_column_len}-+--------+--------+-----------\n" ) # Construct the body for tag in tags: s += ( f"{tag:>{tag_column_len}} | " f"{self.precision(tag):<6.4f} | " f"{self.recall(tag):<6.4f} | " f"{self.f_measure(tag, alpha=alpha):.4f}\n" ) return s def demo(): reference = "DET NN VB DET JJ NN NN IN DET NN".split() test = "DET VB VB DET NN NN NN IN DET NN".split() print("Reference =", reference) print("Test =", test) print("Confusion matrix:") print(ConfusionMatrix(reference, test)) print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True)) print(ConfusionMatrix(reference, test).recall("VB")) if __name__ == "__main__": demo() nltk-3.7/nltk/metrics/distance.py000066400000000000000000000414011420073152400171130ustar00rootroot00000000000000# Natural Language Toolkit: Distance Metrics # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Tom Lippincott # URL: # For license information, see LICENSE.TXT # """ Distance Metrics. Compute the distance between two items (usually strings). As metrics, they must satisfy the following three requirements: 1. d(a, a) = 0 2. d(a, b) >= 0 3. d(a, c) <= d(a, b) + d(b, c) """ import operator import warnings def _edit_dist_init(len1, len2): lev = [] for i in range(len1): lev.append([0] * len2) # initialize 2D array to zero for i in range(len1): lev[i][0] = i # column 0: 0,1,2,3,4,... for j in range(len2): lev[0][j] = j # row 0: 0,1,2,3,4,... return lev def _last_left_t_init(sigma): return {c: 0 for c in sigma} def _edit_dist_step( lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False ): c1 = s1[i - 1] c2 = s2[j - 1] # skipping a character in s1 a = lev[i - 1][j] + 1 # skipping a character in s2 b = lev[i][j - 1] + 1 # substitution c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0) # transposition d = c + 1 # never picked by default if transpositions and last_left > 0 and last_right > 0: d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1 # pick the cheapest lev[i][j] = min(a, b, c, d) def edit_distance(s1, s2, substitution_cost=1, transpositions=False): """ Calculate the Levenshtein edit-distance between two strings. The edit distance is the number of characters that need to be substituted, inserted, or deleted, to transform s1 into s2. For example, transforming "rain" to "shine" requires three steps, consisting of two substitutions and one insertion: "rain" -> "sain" -> "shin" -> "shine". These operations could have been done in other orders, but at least three steps are needed. Allows specifying the cost of substitution edits (e.g., "a" -> "b"), because sometimes it makes sense to assign greater penalties to substitutions. This also optionally allows transposition edits (e.g., "ab" -> "ba"), though this is disabled by default. :param s1, s2: The strings to be analysed :param transpositions: Whether to allow transposition edits :type s1: str :type s2: str :type substitution_cost: int :type transpositions: bool :rtype: int """ # set up a 2-D array len1 = len(s1) len2 = len(s2) lev = _edit_dist_init(len1 + 1, len2 + 1) # retrieve alphabet sigma = set() sigma.update(s1) sigma.update(s2) # set up table to remember positions of last seen occurrence in s1 last_left_t = _last_left_t_init(sigma) # iterate over the array # i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code # see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance for i in range(1, len1 + 1): last_right_buf = 0 for j in range(1, len2 + 1): last_left = last_left_t[s2[j - 1]] last_right = last_right_buf if s1[i - 1] == s2[j - 1]: last_right_buf = j _edit_dist_step( lev, i, j, s1, s2, last_left, last_right, substitution_cost=substitution_cost, transpositions=transpositions, ) last_left_t[s1[i - 1]] = i return lev[len1][len2] def _edit_dist_backtrace(lev): i, j = len(lev) - 1, len(lev[0]) - 1 alignment = [(i, j)] while (i, j) != (0, 0): directions = [ (i - 1, j), # skip s1 (i, j - 1), # skip s2 (i - 1, j - 1), # substitution ] direction_costs = ( (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j)) for i, j in directions ) _, (i, j) = min(direction_costs, key=operator.itemgetter(0)) alignment.append((i, j)) return list(reversed(alignment)) def edit_distance_align(s1, s2, substitution_cost=1): """ Calculate the minimum Levenshtein edit-distance based alignment mapping between two strings. The alignment finds the mapping from string s1 to s2 that minimizes the edit distance cost. For example, mapping "rain" to "shine" would involve 2 substitutions, 2 matches and an insertion resulting in the following mapping: [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)] NB: (0, 0) is the start state without any letters associated See more: https://web.stanford.edu/class/cs124/lec/med.pdf In case of multiple valid minimum-distance alignments, the backtrace has the following operation precedence: 1. Skip s1 character 2. Skip s2 character 3. Substitute s1 and s2 characters The backtrace is carried out in reverse string order. This function does not support transposition. :param s1, s2: The strings to be aligned :type s1: str :type s2: str :type substitution_cost: int :rtype: List[Tuple(int, int)] """ # set up a 2-D array len1 = len(s1) len2 = len(s2) lev = _edit_dist_init(len1 + 1, len2 + 1) # iterate over the array for i in range(len1): for j in range(len2): _edit_dist_step( lev, i + 1, j + 1, s1, s2, 0, 0, substitution_cost=substitution_cost, transpositions=False, ) # backtrace to find alignment alignment = _edit_dist_backtrace(lev) return alignment def binary_distance(label1, label2): """Simple equality test. 0.0 if the labels are identical, 1.0 if they are different. >>> from nltk.metrics import binary_distance >>> binary_distance(1,1) 0.0 >>> binary_distance(1,3) 1.0 """ return 0.0 if label1 == label2 else 1.0 def jaccard_distance(label1, label2): """Distance metric comparing set-similarity.""" return (len(label1.union(label2)) - len(label1.intersection(label2))) / len( label1.union(label2) ) def masi_distance(label1, label2): """Distance metric that takes into account partial agreement when multiple labels are assigned. >>> from nltk.metrics import masi_distance >>> masi_distance(set([1, 2]), set([1, 2, 3, 4])) 0.665 Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI) for Semantic and Pragmatic Annotation. """ len_intersection = len(label1.intersection(label2)) len_union = len(label1.union(label2)) len_label1 = len(label1) len_label2 = len(label2) if len_label1 == len_label2 and len_label1 == len_intersection: m = 1 elif len_intersection == min(len_label1, len_label2): m = 0.67 elif len_intersection > 0: m = 0.33 else: m = 0 return 1 - len_intersection / len_union * m def interval_distance(label1, label2): """Krippendorff's interval distance metric >>> from nltk.metrics import interval_distance >>> interval_distance(1,10) 81 Krippendorff 1980, Content Analysis: An Introduction to its Methodology """ try: return pow(label1 - label2, 2) # return pow(list(label1)[0]-list(label2)[0],2) except: print("non-numeric labels not supported with interval distance") def presence(label): """Higher-order function to test presence of a given label""" return lambda x, y: 1.0 * ((label in x) == (label in y)) def fractional_presence(label): return ( lambda x, y: abs((1.0 / len(x)) - (1.0 / len(y))) * (label in x and label in y) or 0.0 * (label not in x and label not in y) or abs(1.0 / len(x)) * (label in x and label not in y) or (1.0 / len(y)) * (label not in x and label in y) ) def custom_distance(file): data = {} with open(file) as infile: for l in infile: labelA, labelB, dist = l.strip().split("\t") labelA = frozenset([labelA]) labelB = frozenset([labelB]) data[frozenset([labelA, labelB])] = float(dist) return lambda x, y: data[frozenset([x, y])] def jaro_similarity(s1, s2): """ Computes the Jaro similarity between 2 sequences from: Matthew A. Jaro (1989). Advances in record linkage methodology as applied to the 1985 census of Tampa Florida. Journal of the American Statistical Association. 84 (406): 414-20. The Jaro distance between is the min no. of single-character transpositions required to change one word into another. The Jaro similarity formula from https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance : ``jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)`` where - `|s_i|` is the length of string `s_i` - `m` is the no. of matching characters - `t` is the half no. of possible transpositions. """ # First, store the length of the strings # because they will be re-used several times. len_s1, len_s2 = len(s1), len(s2) # The upper bound of the distance for being a matched character. match_bound = max(len_s1, len_s2) // 2 - 1 # Initialize the counts for matches and transpositions. matches = 0 # no.of matched characters in s1 and s2 transpositions = 0 # no. of transpositions between s1 and s2 flagged_1 = [] # positions in s1 which are matches to some character in s2 flagged_2 = [] # positions in s2 which are matches to some character in s1 # Iterate through sequences, check for matches and compute transpositions. for i in range(len_s1): # Iterate through each character. upperbound = min(i + match_bound, len_s2 - 1) lowerbound = max(0, i - match_bound) for j in range(lowerbound, upperbound + 1): if s1[i] == s2[j] and j not in flagged_2: matches += 1 flagged_1.append(i) flagged_2.append(j) break flagged_2.sort() for i, j in zip(flagged_1, flagged_2): if s1[i] != s2[j]: transpositions += 1 if matches == 0: return 0 else: return ( 1 / 3 * ( matches / len_s1 + matches / len_s2 + (matches - transpositions // 2) / matches ) ) def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4): """ The Jaro Winkler distance is an extension of the Jaro similarity in: William E. Winkler. 1990. String Comparator Metrics and Enhanced Decision Rules in the Fellegi-Sunter Model of Record Linkage. Proceedings of the Section on Survey Research Methods. American Statistical Association: 354-359. such that: jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) ) where, - jaro_sim is the output from the Jaro Similarity, see jaro_similarity() - l is the length of common prefix at the start of the string - this implementation provides an upperbound for the l value to keep the prefixes.A common value of this upperbound is 4. - p is the constant scaling factor to overweigh common prefixes. The Jaro-Winkler similarity will fall within the [0, 1] bound, given that max(p)<=0.25 , default is p=0.1 in Winkler (1990) Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf from "Table 5 Comparison of String Comparators Rescaled between 0 and 1" >>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"), ... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"), ... ("dixon", "dickson"), ("billy", "susan")] >>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000] >>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000] One way to match the values on the Winkler's paper is to provide a different p scaling factor for different pairs of strings, e.g. >>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1] >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors): ... assert round(jaro_similarity(s1, s2), 3) == jscore ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from "Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names" >>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'), ... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'), ... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'), ... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'), ... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'), ... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'), ... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'), ... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')] >>> jaro_scores = [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926, ... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905, ... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000] >>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926, ... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943, ... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000] One way to match the values on the Winkler's paper is to provide a different p scaling factor for different pairs of strings, e.g. >>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20, ... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors): ... if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]: ... continue # Skip bad examples from the paper. ... assert round(jaro_similarity(s1, s2), 3) == jscore ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore This test-case proves that the output of Jaro-Winkler similarity depends on the product l * p and not on the product max_l * p. Here the product max_l * p > 1 however the product l * p <= 1 >>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3) 0.88 """ # To ensure that the output of the Jaro-Winkler's similarity # falls between [0,1], the product of l * p needs to be # also fall between [0,1]. if not 0 <= max_l * p <= 1: warnings.warn( str( "The product `max_l * p` might not fall between [0,1]." "Jaro-Winkler similarity might not be between 0 and 1." ) ) # Compute the Jaro similarity jaro_sim = jaro_similarity(s1, s2) # Initialize the upper bound for the no. of prefixes. # if user did not pre-define the upperbound, # use shorter length between s1 and s2 # Compute the prefix matches. l = 0 # zip() will automatically loop until the end of shorter string. for s1_i, s2_i in zip(s1, s2): if s1_i == s2_i: l += 1 else: break if l == max_l: break # Return the similarity value as described in docstring. return jaro_sim + (l * p * (1 - jaro_sim)) def demo(): string_distance_examples = [ ("rain", "shine"), ("abcdef", "acbdef"), ("language", "lnaguaeg"), ("language", "lnaugage"), ("language", "lngauage"), ] for s1, s2 in string_distance_examples: print(f"Edit distance btwn '{s1}' and '{s2}':", edit_distance(s1, s2)) print( f"Edit dist with transpositions btwn '{s1}' and '{s2}':", edit_distance(s1, s2, transpositions=True), ) print(f"Jaro similarity btwn '{s1}' and '{s2}':", jaro_similarity(s1, s2)) print( f"Jaro-Winkler similarity btwn '{s1}' and '{s2}':", jaro_winkler_similarity(s1, s2), ) print( f"Jaro-Winkler distance btwn '{s1}' and '{s2}':", 1 - jaro_winkler_similarity(s1, s2), ) s1 = {1, 2, 3, 4} s2 = {3, 4, 5} print("s1:", s1) print("s2:", s2) print("Binary distance:", binary_distance(s1, s2)) print("Jaccard distance:", jaccard_distance(s1, s2)) print("MASI distance:", masi_distance(s1, s2)) if __name__ == "__main__": demo() nltk-3.7/nltk/metrics/paice.py000066400000000000000000000340221420073152400164030ustar00rootroot00000000000000# Natural Language Toolkit: Agreement Metrics # # Copyright (C) 2001-2022 NLTK Project # Author: Lauri Hallila # URL: # For license information, see LICENSE.TXT # """Counts Paice's performance statistics for evaluating stemming algorithms. What is required: - A dictionary of words grouped by their real lemmas - A dictionary of words grouped by stems from a stemming algorithm When these are given, Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted. References: Chris D. Paice (1994). An evaluation method for stemming algorithms. In Proceedings of SIGIR, 42--50. """ from math import sqrt def get_words_from_dictionary(lemmas): """ Get original set of words used for analysis. :param lemmas: A dictionary where keys are lemmas and values are sets or lists of words corresponding to that lemma. :type lemmas: dict(str): list(str) :return: Set of words that exist as values in the dictionary :rtype: set(str) """ words = set() for lemma in lemmas: words.update(set(lemmas[lemma])) return words def _truncate(words, cutlength): """Group words by stems defined by truncating them at given length. :param words: Set of words used for analysis :param cutlength: Words are stemmed by cutting at this length. :type words: set(str) or list(str) :type cutlength: int :return: Dictionary where keys are stems and values are sets of words corresponding to that stem. :rtype: dict(str): set(str) """ stems = {} for word in words: stem = word[:cutlength] try: stems[stem].update([word]) except KeyError: stems[stem] = {word} return stems # Reference: https://en.wikipedia.org/wiki/Line-line_intersection def _count_intersection(l1, l2): """Count intersection between two line segments defined by coordinate pairs. :param l1: Tuple of two coordinate pairs defining the first line segment :param l2: Tuple of two coordinate pairs defining the second line segment :type l1: tuple(float, float) :type l2: tuple(float, float) :return: Coordinates of the intersection :rtype: tuple(float, float) """ x1, y1 = l1[0] x2, y2 = l1[1] x3, y3 = l2[0] x4, y4 = l2[1] denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4) if denominator == 0.0: # lines are parallel if x1 == x2 == x3 == x4 == 0.0: # When lines are parallel, they must be on the y-axis. # We can ignore x-axis because we stop counting the # truncation line when we get there. # There are no other options as UI (x-axis) grows and # OI (y-axis) diminishes when we go along the truncation line. return (0.0, y4) x = ( (x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4) ) / denominator y = ( (x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4) ) / denominator return (x, y) def _get_derivative(coordinates): """Get derivative of the line from (0,0) to given coordinates. :param coordinates: A coordinate pair :type coordinates: tuple(float, float) :return: Derivative; inf if x is zero :rtype: float """ try: return coordinates[1] / coordinates[0] except ZeroDivisionError: return float("inf") def _calculate_cut(lemmawords, stems): """Count understemmed and overstemmed pairs for (lemma, stem) pair with common words. :param lemmawords: Set or list of words corresponding to certain lemma. :param stems: A dictionary where keys are stems and values are sets or lists of words corresponding to that stem. :type lemmawords: set(str) or list(str) :type stems: dict(str): set(str) :return: Amount of understemmed and overstemmed pairs contributed by words existing in both lemmawords and stems. :rtype: tuple(float, float) """ umt, wmt = 0.0, 0.0 for stem in stems: cut = set(lemmawords) & set(stems[stem]) if cut: cutcount = len(cut) stemcount = len(stems[stem]) # Unachieved merge total umt += cutcount * (len(lemmawords) - cutcount) # Wrongly merged total wmt += cutcount * (stemcount - cutcount) return (umt, wmt) def _calculate(lemmas, stems): """Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs. :param lemmas: A dictionary where keys are lemmas and values are sets or lists of words corresponding to that lemma. :param stems: A dictionary where keys are stems and values are sets or lists of words corresponding to that stem. :type lemmas: dict(str): list(str) :type stems: dict(str): set(str) :return: Global unachieved merge total (gumt), global desired merge total (gdmt), global wrongly merged total (gwmt) and global desired non-merge total (gdnt). :rtype: tuple(float, float, float, float) """ n = sum(len(lemmas[word]) for word in lemmas) gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0) for lemma in lemmas: lemmacount = len(lemmas[lemma]) # Desired merge total gdmt += lemmacount * (lemmacount - 1) # Desired non-merge total gdnt += lemmacount * (n - lemmacount) # For each (lemma, stem) pair with common words, count how many # pairs are understemmed and overstemmed. umt, wmt = _calculate_cut(lemmas[lemma], stems) # Add to total undesired and wrongly-merged totals gumt += umt gwmt += wmt # Each object is counted twice, so divide by two return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2) def _indexes(gumt, gdmt, gwmt, gdnt): """Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW). :param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt), global desired merge total (gdmt), global wrongly merged total (gwmt) and global desired non-merge total (gdnt). :type gumt, gdmt, gwmt, gdnt: float :return: Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW). :rtype: tuple(float, float, float) """ # Calculate Understemming Index (UI), # Overstemming Index (OI) and Stemming Weight (SW) try: ui = gumt / gdmt except ZeroDivisionError: # If GDMT (max merge total) is 0, define UI as 0 ui = 0.0 try: oi = gwmt / gdnt except ZeroDivisionError: # IF GDNT (max non-merge total) is 0, define OI as 0 oi = 0.0 try: sw = oi / ui except ZeroDivisionError: if oi == 0.0: # OI and UI are 0, define SW as 'not a number' sw = float("nan") else: # UI is 0, define SW as infinity sw = float("inf") return (ui, oi, sw) class Paice: """Class for storing lemmas, stems and evaluation metrics.""" def __init__(self, lemmas, stems): """ :param lemmas: A dictionary where keys are lemmas and values are sets or lists of words corresponding to that lemma. :param stems: A dictionary where keys are stems and values are sets or lists of words corresponding to that stem. :type lemmas: dict(str): list(str) :type stems: dict(str): set(str) """ self.lemmas = lemmas self.stems = stems self.coords = [] self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None) self.ui, self.oi, self.sw = (None, None, None) self.errt = None self.update() def __str__(self): text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt] text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt) text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt) text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt) text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui) text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi) text.append("Stemming Weight (OI / UI): %s\n" % self.sw) text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt) coordinates = " ".join(["(%s, %s)" % item for item in self.coords]) text.append("Truncation line: %s" % coordinates) return "".join(text) def _get_truncation_indexes(self, words, cutlength): """Count (UI, OI) when stemming is done by truncating words at \'cutlength\'. :param words: Words used for the analysis :param cutlength: Words are stemmed by cutting them at this length :type words: set(str) or list(str) :type cutlength: int :return: Understemming and overstemming indexes :rtype: tuple(int, int) """ truncated = _truncate(words, cutlength) gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated) ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2] return (ui, oi) def _get_truncation_coordinates(self, cutlength=0): """Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line. :param cutlength: Optional parameter to start counting from (ui, oi) coordinates gotten by stemming at this length. Useful for speeding up the calculations when you know the approximate location of the intersection. :type cutlength: int :return: List of coordinate pairs that define the truncation line :rtype: list(tuple(float, float)) """ words = get_words_from_dictionary(self.lemmas) maxlength = max(len(word) for word in words) # Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line coords = [] while cutlength <= maxlength: # Get (UI, OI) pair of current truncation point pair = self._get_truncation_indexes(words, cutlength) # Store only new coordinates so we'll have an actual # line segment when counting the intersection point if pair not in coords: coords.append(pair) if pair == (0.0, 0.0): # Stop counting if truncation line goes through origo; # length from origo to truncation line is 0 return coords if len(coords) >= 2 and pair[0] > 0.0: derivative1 = _get_derivative(coords[-2]) derivative2 = _get_derivative(coords[-1]) # Derivative of the truncation line is a decreasing value; # when it passes Stemming Weight, we've found the segment # of truncation line intersecting with (0, 0) - (ui, oi) segment if derivative1 >= self.sw >= derivative2: return coords cutlength += 1 return coords def _errt(self): """Count Error-Rate Relative to Truncation (ERRT). :return: ERRT, length of the line from origo to (UI, OI) divided by the length of the line from origo to the point defined by the same line when extended until the truncation line. :rtype: float """ # Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line self.coords = self._get_truncation_coordinates() if (0.0, 0.0) in self.coords: # Truncation line goes through origo, so ERRT cannot be counted if (self.ui, self.oi) != (0.0, 0.0): return float("inf") else: return float("nan") if (self.ui, self.oi) == (0.0, 0.0): # (ui, oi) is origo; define errt as 0.0 return 0.0 # Count the intersection point # Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates # so we have actual line segments instead of a line segment and a point intersection = _count_intersection( ((0, 0), (self.ui, self.oi)), self.coords[-2:] ) # Count OP (length of the line from origo to (ui, oi)) op = sqrt(self.ui ** 2 + self.oi ** 2) # Count OT (length of the line from origo to truncation line that goes through (ui, oi)) ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2) # OP / OT tells how well the stemming algorithm works compared to just truncating words return op / ot def update(self): """Update statistics after lemmas and stems have been set.""" self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems) self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt) self.errt = self._errt() def demo(): """Demonstration of the module.""" # Some words with their real lemmas lemmas = { "kneel": ["kneel", "knelt"], "range": ["range", "ranged"], "ring": ["ring", "rang", "rung"], } # Same words with stems from a stemming algorithm stems = { "kneel": ["kneel"], "knelt": ["knelt"], "rang": ["rang", "range", "ranged"], "ring": ["ring"], "rung": ["rung"], } print("Words grouped by their lemmas:") for lemma in sorted(lemmas): print("{} => {}".format(lemma, " ".join(lemmas[lemma]))) print() print("Same words grouped by a stemming algorithm:") for stem in sorted(stems): print("{} => {}".format(stem, " ".join(stems[stem]))) print() p = Paice(lemmas, stems) print(p) print() # Let's "change" results from a stemming algorithm stems = { "kneel": ["kneel"], "knelt": ["knelt"], "rang": ["rang"], "range": ["range", "ranged"], "ring": ["ring"], "rung": ["rung"], } print("Counting stats after changing stemming results:") for stem in sorted(stems): print("{} => {}".format(stem, " ".join(stems[stem]))) print() p.stems = stems p.update() print(p) if __name__ == "__main__": demo() nltk-3.7/nltk/metrics/scores.py000066400000000000000000000170161420073152400166240ustar00rootroot00000000000000# Natural Language Toolkit: Evaluation # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT import operator from functools import reduce from math import fabs from random import shuffle try: from scipy.stats.stats import betai except ImportError: betai = None from nltk.util import LazyConcatenation, LazyMap def accuracy(reference, test): """ Given a list of reference values and a corresponding list of test values, return the fraction of corresponding values that are equal. In particular, return the fraction of indices ``0= actual_stat: c += 1 if verbose and i % 10 == 0: print("pseudo-statistic: %f" % pseudo_stat) print("significance: %f" % ((c + 1) / (i + 1))) print("-" * 60) significance = (c + 1) / (shuffles + 1) if verbose: print("significance: %f" % significance) if betai: for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]: print(f"prob(phi<={phi:f}): {betai(c, shuffles, phi):f}") return (significance, c, shuffles) def demo(): print("-" * 75) reference = "DET NN VB DET JJ NN NN IN DET NN".split() test = "DET VB VB DET NN NN NN IN DET NN".split() print("Reference =", reference) print("Test =", test) print("Accuracy:", accuracy(reference, test)) print("-" * 75) reference_set = set(reference) test_set = set(test) print("Reference =", reference_set) print("Test = ", test_set) print("Precision:", precision(reference_set, test_set)) print(" Recall:", recall(reference_set, test_set)) print("F-Measure:", f_measure(reference_set, test_set)) print("-" * 75) if __name__ == "__main__": demo() nltk-3.7/nltk/metrics/segmentation.py000066400000000000000000000155271420073152400200300ustar00rootroot00000000000000# Natural Language Toolkit: Text Segmentation Metrics # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # David Doukhan # URL: # For license information, see LICENSE.TXT """ Text Segmentation Metrics 1. Windowdiff Pevzner, L., and Hearst, M., A Critique and Improvement of an Evaluation Metric for Text Segmentation, Computational Linguistics 28, 19-36 2. Generalized Hamming Distance Bookstein A., Kulyukin V.A., Raita T. Generalized Hamming Distance Information Retrieval 5, 2002, pp 353-375 Baseline implementation in C++ http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html Study describing benefits of Generalized Hamming Distance Versus WindowDiff for evaluating text segmentation tasks Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ? TALN 2009 3. Pk text segmentation metric Beeferman D., Berger A., Lafferty J. (1999) Statistical Models for Text Segmentation Machine Learning, 34, 177-210 """ try: import numpy as np except ImportError: pass def windowdiff(seg1, seg2, k, boundary="1", weighted=False): """ Compute the windowdiff score for a pair of segmentations. A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. >>> s1 = "000100000010" >>> s2 = "000010000100" >>> s3 = "100000010000" >>> '%.2f' % windowdiff(s1, s1, 3) '0.00' >>> '%.2f' % windowdiff(s1, s2, 3) '0.30' >>> '%.2f' % windowdiff(s2, s3, 3) '0.80' :param seg1: a segmentation :type seg1: str or list :param seg2: a segmentation :type seg2: str or list :param k: window width :type k: int :param boundary: boundary value :type boundary: str or int or bool :param weighted: use the weighted variant of windowdiff :type weighted: boolean :rtype: float """ if len(seg1) != len(seg2): raise ValueError("Segmentations have unequal length") if k > len(seg1): raise ValueError( "Window width k should be smaller or equal than segmentation lengths" ) wd = 0 for i in range(len(seg1) - k + 1): ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary)) if weighted: wd += ndiff else: wd += min(1, ndiff) return wd / (len(seg1) - k + 1.0) # Generalized Hamming Distance def _init_mat(nrows, ncols, ins_cost, del_cost): mat = np.empty((nrows, ncols)) mat[0, :] = ins_cost * np.arange(ncols) mat[:, 0] = del_cost * np.arange(nrows) return mat def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff): for i, rowi in enumerate(rowv): for j, colj in enumerate(colv): shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j] if rowi == colj: # boundaries are at the same location, no transformation required tcost = mat[i, j] elif rowi > colj: # boundary match through a deletion tcost = del_cost + mat[i, j + 1] else: # boundary match through an insertion tcost = ins_cost + mat[i + 1, j] mat[i + 1, j + 1] = min(tcost, shift_cost) def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"): """ Compute the Generalized Hamming Distance for a reference and a hypothetical segmentation, corresponding to the cost related to the transformation of the hypothetical segmentation into the reference segmentation through boundary insertion, deletion and shift operations. A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. Recommended parameter values are a shift_cost_coeff of 2. Associated with a ins_cost, and del_cost equal to the mean segment length in the reference segmentation. >>> # Same examples as Kulyukin C++ implementation >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5) 0.5 >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5) 2.0 >>> ghd('011', '110', 1.0, 1.0, 0.5) 1.0 >>> ghd('1', '0', 1.0, 1.0, 0.5) 1.0 >>> ghd('111', '000', 1.0, 1.0, 0.5) 3.0 >>> ghd('000', '111', 1.0, 2.0, 0.5) 6.0 :param ref: the reference segmentation :type ref: str or list :param hyp: the hypothetical segmentation :type hyp: str or list :param ins_cost: insertion cost :type ins_cost: float :param del_cost: deletion cost :type del_cost: float :param shift_cost_coeff: constant used to compute the cost of a shift. ``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j`` are the positions indicating the shift :type shift_cost_coeff: float :param boundary: boundary value :type boundary: str or int or bool :rtype: float """ ref_idx = [i for (i, val) in enumerate(ref) if val == boundary] hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary] nref_bound = len(ref_idx) nhyp_bound = len(hyp_idx) if nref_bound == 0 and nhyp_bound == 0: return 0.0 elif nref_bound > 0 and nhyp_bound == 0: return nref_bound * ins_cost elif nref_bound == 0 and nhyp_bound > 0: return nhyp_bound * del_cost mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost) _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff) return mat[-1, -1] # Beeferman's Pk text segmentation evaluation metric def pk(ref, hyp, k=None, boundary="1"): """ Compute the Pk metric for a pair of segmentations A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. >>> '%.2f' % pk('0100'*100, '1'*400, 2) '0.50' >>> '%.2f' % pk('0100'*100, '0'*400, 2) '0.50' >>> '%.2f' % pk('0100'*100, '0100'*100, 2) '0.00' :param ref: the reference segmentation :type ref: str or list :param hyp: the segmentation to evaluate :type hyp: str or list :param k: window size, if None, set to half of the average reference segment length :type boundary: str or int or bool :param boundary: boundary value :type boundary: str or int or bool :rtype: float """ if k is None: k = int(round(len(ref) / (ref.count(boundary) * 2.0))) err = 0 for i in range(len(ref) - k + 1): r = ref[i : i + k].count(boundary) > 0 h = hyp[i : i + k].count(boundary) > 0 if r != h: err += 1 return err / (len(ref) - k + 1.0) nltk-3.7/nltk/metrics/spearman.py000066400000000000000000000041211420073152400171250ustar00rootroot00000000000000# Natural Language Toolkit: Spearman Rank Correlation # # Copyright (C) 2001-2022 NLTK Project # Author: Joel Nothman # URL: # For license information, see LICENSE.TXT """ Tools for comparing ranked lists. """ def _rank_dists(ranks1, ranks2): """Finds the difference between the values in ranks1 and ranks2 for keys present in both dicts. If the arguments are not dicts, they are converted from (key, rank) sequences. """ ranks1 = dict(ranks1) ranks2 = dict(ranks2) for k in ranks1: try: yield k, ranks1[k] - ranks2[k] except KeyError: pass def spearman_correlation(ranks1, ranks2): """Returns the Spearman correlation coefficient for two rankings, which should be dicts or sequences of (key, rank). The coefficient ranges from -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only calculated for keys in both rankings (for meaningful results, remove keys present in only one list before ranking).""" n = 0 res = 0 for k, d in _rank_dists(ranks1, ranks2): res += d * d n += 1 try: return 1 - (6 * res / (n * (n * n - 1))) except ZeroDivisionError: # Result is undefined if only one item is ranked return 0.0 def ranks_from_sequence(seq): """Given a sequence, yields each element with an increasing rank, suitable for use as an argument to ``spearman_correlation``. """ return ((k, i) for i, k in enumerate(seq)) def ranks_from_scores(scores, rank_gap=1e-15): """Given a sequence of (key, score) tuples, yields each key with an increasing rank, tying with previous key's rank if the difference between their scores is less than rank_gap. Suitable for use as an argument to ``spearman_correlation``. """ prev_score = None rank = 0 for i, (key, score) in enumerate(scores): try: if abs(score - prev_score) > rank_gap: rank = i except TypeError: pass yield key, rank prev_score = score nltk-3.7/nltk/misc/000077500000000000000000000000001420073152400142345ustar00rootroot00000000000000nltk-3.7/nltk/misc/__init__.py000066400000000000000000000006131420073152400163450ustar00rootroot00000000000000# Natural Language Toolkit: Miscellaneous modules # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT from nltk.misc.babelfish import babelize_shell from nltk.misc.chomsky import generate_chomsky from nltk.misc.minimalset import MinimalSet from nltk.misc.wordfinder import word_finder nltk-3.7/nltk/misc/babelfish.py000066400000000000000000000005371420073152400165320ustar00rootroot00000000000000""" This module previously provided an interface to Babelfish online translation service; this service is no longer available; this module is kept in NLTK source code in order to provide better error messages for people following the NLTK Book 2.0. """ def babelize_shell(): print("Babelfish online translation service is no longer available.") nltk-3.7/nltk/misc/chomsky.py000066400000000000000000000121011420073152400162560ustar00rootroot00000000000000# Chomsky random text generator, version 1.1, Raymond Hettinger, 2005/09/13 # https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/440546 """ CHOMSKY is an aid to writing linguistic papers in the style of the great master. It is based on selected phrases taken from actual books and articles written by Noam Chomsky. Upon request, it assembles the phrases in the elegant stylistic patterns that Chomsky is noted for. To generate n sentences of linguistic wisdom, type (CHOMSKY n) -- for example (CHOMSKY 5) generates half a screen of linguistic truth. """ leadins = """To characterize a linguistic level L, On the other hand, This suggests that It appears that Furthermore, We will bring evidence in favor of the following thesis: To provide a constituent structure for T(Z,K), From C1, it follows that For any transformation which is sufficiently diversified in \ application to be of any interest, Analogously, Clearly, Note that Of course, Suppose, for instance, that Thus With this clarification, Conversely, We have already seen that By combining adjunctions and certain deformations, I suggested that these results would follow from the assumption that If the position of the trace in (99c) were only relatively \ inaccessible to movement, However, this assumption is not correct, since Comparing these examples with their parasitic gap counterparts in \ (96) and (97), we see that In the discussion of resumptive pronouns following (81), So far, Nevertheless, For one thing, Summarizing, then, we assume that A consequence of the approach just outlined is that Presumably, On our assumptions, It may be, then, that It must be emphasized, once again, that Let us continue to suppose that Notice, incidentally, that """ # List of LEADINs to buy time. subjects = """ the notion of level of grammaticalness a case of semigrammaticalness of a different sort most of the methodological work in modern linguistics a subset of English sentences interesting on quite independent grounds the natural general principle that will subsume this case an important property of these three types of EC any associated supporting element the appearance of parasitic gaps in domains relatively inaccessible \ to ordinary extraction the speaker-hearer's linguistic intuition the descriptive power of the base component the earlier discussion of deviance this analysis of a formative as a pair of sets of features this selectionally introduced contextual feature a descriptively adequate grammar the fundamental error of regarding functional notions as categorial relational information the systematic use of complex symbols the theory of syntactic features developed earlier""" # List of SUBJECTs chosen for maximum professorial macho. verbs = """can be defined in such a way as to impose delimits suffices to account for cannot be arbitrary in is not subject to does not readily tolerate raises serious doubts about is not quite equivalent to does not affect the structure of may remedy and, at the same time, eliminate is not to be considered in determining is to be regarded as is unspecified with respect to is, apparently, determined by is necessary to impose an interpretation on appears to correlate rather closely with is rather different from""" # List of VERBs chosen for autorecursive obfuscation. objects = """ problems of phonemic and morphological analysis. a corpus of utterance tokens upon which conformity has been defined \ by the paired utterance test. the traditional practice of grammarians. the levels of acceptability from fairly high (e.g. (99a)) to virtual \ gibberish (e.g. (98d)). a stipulation to place the constructions into these various categories. a descriptive fact. a parasitic gap construction. the extended c-command discussed in connection with (34). the ultimate standard that determines the accuracy of any proposed grammar. the system of base rules exclusive of the lexicon. irrelevant intervening contexts in selectional rules. nondistinctness in the sense of distinctive feature theory. a general convention regarding the forms of the grammar. an abstract underlying order. an important distinction in language use. the requirement that branching is not tolerated within the dominance \ scope of a complex symbol. the strong generative capacity of the theory.""" # List of OBJECTs selected for profound sententiousness. import random import textwrap from itertools import chain, islice def generate_chomsky(times=5, line_length=72): parts = [] for part in (leadins, subjects, verbs, objects): phraselist = list(map(str.strip, part.splitlines())) random.shuffle(phraselist) parts.append(phraselist) output = chain.from_iterable(islice(zip(*parts), 0, times)) print(textwrap.fill(" ".join(output), line_length)) if __name__ == "__main__": generate_chomsky() nltk-3.7/nltk/misc/minimalset.py000066400000000000000000000055161420073152400167570ustar00rootroot00000000000000# Natural Language Toolkit: Minimal Sets # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT from collections import defaultdict class MinimalSet: """ Find contexts where more than one possible target value can appear. E.g. if targets are word-initial letters, and contexts are the remainders of words, then we would like to find cases like "fat" vs "cat", and "training" vs "draining". If targets are parts-of-speech and contexts are words, then we would like to find cases like wind (noun) 'air in rapid motion', vs wind (verb) 'coil, wrap'. """ def __init__(self, parameters=None): """ Create a new minimal set. :param parameters: The (context, target, display) tuples for the item :type parameters: list(tuple(str, str, str)) """ self._targets = set() # the contrastive information self._contexts = set() # what we are controlling for self._seen = defaultdict(set) # to record what we have seen self._displays = {} # what we will display if parameters: for context, target, display in parameters: self.add(context, target, display) def add(self, context, target, display): """ Add a new item to the minimal set, having the specified context, target, and display form. :param context: The context in which the item of interest appears :type context: str :param target: The item of interest :type target: str :param display: The information to be reported for each item :type display: str """ # Store the set of targets that occurred in this context self._seen[context].add(target) # Keep track of which contexts and targets we have seen self._contexts.add(context) self._targets.add(target) # For a given context and target, store the display form self._displays[(context, target)] = display def contexts(self, minimum=2): """ Determine which contexts occurred with enough distinct targets. :param minimum: the minimum number of distinct target forms :type minimum: int :rtype: list """ return [c for c in self._contexts if len(self._seen[c]) >= minimum] def display(self, context, target, default=""): if (context, target) in self._displays: return self._displays[(context, target)] else: return default def display_all(self, context): result = [] for target in self._targets: x = self.display(context, target) if x: result.append(x) return result def targets(self): return self._targets nltk-3.7/nltk/misc/sort.py000066400000000000000000000104231420073152400155750ustar00rootroot00000000000000# Natural Language Toolkit: List Sorting # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ This module provides a variety of list sorting algorithms, to illustrate the many different algorithms (recipes) for solving a problem, and how to analyze algorithms experimentally. """ # These algorithms are taken from: # Levitin (2004) The Design and Analysis of Algorithms ################################################################## # Selection Sort ################################################################## def selection(a): """ Selection Sort: scan the list to find its smallest element, then swap it with the first element. The remainder of the list is one element smaller; apply the same method to this list, and so on. """ count = 0 for i in range(len(a) - 1): min = i for j in range(i + 1, len(a)): if a[j] < a[min]: min = j count += 1 a[min], a[i] = a[i], a[min] return count ################################################################## # Bubble Sort ################################################################## def bubble(a): """ Bubble Sort: compare adjacent elements of the list left-to-right, and swap them if they are out of order. After one pass through the list swapping adjacent items, the largest item will be in the rightmost position. The remainder is one element smaller; apply the same method to this list, and so on. """ count = 0 for i in range(len(a) - 1): for j in range(len(a) - i - 1): if a[j + 1] < a[j]: a[j], a[j + 1] = a[j + 1], a[j] count += 1 return count ################################################################## # Merge Sort ################################################################## def _merge_lists(b, c): count = 0 i = j = 0 a = [] while i < len(b) and j < len(c): count += 1 if b[i] <= c[j]: a.append(b[i]) i += 1 else: a.append(c[j]) j += 1 if i == len(b): a += c[j:] else: a += b[i:] return a, count def merge(a): """ Merge Sort: split the list in half, and sort each half, then combine the sorted halves. """ count = 0 if len(a) > 1: midpoint = len(a) // 2 b = a[:midpoint] c = a[midpoint:] count_b = merge(b) count_c = merge(c) result, count_a = _merge_lists(b, c) a[:] = result # copy the result back into a. count = count_a + count_b + count_c return count ################################################################## # Quick Sort ################################################################## def _partition(a, l, r): p = a[l] i = l j = r + 1 count = 0 while True: while i < r: i += 1 if a[i] >= p: break while j > l: j -= 1 if j < l or a[j] <= p: break a[i], a[j] = a[j], a[i] # swap count += 1 if i >= j: break a[i], a[j] = a[j], a[i] # undo last swap a[l], a[j] = a[j], a[l] return j, count def _quick(a, l, r): count = 0 if l < r: s, count = _partition(a, l, r) count += _quick(a, l, s - 1) count += _quick(a, s + 1, r) return count def quick(a): return _quick(a, 0, len(a) - 1) ################################################################## # Demonstration ################################################################## def demo(): from random import shuffle for size in (10, 20, 50, 100, 200, 500, 1000): a = list(range(size)) # various sort methods shuffle(a) count_selection = selection(a) shuffle(a) count_bubble = bubble(a) shuffle(a) count_merge = merge(a) shuffle(a) count_quick = quick(a) print( ("size=%5d: selection=%8d, bubble=%8d, " "merge=%6d, quick=%6d") % (size, count_selection, count_bubble, count_merge, count_quick) ) if __name__ == "__main__": demo() nltk-3.7/nltk/misc/wordfinder.py000066400000000000000000000101651420073152400167540ustar00rootroot00000000000000# Natural Language Toolkit: Word Finder # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT # Simplified from PHP version by Robert Klein # http://fswordfinder.sourceforge.net/ import random # reverse a word with probability 0.5 def revword(word): if random.randint(1, 2) == 1: return word[::-1] return word # try to insert word at position x,y; direction encoded in xf,yf def step(word, x, xf, y, yf, grid): for i in range(len(word)): if grid[xf(i)][yf(i)] != "" and grid[xf(i)][yf(i)] != word[i]: return False for i in range(len(word)): grid[xf(i)][yf(i)] = word[i] return True # try to insert word at position x,y, in direction dir def check(word, dir, x, y, grid, rows, cols): if dir == 1: if x - len(word) < 0 or y - len(word) < 0: return False return step(word, x, lambda i: x - i, y, lambda i: y - i, grid) elif dir == 2: if x - len(word) < 0: return False return step(word, x, lambda i: x - i, y, lambda i: y, grid) elif dir == 3: if x - len(word) < 0 or y + (len(word) - 1) >= cols: return False return step(word, x, lambda i: x - i, y, lambda i: y + i, grid) elif dir == 4: if y - len(word) < 0: return False return step(word, x, lambda i: x, y, lambda i: y - i, grid) def wordfinder(words, rows=20, cols=20, attempts=50, alph="ABCDEFGHIJKLMNOPQRSTUVWXYZ"): """ Attempt to arrange words into a letter-grid with the specified number of rows and columns. Try each word in several positions and directions, until it can be fitted into the grid, or the maximum number of allowable attempts is exceeded. Returns a tuple consisting of the grid and the words that were successfully placed. :param words: the list of words to be put into the grid :type words: list :param rows: the number of rows in the grid :type rows: int :param cols: the number of columns in the grid :type cols: int :param attempts: the number of times to attempt placing a word :type attempts: int :param alph: the alphabet, to be used for filling blank cells :type alph: list :rtype: tuple """ # place longer words first words = sorted(words, key=len, reverse=True) grid = [] # the letter grid used = [] # the words we used # initialize the grid for i in range(rows): grid.append([""] * cols) # try to place each word for word in words: word = word.strip().upper() # normalize save = word # keep a record of the word word = revword(word) for attempt in range(attempts): r = random.randint(0, len(word)) dir = random.choice([1, 2, 3, 4]) x = random.randint(0, rows) y = random.randint(0, cols) if dir == 1: x += r y += r elif dir == 2: x += r elif dir == 3: x += r y -= r elif dir == 4: y += r if 0 <= x < rows and 0 <= y < cols: if check(word, dir, x, y, grid, rows, cols): # used.append((save, dir, x, y, word)) used.append(save) break # Fill up the remaining spaces for i in range(rows): for j in range(cols): if grid[i][j] == "": grid[i][j] = random.choice(alph) return grid, used def word_finder(): from nltk.corpus import words wordlist = words.words() random.shuffle(wordlist) wordlist = wordlist[:200] wordlist = [w for w in wordlist if 3 <= len(w) <= 12] grid, used = wordfinder(wordlist) print("Word Finder\n") for i in range(len(grid)): for j in range(len(grid[i])): print(grid[i][j], end=" ") print() print() for i in range(len(used)): print("%d:" % (i + 1), used[i]) if __name__ == "__main__": word_finder() nltk-3.7/nltk/parse/000077500000000000000000000000001420073152400144135ustar00rootroot00000000000000nltk-3.7/nltk/parse/__init__.py000066400000000000000000000071571420073152400165360ustar00rootroot00000000000000# Natural Language Toolkit: Parsers # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # """ NLTK Parsers Classes and interfaces for producing tree structures that represent the internal organization of a text. This task is known as "parsing" the text, and the resulting tree structures are called the text's "parses". Typically, the text is a single sentence, and the tree structure represents the syntactic structure of the sentence. However, parsers can also be used in other domains. For example, parsers can be used to derive the morphological structure of the morphemes that make up a word, or to derive the discourse structure for a set of utterances. Sometimes, a single piece of text can be represented by more than one tree structure. Texts represented by more than one tree structure are called "ambiguous" texts. Note that there are actually two ways in which a text can be ambiguous: - The text has multiple correct parses. - There is not enough information to decide which of several candidate parses is correct. However, the parser module does *not* distinguish these two types of ambiguity. The parser module defines ``ParserI``, a standard interface for parsing texts; and two simple implementations of that interface, ``ShiftReduceParser`` and ``RecursiveDescentParser``. It also contains three sub-modules for specialized kinds of parsing: - ``nltk.parser.chart`` defines chart parsing, which uses dynamic programming to efficiently parse texts. - ``nltk.parser.probabilistic`` defines probabilistic parsing, which associates a probability with each parse. """ from nltk.parse.api import ParserI from nltk.parse.bllip import BllipParser from nltk.parse.chart import ( BottomUpChartParser, BottomUpLeftCornerChartParser, ChartParser, LeftCornerChartParser, SteppingChartParser, TopDownChartParser, ) from nltk.parse.corenlp import CoreNLPDependencyParser, CoreNLPParser from nltk.parse.dependencygraph import DependencyGraph from nltk.parse.earleychart import ( EarleyChartParser, FeatureEarleyChartParser, FeatureIncrementalBottomUpChartParser, FeatureIncrementalBottomUpLeftCornerChartParser, FeatureIncrementalChartParser, FeatureIncrementalTopDownChartParser, IncrementalBottomUpChartParser, IncrementalBottomUpLeftCornerChartParser, IncrementalChartParser, IncrementalLeftCornerChartParser, IncrementalTopDownChartParser, ) from nltk.parse.evaluate import DependencyEvaluator from nltk.parse.featurechart import ( FeatureBottomUpChartParser, FeatureBottomUpLeftCornerChartParser, FeatureChartParser, FeatureTopDownChartParser, ) from nltk.parse.malt import MaltParser from nltk.parse.nonprojectivedependencyparser import ( NaiveBayesDependencyScorer, NonprojectiveDependencyParser, ProbabilisticNonprojectiveParser, ) from nltk.parse.pchart import ( BottomUpProbabilisticChartParser, InsideChartParser, LongestChartParser, RandomChartParser, UnsortedChartParser, ) from nltk.parse.projectivedependencyparser import ( ProbabilisticProjectiveDependencyParser, ProjectiveDependencyParser, ) from nltk.parse.recursivedescent import ( RecursiveDescentParser, SteppingRecursiveDescentParser, ) from nltk.parse.shiftreduce import ShiftReduceParser, SteppingShiftReduceParser from nltk.parse.transitionparser import TransitionParser from nltk.parse.util import TestGrammar, extract_test_sentences, load_parser from nltk.parse.viterbi import ViterbiParser nltk-3.7/nltk/parse/api.py000066400000000000000000000043521420073152400155420ustar00rootroot00000000000000# Natural Language Toolkit: Parser API # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # import itertools from nltk.internals import overridden class ParserI: """ A processing class for deriving trees that represent possible structures for a sequence of tokens. These tree structures are known as "parses". Typically, parsers are used to derive syntax trees for sentences. But parsers can also be used to derive other kinds of tree structure, such as morphological trees and discourse structures. Subclasses must define: - at least one of: ``parse()``, ``parse_sents()``. Subclasses may define: - ``grammar()`` """ def grammar(self): """ :return: The grammar used by this parser. """ raise NotImplementedError() def parse(self, sent, *args, **kwargs): """ :return: An iterator that generates parse trees for the sentence. When possible this list is sorted from most likely to least likely. :param sent: The sentence to be parsed :type sent: list(str) :rtype: iter(Tree) """ if overridden(self.parse_sents): return next(self.parse_sents([sent], *args, **kwargs)) elif overridden(self.parse_one): return ( tree for tree in [self.parse_one(sent, *args, **kwargs)] if tree is not None ) elif overridden(self.parse_all): return iter(self.parse_all(sent, *args, **kwargs)) else: raise NotImplementedError() def parse_sents(self, sents, *args, **kwargs): """ Apply ``self.parse()`` to each element of ``sents``. :rtype: iter(iter(Tree)) """ return (self.parse(sent, *args, **kwargs) for sent in sents) def parse_all(self, sent, *args, **kwargs): """:rtype: list(Tree)""" return list(self.parse(sent, *args, **kwargs)) def parse_one(self, sent, *args, **kwargs): """:rtype: Tree or None""" return next(self.parse(sent, *args, **kwargs), None) nltk-3.7/nltk/parse/bllip.py000066400000000000000000000246661420073152400161050ustar00rootroot00000000000000# Natural Language Toolkit: Interface to BLLIP Parser # # Author: David McClosky # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT from nltk.parse.api import ParserI from nltk.tree import Tree """ Interface for parsing with BLLIP Parser. Requires the Python bllipparser module. BllipParser objects can be constructed with the ``BllipParser.from_unified_model_dir`` class method or manually using the ``BllipParser`` constructor. The former is generally easier if you have a BLLIP Parser unified model directory -- a basic model can be obtained from NLTK's downloader. More unified parsing models can be obtained with BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher`` or see docs for ``bllipparser.ModelFetcher.download_and_install_model``). Basic usage:: # download and install a basic unified parsing model (Wall Street Journal) # sudo python -m nltk.downloader bllip_wsj_no_aux >>> from nltk.data import find >>> model_dir = find('models/bllip_wsj_no_aux').path >>> bllip = BllipParser.from_unified_model_dir(model_dir) # 1-best parsing >>> sentence1 = 'British left waffles on Falklands .'.split() >>> top_parse = bllip.parse_one(sentence1) >>> print(top_parse) (S1 (S (NP (JJ British) (NN left)) (VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands)))) (. .))) # n-best parsing >>> sentence2 = 'Time flies'.split() >>> all_parses = bllip.parse_all(sentence2) >>> print(len(all_parses)) 50 >>> print(all_parses[0]) (S1 (S (NP (NNP Time)) (VP (VBZ flies)))) # incorporating external tagging constraints (None means unconstrained tag) >>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')]) >>> print(next(constrained1)) (S1 (NP (VB Time) (NNS flies))) >>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)]) >>> print(next(constrained2)) (S1 (NP (NN Time) (VBZ flies))) References ---------- - Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of the 1st North American chapter of the Association for Computational Linguistics conference. Association for Computational Linguistics, 2000. - Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing and MaxEnt discriminative reranking." Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics. Association for Computational Linguistics, 2005. Known issues ------------ Note that BLLIP Parser is not currently threadsafe. Since this module uses a SWIG interface, it is potentially unsafe to create multiple ``BllipParser`` objects in the same process. BLLIP Parser currently has issues with non-ASCII text and will raise an error if given any. See https://pypi.python.org/pypi/bllipparser/ for more information on BLLIP Parser's Python interface. """ __all__ = ["BllipParser"] # this block allows this module to be imported even if bllipparser isn't # available try: from bllipparser import RerankingParser from bllipparser.RerankingParser import get_unified_model_parameters def _ensure_bllip_import_or_error(): pass except ImportError as ie: def _ensure_bllip_import_or_error(ie=ie): raise ImportError("Couldn't import bllipparser module: %s" % ie) def _ensure_ascii(words): try: for i, word in enumerate(words): word.encode("ascii") except UnicodeEncodeError as e: raise ValueError( f"Token {i} ({word!r}) is non-ASCII. BLLIP Parser " "currently doesn't support non-ASCII inputs." ) from e def _scored_parse_to_nltk_tree(scored_parse): return Tree.fromstring(str(scored_parse.ptb_parse)) class BllipParser(ParserI): """ Interface for parsing with BLLIP Parser. BllipParser objects can be constructed with the ``BllipParser.from_unified_model_dir`` class method or manually using the ``BllipParser`` constructor. """ def __init__( self, parser_model=None, reranker_features=None, reranker_weights=None, parser_options=None, reranker_options=None, ): """ Load a BLLIP Parser model from scratch. You'll typically want to use the ``from_unified_model_dir()`` class method to construct this object. :param parser_model: Path to parser model directory :type parser_model: str :param reranker_features: Path the reranker model's features file :type reranker_features: str :param reranker_weights: Path the reranker model's weights file :type reranker_weights: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) """ _ensure_bllip_import_or_error() parser_options = parser_options or {} reranker_options = reranker_options or {} self.rrp = RerankingParser() self.rrp.load_parser_model(parser_model, **parser_options) if reranker_features and reranker_weights: self.rrp.load_reranker_model( features_filename=reranker_features, weights_filename=reranker_weights, **reranker_options, ) def parse(self, sentence): """ Use BLLIP Parser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this BLLIP Parser instance's tagger. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: The sentence to be parsed :type sentence: list(str) :rtype: iter(Tree) """ _ensure_ascii(sentence) nbest_list = self.rrp.parse(sentence) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) def tagged_parse(self, word_and_tag_pairs): """ Use BLLIP to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. BLLIP will attempt to use the tags provided but may use others if it can't come up with a complete parse subject to those constraints. You may also specify a tag as ``None`` to leave a token's tag unconstrained. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: Input sentence to parse as (word, tag) pairs :type sentence: list(tuple(str, str)) :rtype: iter(Tree) """ words = [] tag_map = {} for i, (word, tag) in enumerate(word_and_tag_pairs): words.append(word) if tag is not None: tag_map[i] = tag _ensure_ascii(words) nbest_list = self.rrp.parse_tagged(words, tag_map) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) @classmethod def from_unified_model_dir( cls, model_dir, parser_options=None, reranker_options=None ): """ Create a ``BllipParser`` object from a unified parsing model directory. Unified parsing model directories are a standardized way of storing BLLIP parser and reranker models together on disk. See ``bllipparser.RerankingParser.get_unified_model_parameters()`` for more information about unified model directories. :return: A ``BllipParser`` object using the parser and reranker models in the model directory. :param model_dir: Path to the unified model directory. :type model_dir: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) :rtype: BllipParser """ ( parser_model_dir, reranker_features_filename, reranker_weights_filename, ) = get_unified_model_parameters(model_dir) return cls( parser_model_dir, reranker_features_filename, reranker_weights_filename, parser_options, reranker_options, ) def demo(): """This assumes the Python module bllipparser is installed.""" # download and install a basic unified parsing model (Wall Street Journal) # sudo python -m nltk.downloader bllip_wsj_no_aux from nltk.data import find model_dir = find("models/bllip_wsj_no_aux").path print("Loading BLLIP Parsing models...") # the easiest way to get started is to use a unified model bllip = BllipParser.from_unified_model_dir(model_dir) print("Done.") sentence1 = "British left waffles on Falklands .".split() sentence2 = "I saw the man with the telescope .".split() # this sentence is known to fail under the WSJ parsing model fail1 = "# ! ? : -".split() for sentence in (sentence1, sentence2, fail1): print("Sentence: %r" % " ".join(sentence)) try: tree = next(bllip.parse(sentence)) print(tree) except StopIteration: print("(parse failed)") # n-best parsing demo for i, parse in enumerate(bllip.parse(sentence1)): print("parse %d:\n%s" % (i, parse)) # using external POS tag constraints print( "forcing 'tree' to be 'NN':", next(bllip.tagged_parse([("A", None), ("tree", "NN")])), ) print( "forcing 'A' to be 'DT' and 'tree' to be 'NNP':", next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])), ) # constraints don't have to make sense... (though on more complicated # sentences, they may cause the parse to fail) print( "forcing 'A' to be 'NNP':", next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])), ) nltk-3.7/nltk/parse/broker_test.cfg000066400000000000000000000004361420073152400174220ustar00rootroot00000000000000%start S S[sem=] -> NP[sem=?subj] VP[sem=?vp] VP[sem = ] -> V[sem = ?v] NP[sem=?obj] VP[sem = ?v] -> V[sem = ?v] NP[sem = ] -> 'Kim' NP[sem = ] -> 'I' V[sem = <\x y.(like x y)>, tns=pres] -> 'like' V[sem = <\x.(sleeps x)>, tns=pres] -> 'sleeps' nltk-3.7/nltk/parse/chart.py000066400000000000000000001707301420073152400160760ustar00rootroot00000000000000# Natural Language Toolkit: A Chart Parser # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Jean Mark Gawron # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT """ Data classes and parser implementations for "chart parsers", which use dynamic programming to efficiently parse a text. A chart parser derives parse trees for a text by iteratively adding "edges" to a "chart." Each edge represents a hypothesis about the tree structure for a subsequence of the text. The chart is a "blackboard" for composing and combining these hypotheses. When a chart parser begins parsing a text, it creates a new (empty) chart, spanning the text. It then incrementally adds new edges to the chart. A set of "chart rules" specifies the conditions under which new edges should be added to the chart. Once the chart reaches a stage where none of the chart rules adds any new edges, parsing is complete. Charts are encoded with the ``Chart`` class, and edges are encoded with the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module defines three chart parsers: - ``ChartParser`` is a simple and flexible chart parser. Given a set of chart rules, it will apply those rules to the chart until no more edges are added. - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can be used to step through the parsing process. """ import itertools import re import warnings from functools import total_ordering from nltk.grammar import PCFG, is_nonterminal, is_terminal from nltk.internals import raise_unorderable_types from nltk.parse.api import ParserI from nltk.tree import Tree from nltk.util import OrderedDict ######################################################################## ## Edges ######################################################################## @total_ordering class EdgeI: """ A hypothesis about the structure of part of a sentence. Each edge records the fact that a structure is (partially) consistent with the sentence. An edge contains: - A span, indicating what part of the sentence is consistent with the hypothesized structure. - A left-hand side, specifying what kind of structure is hypothesized. - A right-hand side, specifying the contents of the hypothesized structure. - A dot position, indicating how much of the hypothesized structure is consistent with the sentence. Every edge is either complete or incomplete: - An edge is complete if its structure is fully consistent with the sentence. - An edge is incomplete if its structure is partially consistent with the sentence. For every incomplete edge, the span specifies a possible prefix for the edge's structure. There are two kinds of edge: - A ``TreeEdge`` records which trees have been found to be (partially) consistent with the text. - A ``LeafEdge`` records the tokens occurring in the text. The ``EdgeI`` interface provides a common interface to both types of edge, allowing chart parsers to treat them in a uniform manner. """ def __init__(self): if self.__class__ == EdgeI: raise TypeError("Edge is an abstract interface") # //////////////////////////////////////////////////////////// # Span # //////////////////////////////////////////////////////////// def span(self): """ Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the portion of the sentence that is consistent with this edge's structure. :rtype: tuple(int, int) """ raise NotImplementedError() def start(self): """ Return the start index of this edge's span. :rtype: int """ raise NotImplementedError() def end(self): """ Return the end index of this edge's span. :rtype: int """ raise NotImplementedError() def length(self): """ Return the length of this edge's span. :rtype: int """ raise NotImplementedError() # //////////////////////////////////////////////////////////// # Left Hand Side # //////////////////////////////////////////////////////////// def lhs(self): """ Return this edge's left-hand side, which specifies what kind of structure is hypothesized by this edge. :see: ``TreeEdge`` and ``LeafEdge`` for a description of the left-hand side values for each edge type. """ raise NotImplementedError() # //////////////////////////////////////////////////////////// # Right Hand Side # //////////////////////////////////////////////////////////// def rhs(self): """ Return this edge's right-hand side, which specifies the content of the structure hypothesized by this edge. :see: ``TreeEdge`` and ``LeafEdge`` for a description of the right-hand side values for each edge type. """ raise NotImplementedError() def dot(self): """ Return this edge's dot position, which indicates how much of the hypothesized structure is consistent with the sentence. In particular, ``self.rhs[:dot]`` is consistent with ``tokens[self.start():self.end()]``. :rtype: int """ raise NotImplementedError() def nextsym(self): """ Return the element of this edge's right-hand side that immediately follows its dot. :rtype: Nonterminal or terminal or None """ raise NotImplementedError() def is_complete(self): """ Return True if this edge's structure is fully consistent with the text. :rtype: bool """ raise NotImplementedError() def is_incomplete(self): """ Return True if this edge's structure is partially consistent with the text. :rtype: bool """ raise NotImplementedError() # //////////////////////////////////////////////////////////// # Comparisons & hashing # //////////////////////////////////////////////////////////// def __eq__(self, other): return ( self.__class__ is other.__class__ and self._comparison_key == other._comparison_key ) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, EdgeI): raise_unorderable_types("<", self, other) if self.__class__ is other.__class__: return self._comparison_key < other._comparison_key else: return self.__class__.__name__ < other.__class__.__name__ def __hash__(self): try: return self._hash except AttributeError: self._hash = hash(self._comparison_key) return self._hash class TreeEdge(EdgeI): """ An edge that records the fact that a tree is (partially) consistent with the sentence. A tree edge consists of: - A span, indicating what part of the sentence is consistent with the hypothesized tree. - A left-hand side, specifying the hypothesized tree's node value. - A right-hand side, specifying the hypothesized tree's children. Each element of the right-hand side is either a terminal, specifying a token with that terminal as its leaf value; or a nonterminal, specifying a subtree with that nonterminal's symbol as its node value. - A dot position, indicating which children are consistent with part of the sentence. In particular, if ``dot`` is the dot position, ``rhs`` is the right-hand size, ``(start,end)`` is the span, and ``sentence`` is the list of tokens in the sentence, then ``tokens[start:end]`` can be spanned by the children specified by ``rhs[:dot]``. For more information about edges, see the ``EdgeI`` interface. """ def __init__(self, span, lhs, rhs, dot=0): """ Construct a new ``TreeEdge``. :type span: tuple(int, int) :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the portion of the sentence that is consistent with the new edge's structure. :type lhs: Nonterminal :param lhs: The new edge's left-hand side, specifying the hypothesized tree's node value. :type rhs: list(Nonterminal and str) :param rhs: The new edge's right-hand side, specifying the hypothesized tree's children. :type dot: int :param dot: The position of the new edge's dot. This position specifies what prefix of the production's right hand side is consistent with the text. In particular, if ``sentence`` is the list of tokens in the sentence, then ``okens[span[0]:span[1]]`` can be spanned by the children specified by ``rhs[:dot]``. """ self._span = span self._lhs = lhs rhs = tuple(rhs) self._rhs = rhs self._dot = dot self._comparison_key = (span, lhs, rhs, dot) @staticmethod def from_production(production, index): """ Return a new ``TreeEdge`` formed from the given production. The new edge's left-hand side and right-hand side will be taken from ``production``; its span will be ``(index,index)``; and its dot position will be ``0``. :rtype: TreeEdge """ return TreeEdge( span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0 ) def move_dot_forward(self, new_end): """ Return a new ``TreeEdge`` formed from this edge. The new edge's dot position is increased by ``1``, and its end index will be replaced by ``new_end``. :param new_end: The new end index. :type new_end: int :rtype: TreeEdge """ return TreeEdge( span=(self._span[0], new_end), lhs=self._lhs, rhs=self._rhs, dot=self._dot + 1, ) # Accessors def lhs(self): return self._lhs def span(self): return self._span def start(self): return self._span[0] def end(self): return self._span[1] def length(self): return self._span[1] - self._span[0] def rhs(self): return self._rhs def dot(self): return self._dot def is_complete(self): return self._dot == len(self._rhs) def is_incomplete(self): return self._dot != len(self._rhs) def nextsym(self): if self._dot >= len(self._rhs): return None else: return self._rhs[self._dot] # String representation def __str__(self): str = f"[{self._span[0]}:{self._span[1]}] " str += "%-2r ->" % (self._lhs,) for i in range(len(self._rhs)): if i == self._dot: str += " *" str += " %s" % repr(self._rhs[i]) if len(self._rhs) == self._dot: str += " *" return str def __repr__(self): return "[Edge: %s]" % self class LeafEdge(EdgeI): """ An edge that records the fact that a leaf value is consistent with a word in the sentence. A leaf edge consists of: - An index, indicating the position of the word. - A leaf, specifying the word's content. A leaf edge's left-hand side is its leaf value, and its right hand side is ``()``. Its span is ``[index, index+1]``, and its dot position is ``0``. """ def __init__(self, leaf, index): """ Construct a new ``LeafEdge``. :param leaf: The new edge's leaf value, specifying the word that is recorded by this edge. :param index: The new edge's index, specifying the position of the word that is recorded by this edge. """ self._leaf = leaf self._index = index self._comparison_key = (leaf, index) # Accessors def lhs(self): return self._leaf def span(self): return (self._index, self._index + 1) def start(self): return self._index def end(self): return self._index + 1 def length(self): return 1 def rhs(self): return () def dot(self): return 0 def is_complete(self): return True def is_incomplete(self): return False def nextsym(self): return None # String representations def __str__(self): return f"[{self._index}:{self._index + 1}] {repr(self._leaf)}" def __repr__(self): return "[Edge: %s]" % (self) ######################################################################## ## Chart ######################################################################## class Chart: """ A blackboard for hypotheses about the syntactic constituents of a sentence. A chart contains a set of edges, and each edge encodes a single hypothesis about the structure of some portion of the sentence. The ``select`` method can be used to select a specific collection of edges. For example ``chart.select(is_complete=True, start=0)`` yields all complete edges whose start indices are 0. To ensure the efficiency of these selection operations, ``Chart`` dynamically creates and maintains an index for each set of attributes that have been selected on. In order to reconstruct the trees that are represented by an edge, the chart associates each edge with a set of child pointer lists. A child pointer list is a list of the edges that license an edge's right-hand side. :ivar _tokens: The sentence that the chart covers. :ivar _num_leaves: The number of tokens. :ivar _edges: A list of the edges in the chart :ivar _edge_to_cpls: A dictionary mapping each edge to a set of child pointer lists that are associated with that edge. :ivar _indexes: A dictionary mapping tuples of edge attributes to indices, where each index maps the corresponding edge attribute values to lists of edges. """ def __init__(self, tokens): """ Construct a new chart. The chart is initialized with the leaf edges corresponding to the terminal leaves. :type tokens: list :param tokens: The sentence that this chart will be used to parse. """ # Record the sentence token and the sentence length. self._tokens = tuple(tokens) self._num_leaves = len(self._tokens) # Initialise the chart. self.initialize() def initialize(self): """ Clear the chart. """ # A list of edges contained in this chart. self._edges = [] # The set of child pointer lists associated with each edge. self._edge_to_cpls = {} # Indexes mapping attribute values to lists of edges # (used by select()). self._indexes = {} # //////////////////////////////////////////////////////////// # Sentence Access # //////////////////////////////////////////////////////////// def num_leaves(self): """ Return the number of words in this chart's sentence. :rtype: int """ return self._num_leaves def leaf(self, index): """ Return the leaf value of the word at the given index. :rtype: str """ return self._tokens[index] def leaves(self): """ Return a list of the leaf values of each word in the chart's sentence. :rtype: list(str) """ return self._tokens # //////////////////////////////////////////////////////////// # Edge access # //////////////////////////////////////////////////////////// def edges(self): """ Return a list of all edges in this chart. New edges that are added to the chart after the call to edges() will *not* be contained in this list. :rtype: list(EdgeI) :see: ``iteredges``, ``select`` """ return self._edges[:] def iteredges(self): """ Return an iterator over the edges in this chart. It is not guaranteed that new edges which are added to the chart before the iterator is exhausted will also be generated. :rtype: iter(EdgeI) :see: ``edges``, ``select`` """ return iter(self._edges) # Iterating over the chart yields its edges. __iter__ = iteredges def num_edges(self): """ Return the number of edges contained in this chart. :rtype: int """ return len(self._edge_to_cpls) def select(self, **restrictions): """ Return an iterator over the edges in this chart. Any new edges that are added to the chart before the iterator is exahusted will also be generated. ``restrictions`` can be used to restrict the set of edges that will be generated. :param span: Only generate edges ``e`` where ``e.span()==span`` :param start: Only generate edges ``e`` where ``e.start()==start`` :param end: Only generate edges ``e`` where ``e.end()==end`` :param length: Only generate edges ``e`` where ``e.length()==length`` :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs`` :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs`` :param nextsym: Only generate edges ``e`` where ``e.nextsym()==nextsym`` :param dot: Only generate edges ``e`` where ``e.dot()==dot`` :param is_complete: Only generate edges ``e`` where ``e.is_complete()==is_complete`` :param is_incomplete: Only generate edges ``e`` where ``e.is_incomplete()==is_incomplete`` :rtype: iter(EdgeI) """ # If there are no restrictions, then return all edges. if restrictions == {}: return iter(self._edges) # Find the index corresponding to the given restrictions. restr_keys = sorted(restrictions.keys()) restr_keys = tuple(restr_keys) # If it doesn't exist, then create it. if restr_keys not in self._indexes: self._add_index(restr_keys) vals = tuple(restrictions[key] for key in restr_keys) return iter(self._indexes[restr_keys].get(vals, [])) def _add_index(self, restr_keys): """ A helper function for ``select``, which creates a new index for a given set of attributes (aka restriction keys). """ # Make sure it's a valid index. for key in restr_keys: if not hasattr(EdgeI, key): raise ValueError("Bad restriction: %s" % key) # Create the index. index = self._indexes[restr_keys] = {} # Add all existing edges to the index. for edge in self._edges: vals = tuple(getattr(edge, key)() for key in restr_keys) index.setdefault(vals, []).append(edge) def _register_with_indexes(self, edge): """ A helper function for ``insert``, which registers the new edge with all existing indexes. """ for (restr_keys, index) in self._indexes.items(): vals = tuple(getattr(edge, key)() for key in restr_keys) index.setdefault(vals, []).append(edge) # //////////////////////////////////////////////////////////// # Edge Insertion # //////////////////////////////////////////////////////////// def insert_with_backpointer(self, new_edge, previous_edge, child_edge): """ Add a new edge to the chart, using a pointer to the previous edge. """ cpls = self.child_pointer_lists(previous_edge) new_cpls = [cpl + (child_edge,) for cpl in cpls] return self.insert(new_edge, *new_cpls) def insert(self, edge, *child_pointer_lists): """ Add a new edge to the chart, and return True if this operation modified the chart. In particular, return true iff the chart did not already contain ``edge``, or if it did not already associate ``child_pointer_lists`` with ``edge``. :type edge: EdgeI :param edge: The new edge :type child_pointer_lists: sequence of tuple(EdgeI) :param child_pointer_lists: A sequence of lists of the edges that were used to form this edge. This list is used to reconstruct the trees (or partial trees) that are associated with ``edge``. :rtype: bool """ # Is it a new edge? if edge not in self._edge_to_cpls: # Add it to the list of edges. self._append_edge(edge) # Register with indexes. self._register_with_indexes(edge) # Get the set of child pointer lists for this edge. cpls = self._edge_to_cpls.setdefault(edge, OrderedDict()) chart_was_modified = False for child_pointer_list in child_pointer_lists: child_pointer_list = tuple(child_pointer_list) if child_pointer_list not in cpls: # It's a new CPL; register it, and return true. cpls[child_pointer_list] = True chart_was_modified = True return chart_was_modified def _append_edge(self, edge): self._edges.append(edge) # //////////////////////////////////////////////////////////// # Tree extraction & child pointer lists # //////////////////////////////////////////////////////////// def parses(self, root, tree_class=Tree): """ Return an iterator of the complete tree structures that span the entire chart, and whose root node is ``root``. """ for edge in self.select(start=0, end=self._num_leaves, lhs=root): yield from self.trees(edge, tree_class=tree_class, complete=True) def trees(self, edge, tree_class=Tree, complete=False): """ Return an iterator of the tree structures that are associated with ``edge``. If ``edge`` is incomplete, then the unexpanded children will be encoded as childless subtrees, whose node value is the corresponding terminal or nonterminal. :rtype: list(Tree) :note: If two trees share a common subtree, then the same Tree may be used to encode that subtree in both trees. If you need to eliminate this subtree sharing, then create a deep copy of each tree. """ return iter(self._trees(edge, complete, memo={}, tree_class=tree_class)) def _trees(self, edge, complete, memo, tree_class): """ A helper function for ``trees``. :param memo: A dictionary used to record the trees that we've generated for each edge, so that when we see an edge more than once, we can reuse the same trees. """ # If we've seen this edge before, then reuse our old answer. if edge in memo: return memo[edge] # when we're reading trees off the chart, don't use incomplete edges if complete and edge.is_incomplete(): return [] # Leaf edges. if isinstance(edge, LeafEdge): leaf = self._tokens[edge.start()] memo[edge] = [leaf] return [leaf] # Until we're done computing the trees for edge, set # memo[edge] to be empty. This has the effect of filtering # out any cyclic trees (i.e., trees that contain themselves as # descendants), because if we reach this edge via a cycle, # then it will appear that the edge doesn't generate any trees. memo[edge] = [] trees = [] lhs = edge.lhs().symbol() # Each child pointer list can be used to form trees. for cpl in self.child_pointer_lists(edge): # Get the set of child choices for each child pointer. # child_choices[i] is the set of choices for the tree's # ith child. child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] # For each combination of children, add a tree. for children in itertools.product(*child_choices): trees.append(tree_class(lhs, children)) # If the edge is incomplete, then extend it with "partial trees": if edge.is_incomplete(): unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]] for tree in trees: tree.extend(unexpanded) # Update the memoization dictionary. memo[edge] = trees # Return the list of trees. return trees def child_pointer_lists(self, edge): """ Return the set of child pointer lists for the given edge. Each child pointer list is a list of edges that have been used to form this edge. :rtype: list(list(EdgeI)) """ # Make a copy, in case they modify it. return self._edge_to_cpls.get(edge, {}).keys() # //////////////////////////////////////////////////////////// # Display # //////////////////////////////////////////////////////////// def pretty_format_edge(self, edge, width=None): """ Return a pretty-printed string representation of a given edge in this chart. :rtype: str :param width: The number of characters allotted to each index in the sentence. """ if width is None: width = 50 // (self.num_leaves() + 1) (start, end) = (edge.start(), edge.end()) str = "|" + ("." + " " * (width - 1)) * start # Zero-width edges are "#" if complete, ">" if incomplete if start == end: if edge.is_complete(): str += "#" else: str += ">" # Spanning complete edges are "[===]"; Other edges are # "[---]" if complete, "[--->" if incomplete elif edge.is_complete() and edge.span() == (0, self._num_leaves): str += "[" + ("=" * width) * (end - start - 1) + "=" * (width - 1) + "]" elif edge.is_complete(): str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + "]" else: str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + ">" str += (" " * (width - 1) + ".") * (self._num_leaves - end) return str + "| %s" % edge def pretty_format_leaves(self, width=None): """ Return a pretty-printed string representation of this chart's leaves. This string can be used as a header for calls to ``pretty_format_edge``. """ if width is None: width = 50 // (self.num_leaves() + 1) if self._tokens is not None and width > 1: header = "|." for tok in self._tokens: header += tok[: width - 1].center(width - 1) + "." header += "|" else: header = "" return header def pretty_format(self, width=None): """ Return a pretty-printed string representation of this chart. :param width: The number of characters allotted to each index in the sentence. :rtype: str """ if width is None: width = 50 // (self.num_leaves() + 1) # sort edges: primary key=length, secondary key=start index. # (and filter out the token edges) edges = sorted((e.length(), e.start(), e) for e in self) edges = [e for (_, _, e) in edges] return ( self.pretty_format_leaves(width) + "\n" + "\n".join(self.pretty_format_edge(edge, width) for edge in edges) ) # //////////////////////////////////////////////////////////// # Display: Dot (AT&T Graphviz) # //////////////////////////////////////////////////////////// def dot_digraph(self): # Header s = "digraph nltk_chart {\n" # s += ' size="5,5";\n' s += " rankdir=LR;\n" s += " node [height=0.1,width=0.1];\n" s += ' node [style=filled, color="lightgray"];\n' # Set up the nodes for y in range(self.num_edges(), -1, -1): if y == 0: s += ' node [style=filled, color="black"];\n' for x in range(self.num_leaves() + 1): if y == 0 or ( x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end() ): s += ' %04d.%04d [label=""];\n' % (x, y) # Add a spacer s += " x [style=invis]; x->0000.0000 [style=invis];\n" # Declare ranks. for x in range(self.num_leaves() + 1): s += " {rank=same;" for y in range(self.num_edges() + 1): if y == 0 or ( x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end() ): s += " %04d.%04d" % (x, y) s += "}\n" # Add the leaves s += " edge [style=invis, weight=100];\n" s += " node [shape=plaintext]\n" s += " 0000.0000" for x in range(self.num_leaves()): s += "->%s->%04d.0000" % (self.leaf(x), x + 1) s += ";\n\n" # Add the edges s += " edge [style=solid, weight=1];\n" for y, edge in enumerate(self): for x in range(edge.start()): s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % ( x, y + 1, x + 1, y + 1, ) s += ' %04d.%04d -> %04d.%04d [label="%s"];\n' % ( edge.start(), y + 1, edge.end(), y + 1, edge, ) for x in range(edge.end(), self.num_leaves()): s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % ( x, y + 1, x + 1, y + 1, ) s += "}\n" return s ######################################################################## ## Chart Rules ######################################################################## class ChartRuleI: """ A rule that specifies what new edges are licensed by any given set of existing edges. Each chart rule expects a fixed number of edges, as indicated by the class variable ``NUM_EDGES``. In particular: - A chart rule with ``NUM_EDGES=0`` specifies what new edges are licensed, regardless of existing edges. - A chart rule with ``NUM_EDGES=1`` specifies what new edges are licensed by a single existing edge. - A chart rule with ``NUM_EDGES=2`` specifies what new edges are licensed by a pair of existing edges. :type NUM_EDGES: int :cvar NUM_EDGES: The number of existing edges that this rule uses to license new edges. Typically, this number ranges from zero to two. """ def apply(self, chart, grammar, *edges): """ Return a generator that will add edges licensed by this rule and the given edges to the chart, one at a time. Each time the generator is resumed, it will either add a new edge and yield that edge; or return. :type edges: list(EdgeI) :param edges: A set of existing edges. The number of edges that should be passed to ``apply()`` is specified by the ``NUM_EDGES`` class variable. :rtype: iter(EdgeI) """ raise NotImplementedError() def apply_everywhere(self, chart, grammar): """ Return a generator that will add all edges licensed by this rule, given the edges that are currently in the chart, one at a time. Each time the generator is resumed, it will either add a new edge and yield that edge; or return. :rtype: iter(EdgeI) """ raise NotImplementedError() class AbstractChartRule(ChartRuleI): """ An abstract base class for chart rules. ``AbstractChartRule`` provides: - A default implementation for ``apply``. - A default implementation for ``apply_everywhere``, (Currently, this implementation assumes that ``NUM_EDGES <= 3``.) - A default implementation for ``__str__``, which returns a name based on the rule's class name. """ # Subclasses must define apply. def apply(self, chart, grammar, *edges): raise NotImplementedError() # Default: loop through the given number of edges, and call # self.apply() for each set of edges. def apply_everywhere(self, chart, grammar): if self.NUM_EDGES == 0: yield from self.apply(chart, grammar) elif self.NUM_EDGES == 1: for e1 in chart: yield from self.apply(chart, grammar, e1) elif self.NUM_EDGES == 2: for e1 in chart: for e2 in chart: yield from self.apply(chart, grammar, e1, e2) elif self.NUM_EDGES == 3: for e1 in chart: for e2 in chart: for e3 in chart: yield from self.apply(chart, grammar, e1, e2, e3) else: raise AssertionError("NUM_EDGES>3 is not currently supported") # Default: return a name based on the class name. def __str__(self): # Add spaces between InitialCapsWords. return re.sub("([a-z])([A-Z])", r"\1 \2", self.__class__.__name__) # //////////////////////////////////////////////////////////// # Fundamental Rule # //////////////////////////////////////////////////////////// class FundamentalRule(AbstractChartRule): r""" A rule that joins two adjacent edges to form a single combined edge. In particular, this rule specifies that any pair of edges - ``[A -> alpha \* B beta][i:j]`` - ``[B -> gamma \*][j:k]`` licenses the edge: - ``[A -> alpha B * beta][i:j]`` """ NUM_EDGES = 2 def apply(self, chart, grammar, left_edge, right_edge): # Make sure the rule is applicable. if not ( left_edge.is_incomplete() and right_edge.is_complete() and left_edge.end() == right_edge.start() and left_edge.nextsym() == right_edge.lhs() ): return # Construct the new edge. new_edge = left_edge.move_dot_forward(right_edge.end()) # Insert it into the chart. if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class SingleEdgeFundamentalRule(FundamentalRule): r""" A rule that joins a given edge with adjacent edges in the chart, to form combined edges. In particular, this rule specifies that either of the edges: - ``[A -> alpha \* B beta][i:j]`` - ``[B -> gamma \*][j:k]`` licenses the edge: - ``[A -> alpha B * beta][i:j]`` if the other edge is already in the chart. :note: This is basically ``FundamentalRule``, with one edge left unspecified. """ NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): yield from self._apply_incomplete(chart, grammar, edge) else: yield from self._apply_complete(chart, grammar, edge) def _apply_complete(self, chart, grammar, right_edge): for left_edge in chart.select( end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() ): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge def _apply_incomplete(self, chart, grammar, left_edge): for right_edge in chart.select( start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() ): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge # //////////////////////////////////////////////////////////// # Inserting Terminal Leafs # //////////////////////////////////////////////////////////// class LeafInitRule(AbstractChartRule): NUM_EDGES = 0 def apply(self, chart, grammar): for index in range(chart.num_leaves()): new_edge = LeafEdge(chart.leaf(index), index) if chart.insert(new_edge, ()): yield new_edge # //////////////////////////////////////////////////////////// # Top-Down Prediction # //////////////////////////////////////////////////////////// class TopDownInitRule(AbstractChartRule): r""" A rule licensing edges corresponding to the grammar productions for the grammar's start symbol. In particular, this rule specifies that ``[S -> \* alpha][0:i]`` is licensed for each grammar production ``S -> alpha``, where ``S`` is the grammar's start symbol. """ NUM_EDGES = 0 def apply(self, chart, grammar): for prod in grammar.productions(lhs=grammar.start()): new_edge = TreeEdge.from_production(prod, 0) if chart.insert(new_edge, ()): yield new_edge class TopDownPredictRule(AbstractChartRule): r""" A rule licensing edges corresponding to the grammar productions for the nonterminal following an incomplete edge's dot. In particular, this rule specifies that ``[A -> alpha \* B beta][i:j]`` licenses the edge ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``. :note: This rule corresponds to the Predictor Rule in Earley parsing. """ NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_complete(): return for prod in grammar.productions(lhs=edge.nextsym()): new_edge = TreeEdge.from_production(prod, edge.end()) if chart.insert(new_edge, ()): yield new_edge class CachedTopDownPredictRule(TopDownPredictRule): r""" A cached version of ``TopDownPredictRule``. After the first time this rule is applied to an edge with a given ``end`` and ``next``, it will not generate any more edges for edges with that ``end`` and ``next``. If ``chart`` or ``grammar`` are changed, then the cache is flushed. """ def __init__(self): TopDownPredictRule.__init__(self) self._done = {} def apply(self, chart, grammar, edge): if edge.is_complete(): return nextsym, index = edge.nextsym(), edge.end() if not is_nonterminal(nextsym): return # If we've already applied this rule to an edge with the same # next & end, and the chart & grammar have not changed, then # just return (no new edges to add). done = self._done.get((nextsym, index), (None, None)) if done[0] is chart and done[1] is grammar: return # Add all the edges indicated by the top down expand rule. for prod in grammar.productions(lhs=nextsym): # If the left corner in the predicted production is # leaf, it must match with the input. if prod.rhs(): first = prod.rhs()[0] if is_terminal(first): if index >= chart.num_leaves() or first != chart.leaf(index): continue new_edge = TreeEdge.from_production(prod, index) if chart.insert(new_edge, ()): yield new_edge # Record the fact that we've applied this rule. self._done[nextsym, index] = (chart, grammar) # //////////////////////////////////////////////////////////// # Bottom-Up Prediction # //////////////////////////////////////////////////////////// class BottomUpPredictRule(AbstractChartRule): r""" A rule licensing any edge corresponding to a production whose right-hand side begins with a complete edge's left-hand side. In particular, this rule specifies that ``[A -> alpha \*]`` licenses the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``. """ NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(rhs=edge.lhs()): new_edge = TreeEdge.from_production(prod, edge.start()) if chart.insert(new_edge, ()): yield new_edge class BottomUpPredictCombineRule(BottomUpPredictRule): r""" A rule licensing any edge corresponding to a production whose right-hand side begins with a complete edge's left-hand side. In particular, this rule specifies that ``[A -> alpha \*]`` licenses the edge ``[B -> A \* beta]`` for each grammar production ``B -> A beta``. :note: This is like ``BottomUpPredictRule``, but it also applies the ``FundamentalRule`` to the resulting edge. """ NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(rhs=edge.lhs()): new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) if chart.insert(new_edge, (edge,)): yield new_edge class EmptyPredictRule(AbstractChartRule): """ A rule that inserts all empty productions as passive edges, in every position in the chart. """ NUM_EDGES = 0 def apply(self, chart, grammar): for prod in grammar.productions(empty=True): for index in range(chart.num_leaves() + 1): new_edge = TreeEdge.from_production(prod, index) if chart.insert(new_edge, ()): yield new_edge ######################################################################## ## Filtered Bottom Up ######################################################################## class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): def _apply_complete(self, chart, grammar, right_edge): end = right_edge.end() nexttoken = end < chart.num_leaves() and chart.leaf(end) for left_edge in chart.select( end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() ): if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge def _apply_incomplete(self, chart, grammar, left_edge): for right_edge in chart.select( start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() ): end = right_edge.end() nexttoken = end < chart.num_leaves() and chart.leaf(end) if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule): def apply(self, chart, grammar, edge): if edge.is_incomplete(): return end = edge.end() nexttoken = end < chart.num_leaves() and chart.leaf(end) for prod in grammar.productions(rhs=edge.lhs()): if _bottomup_filter(grammar, nexttoken, prod.rhs()): new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) if chart.insert(new_edge, (edge,)): yield new_edge def _bottomup_filter(grammar, nexttoken, rhs, dot=0): if len(rhs) <= dot + 1: return True _next = rhs[dot + 1] if is_terminal(_next): return nexttoken == _next else: return grammar.is_leftcorner(_next, nexttoken) ######################################################################## ## Generic Chart Parser ######################################################################## TD_STRATEGY = [ LeafInitRule(), TopDownInitRule(), CachedTopDownPredictRule(), SingleEdgeFundamentalRule(), ] BU_STRATEGY = [ LeafInitRule(), EmptyPredictRule(), BottomUpPredictRule(), SingleEdgeFundamentalRule(), ] BU_LC_STRATEGY = [ LeafInitRule(), EmptyPredictRule(), BottomUpPredictCombineRule(), SingleEdgeFundamentalRule(), ] LC_STRATEGY = [ LeafInitRule(), FilteredBottomUpPredictCombineRule(), FilteredSingleEdgeFundamentalRule(), ] class ChartParser(ParserI): """ A generic chart parser. A "strategy", or list of ``ChartRuleI`` instances, is used to decide what edges to add to the chart. In particular, ``ChartParser`` uses the following algorithm to parse texts: | Until no new edges are added: | For each *rule* in *strategy*: | Apply *rule* to any applicable edges in the chart. | Return any complete parses in the chart """ def __init__( self, grammar, strategy=BU_LC_STRATEGY, trace=0, trace_chart_width=50, use_agenda=True, chart_class=Chart, ): """ Create a new chart parser, that uses ``grammar`` to parse texts. :type grammar: CFG :param grammar: The grammar used to parse texts. :type strategy: list(ChartRuleI) :param strategy: A list of rules that should be used to decide what edges to add to the chart (top-down strategy by default). :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :type trace_chart_width: int :param trace_chart_width: The default total width reserved for the chart in trace output. The remainder of each line will be used to display edges. :type use_agenda: bool :param use_agenda: Use an optimized agenda-based algorithm, if possible. :param chart_class: The class that should be used to create the parse charts. """ self._grammar = grammar self._strategy = strategy self._trace = trace self._trace_chart_width = trace_chart_width # If the strategy only consists of axioms (NUM_EDGES==0) and # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm: self._use_agenda = use_agenda self._chart_class = chart_class self._axioms = [] self._inference_rules = [] for rule in strategy: if rule.NUM_EDGES == 0: self._axioms.append(rule) elif rule.NUM_EDGES == 1: self._inference_rules.append(rule) else: self._use_agenda = False def grammar(self): return self._grammar def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width): if not trace: return print_rule_header = trace > 1 for edge in new_edges: if print_rule_header: print("%s:" % rule) print_rule_header = False print(chart.pretty_format_edge(edge, edge_width)) def chart_parse(self, tokens, trace=None): """ Return the final parse ``Chart`` from which all possible parse trees can be extracted. :param tokens: The sentence to be parsed :type tokens: list(str) :rtype: Chart """ if trace is None: trace = self._trace trace_new_edges = self._trace_new_edges tokens = list(tokens) self._grammar.check_coverage(tokens) chart = self._chart_class(tokens) grammar = self._grammar # Width, for printing trace edges. trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) if trace: print(chart.pretty_format_leaves(trace_edge_width)) if self._use_agenda: # Use an agenda-based algorithm. for axiom in self._axioms: new_edges = list(axiom.apply(chart, grammar)) trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) inference_rules = self._inference_rules agenda = chart.edges() # We reverse the initial agenda, since it is a stack # but chart.edges() functions as a queue. agenda.reverse() while agenda: edge = agenda.pop() for rule in inference_rules: new_edges = list(rule.apply(chart, grammar, edge)) if trace: trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) agenda += new_edges else: # Do not use an agenda-based algorithm. edges_added = True while edges_added: edges_added = False for rule in self._strategy: new_edges = list(rule.apply_everywhere(chart, grammar)) edges_added = len(new_edges) trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) # Return the final chart. return chart def parse(self, tokens, tree_class=Tree): chart = self.chart_parse(tokens) return iter(chart.parses(self._grammar.start(), tree_class=tree_class)) class TopDownChartParser(ChartParser): """ A ``ChartParser`` using a top-down parsing strategy. See ``ChartParser`` for more information. """ def __init__(self, grammar, **parser_args): ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args) class BottomUpChartParser(ChartParser): """ A ``ChartParser`` using a bottom-up parsing strategy. See ``ChartParser`` for more information. """ def __init__(self, grammar, **parser_args): if isinstance(grammar, PCFG): warnings.warn( "BottomUpChartParser only works for CFG, " "use BottomUpProbabilisticChartParser instead", category=DeprecationWarning, ) ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args) class BottomUpLeftCornerChartParser(ChartParser): """ A ``ChartParser`` using a bottom-up left-corner parsing strategy. This strategy is often more efficient than standard bottom-up. See ``ChartParser`` for more information. """ def __init__(self, grammar, **parser_args): ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args) class LeftCornerChartParser(ChartParser): def __init__(self, grammar, **parser_args): if not grammar.is_nonempty(): raise ValueError( "LeftCornerParser only works for grammars " "without empty productions." ) ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args) ######################################################################## ## Stepping Chart Parser ######################################################################## class SteppingChartParser(ChartParser): """ A ``ChartParser`` that allows you to step through the parsing process, adding a single edge at a time. It also allows you to change the parser's strategy or grammar midway through parsing a text. The ``initialize`` method is used to start parsing a text. ``step`` adds a single edge to the chart. ``set_strategy`` changes the strategy used by the chart parser. ``parses`` returns the set of parses that has been found by the chart parser. :ivar _restart: Records whether the parser's strategy, grammar, or chart has been changed. If so, then ``step`` must restart the parsing algorithm. """ def __init__(self, grammar, strategy=[], trace=0): self._chart = None self._current_chartrule = None self._restart = False ChartParser.__init__(self, grammar, strategy, trace) # //////////////////////////////////////////////////////////// # Initialization # //////////////////////////////////////////////////////////// def initialize(self, tokens): "Begin parsing the given tokens." self._chart = Chart(list(tokens)) self._restart = True # //////////////////////////////////////////////////////////// # Stepping # //////////////////////////////////////////////////////////// def step(self): """ Return a generator that adds edges to the chart, one at a time. Each time the generator is resumed, it adds a single edge and yields that edge. If no more edges can be added, then it yields None. If the parser's strategy, grammar, or chart is changed, then the generator will continue adding edges using the new strategy, grammar, or chart. Note that this generator never terminates, since the grammar or strategy might be changed to values that would add new edges. Instead, it yields None when no more edges can be added with the current strategy and grammar. """ if self._chart is None: raise ValueError("Parser must be initialized first") while True: self._restart = False w = 50 // (self._chart.num_leaves() + 1) for e in self._parse(): if self._trace > 1: print(self._current_chartrule) if self._trace > 0: print(self._chart.pretty_format_edge(e, w)) yield e if self._restart: break else: yield None # No more edges. def _parse(self): """ A generator that implements the actual parsing algorithm. ``step`` iterates through this generator, and restarts it whenever the parser's strategy, grammar, or chart is modified. """ chart = self._chart grammar = self._grammar edges_added = 1 while edges_added > 0: edges_added = 0 for rule in self._strategy: self._current_chartrule = rule for e in rule.apply_everywhere(chart, grammar): edges_added += 1 yield e # //////////////////////////////////////////////////////////// # Accessors # //////////////////////////////////////////////////////////// def strategy(self): "Return the strategy used by this parser." return self._strategy def grammar(self): "Return the grammar used by this parser." return self._grammar def chart(self): "Return the chart that is used by this parser." return self._chart def current_chartrule(self): "Return the chart rule used to generate the most recent edge." return self._current_chartrule def parses(self, tree_class=Tree): "Return the parse trees currently contained in the chart." return self._chart.parses(self._grammar.start(), tree_class) # //////////////////////////////////////////////////////////// # Parser modification # //////////////////////////////////////////////////////////// def set_strategy(self, strategy): """ Change the strategy that the parser uses to decide which edges to add to the chart. :type strategy: list(ChartRuleI) :param strategy: A list of rules that should be used to decide what edges to add to the chart. """ if strategy == self._strategy: return self._strategy = strategy[:] # Make a copy. self._restart = True def set_grammar(self, grammar): "Change the grammar used by the parser." if grammar is self._grammar: return self._grammar = grammar self._restart = True def set_chart(self, chart): "Load a given chart into the chart parser." if chart is self._chart: return self._chart = chart self._restart = True # //////////////////////////////////////////////////////////// # Standard parser methods # //////////////////////////////////////////////////////////// def parse(self, tokens, tree_class=Tree): tokens = list(tokens) self._grammar.check_coverage(tokens) # Initialize ourselves. self.initialize(tokens) # Step until no more edges are generated. for e in self.step(): if e is None: break # Return an iterator of complete parses. return self.parses(tree_class=tree_class) ######################################################################## ## Demo Code ######################################################################## def demo_grammar(): from nltk.grammar import CFG return CFG.fromstring( """ S -> NP VP PP -> "with" NP NP -> NP PP VP -> VP PP VP -> Verb NP VP -> Verb NP -> Det Noun NP -> "John" NP -> "I" Det -> "the" Det -> "my" Det -> "a" Noun -> "dog" Noun -> "cookie" Verb -> "ate" Verb -> "saw" Prep -> "with" Prep -> "under" """ ) def demo( choice=None, print_times=True, print_grammar=False, print_trees=True, trace=2, sent="I saw John with a dog with my cookie", numparses=5, ): """ A demonstration of the chart parsers. """ import sys import time from nltk import CFG, Production, nonterminals # The grammar for ChartParser and SteppingChartParser: grammar = demo_grammar() if print_grammar: print("* Grammar") print(grammar) # Tokenize the sample sentence. print("* Sentence:") print(sent) tokens = sent.split() print(tokens) print() # Ask the user which parser to test, # if the parser wasn't provided as an argument if choice is None: print(" 1: Top-down chart parser") print(" 2: Bottom-up chart parser") print(" 3: Bottom-up left-corner chart parser") print(" 4: Left-corner chart parser with bottom-up filter") print(" 5: Stepping chart parser (alternating top-down & bottom-up)") print(" 6: All parsers") print("\nWhich parser (1-6)? ", end=" ") choice = sys.stdin.readline().strip() print() choice = str(choice) if choice not in "123456": print("Bad parser number") return # Keep track of how long each parser takes. times = {} strategies = { "1": ("Top-down", TD_STRATEGY), "2": ("Bottom-up", BU_STRATEGY), "3": ("Bottom-up left-corner", BU_LC_STRATEGY), "4": ("Filtered left-corner", LC_STRATEGY), } choices = [] if choice in strategies: choices = [choice] if choice == "6": choices = "1234" # Run the requested chart parser(s), except the stepping parser. for strategy in choices: print("* Strategy: " + strategies[strategy][0]) print() cp = ChartParser(grammar, strategies[strategy][1], trace=trace) t = time.time() chart = cp.chart_parse(tokens) parses = list(chart.parses(grammar.start())) times[strategies[strategy][0]] = time.time() - t print("Nr edges in chart:", len(chart.edges())) if numparses: assert len(parses) == numparses, "Not all parses found" if print_trees: for tree in parses: print(tree) else: print("Nr trees:", len(parses)) print() # Run the stepping parser, if requested. if choice in "56": print("* Strategy: Stepping (top-down vs bottom-up)") print() t = time.time() cp = SteppingChartParser(grammar, trace=trace) cp.initialize(tokens) for i in range(5): print("*** SWITCH TO TOP DOWN") cp.set_strategy(TD_STRATEGY) for j, e in enumerate(cp.step()): if j > 20 or e is None: break print("*** SWITCH TO BOTTOM UP") cp.set_strategy(BU_STRATEGY) for j, e in enumerate(cp.step()): if j > 20 or e is None: break times["Stepping"] = time.time() - t print("Nr edges in chart:", len(cp.chart().edges())) if numparses: assert len(list(cp.parses())) == numparses, "Not all parses found" if print_trees: for tree in cp.parses(): print(tree) else: print("Nr trees:", len(list(cp.parses()))) print() # Print the times of all parsers: if not (print_times and times): return print("* Parsing times") print() maxlen = max(len(key) for key in times) format = "%" + repr(maxlen) + "s parser: %6.3fsec" times_items = times.items() for (parser, t) in sorted(times_items, key=lambda a: a[1]): print(format % (parser, t)) if __name__ == "__main__": demo() nltk-3.7/nltk/parse/corenlp.py000066400000000000000000000566321420073152400164430ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the CoreNLP REST API. # # Copyright (C) 2001-2022 NLTK Project # Author: Dmitrijs Milajevs # # URL: # For license information, see LICENSE.TXT import json import re import socket import time from nltk.internals import _java_options, config_java, find_jar_iter, java from nltk.parse.api import ParserI from nltk.parse.dependencygraph import DependencyGraph from nltk.tag.api import TaggerI from nltk.tokenize.api import TokenizerI from nltk.tree import Tree _stanford_url = "https://stanfordnlp.github.io/CoreNLP/" class CoreNLPServerError(EnvironmentError): """Exceptions associated with the Core NLP server.""" def try_port(port=0): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(("", port)) p = sock.getsockname()[1] sock.close() return p class CoreNLPServer: _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar" _JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar" def __init__( self, path_to_jar=None, path_to_models_jar=None, verbose=False, java_options=None, corenlp_options=None, port=None, ): if corenlp_options is None: corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"] jars = list( find_jar_iter( self._JAR, path_to_jar, env_vars=("CORENLP",), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True, ) ) # find the most recent code and model jar stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name)) if port is None: try: port = try_port(9000) except OSError: port = try_port() corenlp_options.append(str(port)) else: try_port(port) self.url = f"http://localhost:{port}" model_jar = max( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=("CORENLP_MODELS",), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True, ), key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name), ) self.verbose = verbose self._classpath = stanford_jar, model_jar self.corenlp_options = corenlp_options self.java_options = java_options or ["-mx2g"] def start(self, stdout="devnull", stderr="devnull"): """Starts the CoreNLP server :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe' """ import requests cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. default_options = " ".join(_java_options) config_java(options=self.java_options, verbose=self.verbose) try: self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout=stdout, stderr=stderr, ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, "Could not start the server. " "The error was: {}".format(stderrdata.decode("ascii")), ) for i in range(30): try: response = requests.get(requests.compat.urljoin(self.url, "live")) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError("Could not connect to the server.") for i in range(60): try: response = requests.get(requests.compat.urljoin(self.url, "ready")) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError("The server is not ready.") def stop(self): self.popen.terminate() self.popen.wait() def __enter__(self): self.start() return self def __exit__(self, exc_type, exc_val, exc_tb): self.stop() return False class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI): """Interface to the CoreNLP Parser.""" def __init__(self, url="http://localhost:9000", encoding="utf8", tagtype=None): import requests self.url = url self.encoding = encoding if tagtype not in ["pos", "ner", None]: raise ValueError("tagtype must be either 'pos', 'ner' or None") self.tagtype = tagtype self.session = requests.Session() def parse_sents(self, sentences, *args, **kwargs): """Parse multiple sentences. Takes multiple sentences as a list where each sentence is a list of words. Each sentence will be automatically tagged with this CoreNLPParser instance's tagger. If a whitespace exists inside a token, then the token will be treated as several tokens. :param sentences: Input sentences to parse :type sentences: list(list(str)) :rtype: iter(iter(Tree)) """ # Converting list(list(str)) -> list(str) sentences = (" ".join(words) for words in sentences) return self.raw_parse_sents(sentences, *args, **kwargs) def raw_parse(self, sentence, properties=None, *args, **kwargs): """Parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by the CoreNLP Parser. :param sentence: Input sentence to parse :type sentence: str :rtype: iter(Tree) """ default_properties = {"tokenize.whitespace": "false"} default_properties.update(properties or {}) return next( self.raw_parse_sents( [sentence], properties=default_properties, *args, **kwargs ) ) def api_call(self, data, properties=None, timeout=60): default_properties = { "outputFormat": "json", "annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format( parser_annotator=self.parser_annotator ), } default_properties.update(properties or {}) response = self.session.post( self.url, params={"properties": json.dumps(default_properties)}, data=data.encode(self.encoding), headers={"Content-Type": f"text/plain; charset={self.encoding}"}, timeout=timeout, ) response.raise_for_status() return response.json() def raw_parse_sents( self, sentences, verbose=False, properties=None, *args, **kwargs ): """Parse multiple sentences. Takes multiple sentences as a list of strings. Each sentence will be automatically tokenized and tagged. :param sentences: Input sentences to parse. :type sentences: list(str) :rtype: iter(iter(Tree)) """ default_properties = { # Only splits on '\n', never inside the sentence. "ssplit.eolonly": "true" } default_properties.update(properties or {}) """ for sentence in sentences: parsed_data = self.api_call(sentence, properties=default_properties) assert len(parsed_data['sentences']) == 1 for parse in parsed_data['sentences']: tree = self.make_tree(parse) yield iter([tree]) """ parsed_data = self.api_call("\n".join(sentences), properties=default_properties) for parsed_sent in parsed_data["sentences"]: tree = self.make_tree(parsed_sent) yield iter([tree]) def parse_text(self, text, *args, **kwargs): """Parse a piece of text. The text might contain several sentences which will be split by CoreNLP. :param str text: text to be split. :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables? """ parsed_data = self.api_call(text, *args, **kwargs) for parse in parsed_data["sentences"]: yield self.make_tree(parse) def tokenize(self, text, properties=None): """Tokenize a string of text. >>> parser = CoreNLPParser(url='http://localhost:9000') >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.' >>> list(parser.tokenize(text)) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> s = "The colour of the wall is blue." >>> list( ... parser.tokenize( ... 'The colour of the wall is blue.', ... properties={'tokenize.options': 'americanize=true'}, ... ) ... ) ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] """ default_properties = {"annotators": "tokenize,ssplit"} default_properties.update(properties or {}) result = self.api_call(text, properties=default_properties) for sentence in result["sentences"]: for token in sentence["tokens"]: yield token["originalText"] or token["word"] def tag_sents(self, sentences): """ Tag multiple sentences. Takes multiple sentences as a list where each sentence is a list of tokens. :param sentences: Input sentences to tag :type sentences: list(list(str)) :rtype: list(list(tuple(str, str)) """ # Converting list(list(str)) -> list(str) sentences = (" ".join(words) for words in sentences) return [sentences[0] for sentences in self.raw_tag_sents(sentences)] def tag(self, sentence): """ Tag a list of tokens. :rtype: list(tuple(str, str)) >>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner') >>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split() >>> parser.tag(tokens) [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')] >>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos') >>> tokens = "What is the airspeed of an unladen swallow ?".split() >>> parser.tag(tokens) [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] """ return self.tag_sents([sentence])[0] def raw_tag_sents(self, sentences): """ Tag multiple sentences. Takes multiple sentences as a list where each sentence is a string. :param sentences: Input sentences to tag :type sentences: list(str) :rtype: list(list(list(tuple(str, str))) """ default_properties = { "ssplit.isOneSentence": "true", "annotators": "tokenize,ssplit,", } # Supports only 'pos' or 'ner' tags. assert self.tagtype in ["pos", "ner"] default_properties["annotators"] += self.tagtype for sentence in sentences: tagged_data = self.api_call(sentence, properties=default_properties) yield [ [ (token["word"], token[self.tagtype]) for token in tagged_sentence["tokens"] ] for tagged_sentence in tagged_data["sentences"] ] class CoreNLPParser(GenericCoreNLPParser): """ >>> parser = CoreNLPParser(url='http://localhost:9000') >>> next( ... parser.raw_parse('The quick brown fox jumps over the lazy dog.') ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S _______________|__________________________ | VP | | _________|___ | | | PP | | | ________|___ | NP | | NP | ____|__________ | | _______|____ | DT JJ JJ NN VBZ IN DT JJ NN . | | | | | | | | | | The quick brown fox jumps over the lazy dog . >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents( ... [ ... 'The quick brown fox jumps over the lazy dog.', ... 'The quick grey wolf jumps over the lazy fox.', ... ] ... ) >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S _______________|__________________________ | VP | | _________|___ | | | PP | | | ________|___ | NP | | NP | ____|__________ | | _______|____ | DT JJ JJ NN VBZ IN DT JJ NN . | | | | | | | | | | The quick brown fox jumps over the lazy dog . >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S _______________|__________________________ | VP | | _________|___ | | | PP | | | ________|___ | NP | | NP | ____|_________ | | _______|____ | DT JJ JJ NN VBZ IN DT JJ NN . | | | | | | | | | | The quick grey wolf jumps over the lazy fox . >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents( ... [ ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ] ... ) >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S _______|____ | VP | ________|___ NP | NP | | ___|___ PRP VBP DT NN | | | | I 'm a dog >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S ____|___________ | VP | ___________|_____________ | | NP | | _______|_________ | | NP PRN | | _____|_______ ____|______________ NP | NP | | NP | | | ______|_________ | | ___|____ | DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB- | | | | | | | | | | This is my friends ' cat -LRB- the tabby -RRB- >>> parse_john, parse_mary, = parser.parse_text( ... 'John loves Mary. Mary walks.' ... ) >>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S _____|_____________ | VP | | ____|___ | NP | NP | | | | | NNP VBZ NNP . | | | | John loves Mary . >>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S _____|____ NP VP | | | | NNP VBZ . | | | Mary walks . Special cases >>> next( ... parser.raw_parse( ... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war ' ... 'Jessica Lynch have angrily dismissed claims made in her biography ' ... 'that she was raped by her Iraqi captors.' ... ) ... ).height() 20 >>> next( ... parser.raw_parse( ... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or " ... '0.05 percent, at 997.02.' ... ) ... ).height() 9 """ _OUTPUT_FORMAT = "penn" parser_annotator = "parse" def make_tree(self, result): return Tree.fromstring(result["parse"]) class CoreNLPDependencyParser(GenericCoreNLPParser): """Dependency parser. >>> dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') >>> parse, = dep_parser.raw_parse( ... 'The quick brown fox jumps over the lazy dog.' ... ) >>> print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE The DT 4 det quick JJ 4 amod brown JJ 4 amod fox NN 5 nsubj jumps VBZ 0 ROOT over IN 9 case the DT 9 det lazy JJ 9 amod dog NN 5 nmod . . 5 punct >>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE (jumps (fox The quick brown) (dog over the lazy) .) >>> for governor, dep, dependent in parse.triples(): ... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE ('jumps', 'VBZ') nsubj ('fox', 'NN') ('fox', 'NN') det ('The', 'DT') ('fox', 'NN') amod ('quick', 'JJ') ('fox', 'NN') amod ('brown', 'JJ') ('jumps', 'VBZ') nmod ('dog', 'NN') ('dog', 'NN') case ('over', 'IN') ('dog', 'NN') det ('the', 'DT') ('dog', 'NN') amod ('lazy', 'JJ') ('jumps', 'VBZ') punct ('.', '.') >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents( ... [ ... 'The quick brown fox jumps over the lazy dog.', ... 'The quick grey wolf jumps over the lazy fox.', ... ] ... ) >>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE The DT 4 det quick JJ 4 amod brown JJ 4 amod fox NN 5 nsubj jumps VBZ 0 ROOT over IN 9 case the DT 9 det lazy JJ 9 amod dog NN 5 nmod . . 5 punct >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE The DT 4 det quick JJ 4 amod grey JJ 4 amod wolf NN 5 nsubj jumps VBZ 0 ROOT over IN 9 case the DT 9 det lazy JJ 9 amod fox NN 5 nmod . . 5 punct >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents( ... [ ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ] ... ) >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE I PRP 4 nsubj 'm VBP 4 cop a DT 4 det dog NN 0 ROOT >>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE This DT 6 nsubj is VBZ 6 cop my PRP$ 4 nmod:poss friends NNS 6 nmod:poss ' POS 4 case cat NN 0 ROOT -LRB- -LRB- 9 punct the DT 9 det tabby NN 6 appos -RRB- -RRB- 9 punct >>> parse_john, parse_mary, = dep_parser.parse_text( ... 'John loves Mary. Mary walks.' ... ) >>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE John NNP 2 nsubj loves VBZ 0 ROOT Mary NNP 2 dobj . . 2 punct >>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE Mary NNP 2 nsubj walks VBZ 0 ROOT . . 2 punct Special cases Non-breaking space inside of a token. >>> len( ... next( ... dep_parser.raw_parse( ... 'Anhalt said children typically treat a 20-ounce soda bottle as one ' ... 'serving, while it actually contains 2 1/2 servings.' ... ) ... ).nodes ... ) 21 Phone numbers. >>> len( ... next( ... dep_parser.raw_parse('This is not going to crash: 01 111 555.') ... ).nodes ... ) 10 >>> print( ... next( ... dep_parser.raw_parse('The underscore _ should not simply disappear.') ... ).to_conll(4) ... ) # doctest: +NORMALIZE_WHITESPACE The DT 3 det underscore VBP 3 amod _ NN 7 nsubj should MD 7 aux not RB 7 neg simply RB 7 advmod disappear VB 0 ROOT . . 7 punct >>> print( ... '\\n'.join( ... next( ... dep_parser.raw_parse( ... 'for all of its insights into the dream world of teen life , and its electronic expression through ' ... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 ' ... '1/2-hour running time .' ... ) ... ).to_conll(4).split('\\n')[-8:] ... ) ... ) its PRP$ 40 nmod:poss 2 1/2 CD 40 nummod - : 40 punct hour NN 31 nmod running VBG 42 amod time NN 40 dep . . 24 punct """ _OUTPUT_FORMAT = "conll2007" parser_annotator = "depparse" def make_tree(self, result): return DependencyGraph( ( " ".join(n_items[1:]) # NLTK expects an iterable of strings... for n_items in sorted(transform(result)) ), cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token. ) def transform(sentence): for dependency in sentence["basicDependencies"]: dependent_index = dependency["dependent"] token = sentence["tokens"][dependent_index - 1] # Return values that we don't know as '_'. Also, consider tag and ctag # to be equal. yield ( dependent_index, "_", token["word"], token["lemma"], token["pos"], token["pos"], "_", str(dependency["governor"]), dependency["dep"], "_", "_", ) nltk-3.7/nltk/parse/dependencygraph.py000077500000000000000000000747041420073152400201440ustar00rootroot00000000000000# Natural Language Toolkit: Dependency Grammars # # Copyright (C) 2001-2022 NLTK Project # Author: Jason Narad # Steven Bird (modifications) # # URL: # For license information, see LICENSE.TXT # """ Tools for reading and writing dependency trees. The input is assumed to be in Malt-TAB format (https://stp.lingfil.uu.se/~nivre/research/MaltXML.html). """ import subprocess import warnings from collections import defaultdict from itertools import chain from pprint import pformat from nltk.tree import Tree ################################################################# # DependencyGraph Class ################################################################# class DependencyGraph: """ A container for the nodes and labelled edges of a dependency structure. """ def __init__( self, tree_str=None, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label="ROOT", ): """Dependency graph. We place a dummy `TOP` node with the index 0, since the root node is often assigned 0 as its head. This also means that the indexing of the nodes corresponds directly to the Malt-TAB format, which starts at 1. If zero-based is True, then Malt-TAB-like input with node numbers starting at 0 and the root node assigned -1 (as produced by, e.g., zpar). :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. """ self.nodes = defaultdict( lambda: { "address": None, "word": None, "lemma": None, "ctag": None, "tag": None, "feats": None, "head": None, "deps": defaultdict(list), "rel": None, } ) self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0}) self.root = None if tree_str: self._parse( tree_str, cell_extractor=cell_extractor, zero_based=zero_based, cell_separator=cell_separator, top_relation_label=top_relation_label, ) def remove_by_address(self, address): """ Removes the node with the given address. References to this node in others will still exist. """ del self.nodes[address] def redirect_arcs(self, originals, redirect): """ Redirects arcs to any of the nodes in the originals list to the redirect node address. """ for node in self.nodes.values(): new_deps = [] for dep in node["deps"]: if dep in originals: new_deps.append(redirect) else: new_deps.append(dep) node["deps"] = new_deps def add_arc(self, head_address, mod_address): """ Adds an arc from the node specified by head_address to the node specified by the mod address. """ relation = self.nodes[mod_address]["rel"] self.nodes[head_address]["deps"].setdefault(relation, []) self.nodes[head_address]["deps"][relation].append(mod_address) # self.nodes[head_address]['deps'].append(mod_address) def connect_graph(self): """ Fully connects all non-root nodes. All nodes are set to be dependents of the root node. """ for node1 in self.nodes.values(): for node2 in self.nodes.values(): if node1["address"] != node2["address"] and node2["rel"] != "TOP": relation = node2["rel"] node1["deps"].setdefault(relation, []) node1["deps"][relation].append(node2["address"]) # node1['deps'].append(node2['address']) def get_by_address(self, node_address): """Return the node with the given address.""" return self.nodes[node_address] def contains_address(self, node_address): """ Returns true if the graph contains a node with the given node address, false otherwise. """ return node_address in self.nodes def to_dot(self): """Return a dot representation suitable for using with Graphviz. >>> dg = DependencyGraph( ... 'John N 2\\n' ... 'loves V 0\\n' ... 'Mary N 2' ... ) >>> print(dg.to_dot()) digraph G{ edge [dir=forward] node [shape=plaintext] 0 [label="0 (None)"] 0 -> 2 [label="ROOT"] 1 [label="1 (John)"] 2 [label="2 (loves)"] 2 -> 1 [label=""] 2 -> 3 [label=""] 3 [label="3 (Mary)"] } """ # Start the digraph specification s = "digraph G{\n" s += "edge [dir=forward]\n" s += "node [shape=plaintext]\n" # Draw the remaining nodes for node in sorted(self.nodes.values(), key=lambda v: v["address"]): s += '\n{} [label="{} ({})"]'.format( node["address"], node["address"], node["word"], ) for rel, deps in node["deps"].items(): for dep in deps: if rel is not None: s += '\n{} -> {} [label="{}"]'.format(node["address"], dep, rel) else: s += "\n{} -> {} ".format(node["address"], dep) s += "\n}" return s def _repr_svg_(self): """Show SVG representation of the transducer (IPython magic). >>> dg = DependencyGraph( ... 'John N 2\\n' ... 'loves V 0\\n' ... 'Mary N 2' ... ) >>> dg._repr_svg_().split('\\n')[0] '' """ dot_string = self.to_dot() return dot2img(dot_string) def __str__(self): return pformat(self.nodes) def __repr__(self): return f"" @staticmethod def load( filename, zero_based=False, cell_separator=None, top_relation_label="ROOT" ): """ :param filename: a name of a file in Malt-TAB format :param zero_based: nodes in the input file are numbered starting from 0 rather than 1 (as produced by, e.g., zpar) :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. :return: a list of DependencyGraphs """ with open(filename) as infile: return [ DependencyGraph( tree_str, zero_based=zero_based, cell_separator=cell_separator, top_relation_label=top_relation_label, ) for tree_str in infile.read().split("\n\n") ] def left_children(self, node_index): """ Returns the number of left children under the node specified by the given address. """ children = chain.from_iterable(self.nodes[node_index]["deps"].values()) index = self.nodes[node_index]["address"] return sum(1 for c in children if c < index) def right_children(self, node_index): """ Returns the number of right children under the node specified by the given address. """ children = chain.from_iterable(self.nodes[node_index]["deps"].values()) index = self.nodes[node_index]["address"] return sum(1 for c in children if c > index) def add_node(self, node): if not self.contains_address(node["address"]): self.nodes[node["address"]].update(node) def _parse( self, input_, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label="ROOT", ): """Parse a sentence. :param extractor: a function that given a tuple of cells returns a 7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``. :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. """ def extract_3_cells(cells, index): word, tag, head = cells return index, word, word, tag, tag, "", head, "" def extract_4_cells(cells, index): word, tag, head, rel = cells return index, word, word, tag, tag, "", head, rel def extract_7_cells(cells, index): line_index, word, lemma, tag, _, head, rel = cells try: index = int(line_index) except ValueError: # index can't be parsed as an integer, use default pass return index, word, lemma, tag, tag, "", head, rel def extract_10_cells(cells, index): line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells try: index = int(line_index) except ValueError: # index can't be parsed as an integer, use default pass return index, word, lemma, ctag, tag, feats, head, rel extractors = { 3: extract_3_cells, 4: extract_4_cells, 7: extract_7_cells, 10: extract_10_cells, } if isinstance(input_, str): input_ = (line for line in input_.split("\n")) lines = (l.rstrip() for l in input_) lines = (l for l in lines if l) cell_number = None for index, line in enumerate(lines, start=1): cells = line.split(cell_separator) if cell_number is None: cell_number = len(cells) else: assert cell_number == len(cells) if cell_extractor is None: try: cell_extractor = extractors[cell_number] except KeyError as e: raise ValueError( "Number of tab-delimited fields ({}) not supported by " "CoNLL(10) or Malt-Tab(4) format".format(cell_number) ) from e try: index, word, lemma, ctag, tag, feats, head, rel = cell_extractor( cells, index ) except (TypeError, ValueError): # cell_extractor doesn't take 2 arguments or doesn't return 8 # values; assume the cell_extractor is an older external # extractor and doesn't accept or return an index. word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells) if head == "_": continue head = int(head) if zero_based: head += 1 self.nodes[index].update( { "address": index, "word": word, "lemma": lemma, "ctag": ctag, "tag": tag, "feats": feats, "head": head, "rel": rel, } ) # Make sure that the fake root node has labeled dependencies. if (cell_number == 3) and (head == 0): rel = top_relation_label self.nodes[head]["deps"][rel].append(index) if self.nodes[0]["deps"][top_relation_label]: root_address = self.nodes[0]["deps"][top_relation_label][0] self.root = self.nodes[root_address] self.top_relation_label = top_relation_label else: warnings.warn( "The graph doesn't contain a node " "that depends on the root element." ) def _word(self, node, filter=True): w = node["word"] if filter: if w != ",": return w return w def _tree(self, i): """Turn dependency graphs into NLTK trees. :param int i: index of a node :return: either a word (if the indexed node is a leaf) or a ``Tree``. """ node = self.get_by_address(i) word = node["word"] deps = sorted(chain.from_iterable(node["deps"].values())) if deps: return Tree(word, [self._tree(dep) for dep in deps]) else: return word def tree(self): """ Starting with the ``root`` node, build a dependency tree using the NLTK ``Tree`` constructor. Dependency labels are omitted. """ node = self.root word = node["word"] deps = sorted(chain.from_iterable(node["deps"].values())) return Tree(word, [self._tree(dep) for dep in deps]) def triples(self, node=None): """ Extract dependency triples of the form: ((head word, head tag), rel, (dep word, dep tag)) """ if not node: node = self.root head = (node["word"], node["ctag"]) for i in sorted(chain.from_iterable(node["deps"].values())): dep = self.get_by_address(i) yield (head, dep["rel"], (dep["word"], dep["ctag"])) yield from self.triples(node=dep) def _hd(self, i): try: return self.nodes[i]["head"] except IndexError: return None def _rel(self, i): try: return self.nodes[i]["rel"] except IndexError: return None # what's the return type? Boolean or list? def contains_cycle(self): """Check whether there are cycles. >>> dg = DependencyGraph(treebank_data) >>> dg.contains_cycle() False >>> cyclic_dg = DependencyGraph() >>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0} >>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1} >>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2} >>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3} >>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4} >>> cyclic_dg.nodes = { ... 0: top, ... 1: child1, ... 2: child2, ... 3: child3, ... 4: child4, ... } >>> cyclic_dg.root = top >>> cyclic_dg.contains_cycle() [3, 1, 2, 4] """ distances = {} for node in self.nodes.values(): for dep in node["deps"]: key = tuple([node["address"], dep]) distances[key] = 1 for _ in self.nodes: new_entries = {} for pair1 in distances: for pair2 in distances: if pair1[1] == pair2[0]: key = tuple([pair1[0], pair2[1]]) new_entries[key] = distances[pair1] + distances[pair2] for pair in new_entries: distances[pair] = new_entries[pair] if pair[0] == pair[1]: path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0]) return path return False # return []? def get_cycle_path(self, curr_node, goal_node_index): for dep in curr_node["deps"]: if dep == goal_node_index: return [curr_node["address"]] for dep in curr_node["deps"]: path = self.get_cycle_path(self.get_by_address(dep), goal_node_index) if len(path) > 0: path.insert(0, curr_node["address"]) return path return [] def to_conll(self, style): """ The dependency graph in CoNLL format. :param style: the style to use for the format (3, 4, 10 columns) :type style: int :rtype: str """ if style == 3: template = "{word}\t{tag}\t{head}\n" elif style == 4: template = "{word}\t{tag}\t{head}\t{rel}\n" elif style == 10: template = ( "{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n" ) else: raise ValueError( "Number of tab-delimited fields ({}) not supported by " "CoNLL(10) or Malt-Tab(4) format".format(style) ) return "".join( template.format(i=i, **node) for i, node in sorted(self.nodes.items()) if node["tag"] != "TOP" ) def nx_graph(self): """Convert the data in a ``nodelist`` into a networkx labeled directed graph.""" import networkx nx_nodelist = list(range(1, len(self.nodes))) nx_edgelist = [ (n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n) ] self.nx_labels = {} for n in nx_nodelist: self.nx_labels[n] = self.nodes[n]["word"] g = networkx.MultiDiGraph() g.add_nodes_from(nx_nodelist) g.add_edges_from(nx_edgelist) return g def dot2img(dot_string, t="svg"): """ Create image representation fom dot_string, using the 'dot' program from the Graphviz package. Use the 't' argument to specify the image file format, for ex. 'png' or 'jpeg' (Running 'dot -T:' lists all available formats). sys.stdout is used instead of subprocess.PIPE, to avoid decoding errors """ from sys import stderr, stdout try: proc = subprocess.run( ["dot", "-T%s" % t], input=dot_string, stdout=stdout, stderr=stderr, text=True, ) except OSError as e: raise Exception("Cannot find the dot binary from Graphviz package") from e out, err = proc.stdout, proc.stderr if err: raise Exception( "Cannot create image representation by running dot from string: {}" "".format(dot_string) ) return out class DependencyGraphError(Exception): """Dependency graph exception.""" def demo(): malt_demo() conll_demo() conll_file_demo() cycle_finding_demo() def malt_demo(nx=False): """ A demonstration of the result of reading a dependency version of the first sentence of the Penn Treebank. """ dg = DependencyGraph( """Pierre NNP 2 NMOD Vinken NNP 8 SUB , , 2 P 61 CD 5 NMOD years NNS 6 AMOD old JJ 2 NMOD , , 2 P will MD 0 ROOT join VB 8 VC the DT 11 NMOD board NN 9 OBJ as IN 9 VMOD a DT 15 NMOD nonexecutive JJ 15 NMOD director NN 12 PMOD Nov. NNP 9 VMOD 29 CD 16 NMOD . . 9 VMOD """ ) tree = dg.tree() tree.pprint() if nx: # currently doesn't work import networkx from matplotlib import pylab g = dg.nx_graph() g.info() pos = networkx.spring_layout(g, dim=1) networkx.draw_networkx_nodes(g, pos, node_size=50) # networkx.draw_networkx_edges(g, pos, edge_color='k', width=8) networkx.draw_networkx_labels(g, pos, dg.nx_labels) pylab.xticks([]) pylab.yticks([]) pylab.savefig("tree.png") pylab.show() def conll_demo(): """ A demonstration of how to read a string representation of a CoNLL format dependency tree. """ dg = DependencyGraph(conll_data1) tree = dg.tree() tree.pprint() print(dg) print(dg.to_conll(4)) def conll_file_demo(): print("Mass conll_read demo...") graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] for graph in graphs: tree = graph.tree() print("\n") tree.pprint() def cycle_finding_demo(): dg = DependencyGraph(treebank_data) print(dg.contains_cycle()) cyclic_dg = DependencyGraph() cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0}) cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1}) cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2}) cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3}) cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4}) print(cyclic_dg.contains_cycle()) treebank_data = """Pierre NNP 2 NMOD Vinken NNP 8 SUB , , 2 P 61 CD 5 NMOD years NNS 6 AMOD old JJ 2 NMOD , , 2 P will MD 0 ROOT join VB 8 VC the DT 11 NMOD board NN 9 OBJ as IN 9 VMOD a DT 15 NMOD nonexecutive JJ 15 NMOD director NN 12 PMOD Nov. NNP 9 VMOD 29 CD 16 NMOD . . 9 VMOD """ conll_data1 = """ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 met met Prep Prep voor 8 mod _ _ 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ 7 gaan ga V V hulp|inf 6 vc _ _ 8 winkelen winkel V V intrans|inf 11 cnj _ _ 9 , , Punc Punc komma 8 punct _ _ 10 zwemmen zwem V V intrans|inf 11 cnj _ _ 11 of of Conj Conj neven 7 vc _ _ 12 terrassen terras N N soort|mv|neut 11 cnj _ _ 13 . . Punc Punc punt 12 punct _ _ """ conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _ 2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _ 4 wild wild Adj Adj attr|stell|onverv 5 mod _ _ 5 zwaaien zwaai N N soort|mv|neut 2 vc _ _ 6 . . Punc Punc punt 5 punct _ _ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 met met Prep Prep voor 8 mod _ _ 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ 7 gaan ga V V hulp|inf 6 vc _ _ 8 winkelen winkel V V intrans|inf 11 cnj _ _ 9 , , Punc Punc komma 8 punct _ _ 10 zwemmen zwem V V intrans|inf 11 cnj _ _ 11 of of Conj Conj neven 7 vc _ _ 12 terrassen terras N N soort|mv|neut 11 cnj _ _ 13 . . Punc Punc punt 12 punct _ _ 1 Dat dat Pron Pron aanw|neut|attr 2 det _ _ 2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _ 3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ 4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _ 5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _ 6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _ 7 . . Punc Punc punt 6 punct _ _ 1 Het het Pron Pron onbep|neut|zelfst 2 su _ _ 2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 bij bij Prep Prep voor 2 ld _ _ 4 de de Art Art bep|zijdofmv|neut 6 det _ _ 5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _ 6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _ 7 die die Pron Pron betr|neut|zelfst 6 mod _ _ 8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _ 9 ginds ginds Adv Adv gew|aanw 12 mod _ _ 10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _ 11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _ 12 gelaten laat V V trans|verldw|onverv 11 vc _ _ 13 . . Punc Punc punt 12 punct _ _ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _ 3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _ 4 naast naast Prep Prep voor 11 mod _ _ 5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _ 6 op op Prep Prep voor 11 ld _ _ 7 de de Art Art bep|zijdofmv|neut 8 det _ _ 8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _ 9 kunnen kan V V hulp|inf 2 vc _ _ 10 gaan ga V V hulp|inf 9 vc _ _ 11 liggen lig V V intrans|inf 10 vc _ _ 12 . . Punc Punc punt 11 punct _ _ 1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _ 2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _ 3 mams mams N N soort|ev|neut 4 det _ _ 4 rug rug N N soort|ev|neut 5 obj1 _ _ 5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _ 6 hebben heb V V hulp|inf 2 vc _ _ 7 en en Conj Conj neven 0 ROOT _ _ 8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _ 9 de de Art Art bep|zijdofmv|neut 10 det _ _ 10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _ 11 . . Punc Punc punt 10 punct _ _ 1 Of of Conj Conj onder|metfin 0 ROOT _ _ 2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _ 3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ 4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _ 5 met met Prep Prep voor 10 mod _ _ 6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _ 7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _ 8 rond rond Adv Adv deelv 10 svp _ _ 9 kunnen kan V V hulp|inf 3 vc _ _ 10 slenteren slenter V V intrans|inf 9 vc _ _ 11 in in Prep Prep voor 10 mod _ _ 12 de de Art Art bep|zijdofmv|neut 13 det _ _ 13 buurt buurt N N soort|ev|neut 11 obj1 _ _ 14 van van Prep Prep voor 13 mod _ _ 15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _ 16 . . Punc Punc punt 15 punct _ _ """ if __name__ == "__main__": demo() nltk-3.7/nltk/parse/earleychart.py000066400000000000000000000424721420073152400173010ustar00rootroot00000000000000# Natural Language Toolkit: An Incremental Earley Chart Parser # # Copyright (C) 2001-2022 NLTK Project # Author: Peter Ljunglöf # Rob Speer # Edward Loper # Steven Bird # Jean Mark Gawron # URL: # For license information, see LICENSE.TXT """ Data classes and parser implementations for *incremental* chart parsers, which use dynamic programming to efficiently parse a text. A "chart parser" derives parse trees for a text by iteratively adding \"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree structure for a subsequence of the text. The "chart" is a \"blackboard\" for composing and combining these hypotheses. A parser is "incremental", if it guarantees that for all i, j where i < j, all edges ending at i are built before any edges ending at j. This is appealing for, say, speech recognizer hypothesis filtering. The main parser class is ``EarleyChartParser``, which is a top-down algorithm, originally formulated by Jay Earley (1970). """ from time import perf_counter from nltk.parse.chart import ( BottomUpPredictCombineRule, BottomUpPredictRule, CachedTopDownPredictRule, Chart, ChartParser, EdgeI, EmptyPredictRule, FilteredBottomUpPredictCombineRule, FilteredSingleEdgeFundamentalRule, LeafEdge, LeafInitRule, SingleEdgeFundamentalRule, TopDownInitRule, ) from nltk.parse.featurechart import ( FeatureBottomUpPredictCombineRule, FeatureBottomUpPredictRule, FeatureChart, FeatureChartParser, FeatureEmptyPredictRule, FeatureSingleEdgeFundamentalRule, FeatureTopDownInitRule, FeatureTopDownPredictRule, ) # //////////////////////////////////////////////////////////// # Incremental Chart # //////////////////////////////////////////////////////////// class IncrementalChart(Chart): def initialize(self): # A sequence of edge lists contained in this chart. self._edgelists = tuple([] for x in self._positions()) # The set of child pointer lists associated with each edge. self._edge_to_cpls = {} # Indexes mapping attribute values to lists of edges # (used by select()). self._indexes = {} def edges(self): return list(self.iteredges()) def iteredges(self): return (edge for edgelist in self._edgelists for edge in edgelist) def select(self, end, **restrictions): edgelist = self._edgelists[end] # If there are no restrictions, then return all edges. if restrictions == {}: return iter(edgelist) # Find the index corresponding to the given restrictions. restr_keys = sorted(restrictions.keys()) restr_keys = tuple(restr_keys) # If it doesn't exist, then create it. if restr_keys not in self._indexes: self._add_index(restr_keys) vals = tuple(restrictions[key] for key in restr_keys) return iter(self._indexes[restr_keys][end].get(vals, [])) def _add_index(self, restr_keys): # Make sure it's a valid index. for key in restr_keys: if not hasattr(EdgeI, key): raise ValueError("Bad restriction: %s" % key) # Create the index. index = self._indexes[restr_keys] = tuple({} for x in self._positions()) # Add all existing edges to the index. for end, edgelist in enumerate(self._edgelists): this_index = index[end] for edge in edgelist: vals = tuple(getattr(edge, key)() for key in restr_keys) this_index.setdefault(vals, []).append(edge) def _register_with_indexes(self, edge): end = edge.end() for (restr_keys, index) in self._indexes.items(): vals = tuple(getattr(edge, key)() for key in restr_keys) index[end].setdefault(vals, []).append(edge) def _append_edge(self, edge): self._edgelists[edge.end()].append(edge) def _positions(self): return range(self.num_leaves() + 1) class FeatureIncrementalChart(IncrementalChart, FeatureChart): def select(self, end, **restrictions): edgelist = self._edgelists[end] # If there are no restrictions, then return all edges. if restrictions == {}: return iter(edgelist) # Find the index corresponding to the given restrictions. restr_keys = sorted(restrictions.keys()) restr_keys = tuple(restr_keys) # If it doesn't exist, then create it. if restr_keys not in self._indexes: self._add_index(restr_keys) vals = tuple( self._get_type_if_possible(restrictions[key]) for key in restr_keys ) return iter(self._indexes[restr_keys][end].get(vals, [])) def _add_index(self, restr_keys): # Make sure it's a valid index. for key in restr_keys: if not hasattr(EdgeI, key): raise ValueError("Bad restriction: %s" % key) # Create the index. index = self._indexes[restr_keys] = tuple({} for x in self._positions()) # Add all existing edges to the index. for end, edgelist in enumerate(self._edgelists): this_index = index[end] for edge in edgelist: vals = tuple( self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys ) this_index.setdefault(vals, []).append(edge) def _register_with_indexes(self, edge): end = edge.end() for (restr_keys, index) in self._indexes.items(): vals = tuple( self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys ) index[end].setdefault(vals, []).append(edge) # //////////////////////////////////////////////////////////// # Incremental CFG Rules # //////////////////////////////////////////////////////////// class CompleteFundamentalRule(SingleEdgeFundamentalRule): def _apply_incomplete(self, chart, grammar, left_edge): end = left_edge.end() # When the chart is incremental, we only have to look for # empty complete edges here. for right_edge in chart.select( start=end, end=end, is_complete=True, lhs=left_edge.nextsym() ): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class CompleterRule(CompleteFundamentalRule): _fundamental_rule = CompleteFundamentalRule() def apply(self, chart, grammar, edge): if not isinstance(edge, LeafEdge): yield from self._fundamental_rule.apply(chart, grammar, edge) class ScannerRule(CompleteFundamentalRule): _fundamental_rule = CompleteFundamentalRule() def apply(self, chart, grammar, edge): if isinstance(edge, LeafEdge): yield from self._fundamental_rule.apply(chart, grammar, edge) class PredictorRule(CachedTopDownPredictRule): pass class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule): def apply(self, chart, grammar, edge): # Since the Filtered rule only works for grammars without empty productions, # we only have to bother with complete edges here. if edge.is_complete(): yield from self._apply_complete(chart, grammar, edge) # //////////////////////////////////////////////////////////// # Incremental FCFG Rules # //////////////////////////////////////////////////////////// class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule): def _apply_incomplete(self, chart, grammar, left_edge): fr = self._fundamental_rule end = left_edge.end() # When the chart is incremental, we only have to look for # empty complete edges here. for right_edge in chart.select( start=end, end=end, is_complete=True, lhs=left_edge.nextsym() ): yield from fr.apply(chart, grammar, left_edge, right_edge) class FeatureCompleterRule(CompleterRule): _fundamental_rule = FeatureCompleteFundamentalRule() class FeatureScannerRule(ScannerRule): _fundamental_rule = FeatureCompleteFundamentalRule() class FeaturePredictorRule(FeatureTopDownPredictRule): pass # //////////////////////////////////////////////////////////// # Incremental CFG Chart Parsers # //////////////////////////////////////////////////////////// EARLEY_STRATEGY = [ LeafInitRule(), TopDownInitRule(), CompleterRule(), ScannerRule(), PredictorRule(), ] TD_INCREMENTAL_STRATEGY = [ LeafInitRule(), TopDownInitRule(), CachedTopDownPredictRule(), CompleteFundamentalRule(), ] BU_INCREMENTAL_STRATEGY = [ LeafInitRule(), EmptyPredictRule(), BottomUpPredictRule(), CompleteFundamentalRule(), ] BU_LC_INCREMENTAL_STRATEGY = [ LeafInitRule(), EmptyPredictRule(), BottomUpPredictCombineRule(), CompleteFundamentalRule(), ] LC_INCREMENTAL_STRATEGY = [ LeafInitRule(), FilteredBottomUpPredictCombineRule(), FilteredCompleteFundamentalRule(), ] class IncrementalChartParser(ChartParser): """ An *incremental* chart parser implementing Jay Earley's parsing algorithm: | For each index end in [0, 1, ..., N]: | For each edge such that edge.end = end: | If edge is incomplete and edge.next is not a part of speech: | Apply PredictorRule to edge | If edge is incomplete and edge.next is a part of speech: | Apply ScannerRule to edge | If edge is complete: | Apply CompleterRule to edge | Return any complete parses in the chart """ def __init__( self, grammar, strategy=BU_LC_INCREMENTAL_STRATEGY, trace=0, trace_chart_width=50, chart_class=IncrementalChart, ): """ Create a new Earley chart parser, that uses ``grammar`` to parse texts. :type grammar: CFG :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :type trace_chart_width: int :param trace_chart_width: The default total width reserved for the chart in trace output. The remainder of each line will be used to display edges. :param chart_class: The class that should be used to create the charts used by this parser. """ self._grammar = grammar self._trace = trace self._trace_chart_width = trace_chart_width self._chart_class = chart_class self._axioms = [] self._inference_rules = [] for rule in strategy: if rule.NUM_EDGES == 0: self._axioms.append(rule) elif rule.NUM_EDGES == 1: self._inference_rules.append(rule) else: raise ValueError( "Incremental inference rules must have " "NUM_EDGES == 0 or 1" ) def chart_parse(self, tokens, trace=None): if trace is None: trace = self._trace trace_new_edges = self._trace_new_edges tokens = list(tokens) self._grammar.check_coverage(tokens) chart = self._chart_class(tokens) grammar = self._grammar # Width, for printing trace edges. trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) if trace: print(chart.pretty_format_leaves(trace_edge_width)) for axiom in self._axioms: new_edges = list(axiom.apply(chart, grammar)) trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) inference_rules = self._inference_rules for end in range(chart.num_leaves() + 1): if trace > 1: print("\n* Processing queue:", end, "\n") agenda = list(chart.select(end=end)) while agenda: edge = agenda.pop() for rule in inference_rules: new_edges = list(rule.apply(chart, grammar, edge)) trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) for new_edge in new_edges: if new_edge.end() == end: agenda.append(new_edge) return chart class EarleyChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args) class IncrementalTopDownChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__( self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args ) class IncrementalBottomUpChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__( self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args ) class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__( self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args ) class IncrementalLeftCornerChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): if not grammar.is_nonempty(): raise ValueError( "IncrementalLeftCornerParser only works for grammars " "without empty productions." ) IncrementalChartParser.__init__( self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args ) # //////////////////////////////////////////////////////////// # Incremental FCFG Chart Parsers # //////////////////////////////////////////////////////////// EARLEY_FEATURE_STRATEGY = [ LeafInitRule(), FeatureTopDownInitRule(), FeatureCompleterRule(), FeatureScannerRule(), FeaturePredictorRule(), ] TD_INCREMENTAL_FEATURE_STRATEGY = [ LeafInitRule(), FeatureTopDownInitRule(), FeatureTopDownPredictRule(), FeatureCompleteFundamentalRule(), ] BU_INCREMENTAL_FEATURE_STRATEGY = [ LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictRule(), FeatureCompleteFundamentalRule(), ] BU_LC_INCREMENTAL_FEATURE_STRATEGY = [ LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictCombineRule(), FeatureCompleteFundamentalRule(), ] class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser): def __init__( self, grammar, strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY, trace_chart_width=20, chart_class=FeatureIncrementalChart, **parser_args ): IncrementalChartParser.__init__( self, grammar, strategy=strategy, trace_chart_width=trace_chart_width, chart_class=chart_class, **parser_args ) class FeatureEarleyChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__( self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args ) class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__( self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args ) class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__( self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args ) class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__( self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args ) # //////////////////////////////////////////////////////////// # Demonstration # //////////////////////////////////////////////////////////// def demo( print_times=True, print_grammar=False, print_trees=True, trace=2, sent="I saw John with a dog with my cookie", numparses=5, ): """ A demonstration of the Earley parsers. """ import sys import time from nltk.parse.chart import demo_grammar # The grammar for ChartParser and SteppingChartParser: grammar = demo_grammar() if print_grammar: print("* Grammar") print(grammar) # Tokenize the sample sentence. print("* Sentence:") print(sent) tokens = sent.split() print(tokens) print() # Do the parsing. earley = EarleyChartParser(grammar, trace=trace) t = perf_counter() chart = earley.chart_parse(tokens) parses = list(chart.parses(grammar.start())) t = perf_counter() - t # Print results. if numparses: assert len(parses) == numparses, "Not all parses found" if print_trees: for tree in parses: print(tree) else: print("Nr trees:", len(parses)) if print_times: print("Time:", t) if __name__ == "__main__": demo() nltk-3.7/nltk/parse/evaluate.py000066400000000000000000000103711420073152400165750ustar00rootroot00000000000000# Natural Language Toolkit: evaluation of dependency parser # # Author: Long Duong # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT import unicodedata class DependencyEvaluator: """ Class for measuring labelled and unlabelled attachment score for dependency parsing. Note that the evaluation ignores punctuation. >>> from nltk.parse import DependencyGraph, DependencyEvaluator >>> gold_sent = DependencyGraph(\""" ... Pierre NNP 2 NMOD ... Vinken NNP 8 SUB ... , , 2 P ... 61 CD 5 NMOD ... years NNS 6 AMOD ... old JJ 2 NMOD ... , , 2 P ... will MD 0 ROOT ... join VB 8 VC ... the DT 11 NMOD ... board NN 9 OBJ ... as IN 9 VMOD ... a DT 15 NMOD ... nonexecutive JJ 15 NMOD ... director NN 12 PMOD ... Nov. NNP 9 VMOD ... 29 CD 16 NMOD ... . . 9 VMOD ... \""") >>> parsed_sent = DependencyGraph(\""" ... Pierre NNP 8 NMOD ... Vinken NNP 1 SUB ... , , 3 P ... 61 CD 6 NMOD ... years NNS 6 AMOD ... old JJ 2 NMOD ... , , 3 AMOD ... will MD 0 ROOT ... join VB 8 VC ... the DT 11 AMOD ... board NN 9 OBJECT ... as IN 9 NMOD ... a DT 15 NMOD ... nonexecutive JJ 15 NMOD ... director NN 12 PMOD ... Nov. NNP 9 VMOD ... 29 CD 16 NMOD ... . . 9 VMOD ... \""") >>> de = DependencyEvaluator([parsed_sent],[gold_sent]) >>> las, uas = de.eval() >>> las 0.6... >>> uas 0.8... >>> abs(uas - 0.8) < 0.00001 True """ def __init__(self, parsed_sents, gold_sents): """ :param parsed_sents: the list of parsed_sents as the output of parser :type parsed_sents: list(DependencyGraph) """ self._parsed_sents = parsed_sents self._gold_sents = gold_sents def _remove_punct(self, inStr): """ Function to remove punctuation from Unicode string. :param input: the input string :return: Unicode string after remove all punctuation """ punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"} return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat) def eval(self): """ Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS) :return : tuple(float,float) """ if len(self._parsed_sents) != len(self._gold_sents): raise ValueError( " Number of parsed sentence is different with number of gold sentence." ) corr = 0 corrL = 0 total = 0 for i in range(len(self._parsed_sents)): parsed_sent_nodes = self._parsed_sents[i].nodes gold_sent_nodes = self._gold_sents[i].nodes if len(parsed_sent_nodes) != len(gold_sent_nodes): raise ValueError("Sentences must have equal length.") for parsed_node_address, parsed_node in parsed_sent_nodes.items(): gold_node = gold_sent_nodes[parsed_node_address] if parsed_node["word"] is None: continue if parsed_node["word"] != gold_node["word"]: raise ValueError("Sentence sequence is not matched.") # Ignore if word is punctuation by default # if (parsed_sent[j]["word"] in string.punctuation): if self._remove_punct(parsed_node["word"]) == "": continue total += 1 if parsed_node["head"] == gold_node["head"]: corr += 1 if parsed_node["rel"] == gold_node["rel"]: corrL += 1 return corrL / total, corr / total nltk-3.7/nltk/parse/featurechart.py000066400000000000000000000525421420073152400174520ustar00rootroot00000000000000# Natural Language Toolkit: Chart Parser for Feature-Based Grammars # # Copyright (C) 2001-2022 NLTK Project # Author: Rob Speer # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT """ Extension of chart parsing implementation to handle grammars with feature structures as nodes. """ from time import perf_counter from nltk.featstruct import TYPE, FeatStruct, find_variables, unify from nltk.grammar import ( CFG, FeatStructNonterminal, Nonterminal, Production, is_nonterminal, is_terminal, ) from nltk.parse.chart import ( BottomUpPredictCombineRule, BottomUpPredictRule, CachedTopDownPredictRule, Chart, ChartParser, EdgeI, EmptyPredictRule, FundamentalRule, LeafInitRule, SingleEdgeFundamentalRule, TopDownInitRule, TreeEdge, ) from nltk.sem import logic from nltk.tree import Tree # //////////////////////////////////////////////////////////// # Tree Edge # //////////////////////////////////////////////////////////// class FeatureTreeEdge(TreeEdge): """ A specialized tree edge that allows shared variable bindings between nonterminals on the left-hand side and right-hand side. Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a dictionary mapping from variables to values. If the edge is not complete, then these bindings are simply stored. However, if the edge is complete, then the constructor applies these bindings to every nonterminal in the edge whose symbol implements the interface ``SubstituteBindingsI``. """ def __init__(self, span, lhs, rhs, dot=0, bindings=None): """ Construct a new edge. If the edge is incomplete (i.e., if ``dot alpha \* B1 beta][i:j]`` - ``[B2 -> gamma \*][j:k]`` licenses the edge: - ``[A -> alpha B3 \* beta][i:j]`` assuming that B1 and B2 can be unified to generate B3. """ def apply(self, chart, grammar, left_edge, right_edge): # Make sure the rule is applicable. if not ( left_edge.end() == right_edge.start() and left_edge.is_incomplete() and right_edge.is_complete() and isinstance(left_edge, FeatureTreeEdge) ): return found = right_edge.lhs() nextsym = left_edge.nextsym() if isinstance(right_edge, FeatureTreeEdge): if not is_nonterminal(nextsym): return if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: return # Create a copy of the bindings. bindings = left_edge.bindings() # We rename vars here, because we don't want variables # from the two different productions to match. found = found.rename_variables(used_vars=left_edge.variables()) # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to # generate B3 (result). result = unify(nextsym, found, bindings, rename_vars=False) if result is None: return else: if nextsym != found: return # Create a copy of the bindings. bindings = left_edge.bindings() # Construct the new edge. new_edge = left_edge.move_dot_forward(right_edge.end(), bindings) # Add it to the chart, with appropriate child pointers. if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): """ A specialized version of the completer / single edge fundamental rule that operates on nonterminals whose symbols are ``FeatStructNonterminal``. Rather than simply comparing the nonterminals for equality, they are unified. """ _fundamental_rule = FeatureFundamentalRule() def _apply_complete(self, chart, grammar, right_edge): fr = self._fundamental_rule for left_edge in chart.select( end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() ): yield from fr.apply(chart, grammar, left_edge, right_edge) def _apply_incomplete(self, chart, grammar, left_edge): fr = self._fundamental_rule for right_edge in chart.select( start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() ): yield from fr.apply(chart, grammar, left_edge, right_edge) # //////////////////////////////////////////////////////////// # Top-Down Prediction # //////////////////////////////////////////////////////////// class FeatureTopDownInitRule(TopDownInitRule): def apply(self, chart, grammar): for prod in grammar.productions(lhs=grammar.start()): new_edge = FeatureTreeEdge.from_production(prod, 0) if chart.insert(new_edge, ()): yield new_edge class FeatureTopDownPredictRule(CachedTopDownPredictRule): r""" A specialized version of the (cached) top down predict rule that operates on nonterminals whose symbols are ``FeatStructNonterminal``. Rather than simply comparing the nonterminals for equality, they are unified. The top down expand rule states that: - ``[A -> alpha \* B1 beta][i:j]`` licenses the edge: - ``[B2 -> \* gamma][j:j]`` for each grammar production ``B2 -> gamma``, assuming that B1 and B2 can be unified. """ def apply(self, chart, grammar, edge): if edge.is_complete(): return nextsym, index = edge.nextsym(), edge.end() if not is_nonterminal(nextsym): return # If we've already applied this rule to an edge with the same # next & end, and the chart & grammar have not changed, then # just return (no new edges to add). nextsym_with_bindings = edge.next_with_bindings() done = self._done.get((nextsym_with_bindings, index), (None, None)) if done[0] is chart and done[1] is grammar: return for prod in grammar.productions(lhs=nextsym): # If the left corner in the predicted production is # leaf, it must match with the input. if prod.rhs(): first = prod.rhs()[0] if is_terminal(first): if index >= chart.num_leaves(): continue if first != chart.leaf(index): continue # We rename vars here, because we don't want variables # from the two different productions to match. if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True): new_edge = FeatureTreeEdge.from_production(prod, edge.end()) if chart.insert(new_edge, ()): yield new_edge # Record the fact that we've applied this rule. self._done[nextsym_with_bindings, index] = (chart, grammar) # //////////////////////////////////////////////////////////// # Bottom-Up Prediction # //////////////////////////////////////////////////////////// class FeatureBottomUpPredictRule(BottomUpPredictRule): def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(rhs=edge.lhs()): if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue new_edge = FeatureTreeEdge.from_production(prod, edge.start()) if chart.insert(new_edge, ()): yield new_edge class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule): def apply(self, chart, grammar, edge): if edge.is_incomplete(): return found = edge.lhs() for prod in grammar.productions(rhs=found): bindings = {} if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue # We rename vars here, because we don't want variables # from the two different productions to match. used_vars = find_variables( (prod.lhs(),) + prod.rhs(), fs_class=FeatStruct ) found = found.rename_variables(used_vars=used_vars) result = unify(_next, found, bindings, rename_vars=False) if result is None: continue new_edge = FeatureTreeEdge.from_production( prod, edge.start() ).move_dot_forward(edge.end(), bindings) if chart.insert(new_edge, (edge,)): yield new_edge class FeatureEmptyPredictRule(EmptyPredictRule): def apply(self, chart, grammar): for prod in grammar.productions(empty=True): for index in range(chart.num_leaves() + 1): new_edge = FeatureTreeEdge.from_production(prod, index) if chart.insert(new_edge, ()): yield new_edge # //////////////////////////////////////////////////////////// # Feature Chart Parser # //////////////////////////////////////////////////////////// TD_FEATURE_STRATEGY = [ LeafInitRule(), FeatureTopDownInitRule(), FeatureTopDownPredictRule(), FeatureSingleEdgeFundamentalRule(), ] BU_FEATURE_STRATEGY = [ LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictRule(), FeatureSingleEdgeFundamentalRule(), ] BU_LC_FEATURE_STRATEGY = [ LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictCombineRule(), FeatureSingleEdgeFundamentalRule(), ] class FeatureChartParser(ChartParser): def __init__( self, grammar, strategy=BU_LC_FEATURE_STRATEGY, trace_chart_width=20, chart_class=FeatureChart, **parser_args, ): ChartParser.__init__( self, grammar, strategy=strategy, trace_chart_width=trace_chart_width, chart_class=chart_class, **parser_args, ) class FeatureTopDownChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args) class FeatureBottomUpChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args) class FeatureBottomUpLeftCornerChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__( self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args ) # //////////////////////////////////////////////////////////// # Instantiate Variable Chart # //////////////////////////////////////////////////////////// class InstantiateVarsChart(FeatureChart): """ A specialized chart that 'instantiates' variables whose names start with '@', by replacing them with unique new variables. In particular, whenever a complete edge is added to the chart, any variables in the edge's ``lhs`` whose names start with '@' will be replaced by unique new ``Variable``. """ def __init__(self, tokens): FeatureChart.__init__(self, tokens) def initialize(self): self._instantiated = set() FeatureChart.initialize(self) def insert(self, edge, child_pointer_list): if edge in self._instantiated: return False self.instantiate_edge(edge) return FeatureChart.insert(self, edge, child_pointer_list) def instantiate_edge(self, edge): """ If the edge is a ``FeatureTreeEdge``, and it is complete, then instantiate all variables whose names start with '@', by replacing them with unique new variables. Note that instantiation is done in-place, since the parsing algorithms might already hold a reference to the edge for future use. """ # If the edge is a leaf, or is not complete, or is # already in the chart, then just return it as-is. if not isinstance(edge, FeatureTreeEdge): return if not edge.is_complete(): return if edge in self._edge_to_cpls: return # Get a list of variables that need to be instantiated. # If there are none, then return as-is. inst_vars = self.inst_vars(edge) if not inst_vars: return # Instantiate the edge! self._instantiated.add(edge) edge._lhs = edge.lhs().substitute_bindings(inst_vars) def inst_vars(self, edge): return { var: logic.unique_variable() for var in edge.lhs().variables() if var.name.startswith("@") } # //////////////////////////////////////////////////////////// # Demo # //////////////////////////////////////////////////////////// def demo_grammar(): from nltk.grammar import FeatureGrammar return FeatureGrammar.fromstring( """ S -> NP VP PP -> Prep NP NP -> NP PP VP -> VP PP VP -> Verb NP VP -> Verb NP -> Det[pl=?x] Noun[pl=?x] NP -> "John" NP -> "I" Det -> "the" Det -> "my" Det[-pl] -> "a" Noun[-pl] -> "dog" Noun[-pl] -> "cookie" Verb -> "ate" Verb -> "saw" Prep -> "with" Prep -> "under" """ ) def demo( print_times=True, print_grammar=True, print_trees=True, print_sentence=True, trace=1, parser=FeatureChartParser, sent="I saw John with a dog with my cookie", ): import sys import time print() grammar = demo_grammar() if print_grammar: print(grammar) print() print("*", parser.__name__) if print_sentence: print("Sentence:", sent) tokens = sent.split() t = perf_counter() cp = parser(grammar, trace=trace) chart = cp.chart_parse(tokens) trees = list(chart.parses(grammar.start())) if print_times: print("Time: %s" % (perf_counter() - t)) if print_trees: for tree in trees: print(tree) else: print("Nr trees:", len(trees)) def run_profile(): import profile profile.run("for i in range(1): demo()", "/tmp/profile.out") import pstats p = pstats.Stats("/tmp/profile.out") p.strip_dirs().sort_stats("time", "cum").print_stats(60) p.strip_dirs().sort_stats("cum", "time").print_stats(60) if __name__ == "__main__": from nltk.data import load demo() print() grammar = load("grammars/book_grammars/feat0.fcfg") cp = FeatureChartParser(grammar, trace=2) sent = "Kim likes children" tokens = sent.split() trees = cp.parse(tokens) for tree in trees: print(tree) nltk-3.7/nltk/parse/generate.py000066400000000000000000000043701420073152400165630ustar00rootroot00000000000000# Natural Language Toolkit: Generating from a CFG # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT # import itertools import sys from nltk.grammar import Nonterminal def generate(grammar, start=None, depth=None, n=None): """ Generates an iterator of all sentences from a CFG. :param grammar: The Grammar used to generate sentences. :param start: The Nonterminal from which to start generate sentences. :param depth: The maximal depth of the generated tree. :param n: The maximum number of sentences to return. :return: An iterator of lists of terminal tokens. """ if not start: start = grammar.start() if depth is None: depth = sys.maxsize iter = _generate_all(grammar, [start], depth) if n: iter = itertools.islice(iter, n) return iter def _generate_all(grammar, items, depth): if items: try: for frag1 in _generate_one(grammar, items[0], depth): for frag2 in _generate_all(grammar, items[1:], depth): yield frag1 + frag2 except RecursionError as error: # Helpful error message while still showing the recursion stack. raise RuntimeError( "The grammar has rule(s) that yield infinite recursion!" ) from error else: yield [] def _generate_one(grammar, item, depth): if depth > 0: if isinstance(item, Nonterminal): for prod in grammar.productions(lhs=item): yield from _generate_all(grammar, prod.rhs(), depth - 1) else: yield [item] demo_grammar = """ S -> NP VP NP -> Det N PP -> P NP VP -> 'slept' | 'saw' NP | 'walked' PP Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' P -> 'in' | 'with' """ def demo(N=23): from nltk.grammar import CFG print("Generating the first %d sentences for demo grammar:" % (N,)) print(demo_grammar) grammar = CFG.fromstring(demo_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): print("%3d. %s" % (n, " ".join(sent))) if __name__ == "__main__": demo() nltk-3.7/nltk/parse/malt.py000066400000000000000000000374621420073152400157360ustar00rootroot00000000000000# Natural Language Toolkit: Interface to MaltParser # # Author: Dan Garrette # Contributor: Liling Tan, Mustufain, osamamukhtar11 # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT import inspect import os import subprocess import sys import tempfile from nltk.data import ZipFilePathPointer from nltk.internals import find_dir, find_file, find_jars_within_path from nltk.parse.api import ParserI from nltk.parse.dependencygraph import DependencyGraph from nltk.parse.util import taggedsents_to_conll def malt_regex_tagger(): from nltk.tag import RegexpTagger _tagger = RegexpTagger( [ (r"\.$", "."), (r"\,$", ","), (r"\?$", "?"), # fullstop, comma, Qmark (r"\($", "("), (r"\)$", ")"), # round brackets (r"\[$", "["), (r"\]$", "]"), # square brackets (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "DT"), # articles (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns (r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive (r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions (r"(till|Till|until|Until)$", "IN"), # time prepopsitions (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions (r"(under|Under|below|Below)$", "IN"), # space prepopsitions (r"(over|Over|above|Above)$", "IN"), # space prepopsitions (r"(across|Across|through|Through)$", "IN"), # space prepopsitions (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ] ) return _tagger.tag def find_maltparser(parser_dirname): """ A module to find MaltParser .jar file and its dependencies. """ if os.path.exists(parser_dirname): # If a full path is given. _malt_dir = parser_dirname else: # Try to find path to maltparser directory in environment variables. _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",)) # Checks that that the found directory contains all the necessary .jar malt_dependencies = ["", "", ""] _malt_jars = set(find_jars_within_path(_malt_dir)) _jars = {os.path.split(jar)[1] for jar in _malt_jars} malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"} assert malt_dependencies.issubset(_jars) assert any( filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars) ) return list(_malt_jars) def find_malt_model(model_filename): """ A module to find pre-trained MaltParser model. """ if model_filename is None: return "malt_temp.mco" elif os.path.exists(model_filename): # If a full path is given. return model_filename else: # Try to find path to malt model in environment variables. return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False) class MaltParser(ParserI): """ A class for dependency parsing with MaltParser. The input is the paths to: - (optionally) a maltparser directory - (optionally) the path to a pre-trained MaltParser .mco model file - (optionally) the tagger to use for POS tagging before parsing - (optionally) additional Java arguments Example: >>> from nltk.parse import malt >>> # With MALT_PARSER and MALT_MODEL environment set. >>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP (shot I (elephant an) (in (pajamas my)) .) >>> # Without MALT_PARSER and MALT_MODEL environment. >>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP (shot I (elephant an) (in (pajamas my)) .) """ def __init__( self, parser_dirname="", model_filename=None, tagger=None, additional_java_args=None, ): """ An interface for parsing with the Malt Parser. :param parser_dirname: The path to the maltparser directory that contains the maltparser-1.x.jar :type parser_dirname: str :param model_filename: The name of the pre-trained model with .mco file extension. If provided, training will not be required. (see http://www.maltparser.org/mco/mco.html and see http://www.patful.com/chalk/node/185) :type model_filename: str :param tagger: The tagger used to POS tag the raw string before formatting to CONLL format. It should behave like `nltk.pos_tag` :type tagger: function :param additional_java_args: This is the additional Java arguments that one can use when calling Maltparser, usually this is the heapsize limits, e.g. `additional_java_args=['-Xmx1024m']` (see https://goo.gl/mpDBvQ) :type additional_java_args: list """ # Find all the necessary jar files for MaltParser. self.malt_jars = find_maltparser(parser_dirname) # Initialize additional java arguments. self.additional_java_args = ( additional_java_args if additional_java_args is not None else [] ) # Initialize model. self.model = find_malt_model(model_filename) self._trained = self.model != "malt_temp.mco" # Set the working_dir parameters i.e. `-w` from MaltParser's option. self.working_dir = tempfile.gettempdir() # Initialize POS tagger. self.tagger = tagger if tagger is not None else malt_regex_tagger() def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"): """ Use MaltParser to parse multiple POS tagged sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :return: iter(iter(``DependencyGraph``)) the dependency graph representation of each sentence """ if not self._trained: raise Exception("Parser has not been trained. Call train() first.") with tempfile.NamedTemporaryFile( prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False ) as input_file: with tempfile.NamedTemporaryFile( prefix="malt_output.conll.", dir=self.working_dir, mode="w", delete=False, ) as output_file: # Convert list of sentences to CONLL format. for line in taggedsents_to_conll(sentences): input_file.write(str(line)) input_file.close() # Generate command to run maltparser. cmd = self.generate_malt_command( input_file.name, output_file.name, mode="parse" ) # This is a maltparser quirk, it needs to be run # where the model file is. otherwise it goes into an awkward # missing .jars or strange -w working_dir problem. _current_path = os.getcwd() # Remembers the current path. try: # Change to modelfile path os.chdir(os.path.split(self.model)[0]) except: pass ret = self._execute(cmd, verbose) # Run command. os.chdir(_current_path) # Change back to current path. if ret != 0: raise Exception( "MaltParser parsing (%s) failed with exit " "code %d" % (" ".join(cmd), ret) ) # Must return iter(iter(Tree)) with open(output_file.name) as infile: for tree_str in infile.read().split("\n\n"): yield ( iter( [ DependencyGraph( tree_str, top_relation_label=top_relation_label ) ] ) ) os.remove(input_file.name) os.remove(output_file.name) def parse_sents(self, sentences, verbose=False, top_relation_label="null"): """ Use MaltParser to parse multiple sentences. Takes a list of sentences, where each sentence is a list of words. Each sentence will be automatically tagged with this MaltParser instance's tagger. :param sentences: Input sentences to parse :type sentence: list(list(str)) :return: iter(DependencyGraph) """ tagged_sentences = (self.tagger(sentence) for sentence in sentences) return self.parse_tagged_sents( tagged_sentences, verbose, top_relation_label=top_relation_label ) def generate_malt_command(self, inputfilename, outputfilename=None, mode=None): """ This function generates the maltparser command use at the terminal. :param inputfilename: path to the input file :type inputfilename: str :param outputfilename: path to the output file :type outputfilename: str """ cmd = ["java"] cmd += self.additional_java_args # Adds additional java arguments # Joins classpaths with ";" if on Windows and on Linux/Mac use ":" classpaths_separator = ";" if sys.platform.startswith("win") else ":" cmd += [ "-cp", classpaths_separator.join(self.malt_jars), ] # Adds classpaths for jars cmd += ["org.maltparser.Malt"] # Adds the main function. # Adds the model file. if os.path.exists(self.model): # when parsing cmd += ["-c", os.path.split(self.model)[-1]] else: # when learning cmd += ["-c", self.model] cmd += ["-i", inputfilename] if mode == "parse": cmd += ["-o", outputfilename] cmd += ["-m", mode] # mode use to generate parses. return cmd @staticmethod def _execute(cmd, verbose=False): output = None if verbose else subprocess.PIPE p = subprocess.Popen(cmd, stdout=output, stderr=output) return p.wait() def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of ``DependencyGraph`` objects :param depgraphs: list of ``DependencyGraph`` objects for training input data :type depgraphs: DependencyGraph """ # Write the conll_str to malt_train.conll file in /tmp/ with tempfile.NamedTemporaryFile( prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False ) as input_file: input_str = "\n".join(dg.to_conll(10) for dg in depgraphs) input_file.write(str(input_str)) # Trains the model with the malt_train.conll self.train_from_file(input_file.name, verbose=verbose) # Removes the malt_train.conll once training finishes. os.remove(input_file.name) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data :type conll_file: str """ # If conll_file is a ZipFilePathPointer, # then we need to do some extra massaging if isinstance(conll_file, ZipFilePathPointer): with tempfile.NamedTemporaryFile( prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False ) as input_file: with conll_file.open() as conll_input_file: conll_str = conll_input_file.read() input_file.write(str(conll_str)) return self.train_from_file(input_file.name, verbose=verbose) # Generate command to run maltparser. cmd = self.generate_malt_command(conll_file, mode="learn") ret = self._execute(cmd, verbose) if ret != 0: raise Exception( "MaltParser training (%s) failed with exit " "code %d" % (" ".join(cmd), ret) ) self._trained = True if __name__ == "__main__": """ A demonstration function to show how NLTK users can use the malt parser API. >>> from nltk import pos_tag >>> assert 'MALT_PARSER' in os.environ, str( ... "Please set MALT_PARSER in your global environment, e.g.:\n" ... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'") >>> >>> assert 'MALT_MODEL' in os.environ, str( ... "Please set MALT_MODEL in your global environment, e.g.:\n" ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'") >>> >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" ... "2 sees _ VB _ _ 0 ROOT _ _\n" ... "3 a _ DT _ _ 4 SPEC _ _\n" ... "4 dog _ NN _ _ 2 OBJ _ _\n" ... "5 . _ . _ _ 2 PUNCT _ _\n") >>> >>> >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" ... "2 walks _ VB _ _ 0 ROOT _ _\n" ... "3 . _ . _ _ 2 PUNCT _ _\n") >>> dg1 = DependencyGraph(_dg1_str) >>> dg2 = DependencyGraph(_dg2_str) >>> # Initialize a MaltParser object >>> mp = MaltParser() >>> >>> # Trains a model. >>> mp.train([dg1,dg2], verbose=False) >>> sent1 = ['John','sees','Mary', '.'] >>> sent2 = ['John', 'walks', 'a', 'dog', '.'] >>> >>> # Parse a single sentence. >>> parsed_sent1 = mp.parse_one(sent1) >>> parsed_sent2 = mp.parse_one(sent2) >>> print(parsed_sent1.tree()) (sees John Mary .) >>> print(parsed_sent2.tree()) (walks John (dog a) .) >>> >>> # Parsing multiple sentences. >>> sentences = [sent1,sent2] >>> parsed_sents = mp.parse_sents(sentences) >>> print(next(next(parsed_sents)).tree()) (sees John Mary .) >>> print(next(next(parsed_sents)).tree()) (walks John (dog a) .) >>> >>> # Initialize a MaltParser object with an English pre-trained model. >>> parser_dirname = 'maltparser-1.9.2' >>> model_name = 'engmalt.linear-1.7.mco' >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag) >>> sent1 = 'I shot an elephant in my pajamas .'.split() >>> sent2 = 'Time flies like banana .'.split() >>> # Parse a single sentence. >>> print(mp.parse_one(sent1).tree()) (shot I (elephant an) (in (pajamas my)) .) # Parsing multiple sentences >>> sentences = [sent1,sent2] >>> parsed_sents = mp.parse_sents(sentences) >>> print(next(next(parsed_sents)).tree()) (shot I (elephant an) (in (pajamas my)) .) >>> print(next(next(parsed_sents)).tree()) (flies Time (like banana) .) """ import doctest doctest.testmod() nltk-3.7/nltk/parse/nonprojectivedependencyparser.py000066400000000000000000000700051420073152400231300ustar00rootroot00000000000000# Natural Language Toolkit: Dependency Grammars # # Copyright (C) 2001-2022 NLTK Project # Author: Jason Narad # # URL: # For license information, see LICENSE.TXT # import logging import math from nltk.parse.dependencygraph import DependencyGraph logger = logging.getLogger(__name__) ################################################################# # DependencyScorerI - Interface for Graph-Edge Weight Calculation ################################################################# class DependencyScorerI: """ A scorer for calculated the weights on the edges of a weighted dependency graph. This is used by a ``ProbabilisticNonprojectiveParser`` to initialize the edge weights of a ``DependencyGraph``. While typically this would be done by training a binary classifier, any class that can return a multidimensional list representation of the edge weights can implement this interface. As such, it has no necessary fields. """ def __init__(self): if self.__class__ == DependencyScorerI: raise TypeError("DependencyScorerI is an abstract interface") def train(self, graphs): """ :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. Typically the edges present in the graphs can be used as positive training examples, and the edges not present as negative examples. """ raise NotImplementedError() def score(self, graph): """ :type graph: DependencyGraph :param graph: A dependency graph whose set of edges need to be scored. :rtype: A three-dimensional list of numbers. :return: The score is returned in a multidimensional(3) list, such that the outer-dimension refers to the head, and the inner-dimension refers to the dependencies. For instance, scores[0][1] would reference the list of scores corresponding to arcs from node 0 to node 1. The node's 'address' field can be used to determine its number identification. For further illustration, a score list corresponding to Fig.2 of Keith Hall's 'K-best Spanning Tree Parsing' paper:: scores = [[[], [5], [1], [1]], [[], [], [11], [4]], [[], [10], [], [5]], [[], [8], [8], []]] When used in conjunction with a MaxEntClassifier, each score would correspond to the confidence of a particular edge being classified with the positive training examples. """ raise NotImplementedError() ################################################################# # NaiveBayesDependencyScorer ################################################################# class NaiveBayesDependencyScorer(DependencyScorerI): """ A dependency scorer built around a MaxEnt classifier. In this particular class that classifier is a ``NaiveBayesClassifier``. It uses head-word, head-tag, child-word, and child-tag features for classification. >>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2 >>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry] >>> npp = ProbabilisticNonprojectiveParser() >>> npp.train(graphs, NaiveBayesDependencyScorer()) >>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']) >>> len(list(parses)) 1 """ def __init__(self): pass # Do nothing without throwing error def train(self, graphs): """ Trains a ``NaiveBayesClassifier`` using the edges present in graphs list as positive examples, the edges not present as negative examples. Uses a feature vector of head-word, head-tag, child-word, and child-tag. :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. """ from nltk.classify import NaiveBayesClassifier # Create training labeled training examples labeled_examples = [] for graph in graphs: for head_node in graph.nodes.values(): for child_index, child_node in graph.nodes.items(): if child_index in head_node["deps"]: label = "T" else: label = "F" labeled_examples.append( ( dict( a=head_node["word"], b=head_node["tag"], c=child_node["word"], d=child_node["tag"], ), label, ) ) self.classifier = NaiveBayesClassifier.train(labeled_examples) def score(self, graph): """ Converts the graph into a feature-based representation of each edge, and then assigns a score to each based on the confidence of the classifier in assigning it to the positive label. Scores are returned in a multidimensional list. :type graph: DependencyGraph :param graph: A dependency graph to score. :rtype: 3 dimensional list :return: Edge scores for the graph parameter. """ # Convert graph to feature representation edges = [] for head_node in graph.nodes.values(): for child_node in graph.nodes.values(): edges.append( dict( a=head_node["word"], b=head_node["tag"], c=child_node["word"], d=child_node["tag"], ) ) # Score edges edge_scores = [] row = [] count = 0 for pdist in self.classifier.prob_classify_many(edges): logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F")) # smoothing in case the probability = 0 row.append([math.log(pdist.prob("T") + 0.00000000001)]) count += 1 if count == len(graph.nodes): edge_scores.append(row) row = [] count = 0 return edge_scores ################################################################# # A Scorer for Demo Purposes ################################################################# # A short class necessary to show parsing example from paper class DemoScorer(DependencyScorerI): def train(self, graphs): print("Training...") def score(self, graph): # scores for Keith Hall 'K-best Spanning Tree Parsing' paper return [ [[], [5], [1], [1]], [[], [], [11], [4]], [[], [10], [], [5]], [[], [8], [8], []], ] ################################################################# # Non-Projective Probabilistic Parsing ################################################################# class ProbabilisticNonprojectiveParser: """A probabilistic non-projective dependency parser. Nonprojective dependencies allows for "crossing branches" in the parse tree which is necessary for representing particular linguistic phenomena, or even typical parses in some languages. This parser follows the MST parsing algorithm, outlined in McDonald(2005), which likens the search for the best non-projective parse to finding the maximum spanning tree in a weighted directed graph. >>> class Scorer(DependencyScorerI): ... def train(self, graphs): ... pass ... ... def score(self, graph): ... return [ ... [[], [5], [1], [1]], ... [[], [], [11], [4]], ... [[], [10], [], [5]], ... [[], [8], [8], []], ... ] >>> npp = ProbabilisticNonprojectiveParser() >>> npp.train([], Scorer()) >>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None]) >>> len(list(parses)) 1 Rule based example >>> from nltk.grammar import DependencyGrammar >>> grammar = DependencyGrammar.fromstring(''' ... 'taught' -> 'play' | 'man' ... 'man' -> 'the' | 'in' ... 'in' -> 'corner' ... 'corner' -> 'the' ... 'play' -> 'golf' | 'dachshund' | 'to' ... 'dachshund' -> 'his' ... ''') >>> ndp = NonprojectiveDependencyParser(grammar) >>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf']) >>> len(list(parses)) 4 """ def __init__(self): """ Creates a new non-projective parser. """ logging.debug("initializing prob. nonprojective...") def train(self, graphs, dependency_scorer): """ Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects, and establishes this as the parser's scorer. This is used to initialize the scores on a ``DependencyGraph`` during the parsing procedure. :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. :type dependency_scorer: DependencyScorerI :param dependency_scorer: A scorer which implements the ``DependencyScorerI`` interface. """ self._scorer = dependency_scorer self._scorer.train(graphs) def initialize_edge_scores(self, graph): """ Assigns a score to every edge in the ``DependencyGraph`` graph. These scores are generated via the parser's scorer which was assigned during the training process. :type graph: DependencyGraph :param graph: A dependency graph to assign scores to. """ self.scores = self._scorer.score(graph) def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph): """ Takes a list of nodes that have been identified to belong to a cycle, and collapses them into on larger node. The arcs of all nodes in the graph must be updated to account for this. :type new_node: Node. :param new_node: A Node (Dictionary) to collapse the cycle nodes into. :type cycle_path: A list of integers. :param cycle_path: A list of node addresses, each of which is in the cycle. :type g_graph, b_graph, c_graph: DependencyGraph :param g_graph, b_graph, c_graph: Graphs which need to be updated. """ logger.debug("Collapsing nodes...") # Collapse all cycle nodes into v_n+1 in G_Graph for cycle_node_index in cycle_path: g_graph.remove_by_address(cycle_node_index) g_graph.add_node(new_node) g_graph.redirect_arcs(cycle_path, new_node["address"]) def update_edge_scores(self, new_node, cycle_path): """ Updates the edge scores to reflect a collapse operation into new_node. :type new_node: A Node. :param new_node: The node which cycle nodes are collapsed into. :type cycle_path: A list of integers. :param cycle_path: A list of node addresses that belong to the cycle. """ logger.debug("cycle %s", cycle_path) cycle_path = self.compute_original_indexes(cycle_path) logger.debug("old cycle %s", cycle_path) logger.debug("Prior to update: %s", self.scores) for i, row in enumerate(self.scores): for j, column in enumerate(self.scores[i]): logger.debug(self.scores[i][j]) if j in cycle_path and i not in cycle_path and self.scores[i][j]: subtract_val = self.compute_max_subtract_score(j, cycle_path) logger.debug("%s - %s", self.scores[i][j], subtract_val) new_vals = [] for cur_val in self.scores[i][j]: new_vals.append(cur_val - subtract_val) self.scores[i][j] = new_vals for i, row in enumerate(self.scores): for j, cell in enumerate(self.scores[i]): if i in cycle_path and j in cycle_path: self.scores[i][j] = [] logger.debug("After update: %s", self.scores) def compute_original_indexes(self, new_indexes): """ As nodes are collapsed into others, they are replaced by the new node in the graph, but it's still necessary to keep track of what these original nodes were. This takes a list of node addresses and replaces any collapsed node addresses with their original addresses. :type new_indexes: A list of integers. :param new_indexes: A list of node addresses to check for subsumed nodes. """ swapped = True while swapped: originals = [] swapped = False for new_index in new_indexes: if new_index in self.inner_nodes: for old_val in self.inner_nodes[new_index]: if old_val not in originals: originals.append(old_val) swapped = True else: originals.append(new_index) new_indexes = originals return new_indexes def compute_max_subtract_score(self, column_index, cycle_indexes): """ When updating scores the score of the highest-weighted incoming arc is subtracted upon collapse. This returns the correct amount to subtract from that edge. :type column_index: integer. :param column_index: A index representing the column of incoming arcs to a particular node being updated :type cycle_indexes: A list of integers. :param cycle_indexes: Only arcs from cycle nodes are considered. This is a list of such nodes addresses. """ max_score = -100000 for row_index in cycle_indexes: for subtract_val in self.scores[row_index][column_index]: if subtract_val > max_score: max_score = subtract_val return max_score def best_incoming_arc(self, node_index): """ Returns the source of the best incoming arc to the node with address: node_index :type node_index: integer. :param node_index: The address of the 'destination' node, the node that is arced to. """ originals = self.compute_original_indexes([node_index]) logger.debug("originals: %s", originals) max_arc = None max_score = None for row_index in range(len(self.scores)): for col_index in range(len(self.scores[row_index])): if col_index in originals and ( max_score is None or self.scores[row_index][col_index] > max_score ): max_score = self.scores[row_index][col_index] max_arc = row_index logger.debug("%s, %s", row_index, col_index) logger.debug(max_score) for key in self.inner_nodes: replaced_nodes = self.inner_nodes[key] if max_arc in replaced_nodes: return key return max_arc def original_best_arc(self, node_index): originals = self.compute_original_indexes([node_index]) max_arc = None max_score = None max_orig = None for row_index in range(len(self.scores)): for col_index in range(len(self.scores[row_index])): if col_index in originals and ( max_score is None or self.scores[row_index][col_index] > max_score ): max_score = self.scores[row_index][col_index] max_arc = row_index max_orig = col_index return [max_arc, max_orig] def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update( {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} ) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update( {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} ) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update( {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} ) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug("current_vertex: %s", current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug("current_node: %s", current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update( {"word": "TEMP", "rel": "NTOP", "address": new_vertex} ) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1} # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node["address"], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node["address"]] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug("g_graph: %s", g_graph) logger.debug("b_graph: %s", b_graph) logger.debug("c_graph: %s", c_graph) logger.debug("Betas: %s", betas) logger.debug("replaced nodes %s", self.inner_nodes) # Recover parse tree logger.debug("Final scores: %s", self.scores) logger.debug("Recovering parse...") for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug("Betas: %s", betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node["deps"] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug("Done.") yield original_graph ################################################################# # Rule-based Non-Projective Parser ################################################################# class NonprojectiveDependencyParser: """ A non-projective, rule-based, dependency parser. This parser will return the set of all possible non-projective parses based on the word-to-word relations defined in the parser's dependency grammar, and will allow the branches of the parse tree to cross in order to capture a variety of linguistic phenomena that a projective parser will not. """ def __init__(self, dependency_grammar): """ Creates a new ``NonprojectiveDependencyParser``. :param dependency_grammar: a grammar of word-to-word relations. :type dependency_grammar: DependencyGrammar """ self._grammar = dependency_grammar def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: An iterator of non-projective parses. rtype: iter(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() for index, token in enumerate(tokens): self._graph.nodes[index] = { "word": token, "deps": [], "rel": "NTOP", "address": index, } for head_node in self._graph.nodes.values(): deps = [] for dep_node in self._graph.nodes.values(): if ( self._grammar.contains(head_node["word"], dep_node["word"]) and head_node["word"] != dep_node["word"] ): deps.append(dep_node["address"]) head_node["deps"] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) < 2: if len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while i >= 0: if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in range(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses # ensure 1 root, every thing has 1 head for analysis in analyses: if analysis.count(-1) > 1: # there are several root elements! continue graph = DependencyGraph() graph.root = graph.nodes[analysis.index(-1) + 1] for address, (token, head_index) in enumerate( zip(tokens, analysis), start=1 ): head_address = head_index + 1 node = graph.nodes[address] node.update({"word": token, "address": address}) if head_address == 0: rel = "ROOT" else: rel = "" graph.nodes[head_index + 1]["deps"][rel].append(address) # TODO: check for cycles yield graph ################################################################# # Demos ################################################################# def demo(): # hall_demo() nonprojective_conll_parse_demo() rule_based_demo() def hall_demo(): npp = ProbabilisticNonprojectiveParser() npp.train([], DemoScorer()) for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]): print(parse_graph) def nonprojective_conll_parse_demo(): from nltk.parse.dependencygraph import conll_data2 graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] npp = ProbabilisticNonprojectiveParser() npp.train(graphs, NaiveBayesDependencyScorer()) for parse_graph in npp.parse( ["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"] ): print(parse_graph) def rule_based_demo(): from nltk.grammar import DependencyGrammar grammar = DependencyGrammar.fromstring( """ 'taught' -> 'play' | 'man' 'man' -> 'the' | 'in' 'in' -> 'corner' 'corner' -> 'the' 'play' -> 'golf' | 'dachshund' | 'to' 'dachshund' -> 'his' """ ) print(grammar) ndp = NonprojectiveDependencyParser(grammar) graphs = ndp.parse( [ "the", "man", "in", "the", "corner", "taught", "his", "dachshund", "to", "play", "golf", ] ) print("Graphs:") for graph in graphs: print(graph) if __name__ == "__main__": demo() nltk-3.7/nltk/parse/pchart.py000066400000000000000000000466751420073152400162700ustar00rootroot00000000000000# Natural Language Toolkit: Probabilistic Chart Parsers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ Classes and interfaces for associating probabilities with tree structures that represent the internal organization of a text. The probabilistic parser module defines ``BottomUpProbabilisticChartParser``. ``BottomUpProbabilisticChartParser`` is an abstract class that implements a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges, and adds them to the chart one at a time. The ordering of this queue is based on the probabilities associated with the edges, allowing the parser to expand more likely edges before less likely ones. Each subclass implements a different queue ordering, producing different search strategies. Currently the following subclasses are defined: - ``InsideChartParser`` searches edges in decreasing order of their trees' inside probabilities. - ``RandomChartParser`` searches edges in random order. - ``LongestChartParser`` searches edges in decreasing order of their location's length. The ``BottomUpProbabilisticChartParser`` constructor has an optional argument beam_size. If non-zero, this controls the size of the beam (aka the edge queue). This option is most useful with InsideChartParser. """ ##////////////////////////////////////////////////////// ## Bottom-Up PCFG Chart Parser ##////////////////////////////////////////////////////// # [XX] This might not be implemented quite right -- it would be better # to associate probabilities with child pointer lists. import random from functools import reduce from nltk.grammar import PCFG, Nonterminal from nltk.parse.api import ParserI from nltk.parse.chart import AbstractChartRule, Chart, LeafEdge, TreeEdge from nltk.tree import ProbabilisticTree, Tree # Probabilistic edges class ProbabilisticLeafEdge(LeafEdge): def prob(self): return 1.0 class ProbabilisticTreeEdge(TreeEdge): def __init__(self, prob, *args, **kwargs): TreeEdge.__init__(self, *args, **kwargs) self._prob = prob # two edges with different probabilities are not equal. self._comparison_key = (self._comparison_key, prob) def prob(self): return self._prob @staticmethod def from_production(production, index, p): return ProbabilisticTreeEdge( p, (index, index), production.lhs(), production.rhs(), 0 ) # Rules using probabilistic edges class ProbabilisticBottomUpInitRule(AbstractChartRule): NUM_EDGES = 0 def apply(self, chart, grammar): for index in range(chart.num_leaves()): new_edge = ProbabilisticLeafEdge(chart.leaf(index), index) if chart.insert(new_edge, ()): yield new_edge class ProbabilisticBottomUpPredictRule(AbstractChartRule): NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(): if edge.lhs() == prod.rhs()[0]: new_edge = ProbabilisticTreeEdge.from_production( prod, edge.start(), prod.prob() ) if chart.insert(new_edge, ()): yield new_edge class ProbabilisticFundamentalRule(AbstractChartRule): NUM_EDGES = 2 def apply(self, chart, grammar, left_edge, right_edge): # Make sure the rule is applicable. if not ( left_edge.end() == right_edge.start() and left_edge.nextsym() == right_edge.lhs() and left_edge.is_incomplete() and right_edge.is_complete() ): return # Construct the new edge. p = left_edge.prob() * right_edge.prob() new_edge = ProbabilisticTreeEdge( p, span=(left_edge.start(), right_edge.end()), lhs=left_edge.lhs(), rhs=left_edge.rhs(), dot=left_edge.dot() + 1, ) # Add it to the chart, with appropriate child pointers. changed_chart = False for cpl1 in chart.child_pointer_lists(left_edge): if chart.insert(new_edge, cpl1 + (right_edge,)): changed_chart = True # If we changed the chart, then generate the edge. if changed_chart: yield new_edge class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule): NUM_EDGES = 1 _fundamental_rule = ProbabilisticFundamentalRule() def apply(self, chart, grammar, edge1): fr = self._fundamental_rule if edge1.is_incomplete(): # edge1 = left_edge; edge2 = right_edge for edge2 in chart.select( start=edge1.end(), is_complete=True, lhs=edge1.nextsym() ): yield from fr.apply(chart, grammar, edge1, edge2) else: # edge2 = left_edge; edge1 = right_edge for edge2 in chart.select( end=edge1.start(), is_complete=False, nextsym=edge1.lhs() ): yield from fr.apply(chart, grammar, edge2, edge1) def __str__(self): return "Fundamental Rule" class BottomUpProbabilisticChartParser(ParserI): """ An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to record partial results. ``BottomUpProbabilisticChartParser`` maintains a queue of edges that can be added to the chart. This queue is initialized with edges for each token in the text that is being parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into the chart one at a time, starting with the most likely edges, and proceeding to less likely edges. For each edge that is added to the chart, it may become possible to insert additional edges into the chart; these are added to the queue. This process continues until enough complete parses have been generated, or until the queue is empty. The sorting order for the queue is not specified by ``BottomUpProbabilisticChartParser``. Different sorting orders will result in different search strategies. The sorting order for the queue is defined by the method ``sort_queue``; subclasses are required to provide a definition for this method. :type _grammar: PCFG :ivar _grammar: The grammar used to parse sentences. :type _trace: int :ivar _trace: The level of tracing output that should be generated when parsing a text. """ def __init__(self, grammar, beam_size=0, trace=0): """ Create a new ``BottomUpProbabilisticChartParser``, that uses ``grammar`` to parse texts. :type grammar: PCFG :param grammar: The grammar used to parse texts. :type beam_size: int :param beam_size: The maximum length for the parser's edge queue. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. """ if not isinstance(grammar, PCFG): raise ValueError("The grammar must be probabilistic PCFG") self._grammar = grammar self.beam_size = beam_size self._trace = trace def grammar(self): return self._grammar def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. :type trace: int :param trace: The trace level. A trace level of ``0`` will generate no tracing output; and higher trace levels will produce more verbose tracing output. :rtype: None """ self._trace = trace # TODO: change this to conform more with the standard ChartParser def parse(self, tokens): self._grammar.check_coverage(tokens) chart = Chart(list(tokens)) grammar = self._grammar # Chart parser rules. bu_init = ProbabilisticBottomUpInitRule() bu = ProbabilisticBottomUpPredictRule() fr = SingleEdgeProbabilisticFundamentalRule() # Our queue queue = [] # Initialize the chart. for edge in bu_init.apply(chart, grammar): if self._trace > 1: print( " %-50s [%s]" % (chart.pretty_format_edge(edge, width=2), edge.prob()) ) queue.append(edge) while len(queue) > 0: # Re-sort the queue. self.sort_queue(queue, chart) # Prune the queue to the correct size if a beam was defined if self.beam_size: self._prune(queue, chart) # Get the best edge. edge = queue.pop() if self._trace > 0: print( " %-50s [%s]" % (chart.pretty_format_edge(edge, width=2), edge.prob()) ) # Apply BU & FR to it. queue.extend(bu.apply(chart, grammar, edge)) queue.extend(fr.apply(chart, grammar, edge)) # Get a list of complete parses. parses = list(chart.parses(grammar.start(), ProbabilisticTree)) # Assign probabilities to the trees. prod_probs = {} for prod in grammar.productions(): prod_probs[prod.lhs(), prod.rhs()] = prod.prob() for parse in parses: self._setprob(parse, prod_probs) # Sort by probability parses.sort(reverse=True, key=lambda tree: tree.prob()) return iter(parses) def _setprob(self, tree, prod_probs): if tree.prob() is not None: return # Get the prob of the CFG production. lhs = Nonterminal(tree.label()) rhs = [] for child in tree: if isinstance(child, Tree): rhs.append(Nonterminal(child.label())) else: rhs.append(child) prob = prod_probs[lhs, tuple(rhs)] # Get the probs of children. for child in tree: if isinstance(child, Tree): self._setprob(child, prod_probs) prob *= child.prob() tree.set_prob(prob) def sort_queue(self, queue, chart): """ Sort the given queue of ``Edge`` objects, placing the edge that should be tried first at the beginning of the queue. This method will be called after each ``Edge`` is added to the queue. :param queue: The queue of ``Edge`` objects to sort. Each edge in this queue is an edge that could be added to the chart by the fundamental rule; but that has not yet been added. :type queue: list(Edge) :param chart: The chart being used to parse the text. This chart can be used to provide extra information for sorting the queue. :type chart: Chart :rtype: None """ raise NotImplementedError() def _prune(self, queue, chart): """Discard items in the queue if the queue is longer than the beam.""" if len(queue) > self.beam_size: split = len(queue) - self.beam_size if self._trace > 2: for edge in queue[:split]: print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2)) del queue[:split] class InsideChartParser(BottomUpProbabilisticChartParser): """ A bottom-up parser for ``PCFG`` grammars that tries edges in descending order of the inside probabilities of their trees. The "inside probability" of a tree is simply the probability of the entire tree, ignoring its context. In particular, the inside probability of a tree generated by production *p* with children *c[1], c[2], ..., c[n]* is *P(p)P(c[1])P(c[2])...P(c[n])*; and the inside probability of a token is 1 if it is present in the text, and 0 if it is absent. This sorting order results in a type of lowest-cost-first search strategy. """ # Inherit constructor. def sort_queue(self, queue, chart): """ Sort the given queue of edges, in descending order of the inside probabilities of the edges' trees. :param queue: The queue of ``Edge`` objects to sort. Each edge in this queue is an edge that could be added to the chart by the fundamental rule; but that has not yet been added. :type queue: list(Edge) :param chart: The chart being used to parse the text. This chart can be used to provide extra information for sorting the queue. :type chart: Chart :rtype: None """ queue.sort(key=lambda edge: edge.prob()) # Eventually, this will become some sort of inside-outside parser: # class InsideOutsideParser(BottomUpProbabilisticChartParser): # def __init__(self, grammar, trace=0): # # Inherit docs. # BottomUpProbabilisticChartParser.__init__(self, grammar, trace) # # # Find the best path from S to each nonterminal # bestp = {} # for production in grammar.productions(): bestp[production.lhs()]=0 # bestp[grammar.start()] = 1.0 # # for i in range(len(grammar.productions())): # for production in grammar.productions(): # lhs = production.lhs() # for elt in production.rhs(): # bestp[elt] = max(bestp[lhs]*production.prob(), # bestp.get(elt,0)) # # self._bestp = bestp # for (k,v) in self._bestp.items(): print(k,v) # # def _sortkey(self, edge): # return edge.structure()[PROB] * self._bestp[edge.lhs()] # # def sort_queue(self, queue, chart): # queue.sort(key=self._sortkey) class RandomChartParser(BottomUpProbabilisticChartParser): """ A bottom-up parser for ``PCFG`` grammars that tries edges in random order. This sorting order results in a random search strategy. """ # Inherit constructor def sort_queue(self, queue, chart): i = random.randint(0, len(queue) - 1) (queue[-1], queue[i]) = (queue[i], queue[-1]) class UnsortedChartParser(BottomUpProbabilisticChartParser): """ A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order. """ # Inherit constructor def sort_queue(self, queue, chart): return class LongestChartParser(BottomUpProbabilisticChartParser): """ A bottom-up parser for ``PCFG`` grammars that tries longer edges before shorter ones. This sorting order results in a type of best-first search strategy. """ # Inherit constructor def sort_queue(self, queue, chart): queue.sort(key=lambda edge: edge.length()) ##////////////////////////////////////////////////////// ## Test Code ##////////////////////////////////////////////////////// def demo(choice=None, draw_parses=None, print_parses=None): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.parse import pchart # Define two demos. Each demo has a sentence and a grammar. toy_pcfg1 = PCFG.fromstring( """ S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """ ) toy_pcfg2 = PCFG.fromstring( """ S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """ ) demos = [ ("I saw John with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] if choice is None: # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") choice = int(sys.stdin.readline().strip()) - 1 try: sent, grammar = demos[choice] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideChartParser(grammar), pchart.RandomChartParser(grammar), pchart.UnsortedChartParser(grammar), pchart.LongestChartParser(grammar), pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print(f"\ns: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = list(parser.parse(tokens)) times.append(time.time() - t) p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0 average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print(" Parser Beam | Time (secs) # Parses Average P(parse)") print("------------------------+------------------------------------------") for i in range(len(parsers)): print( "%18s %4d |%11.4f%11d%19.14f" % ( parsers[i].__class__.__name__, parsers[i].beam_size, times[i], num_parses[i], average_p[i], ) ) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------+------------------------------------------") print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p)) if draw_parses is None: # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") draw_parses = sys.stdin.readline().strip().lower().startswith("y") if draw_parses: from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) if print_parses is None: # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") print_parses = sys.stdin.readline().strip().lower().startswith("y") if print_parses: for parse in parses: print(parse) if __name__ == "__main__": demo() nltk-3.7/nltk/parse/projectivedependencyparser.py000066400000000000000000000656071420073152400224310ustar00rootroot00000000000000# Natural Language Toolkit: Dependency Grammars # # Copyright (C) 2001-2022 NLTK Project # Author: Jason Narad # # URL: # For license information, see LICENSE.TXT # from collections import defaultdict from functools import total_ordering from itertools import chain from nltk.grammar import ( DependencyGrammar, DependencyProduction, ProbabilisticDependencyGrammar, ) from nltk.internals import raise_unorderable_types from nltk.parse.dependencygraph import DependencyGraph ################################################################# # Dependency Span ################################################################# @total_ordering class DependencySpan: """ A contiguous span over some part of the input string representing dependency (head -> modifier) relationships amongst words. An atomic span corresponds to only one word so it isn't a 'span' in the conventional sense, as its _start_index = _end_index = _head_index for concatenation purposes. All other spans are assumed to have arcs between all nodes within the start and end indexes of the span, and one head index corresponding to the head word for the entire span. This is the same as the root node if the dependency structure were depicted as a graph. """ def __init__(self, start_index, end_index, head_index, arcs, tags): self._start_index = start_index self._end_index = end_index self._head_index = head_index self._arcs = arcs self._tags = tags self._comparison_key = (start_index, end_index, head_index, tuple(arcs)) self._hash = hash(self._comparison_key) def head_index(self): """ :return: An value indexing the head of the entire ``DependencySpan``. :rtype: int """ return self._head_index def __repr__(self): """ :return: A concise string representatino of the ``DependencySpan``. :rtype: str. """ return "Span %d-%d; Head Index: %d" % ( self._start_index, self._end_index, self._head_index, ) def __str__(self): """ :return: A verbose string representation of the ``DependencySpan``. :rtype: str """ str = "Span %d-%d; Head Index: %d" % ( self._start_index, self._end_index, self._head_index, ) for i in range(len(self._arcs)): str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i]) return str def __eq__(self, other): return ( type(self) == type(other) and self._comparison_key == other._comparison_key ) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, DependencySpan): raise_unorderable_types("<", self, other) return self._comparison_key < other._comparison_key def __hash__(self): """ :return: The hash value of this ``DependencySpan``. """ return self._hash ################################################################# # Chart Cell ################################################################# class ChartCell: """ A cell from the parse chart formed when performing the CYK algorithm. Each cell keeps track of its x and y coordinates (though this will probably be discarded), and a list of spans serving as the cell's entries. """ def __init__(self, x, y): """ :param x: This cell's x coordinate. :type x: int. :param y: This cell's y coordinate. :type y: int. """ self._x = x self._y = y self._entries = set() def add(self, span): """ Appends the given span to the list of spans representing the chart cell's entries. :param span: The span to add. :type span: DependencySpan """ self._entries.add(span) def __str__(self): """ :return: A verbose string representation of this ``ChartCell``. :rtype: str. """ return "CC[%d,%d]: %s" % (self._x, self._y, self._entries) def __repr__(self): """ :return: A concise string representation of this ``ChartCell``. :rtype: str. """ return "%s" % self ################################################################# # Parsing with Dependency Grammars ################################################################# class ProjectiveDependencyParser: """ A projective, rule-based, dependency parser. A ProjectiveDependencyParser is created with a DependencyGrammar, a set of productions specifying word-to-word dependency relations. The parse() method will then return the set of all parses, in tree representation, for a given input sequence of tokens. Each parse must meet the requirements of the both the grammar and the projectivity constraint which specifies that the branches of the dependency tree are not allowed to cross. Alternatively, this can be understood as stating that each parent node and its children in the parse tree form a continuous substring of the input sequence. """ def __init__(self, dependency_grammar): """ Create a new ProjectiveDependencyParser, from a word-to-word dependency grammar ``DependencyGrammar``. :param dependency_grammar: A word-to-word relation dependencygrammar. :type dependency_grammar: DependencyGrammar """ self._grammar = dependency_grammar def parse(self, tokens): """ Performs a projective dependency parse on the list of tokens using a chart-based, span-concatenation algorithm similar to Eisner (1996). :param tokens: The list of input tokens. :type tokens: list(str) :return: An iterator over parse trees. :rtype: iter(Tree) """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"])) for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) for parse in chart[len(self._tokens)][0]._entries: conll_format = "" # malt_format = "" for i in range(len(tokens)): # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with the new Dependency Graph requirement (at least must have an root elements) conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( i + 1, tokens[i], tokens[i], "null", "null", "null", parse._arcs[i] + 1, "ROOT", "-", "-", ) dg = DependencyGraph(conll_format) # if self.meets_arity(dg): yield dg.tree() def concatenate(self, span1, span2): """ Concatenates the two spans in whichever way possible. This includes rightward concatenation (from the leftmost word of the leftmost span to the rightmost word of the rightmost span) and leftward concatenation (vice-versa) between adjacent spans. Unlike Eisner's presentation of span concatenation, these spans do not share or pivot on a particular word/word-index. :return: A list of new spans formed through concatenation. :rtype: list(DependencySpan) """ spans = [] if span1._start_index == span2._start_index: print("Error: Mismatched spans - replace this with thrown error") if span1._start_index > span2._start_index: temp_span = span1 span1 = span2 span2 = temp_span # adjacent rightward covered concatenation new_arcs = span1._arcs + span2._arcs new_tags = span1._tags + span2._tags if self._grammar.contains( self._tokens[span1._head_index], self._tokens[span2._head_index] ): # print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index)) new_arcs[span2._head_index - span1._start_index] = span1._head_index spans.append( DependencySpan( span1._start_index, span2._end_index, span1._head_index, new_arcs, new_tags, ) ) # adjacent leftward covered concatenation new_arcs = span1._arcs + span2._arcs if self._grammar.contains( self._tokens[span2._head_index], self._tokens[span1._head_index] ): # print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index)) new_arcs[span1._head_index - span1._start_index] = span2._head_index spans.append( DependencySpan( span1._start_index, span2._end_index, span2._head_index, new_arcs, new_tags, ) ) return spans ################################################################# # Parsing with Probabilistic Dependency Grammars ################################################################# class ProbabilisticProjectiveDependencyParser: """A probabilistic, projective dependency parser. This parser returns the most probable projective parse derived from the probabilistic dependency grammar derived from the train() method. The probabilistic model is an implementation of Eisner's (1996) Model C, which conditions on head-word, head-tag, child-word, and child-tag. The decoding uses a bottom-up chart-based span concatenation algorithm that's identical to the one utilized by the rule-based projective parser. Usage example >>> from nltk.parse.dependencygraph import conll_data2 >>> graphs = [ ... DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry ... ] >>> ppdp = ProbabilisticProjectiveDependencyParser() >>> ppdp.train(graphs) >>> sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.'] >>> list(ppdp.parse(sent)) [Tree('zag', ['Cathy', 'hen', Tree('zwaaien', ['wild', '.'])])] """ def __init__(self): """ Create a new probabilistic dependency parser. No additional operations are necessary. """ def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: if tokens[i - 1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i - 1]]: chart[i][j].add( DependencySpan(i - 1, i, i - 1, [-1], [tag]) ) else: print( "No tag found for input token '%s', parse is impossible." % tokens[i - 1] ) return [] for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += "%s\t%s\t%d\t%s\n" % ( tokens[i], "null", parse._arcs[i] + 1, "null", ) # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with recent change in dependency graph such that there must be a ROOT element. conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( i + 1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], "null", parse._arcs[i] + 1, "ROOT", "-", "-", ) dg = DependencyGraph(conll_format) score = self.compute_prob(dg) trees.append((score, dg.tree())) trees.sort() return (tree for (score, tree) in trees) def concatenate(self, span1, span2): """ Concatenates the two spans in whichever way possible. This includes rightward concatenation (from the leftmost word of the leftmost span to the rightmost word of the rightmost span) and leftward concatenation (vice-versa) between adjacent spans. Unlike Eisner's presentation of span concatenation, these spans do not share or pivot on a particular word/word-index. :return: A list of new spans formed through concatenation. :rtype: list(DependencySpan) """ spans = [] if span1._start_index == span2._start_index: print("Error: Mismatched spans - replace this with thrown error") if span1._start_index > span2._start_index: temp_span = span1 span1 = span2 span2 = temp_span # adjacent rightward covered concatenation new_arcs = span1._arcs + span2._arcs new_tags = span1._tags + span2._tags if self._grammar.contains( self._tokens[span1._head_index], self._tokens[span2._head_index] ): new_arcs[span2._head_index - span1._start_index] = span1._head_index spans.append( DependencySpan( span1._start_index, span2._end_index, span1._head_index, new_arcs, new_tags, ) ) # adjacent leftward covered concatenation new_arcs = span1._arcs + span2._arcs new_tags = span1._tags + span2._tags if self._grammar.contains( self._tokens[span2._head_index], self._tokens[span1._head_index] ): new_arcs[span1._head_index - span1._start_index] = span2._head_index spans.append( DependencySpan( span1._start_index, span2._end_index, span2._head_index, new_arcs, new_tags, ) ) return spans def train(self, graphs): """ Trains a ProbabilisticDependencyGrammar based on the list of input DependencyGraphs. This model is an implementation of Eisner's (1996) Model C, which derives its statistics from head-word, head-tag, child-word, and child-tag relationships. :param graphs: A list of dependency graphs to train from. :type: list(DependencyGraph) """ productions = [] events = defaultdict(int) tags = {} for dg in graphs: for node_index in range(1, len(dg.nodes)): # children = dg.nodes[node_index]['deps'] children = list( chain.from_iterable(dg.nodes[node_index]["deps"].values()) ) nr_left_children = dg.left_children(node_index) nr_right_children = dg.right_children(node_index) nr_children = nr_left_children + nr_right_children for child_index in range( 0 - (nr_left_children + 1), nr_right_children + 2 ): head_word = dg.nodes[node_index]["word"] head_tag = dg.nodes[node_index]["tag"] if head_word in tags: tags[head_word].add(head_tag) else: tags[head_word] = {head_tag} child = "STOP" child_tag = "STOP" prev_word = "START" prev_tag = "START" if child_index < 0: array_index = child_index + nr_left_children if array_index >= 0: child = dg.nodes[children[array_index]]["word"] child_tag = dg.nodes[children[array_index]]["tag"] if child_index != -1: prev_word = dg.nodes[children[array_index + 1]]["word"] prev_tag = dg.nodes[children[array_index + 1]]["tag"] if child != "STOP": productions.append(DependencyProduction(head_word, [child])) head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format( child, child_tag, prev_tag, head_word, head_tag, ) mod_event = "(mods ({}, {}, {}) left))".format( prev_tag, head_word, head_tag, ) events[head_event] += 1 events[mod_event] += 1 elif child_index > 0: array_index = child_index + nr_left_children - 1 if array_index < nr_children: child = dg.nodes[children[array_index]]["word"] child_tag = dg.nodes[children[array_index]]["tag"] if child_index != 1: prev_word = dg.nodes[children[array_index - 1]]["word"] prev_tag = dg.nodes[children[array_index - 1]]["tag"] if child != "STOP": productions.append(DependencyProduction(head_word, [child])) head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format( child, child_tag, prev_tag, head_word, head_tag, ) mod_event = "(mods ({}, {}, {}) right))".format( prev_tag, head_word, head_tag, ) events[head_event] += 1 events[mod_event] += 1 self._grammar = ProbabilisticDependencyGrammar(productions, events, tags) def compute_prob(self, dg): """ Computes the probability of a dependency graph based on the parser's probability model (defined by the parser's statistical dependency grammar). :param dg: A dependency graph to score. :type dg: DependencyGraph :return: The probability of the dependency graph. :rtype: int """ prob = 1.0 for node_index in range(1, len(dg.nodes)): # children = dg.nodes[node_index]['deps'] children = list(chain.from_iterable(dg.nodes[node_index]["deps"].values())) nr_left_children = dg.left_children(node_index) nr_right_children = dg.right_children(node_index) nr_children = nr_left_children + nr_right_children for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2): head_word = dg.nodes[node_index]["word"] head_tag = dg.nodes[node_index]["tag"] child = "STOP" child_tag = "STOP" prev_word = "START" prev_tag = "START" if child_index < 0: array_index = child_index + nr_left_children if array_index >= 0: child = dg.nodes[children[array_index]]["word"] child_tag = dg.nodes[children[array_index]]["tag"] if child_index != -1: prev_word = dg.nodes[children[array_index + 1]]["word"] prev_tag = dg.nodes[children[array_index + 1]]["tag"] head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format( child, child_tag, prev_tag, head_word, head_tag, ) mod_event = "(mods ({}, {}, {}) left))".format( prev_tag, head_word, head_tag, ) h_count = self._grammar._events[head_event] m_count = self._grammar._events[mod_event] # If the grammar is not covered if m_count != 0: prob *= h_count / m_count else: prob = 0.00000001 # Very small number elif child_index > 0: array_index = child_index + nr_left_children - 1 if array_index < nr_children: child = dg.nodes[children[array_index]]["word"] child_tag = dg.nodes[children[array_index]]["tag"] if child_index != 1: prev_word = dg.nodes[children[array_index - 1]]["word"] prev_tag = dg.nodes[children[array_index - 1]]["tag"] head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format( child, child_tag, prev_tag, head_word, head_tag, ) mod_event = "(mods ({}, {}, {}) right))".format( prev_tag, head_word, head_tag, ) h_count = self._grammar._events[head_event] m_count = self._grammar._events[mod_event] if m_count != 0: prob *= h_count / m_count else: prob = 0.00000001 # Very small number return prob ################################################################# # Demos ################################################################# def demo(): projective_rule_parse_demo() # arity_parse_demo() projective_prob_parse_demo() def projective_rule_parse_demo(): """ A demonstration showing the creation and use of a ``DependencyGrammar`` to perform a projective dependency parse. """ grammar = DependencyGrammar.fromstring( """ 'scratch' -> 'cats' | 'walls' 'walls' -> 'the' 'cats' -> 'the' """ ) print(grammar) pdp = ProjectiveDependencyParser(grammar) trees = pdp.parse(["the", "cats", "scratch", "the", "walls"]) for tree in trees: print(tree) def arity_parse_demo(): """ A demonstration showing the creation of a ``DependencyGrammar`` in which a specific number of modifiers is listed for a given head. This can further constrain the number of possible parses created by a ``ProjectiveDependencyParser``. """ print() print("A grammar with no arity constraints. Each DependencyProduction") print("specifies a relationship between one head word and only one") print("modifier word.") grammar = DependencyGrammar.fromstring( """ 'fell' -> 'price' | 'stock' 'price' -> 'of' | 'the' 'of' -> 'stock' 'stock' -> 'the' """ ) print(grammar) print() print("For the sentence 'The price of the stock fell', this grammar") print("will produce the following three parses:") pdp = ProjectiveDependencyParser(grammar) trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"]) for tree in trees: print(tree) print() print("By contrast, the following grammar contains a ") print("DependencyProduction that specifies a relationship") print("between a single head word, 'price', and two modifier") print("words, 'of' and 'the'.") grammar = DependencyGrammar.fromstring( """ 'fell' -> 'price' | 'stock' 'price' -> 'of' 'the' 'of' -> 'stock' 'stock' -> 'the' """ ) print(grammar) print() print( "This constrains the number of possible parses to just one:" ) # unimplemented, soon to replace pdp = ProjectiveDependencyParser(grammar) trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"]) for tree in trees: print(tree) def projective_prob_parse_demo(): """ A demo showing the training and use of a projective dependency parser. """ from nltk.parse.dependencygraph import conll_data2 graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] ppdp = ProbabilisticProjectiveDependencyParser() print("Training Probabilistic Projective Dependency Parser...") ppdp.train(graphs) sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."] print("Parsing '", " ".join(sent), "'...") print("Parse:") for tree in ppdp.parse(sent): print(tree) if __name__ == "__main__": demo() nltk-3.7/nltk/parse/recursivedescent.py000066400000000000000000000614041420073152400203470ustar00rootroot00000000000000# Natural Language Toolkit: Recursive Descent Parser # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from nltk.grammar import Nonterminal from nltk.parse.api import ParserI from nltk.tree import ImmutableTree, Tree ##////////////////////////////////////////////////////// ## Recursive Descent Parser ##////////////////////////////////////////////////////// class RecursiveDescentParser(ParserI): """ A simple top-down CFG parser that parses texts by recursively expanding the fringe of a Tree, and matching it against a text. ``RecursiveDescentParser`` uses a list of tree locations called a "frontier" to remember which subtrees have not yet been expanded and which leaves have not yet been matched against the text. Each tree location consists of a list of child indices specifying the path from the root of the tree to a subtree or a leaf; see the reference documentation for Tree for more information about tree locations. When the parser begins parsing a text, it constructs a tree containing only the start symbol, and a frontier containing the location of the tree's root node. It then extends the tree to cover the text, using the following recursive procedure: - If the frontier is empty, and the text is covered by the tree, then return the tree as a possible parse. - If the frontier is empty, and the text is not covered by the tree, then return no parses. - If the first element of the frontier is a subtree, then use CFG productions to "expand" it. For each applicable production, add the expanded subtree's children to the frontier, and recursively find all parses that can be generated by the new tree and frontier. - If the first element of the frontier is a token, then "match" it against the next token from the text. Remove the token from the frontier, and recursively find all parses that can be generated by the new tree and frontier. :see: ``nltk.grammar`` """ def __init__(self, grammar, trace=0): """ Create a new ``RecursiveDescentParser``, that uses ``grammar`` to parse texts. :type grammar: CFG :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. """ self._grammar = grammar self._trace = trace def grammar(self): return self._grammar def parse(self, tokens): # Inherit docs from ParserI tokens = list(tokens) self._grammar.check_coverage(tokens) # Start a recursive descent parse, with an initial tree # containing just the start symbol. start = self._grammar.start().symbol() initial_tree = Tree(start, []) frontier = [()] if self._trace: self._trace_start(initial_tree, frontier, tokens) return self._parse(tokens, initial_tree, frontier) def _parse(self, remaining_text, tree, frontier): """ Recursively expand and match each elements of ``tree`` specified by ``frontier``, to cover ``remaining_text``. Return a list of all parses found. :return: An iterator of all parses that can be generated by matching and expanding the elements of ``tree`` specified by ``frontier``. :rtype: iter(Tree) :type tree: Tree :param tree: A partial structure for the text that is currently being parsed. The elements of ``tree`` that are specified by ``frontier`` have not yet been expanded or matched. :type remaining_text: list(str) :param remaining_text: The portion of the text that is not yet covered by ``tree``. :type frontier: list(tuple(int)) :param frontier: A list of the locations within ``tree`` of all subtrees that have not yet been expanded, and all leaves that have not yet been matched. This list sorted in left-to-right order of location within the tree. """ # If the tree covers the text, and there's nothing left to # expand, then we've found a complete parse; return it. if len(remaining_text) == 0 and len(frontier) == 0: if self._trace: self._trace_succeed(tree, frontier) yield tree # If there's still text, but nothing left to expand, we failed. elif len(frontier) == 0: if self._trace: self._trace_backtrack(tree, frontier) # If the next element on the frontier is a tree, expand it. elif isinstance(tree[frontier[0]], Tree): yield from self._expand(remaining_text, tree, frontier) # If the next element on the frontier is a token, match it. else: yield from self._match(remaining_text, tree, frontier) def _match(self, rtext, tree, frontier): """ :rtype: iter(Tree) :return: an iterator of all parses that can be generated by matching the first element of ``frontier`` against the first token in ``rtext``. In particular, if the first element of ``frontier`` has the same type as the first token in ``rtext``, then substitute the token into ``tree``; and return all parses that can be generated by matching and expanding the remaining elements of ``frontier``. If the first element of ``frontier`` does not have the same type as the first token in ``rtext``, then return empty list. :type tree: Tree :param tree: A partial structure for the text that is currently being parsed. The elements of ``tree`` that are specified by ``frontier`` have not yet been expanded or matched. :type rtext: list(str) :param rtext: The portion of the text that is not yet covered by ``tree``. :type frontier: list of tuple of int :param frontier: A list of the locations within ``tree`` of all subtrees that have not yet been expanded, and all leaves that have not yet been matched. """ tree_leaf = tree[frontier[0]] if len(rtext) > 0 and tree_leaf == rtext[0]: # If it's a terminal that matches rtext[0], then substitute # in the token, and continue parsing. newtree = tree.copy(deep=True) newtree[frontier[0]] = rtext[0] if self._trace: self._trace_match(newtree, frontier[1:], rtext[0]) yield from self._parse(rtext[1:], newtree, frontier[1:]) else: # If it's a non-matching terminal, fail. if self._trace: self._trace_backtrack(tree, frontier, rtext[:1]) def _expand(self, remaining_text, tree, frontier, production=None): """ :rtype: iter(Tree) :return: An iterator of all parses that can be generated by expanding the first element of ``frontier`` with ``production``. In particular, if the first element of ``frontier`` is a subtree whose node type is equal to ``production``'s left hand side, then add a child to that subtree for each element of ``production``'s right hand side; and return all parses that can be generated by matching and expanding the remaining elements of ``frontier``. If the first element of ``frontier`` is not a subtree whose node type is equal to ``production``'s left hand side, then return an empty list. If ``production`` is not specified, then return a list of all parses that can be generated by expanding the first element of ``frontier`` with *any* CFG production. :type tree: Tree :param tree: A partial structure for the text that is currently being parsed. The elements of ``tree`` that are specified by ``frontier`` have not yet been expanded or matched. :type remaining_text: list(str) :param remaining_text: The portion of the text that is not yet covered by ``tree``. :type frontier: list(tuple(int)) :param frontier: A list of the locations within ``tree`` of all subtrees that have not yet been expanded, and all leaves that have not yet been matched. """ if production is None: productions = self._grammar.productions() else: productions = [production] for production in productions: lhs = production.lhs().symbol() if lhs == tree[frontier[0]].label(): subtree = self._production_to_tree(production) if frontier[0] == (): newtree = subtree else: newtree = tree.copy(deep=True) newtree[frontier[0]] = subtree new_frontier = [ frontier[0] + (i,) for i in range(len(production.rhs())) ] if self._trace: self._trace_expand(newtree, new_frontier, production) yield from self._parse( remaining_text, newtree, new_frontier + frontier[1:] ) def _production_to_tree(self, production): """ :rtype: Tree :return: The Tree that is licensed by ``production``. In particular, given the production ``[lhs -> elt[1] ... elt[n]]`` return a tree that has a node ``lhs.symbol``, and ``n`` children. For each nonterminal element ``elt[i]`` in the production, the tree token has a childless subtree with node value ``elt[i].symbol``; and for each terminal element ``elt[j]``, the tree token has a leaf token with type ``elt[j]``. :param production: The CFG production that licenses the tree token that should be returned. :type production: Production """ children = [] for elt in production.rhs(): if isinstance(elt, Nonterminal): children.append(Tree(elt.symbol(), [])) else: # This will be matched. children.append(elt) return Tree(production.lhs().symbol(), children) def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. :type trace: int :param trace: The trace level. A trace level of ``0`` will generate no tracing output; and higher trace levels will produce more verbose tracing output. :rtype: None """ self._trace = trace def _trace_fringe(self, tree, treeloc=None): """ Print trace output displaying the fringe of ``tree``. The fringe of ``tree`` consists of all of its leaves and all of its childless subtrees. :rtype: None """ if treeloc == (): print("*", end=" ") if isinstance(tree, Tree): if len(tree) == 0: print(repr(Nonterminal(tree.label())), end=" ") for i in range(len(tree)): if treeloc is not None and i == treeloc[0]: self._trace_fringe(tree[i], treeloc[1:]) else: self._trace_fringe(tree[i]) else: print(repr(tree), end=" ") def _trace_tree(self, tree, frontier, operation): """ Print trace output displaying the parser's current state. :param operation: A character identifying the operation that generated the current state. :rtype: None """ if self._trace == 2: print(" %c [" % operation, end=" ") else: print(" [", end=" ") if len(frontier) > 0: self._trace_fringe(tree, frontier[0]) else: self._trace_fringe(tree) print("]") def _trace_start(self, tree, frontier, text): print("Parsing %r" % " ".join(text)) if self._trace > 2: print("Start:") if self._trace > 1: self._trace_tree(tree, frontier, " ") def _trace_expand(self, tree, frontier, production): if self._trace > 2: print("Expand: %s" % production) if self._trace > 1: self._trace_tree(tree, frontier, "E") def _trace_match(self, tree, frontier, tok): if self._trace > 2: print("Match: %r" % tok) if self._trace > 1: self._trace_tree(tree, frontier, "M") def _trace_succeed(self, tree, frontier): if self._trace > 2: print("GOOD PARSE:") if self._trace == 1: print("Found a parse:\n%s" % tree) if self._trace > 1: self._trace_tree(tree, frontier, "+") def _trace_backtrack(self, tree, frontier, toks=None): if self._trace > 2: if toks: print("Backtrack: %r match failed" % toks[0]) else: print("Backtrack") ##////////////////////////////////////////////////////// ## Stepping Recursive Descent Parser ##////////////////////////////////////////////////////// class SteppingRecursiveDescentParser(RecursiveDescentParser): """ A ``RecursiveDescentParser`` that allows you to step through the parsing process, performing a single operation at a time. The ``initialize`` method is used to start parsing a text. ``expand`` expands the first element on the frontier using a single CFG production, and ``match`` matches the first element on the frontier against the next text token. ``backtrack`` undoes the most recent expand or match operation. ``step`` performs a single expand, match, or backtrack operation. ``parses`` returns the set of parses that have been found by the parser. :ivar _history: A list of ``(rtext, tree, frontier)`` tripples, containing the previous states of the parser. This history is used to implement the ``backtrack`` operation. :ivar _tried_e: A record of all productions that have been tried for a given tree. This record is used by ``expand`` to perform the next untried production. :ivar _tried_m: A record of what tokens have been matched for a given tree. This record is used by ``step`` to decide whether or not to match a token. :see: ``nltk.grammar`` """ def __init__(self, grammar, trace=0): super().__init__(grammar, trace) self._rtext = None self._tree = None self._frontier = [()] self._tried_e = {} self._tried_m = {} self._history = [] self._parses = [] # [XX] TEMPORARY HACK WARNING! This should be replaced with # something nicer when we get the chance. def _freeze(self, tree): c = tree.copy() # for pos in c.treepositions('leaves'): # c[pos] = c[pos].freeze() return ImmutableTree.convert(c) def parse(self, tokens): tokens = list(tokens) self.initialize(tokens) while self.step() is not None: pass return self.parses() def initialize(self, tokens): """ Start parsing a given text. This sets the parser's tree to the start symbol, its frontier to the root node, and its remaining text to ``token['SUBTOKENS']``. """ self._rtext = tokens start = self._grammar.start().symbol() self._tree = Tree(start, []) self._frontier = [()] self._tried_e = {} self._tried_m = {} self._history = [] self._parses = [] if self._trace: self._trace_start(self._tree, self._frontier, self._rtext) def remaining_text(self): """ :return: The portion of the text that is not yet covered by the tree. :rtype: list(str) """ return self._rtext def frontier(self): """ :return: A list of the tree locations of all subtrees that have not yet been expanded, and all leaves that have not yet been matched. :rtype: list(tuple(int)) """ return self._frontier def tree(self): """ :return: A partial structure for the text that is currently being parsed. The elements specified by the frontier have not yet been expanded or matched. :rtype: Tree """ return self._tree def step(self): """ Perform a single parsing operation. If an untried match is possible, then perform the match, and return the matched token. If an untried expansion is possible, then perform the expansion, and return the production that it is based on. If backtracking is possible, then backtrack, and return True. Otherwise, return None. :return: None if no operation was performed; a token if a match was performed; a production if an expansion was performed; and True if a backtrack operation was performed. :rtype: Production or String or bool """ # Try matching (if we haven't already) if self.untried_match(): token = self.match() if token is not None: return token # Try expanding. production = self.expand() if production is not None: return production # Try backtracking if self.backtrack(): self._trace_backtrack(self._tree, self._frontier) return True # Nothing left to do. return None def expand(self, production=None): """ Expand the first element of the frontier. In particular, if the first element of the frontier is a subtree whose node type is equal to ``production``'s left hand side, then add a child to that subtree for each element of ``production``'s right hand side. If ``production`` is not specified, then use the first untried expandable production. If all expandable productions have been tried, do nothing. :return: The production used to expand the frontier, if an expansion was performed. If no expansion was performed, return None. :rtype: Production or None """ # Make sure we *can* expand. if len(self._frontier) == 0: return None if not isinstance(self._tree[self._frontier[0]], Tree): return None # If they didn't specify a production, check all untried ones. if production is None: productions = self.untried_expandable_productions() else: productions = [production] parses = [] for prod in productions: # Record that we've tried this production now. self._tried_e.setdefault(self._freeze(self._tree), []).append(prod) # Try expanding. for _result in self._expand(self._rtext, self._tree, self._frontier, prod): return prod # We didn't expand anything. return None def match(self): """ Match the first element of the frontier. In particular, if the first element of the frontier has the same type as the next text token, then substitute the text token into the tree. :return: The token matched, if a match operation was performed. If no match was performed, return None :rtype: str or None """ # Record that we've tried matching this token. tok = self._rtext[0] self._tried_m.setdefault(self._freeze(self._tree), []).append(tok) # Make sure we *can* match. if len(self._frontier) == 0: return None if isinstance(self._tree[self._frontier[0]], Tree): return None for _result in self._match(self._rtext, self._tree, self._frontier): # Return the token we just matched. return self._history[-1][0][0] return None def backtrack(self): """ Return the parser to its state before the most recent match or expand operation. Calling ``undo`` repeatedly return the parser to successively earlier states. If no match or expand operations have been performed, ``undo`` will make no changes. :return: true if an operation was successfully undone. :rtype: bool """ if len(self._history) == 0: return False (self._rtext, self._tree, self._frontier) = self._history.pop() return True def expandable_productions(self): """ :return: A list of all the productions for which expansions are available for the current parser state. :rtype: list(Production) """ # Make sure we *can* expand. if len(self._frontier) == 0: return [] frontier_child = self._tree[self._frontier[0]] if len(self._frontier) == 0 or not isinstance(frontier_child, Tree): return [] return [ p for p in self._grammar.productions() if p.lhs().symbol() == frontier_child.label() ] def untried_expandable_productions(self): """ :return: A list of all the untried productions for which expansions are available for the current parser state. :rtype: list(Production) """ tried_expansions = self._tried_e.get(self._freeze(self._tree), []) return [p for p in self.expandable_productions() if p not in tried_expansions] def untried_match(self): """ :return: Whether the first element of the frontier is a token that has not yet been matched. :rtype: bool """ if len(self._rtext) == 0: return False tried_matches = self._tried_m.get(self._freeze(self._tree), []) return self._rtext[0] not in tried_matches def currently_complete(self): """ :return: Whether the parser's current state represents a complete parse. :rtype: bool """ return len(self._frontier) == 0 and len(self._rtext) == 0 def _parse(self, remaining_text, tree, frontier): """ A stub version of ``_parse`` that sets the parsers current state to the given arguments. In ``RecursiveDescentParser``, the ``_parse`` method is used to recursively continue parsing a text. ``SteppingRecursiveDescentParser`` overrides it to capture these recursive calls. It records the parser's old state in the history (to allow for backtracking), and updates the parser's new state using the given arguments. Finally, it returns ``[1]``, which is used by ``match`` and ``expand`` to detect whether their operations were successful. :return: ``[1]`` :rtype: list of int """ self._history.append((self._rtext, self._tree, self._frontier)) self._rtext = remaining_text self._tree = tree self._frontier = frontier # Is it a good parse? If so, record it. if len(frontier) == 0 and len(remaining_text) == 0: self._parses.append(tree) self._trace_succeed(self._tree, self._frontier) return [1] def parses(self): """ :return: An iterator of the parses that have been found by this parser so far. :rtype: list of Tree """ return iter(self._parses) def set_grammar(self, grammar): """ Change the grammar used to parse texts. :param grammar: The new grammar. :type grammar: CFG """ self._grammar = grammar ##////////////////////////////////////////////////////// ## Demonstration Code ##////////////////////////////////////////////////////// def demo(): """ A demonstration of the recursive descent parser. """ from nltk import CFG, parse grammar = CFG.fromstring( """ S -> NP VP NP -> Det N | Det N PP VP -> V NP | V NP PP PP -> P NP NP -> 'I' N -> 'man' | 'park' | 'telescope' | 'dog' Det -> 'the' | 'a' P -> 'in' | 'with' V -> 'saw' """ ) for prod in grammar.productions(): print(prod) sent = "I saw a man in the park".split() parser = parse.RecursiveDescentParser(grammar, trace=2) for p in parser.parse(sent): print(p) if __name__ == "__main__": demo() nltk-3.7/nltk/parse/shiftreduce.py000066400000000000000000000403201420073152400172710ustar00rootroot00000000000000# Natural Language Toolkit: Shift-Reduce Parser # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from nltk.grammar import Nonterminal from nltk.parse.api import ParserI from nltk.tree import Tree ##////////////////////////////////////////////////////// ## Shift/Reduce Parser ##////////////////////////////////////////////////////// class ShiftReduceParser(ParserI): """ A simple bottom-up CFG parser that uses two operations, "shift" and "reduce", to find a single parse for a text. ``ShiftReduceParser`` maintains a stack, which records the structure of a portion of the text. This stack is a list of strings and Trees that collectively cover a portion of the text. For example, while parsing the sentence "the dog saw the man" with a typical grammar, ``ShiftReduceParser`` will produce the following stack, which covers "the dog saw":: [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')] ``ShiftReduceParser`` attempts to extend the stack to cover the entire text, and to combine the stack elements into a single tree, producing a complete parse for the sentence. Initially, the stack is empty. It is extended to cover the text, from left to right, by repeatedly applying two operations: - "shift" moves a token from the beginning of the text to the end of the stack. - "reduce" uses a CFG production to combine the rightmost stack elements into a single Tree. Often, more than one operation can be performed on a given stack. In this case, ``ShiftReduceParser`` uses the following heuristics to decide which operation to perform: - Only shift if no reductions are available. - If multiple reductions are available, then apply the reduction whose CFG production is listed earliest in the grammar. Note that these heuristics are not guaranteed to choose an operation that leads to a parse of the text. Also, if multiple parses exists, ``ShiftReduceParser`` will return at most one of them. :see: ``nltk.grammar`` """ def __init__(self, grammar, trace=0): """ Create a new ``ShiftReduceParser``, that uses ``grammar`` to parse texts. :type grammar: Grammar :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. """ self._grammar = grammar self._trace = trace self._check_grammar() def grammar(self): return self._grammar def parse(self, tokens): tokens = list(tokens) self._grammar.check_coverage(tokens) # initialize the stack. stack = [] remaining_text = tokens # Trace output. if self._trace: print("Parsing %r" % " ".join(tokens)) self._trace_stack(stack, remaining_text) # iterate through the text, pushing the token onto # the stack, then reducing the stack. while len(remaining_text) > 0: self._shift(stack, remaining_text) while self._reduce(stack, remaining_text): pass # Did we reduce everything? if len(stack) == 1: # Did we end up with the right category? if stack[0].label() == self._grammar.start().symbol(): yield stack[0] def _shift(self, stack, remaining_text): """ Move a token from the beginning of ``remaining_text`` to the end of ``stack``. :type stack: list(str and Tree) :param stack: A list of strings and Trees, encoding the structure of the text that has been parsed so far. :type remaining_text: list(str) :param remaining_text: The portion of the text that is not yet covered by ``stack``. :rtype: None """ stack.append(remaining_text[0]) remaining_text.remove(remaining_text[0]) if self._trace: self._trace_shift(stack, remaining_text) def _match_rhs(self, rhs, rightmost_stack): """ :rtype: bool :return: true if the right hand side of a CFG production matches the rightmost elements of the stack. ``rhs`` matches ``rightmost_stack`` if they are the same length, and each element of ``rhs`` matches the corresponding element of ``rightmost_stack``. A nonterminal element of ``rhs`` matches any Tree whose node value is equal to the nonterminal's symbol. A terminal element of ``rhs`` matches any string whose type is equal to the terminal. :type rhs: list(terminal and Nonterminal) :param rhs: The right hand side of a CFG production. :type rightmost_stack: list(string and Tree) :param rightmost_stack: The rightmost elements of the parser's stack. """ if len(rightmost_stack) != len(rhs): return False for i in range(len(rightmost_stack)): if isinstance(rightmost_stack[i], Tree): if not isinstance(rhs[i], Nonterminal): return False if rightmost_stack[i].label() != rhs[i].symbol(): return False else: if isinstance(rhs[i], Nonterminal): return False if rightmost_stack[i] != rhs[i]: return False return True def _reduce(self, stack, remaining_text, production=None): """ Find a CFG production whose right hand side matches the rightmost stack elements; and combine those stack elements into a single Tree, with the node specified by the production's left-hand side. If more than one CFG production matches the stack, then use the production that is listed earliest in the grammar. The new Tree replaces the elements in the stack. :rtype: Production or None :return: If a reduction is performed, then return the CFG production that the reduction is based on; otherwise, return false. :type stack: list(string and Tree) :param stack: A list of strings and Trees, encoding the structure of the text that has been parsed so far. :type remaining_text: list(str) :param remaining_text: The portion of the text that is not yet covered by ``stack``. """ if production is None: productions = self._grammar.productions() else: productions = [production] # Try each production, in order. for production in productions: rhslen = len(production.rhs()) # check if the RHS of a production matches the top of the stack if self._match_rhs(production.rhs(), stack[-rhslen:]): # combine the tree to reflect the reduction tree = Tree(production.lhs().symbol(), stack[-rhslen:]) stack[-rhslen:] = [tree] # We reduced something if self._trace: self._trace_reduce(stack, production, remaining_text) return production # We didn't reduce anything return None def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. :type trace: int :param trace: The trace level. A trace level of ``0`` will generate no tracing output; and higher trace levels will produce more verbose tracing output. :rtype: None """ # 1: just show shifts. # 2: show shifts & reduces # 3: display which tokens & productions are shifed/reduced self._trace = trace def _trace_stack(self, stack, remaining_text, marker=" "): """ Print trace output displaying the given stack and text. :rtype: None :param marker: A character that is printed to the left of the stack. This is used with trace level 2 to print 'S' before shifted stacks and 'R' before reduced stacks. """ s = " " + marker + " [ " for elt in stack: if isinstance(elt, Tree): s += repr(Nonterminal(elt.label())) + " " else: s += repr(elt) + " " s += "* " + " ".join(remaining_text) + "]" print(s) def _trace_shift(self, stack, remaining_text): """ Print trace output displaying that a token has been shifted. :rtype: None """ if self._trace > 2: print("Shift %r:" % stack[-1]) if self._trace == 2: self._trace_stack(stack, remaining_text, "S") elif self._trace > 0: self._trace_stack(stack, remaining_text) def _trace_reduce(self, stack, production, remaining_text): """ Print trace output displaying that ``production`` was used to reduce ``stack``. :rtype: None """ if self._trace > 2: rhs = " ".join(production.rhs()) print(f"Reduce {production.lhs()!r} <- {rhs}") if self._trace == 2: self._trace_stack(stack, remaining_text, "R") elif self._trace > 1: self._trace_stack(stack, remaining_text) def _check_grammar(self): """ Check to make sure that all of the CFG productions are potentially useful. If any productions can never be used, then print a warning. :rtype: None """ productions = self._grammar.productions() # Any production whose RHS is an extension of another production's RHS # will never be used. for i in range(len(productions)): for j in range(i + 1, len(productions)): rhs1 = productions[i].rhs() rhs2 = productions[j].rhs() if rhs1[: len(rhs2)] == rhs2: print("Warning: %r will never be used" % productions[i]) ##////////////////////////////////////////////////////// ## Stepping Shift/Reduce Parser ##////////////////////////////////////////////////////// class SteppingShiftReduceParser(ShiftReduceParser): """ A ``ShiftReduceParser`` that allows you to setp through the parsing process, performing a single operation at a time. It also allows you to change the parser's grammar midway through parsing a text. The ``initialize`` method is used to start parsing a text. ``shift`` performs a single shift operation, and ``reduce`` performs a single reduce operation. ``step`` will perform a single reduce operation if possible; otherwise, it will perform a single shift operation. ``parses`` returns the set of parses that have been found by the parser. :ivar _history: A list of ``(stack, remaining_text)`` pairs, containing all of the previous states of the parser. This history is used to implement the ``undo`` operation. :see: ``nltk.grammar`` """ def __init__(self, grammar, trace=0): super().__init__(grammar, trace) self._stack = None self._remaining_text = None self._history = [] def parse(self, tokens): tokens = list(tokens) self.initialize(tokens) while self.step(): pass return self.parses() def stack(self): """ :return: The parser's stack. :rtype: list(str and Tree) """ return self._stack def remaining_text(self): """ :return: The portion of the text that is not yet covered by the stack. :rtype: list(str) """ return self._remaining_text def initialize(self, tokens): """ Start parsing a given text. This sets the parser's stack to ``[]`` and sets its remaining text to ``tokens``. """ self._stack = [] self._remaining_text = tokens self._history = [] def step(self): """ Perform a single parsing operation. If a reduction is possible, then perform that reduction, and return the production that it is based on. Otherwise, if a shift is possible, then perform it, and return True. Otherwise, return False. :return: False if no operation was performed; True if a shift was performed; and the CFG production used to reduce if a reduction was performed. :rtype: Production or bool """ return self.reduce() or self.shift() def shift(self): """ Move a token from the beginning of the remaining text to the end of the stack. If there are no more tokens in the remaining text, then do nothing. :return: True if the shift operation was successful. :rtype: bool """ if len(self._remaining_text) == 0: return False self._history.append((self._stack[:], self._remaining_text[:])) self._shift(self._stack, self._remaining_text) return True def reduce(self, production=None): """ Use ``production`` to combine the rightmost stack elements into a single Tree. If ``production`` does not match the rightmost stack elements, then do nothing. :return: The production used to reduce the stack, if a reduction was performed. If no reduction was performed, return None. :rtype: Production or None """ self._history.append((self._stack[:], self._remaining_text[:])) return_val = self._reduce(self._stack, self._remaining_text, production) if not return_val: self._history.pop() return return_val def undo(self): """ Return the parser to its state before the most recent shift or reduce operation. Calling ``undo`` repeatedly return the parser to successively earlier states. If no shift or reduce operations have been performed, ``undo`` will make no changes. :return: true if an operation was successfully undone. :rtype: bool """ if len(self._history) == 0: return False (self._stack, self._remaining_text) = self._history.pop() return True def reducible_productions(self): """ :return: A list of the productions for which reductions are available for the current parser state. :rtype: list(Production) """ productions = [] for production in self._grammar.productions(): rhslen = len(production.rhs()) if self._match_rhs(production.rhs(), self._stack[-rhslen:]): productions.append(production) return productions def parses(self): """ :return: An iterator of the parses that have been found by this parser so far. :rtype: iter(Tree) """ if ( len(self._remaining_text) == 0 and len(self._stack) == 1 and self._stack[0].label() == self._grammar.start().symbol() ): yield self._stack[0] # copied from nltk.parser def set_grammar(self, grammar): """ Change the grammar used to parse texts. :param grammar: The new grammar. :type grammar: CFG """ self._grammar = grammar ##////////////////////////////////////////////////////// ## Demonstration Code ##////////////////////////////////////////////////////// def demo(): """ A demonstration of the shift-reduce parser. """ from nltk import CFG, parse grammar = CFG.fromstring( """ S -> NP VP NP -> Det N | Det N PP VP -> V NP | V NP PP PP -> P NP NP -> 'I' N -> 'man' | 'park' | 'telescope' | 'dog' Det -> 'the' | 'a' P -> 'in' | 'with' V -> 'saw' """ ) sent = "I saw a man in the park".split() parser = parse.ShiftReduceParser(grammar, trace=2) for p in parser.parse(sent): print(p) if __name__ == "__main__": demo() nltk-3.7/nltk/parse/stanford.py000066400000000000000000000444111420073152400166110ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the Stanford Parser # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Xu # # URL: # For license information, see LICENSE.TXT import os import tempfile import warnings from subprocess import PIPE from nltk.internals import ( _java_options, config_java, find_jar_iter, find_jars_within_path, java, ) from nltk.parse.api import ParserI from nltk.parse.dependencygraph import DependencyGraph from nltk.tree import Tree _stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml" class GenericStanfordParser(ParserI): """Interface to the Stanford Parser""" _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar" _JAR = r"stanford-parser\.jar" _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser" _USE_STDIN = False _DOUBLE_SPACED_OUTPUT = False def __init__( self, path_to_jar=None, path_to_models_jar=None, model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", encoding="utf8", verbose=False, java_options="-mx4g", corenlp_options="", ): # find the most recent code and model jar stanford_jar = max( find_jar_iter( self._JAR, path_to_jar, env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True, ), key=lambda model_path: os.path.dirname(model_path), ) model_jar = max( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True, ), key=lambda model_path: os.path.dirname(model_path), ) # self._classpath = (stanford_jar, model_jar) # Adding logging jar files to classpath stanford_dir = os.path.split(stanford_jar)[0] self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir)) self.model_path = model_path self._encoding = encoding self.corenlp_options = corenlp_options self.java_options = java_options def _parse_trees_output(self, output_): res = [] cur_lines = [] cur_trees = [] blank = False for line in output_.splitlines(False): if line == "": if blank: res.append(iter(cur_trees)) cur_trees = [] blank = False elif self._DOUBLE_SPACED_OUTPUT: cur_trees.append(self._make_tree("\n".join(cur_lines))) cur_lines = [] blank = True else: res.append(iter([self._make_tree("\n".join(cur_lines))])) cur_lines = [] else: cur_lines.append(line) blank = False return iter(res) def parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a list where each sentence is a list of words. Each sentence will be automatically tagged with this StanfordParser instance's tagger. If whitespaces exists inside a token, then the token will be treated as separate tokens. :param sentences: Input sentences to parse :type sentences: list(list(str)) :rtype: iter(iter(Tree)) """ cmd = [ self._MAIN_CLASS, "-model", self.model_path, "-sentences", "newline", "-outputFormat", self._OUTPUT_FORMAT, "-tokenized", "-escaper", "edu.stanford.nlp.process.PTBEscapingProcessor", ] return self._parse_trees_output( self._execute( cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose ) ) def raw_parse(self, sentence, verbose=False): """ Use StanfordParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by the Stanford Parser. :param sentence: Input sentence to parse :type sentence: str :rtype: iter(Tree) """ return next(self.raw_parse_sents([sentence], verbose)) def raw_parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a list of strings. Each sentence will be automatically tokenized and tagged by the Stanford Parser. :param sentences: Input sentences to parse :type sentences: list(str) :rtype: iter(iter(Tree)) """ cmd = [ self._MAIN_CLASS, "-model", self.model_path, "-sentences", "newline", "-outputFormat", self._OUTPUT_FORMAT, ] return self._parse_trees_output( self._execute(cmd, "\n".join(sentences), verbose) ) def tagged_parse(self, sentence, verbose=False): """ Use StanfordParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: list(tuple(str, str)) :rtype: iter(Tree) """ return next(self.tagged_parse_sents([sentence], verbose)) def tagged_parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentences: list(list(tuple(str, str))) :rtype: iter(iter(Tree)) """ tag_separator = "/" cmd = [ self._MAIN_CLASS, "-model", self.model_path, "-sentences", "newline", "-outputFormat", self._OUTPUT_FORMAT, "-tokenized", "-tagSeparator", tag_separator, "-tokenizerFactory", "edu.stanford.nlp.process.WhitespaceTokenizer", "-tokenizerMethod", "newCoreLabelTokenizerFactory", ] # We don't need to escape slashes as "splitting is done on the last instance of the character in the token" return self._parse_trees_output( self._execute( cmd, "\n".join( " ".join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences ), verbose, ) ) def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(["-encoding", encoding]) if self.corenlp_options: cmd.extend(self.corenlp_options.split()) default_options = " ".join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, str) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() # Run the tagger and get the output. if self._USE_STDIN: input_file.seek(0) stdout, stderr = java( cmd, classpath=self._classpath, stdin=input_file, stdout=PIPE, stderr=PIPE, ) else: cmd.append(input_file.name) stdout, stderr = java( cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE ) stdout = stdout.replace(b"\xc2\xa0", b" ") stdout = stdout.replace(b"\x00\xa0", b" ") stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout class StanfordParser(GenericStanfordParser): """ >>> parser=StanfordParser( ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... ) >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents(( ... "the quick brown fox jumps over the lazy dog", ... "the quick grey wolf jumps over the lazy fox" ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]), Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']), Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( ... ( ... ("The", "DT"), ... ("quick", "JJ"), ... ("brown", "JJ"), ... ("fox", "NN"), ... ("jumped", "VBD"), ... ("over", "IN"), ... ("the", "DT"), ... ("lazy", "JJ"), ... ("dog", "NN"), ... (".", "."), ... ), ... ))],[]) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] """ _OUTPUT_FORMAT = "penn" def __init__(self, *args, **kwargs): warnings.warn( "The StanfordParser will be deprecated\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.", DeprecationWarning, stacklevel=2, ) super().__init__(*args, **kwargs) def _make_tree(self, result): return Tree.fromstring(result) class StanfordDependencyParser(GenericStanfordParser): """ >>> dep_parser=StanfordDependencyParser( ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... ) >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( ... "The quick brown fox jumps over the lazy dog.", ... "The quick grey wolf jumps over the lazy fox." ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( ... ( ... ("The", "DT"), ... ("quick", "JJ"), ... ("brown", "JJ"), ... ("fox", "NN"), ... ("jumped", "VBD"), ... ("over", "IN"), ... ("the", "DT"), ... ("lazy", "JJ"), ... ("dog", "NN"), ... (".", "."), ... ), ... ))],[]) # doctest: +NORMALIZE_WHITESPACE [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] """ _OUTPUT_FORMAT = "conll2007" def __init__(self, *args, **kwargs): warnings.warn( "The StanfordDependencyParser will be deprecated\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.", DeprecationWarning, stacklevel=2, ) super().__init__(*args, **kwargs) def _make_tree(self, result): return DependencyGraph(result, top_relation_label="root") class StanfordNeuralDependencyParser(GenericStanfordParser): """ >>> from nltk.parse.stanford import StanfordNeuralDependencyParser >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g') >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])] >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'), u'punct', (u'.', u'.'))]] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( ... "The quick brown fox jumps over the lazy dog.", ... "The quick grey wolf jumps over the lazy fox." ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy']), '.'])] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])] """ _OUTPUT_FORMAT = "conll" _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP" _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar" _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar" _USE_STDIN = True _DOUBLE_SPACED_OUTPUT = True def __init__(self, *args, **kwargs): warnings.warn( "The StanfordNeuralDependencyParser will be deprecated\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.", DeprecationWarning, stacklevel=2, ) super().__init__(*args, **kwargs) self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse" def tagged_parse_sents(self, sentences, verbose=False): """ Currently unimplemented because the neural dependency parser (and the StanfordCoreNLP pipeline class) doesn't support passing in pre- tagged tokens. """ raise NotImplementedError( "tagged_parse[_sents] is not supported by " "StanfordNeuralDependencyParser; use " "parse[_sents] or raw_parse[_sents] instead." ) def _make_tree(self, result): return DependencyGraph(result, top_relation_label="ROOT") nltk-3.7/nltk/parse/test.cfg000066400000000000000000000004361420073152400160560ustar00rootroot00000000000000%start S S[sem=] -> NP[sem=?subj] VP[sem=?vp] VP[sem = ] -> V[sem = ?v] NP[sem=?obj] VP[sem = ?v] -> V[sem = ?v] NP[sem = ] -> 'Kim' NP[sem = ] -> 'I' V[sem = <\x y.(like x y)>, tns=pres] -> 'like' V[sem = <\x.(sleeps x)>, tns=pres] -> 'sleeps' nltk-3.7/nltk/parse/transitionparser.py000066400000000000000000000753001420073152400204010ustar00rootroot00000000000000# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers # # Author: Long Duong # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT import pickle import tempfile from copy import deepcopy from operator import itemgetter from os import remove try: from numpy import array from scipy import sparse from sklearn import svm from sklearn.datasets import load_svmlight_file except ImportError: pass from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI class Configuration: """ Class for holding configuration which is the partial analysis of the input sentence. The transition based parser aims at finding set of operators that transfer the initial configuration to the terminal configuration. The configuration includes: - Stack: for storing partially proceeded words - Buffer: for storing remaining input words - Set of arcs: for storing partially built dependency tree This class also provides a method to represent a configuration as list of features. """ def __init__(self, dep_graph): """ :param dep_graph: the representation of an input in the form of dependency graph. :type dep_graph: DependencyGraph where the dependencies are not specified. """ # dep_graph.nodes contain list of token for a sentence self.stack = [0] # The root element self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer self.arcs = [] # empty set of arc self._tokens = dep_graph.nodes self._max_address = len(self.buffer) def __str__(self): return ( "Stack : " + str(self.stack) + " Buffer : " + str(self.buffer) + " Arcs : " + str(self.arcs) ) def _check_informative(self, feat, flag=False): """ Check whether a feature is informative The flag control whether "_" is informative or not """ if feat is None: return False if feat == "": return False if flag is False: if feat == "_": return False return True def extract_features(self): """ Extract the set of features for the current configuration. Implement standard features as describe in Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre. Please note that these features are very basic. :return: list(str) """ result = [] # Todo : can come up with more complicated features set for better # performance. if len(self.stack) > 0: # Stack 0 stack_idx0 = self.stack[len(self.stack) - 1] token = self._tokens[stack_idx0] if self._check_informative(token["word"], True): result.append("STK_0_FORM_" + token["word"]) if "lemma" in token and self._check_informative(token["lemma"]): result.append("STK_0_LEMMA_" + token["lemma"]) if self._check_informative(token["tag"]): result.append("STK_0_POS_" + token["tag"]) if "feats" in token and self._check_informative(token["feats"]): feats = token["feats"].split("|") for feat in feats: result.append("STK_0_FEATS_" + feat) # Stack 1 if len(self.stack) > 1: stack_idx1 = self.stack[len(self.stack) - 2] token = self._tokens[stack_idx1] if self._check_informative(token["tag"]): result.append("STK_1_POS_" + token["tag"]) # Left most, right most dependency of stack[0] left_most = 1000000 right_most = -1 dep_left_most = "" dep_right_most = "" for (wi, r, wj) in self.arcs: if wi == stack_idx0: if (wj > wi) and (wj > right_most): right_most = wj dep_right_most = r if (wj < wi) and (wj < left_most): left_most = wj dep_left_most = r if self._check_informative(dep_left_most): result.append("STK_0_LDEP_" + dep_left_most) if self._check_informative(dep_right_most): result.append("STK_0_RDEP_" + dep_right_most) # Check Buffered 0 if len(self.buffer) > 0: # Buffer 0 buffer_idx0 = self.buffer[0] token = self._tokens[buffer_idx0] if self._check_informative(token["word"], True): result.append("BUF_0_FORM_" + token["word"]) if "lemma" in token and self._check_informative(token["lemma"]): result.append("BUF_0_LEMMA_" + token["lemma"]) if self._check_informative(token["tag"]): result.append("BUF_0_POS_" + token["tag"]) if "feats" in token and self._check_informative(token["feats"]): feats = token["feats"].split("|") for feat in feats: result.append("BUF_0_FEATS_" + feat) # Buffer 1 if len(self.buffer) > 1: buffer_idx1 = self.buffer[1] token = self._tokens[buffer_idx1] if self._check_informative(token["word"], True): result.append("BUF_1_FORM_" + token["word"]) if self._check_informative(token["tag"]): result.append("BUF_1_POS_" + token["tag"]) if len(self.buffer) > 2: buffer_idx2 = self.buffer[2] token = self._tokens[buffer_idx2] if self._check_informative(token["tag"]): result.append("BUF_2_POS_" + token["tag"]) if len(self.buffer) > 3: buffer_idx3 = self.buffer[3] token = self._tokens[buffer_idx3] if self._check_informative(token["tag"]): result.append("BUF_3_POS_" + token["tag"]) # Left most, right most dependency of stack[0] left_most = 1000000 right_most = -1 dep_left_most = "" dep_right_most = "" for (wi, r, wj) in self.arcs: if wi == buffer_idx0: if (wj > wi) and (wj > right_most): right_most = wj dep_right_most = r if (wj < wi) and (wj < left_most): left_most = wj dep_left_most = r if self._check_informative(dep_left_most): result.append("BUF_0_LDEP_" + dep_left_most) if self._check_informative(dep_right_most): result.append("BUF_0_RDEP_" + dep_right_most) return result class Transition: """ This class defines a set of transition which is applied to a configuration to get another configuration Note that for different parsing algorithm, the transition is different. """ # Define set of transitions LEFT_ARC = "LEFTARC" RIGHT_ARC = "RIGHTARC" SHIFT = "SHIFT" REDUCE = "REDUCE" def __init__(self, alg_option): """ :param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm :type alg_option: str """ self._algo = alg_option if alg_option not in [ TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER, ]: raise ValueError( " Currently we only support %s and %s " % (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER) ) def left_arc(self, conf, relation): """ Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager :param configuration: is the current configuration :return: A new configuration or -1 if the pre-condition is not satisfied """ if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): return -1 if conf.buffer[0] == 0: # here is the Root element return -1 idx_wi = conf.stack[len(conf.stack) - 1] flag = True if self._algo == TransitionParser.ARC_EAGER: for (idx_parent, r, idx_child) in conf.arcs: if idx_child == idx_wi: flag = False if flag: conf.stack.pop() idx_wj = conf.buffer[0] conf.arcs.append((idx_wj, relation, idx_wi)) else: return -1 def right_arc(self, conf, relation): """ Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager :param configuration: is the current configuration :return: A new configuration or -1 if the pre-condition is not satisfied """ if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): return -1 if self._algo == TransitionParser.ARC_STANDARD: idx_wi = conf.stack.pop() idx_wj = conf.buffer[0] conf.buffer[0] = idx_wi conf.arcs.append((idx_wi, relation, idx_wj)) else: # arc-eager idx_wi = conf.stack[len(conf.stack) - 1] idx_wj = conf.buffer.pop(0) conf.stack.append(idx_wj) conf.arcs.append((idx_wi, relation, idx_wj)) def reduce(self, conf): """ Note that the algorithm for reduce is only available for arc-eager :param configuration: is the current configuration :return: A new configuration or -1 if the pre-condition is not satisfied """ if self._algo != TransitionParser.ARC_EAGER: return -1 if len(conf.stack) <= 0: return -1 idx_wi = conf.stack[len(conf.stack) - 1] flag = False for (idx_parent, r, idx_child) in conf.arcs: if idx_child == idx_wi: flag = True if flag: conf.stack.pop() # reduce it else: return -1 def shift(self, conf): """ Note that the algorithm for shift is the SAME for arc-standard and arc-eager :param configuration: is the current configuration :return: A new configuration or -1 if the pre-condition is not satisfied """ if len(conf.buffer) <= 0: return -1 idx_wi = conf.buffer.pop(0) conf.stack.append(idx_wi) class TransitionParser(ParserI): """ Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager" """ ARC_STANDARD = "arc-standard" ARC_EAGER = "arc-eager" def __init__(self, algorithm): """ :param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm :type algorithm: str """ if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]): raise ValueError( " Currently we only support %s and %s " % (self.ARC_STANDARD, self.ARC_EAGER) ) self._algorithm = algorithm self._dictionary = {} self._transition = {} self._match_transition = {} def _get_dep_relation(self, idx_parent, idx_child, depgraph): p_node = depgraph.nodes[idx_parent] c_node = depgraph.nodes[idx_child] if c_node["word"] is None: return None # Root word if c_node["head"] == p_node["address"]: return c_node["rel"] else: return None def _convert_to_binary_features(self, features): """ :param features: list of feature string which is needed to convert to binary features :type features: list(str) :return : string of binary features in libsvm format which is 'featureID:value' pairs """ unsorted_result = [] for feature in features: self._dictionary.setdefault(feature, len(self._dictionary)) unsorted_result.append(self._dictionary[feature]) # Default value of each feature is 1.0 return " ".join( str(featureID) + ":1.0" for featureID in sorted(unsorted_result) ) def _is_projective(self, depgraph): arc_list = [] for key in depgraph.nodes: node = depgraph.nodes[key] if "head" in node: childIdx = node["address"] parentIdx = node["head"] if parentIdx is not None: arc_list.append((parentIdx, childIdx)) for (parentIdx, childIdx) in arc_list: # Ensure that childIdx < parentIdx if childIdx > parentIdx: temp = childIdx childIdx = parentIdx parentIdx = temp for k in range(childIdx + 1, parentIdx): for m in range(len(depgraph.nodes)): if (m < childIdx) or (m > parentIdx): if (k, m) in arc_list: return False if (m, k) in arc_list: return False return True def _write_to_file(self, key, binary_features, input_file): """ write the binary features to input file and update the transition dictionary """ self._transition.setdefault(key, len(self._transition) + 1) self._match_transition[self._transition[key]] = key input_str = str(self._transition[key]) + " " + binary_features + "\n" input_file.write(input_str.encode("utf-8")) def _create_training_examples_arc_std(self, depgraphs, input_file): """ Create the training example in the libsvm format and write it to the input_file. Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009) """ operation = Transition(self.ARC_STANDARD) count_proj = 0 training_seq = [] for depgraph in depgraphs: if not self._is_projective(depgraph): continue count_proj += 1 conf = Configuration(depgraph) while len(conf.buffer) > 0: b0 = conf.buffer[0] features = conf.extract_features() binary_features = self._convert_to_binary_features(features) if len(conf.stack) > 0: s0 = conf.stack[len(conf.stack) - 1] # Left-arc operation rel = self._get_dep_relation(b0, s0, depgraph) if rel is not None: key = Transition.LEFT_ARC + ":" + rel self._write_to_file(key, binary_features, input_file) operation.left_arc(conf, rel) training_seq.append(key) continue # Right-arc operation rel = self._get_dep_relation(s0, b0, depgraph) if rel is not None: precondition = True # Get the max-index of buffer maxID = conf._max_address for w in range(maxID + 1): if w != b0: relw = self._get_dep_relation(b0, w, depgraph) if relw is not None: if (b0, relw, w) not in conf.arcs: precondition = False if precondition: key = Transition.RIGHT_ARC + ":" + rel self._write_to_file(key, binary_features, input_file) operation.right_arc(conf, rel) training_seq.append(key) continue # Shift operation as the default key = Transition.SHIFT self._write_to_file(key, binary_features, input_file) operation.shift(conf) training_seq.append(key) print(" Number of training examples : " + str(len(depgraphs))) print(" Number of valid (projective) examples : " + str(count_proj)) return training_seq def _create_training_examples_arc_eager(self, depgraphs, input_file): """ Create the training example in the libsvm format and write it to the input_file. Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre """ operation = Transition(self.ARC_EAGER) countProj = 0 training_seq = [] for depgraph in depgraphs: if not self._is_projective(depgraph): continue countProj += 1 conf = Configuration(depgraph) while len(conf.buffer) > 0: b0 = conf.buffer[0] features = conf.extract_features() binary_features = self._convert_to_binary_features(features) if len(conf.stack) > 0: s0 = conf.stack[len(conf.stack) - 1] # Left-arc operation rel = self._get_dep_relation(b0, s0, depgraph) if rel is not None: key = Transition.LEFT_ARC + ":" + rel self._write_to_file(key, binary_features, input_file) operation.left_arc(conf, rel) training_seq.append(key) continue # Right-arc operation rel = self._get_dep_relation(s0, b0, depgraph) if rel is not None: key = Transition.RIGHT_ARC + ":" + rel self._write_to_file(key, binary_features, input_file) operation.right_arc(conf, rel) training_seq.append(key) continue # reduce operation flag = False for k in range(s0): if self._get_dep_relation(k, b0, depgraph) is not None: flag = True if self._get_dep_relation(b0, k, depgraph) is not None: flag = True if flag: key = Transition.REDUCE self._write_to_file(key, binary_features, input_file) operation.reduce(conf) training_seq.append(key) continue # Shift operation as the default key = Transition.SHIFT self._write_to_file(key, binary_features, input_file) operation.shift(conf) training_seq.append(key) print(" Number of training examples : " + str(len(depgraphs))) print(" Number of valid (projective) examples : " + str(countProj)) return training_seq def train(self, depgraphs, modelfile, verbose=True): """ :param depgraphs : list of DependencyGraph as the training data :type depgraphs : DependencyGraph :param modelfile : file name to save the trained model :type modelfile : str """ try: input_file = tempfile.NamedTemporaryFile( prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False ) if self._algorithm == self.ARC_STANDARD: self._create_training_examples_arc_std(depgraphs, input_file) else: self._create_training_examples_arc_eager(depgraphs, input_file) input_file.close() # Using the temporary file to train the libsvm classifier x_train, y_train = load_svmlight_file(input_file.name) # The parameter is set according to the paper: # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre # Todo : because of probability = True => very slow due to # cross-validation. Need to improve the speed here model = svm.SVC( kernel="poly", degree=2, coef0=0, gamma=0.2, C=0.5, verbose=verbose, probability=True, ) model.fit(x_train, y_train) # Save the model to file name (as pickle) pickle.dump(model, open(modelfile, "wb")) finally: remove(input_file.name) def parse(self, depgraphs, modelFile): """ :param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy :type depgraphs: list(DependencyGraph) :param modelfile: the model file :type modelfile: str :return: list (DependencyGraph) with the 'head' and 'rel' information """ result = [] # First load the model model = pickle.load(open(modelFile, "rb")) operation = Transition(self._algorithm) for depgraph in depgraphs: conf = Configuration(depgraph) while len(conf.buffer) > 0: features = conf.extract_features() col = [] row = [] data = [] for feature in features: if feature in self._dictionary: col.append(self._dictionary[feature]) row.append(0) data.append(1.0) np_col = array(sorted(col)) # NB : index must be sorted np_row = array(row) np_data = array(data) x_test = sparse.csr_matrix( (np_data, (np_row, np_col)), shape=(1, len(self._dictionary)) ) # It's best to use decision function as follow BUT it's not supported yet for sparse SVM # Using decision function to build the votes array # dec_func = model.decision_function(x_test)[0] # votes = {} # k = 0 # for i in range(len(model.classes_)): # for j in range(i+1, len(model.classes_)): # #if dec_func[k] > 0: # votes.setdefault(i,0) # votes[i] +=1 # else: # votes.setdefault(j,0) # votes[j] +=1 # k +=1 # Sort votes according to the values # sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True) # We will use predict_proba instead of decision_function prob_dict = {} pred_prob = model.predict_proba(x_test)[0] for i in range(len(pred_prob)): prob_dict[i] = pred_prob[i] sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True) # Note that SHIFT is always a valid operation for (y_pred_idx, confidence) in sorted_Prob: # y_pred = model.predict(x_test)[0] # From the prediction match to the operation y_pred = model.classes_[y_pred_idx] if y_pred in self._match_transition: strTransition = self._match_transition[y_pred] baseTransition = strTransition.split(":")[0] if baseTransition == Transition.LEFT_ARC: if ( operation.left_arc(conf, strTransition.split(":")[1]) != -1 ): break elif baseTransition == Transition.RIGHT_ARC: if ( operation.right_arc(conf, strTransition.split(":")[1]) != -1 ): break elif baseTransition == Transition.REDUCE: if operation.reduce(conf) != -1: break elif baseTransition == Transition.SHIFT: if operation.shift(conf) != -1: break else: raise ValueError( "The predicted transition is not recognized, expected errors" ) # Finish with operations build the dependency graph from Conf.arcs new_depgraph = deepcopy(depgraph) for key in new_depgraph.nodes: node = new_depgraph.nodes[key] node["rel"] = "" # With the default, all the token depend on the Root node["head"] = 0 for (head, rel, child) in conf.arcs: c_node = new_depgraph.nodes[child] c_node["head"] = head c_node["rel"] = rel result.append(new_depgraph) return result def demo(): """ >>> from nltk.parse import DependencyGraph, DependencyEvaluator >>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition >>> gold_sent = DependencyGraph(\""" ... Economic JJ 2 ATT ... news NN 3 SBJ ... has VBD 0 ROOT ... little JJ 5 ATT ... effect NN 3 OBJ ... on IN 5 ATT ... financial JJ 8 ATT ... markets NNS 6 PC ... . . 3 PU ... \""") >>> conf = Configuration(gold_sent) ###################### Check the Initial Feature ######################## >>> print(', '.join(conf.extract_features())) STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ ###################### Check The Transition ####################### Check the Initialized Configuration >>> print(conf) Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : [] A. Do some transition checks for ARC-STANDARD >>> operation = Transition('arc-standard') >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") >>> operation.shift(conf) >>> operation.left_arc(conf,"SBJ") >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") Middle Configuration and Features Check >>> print(conf) Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)] >>> print(', '.join(conf.extract_features())) STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT >>> operation.right_arc(conf, "PC") >>> operation.right_arc(conf, "ATT") >>> operation.right_arc(conf, "OBJ") >>> operation.shift(conf) >>> operation.right_arc(conf, "PU") >>> operation.right_arc(conf, "ROOT") >>> operation.shift(conf) Terminated Configuration Check >>> print(conf) Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)] B. Do some transition checks for ARC-EAGER >>> conf = Configuration(gold_sent) >>> operation = Transition('arc-eager') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.shift(conf) >>> operation.left_arc(conf,'SBJ') >>> operation.right_arc(conf,'ROOT') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.right_arc(conf,'OBJ') >>> operation.right_arc(conf,'ATT') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.right_arc(conf,'PC') >>> operation.reduce(conf) >>> operation.reduce(conf) >>> operation.reduce(conf) >>> operation.right_arc(conf,'PU') >>> print(conf) Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)] ###################### Check The Training Function ####################### A. Check the ARC-STANDARD training >>> import tempfile >>> import os >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) >>> parser_std = TransitionParser('arc-standard') >>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file))) Number of training examples : 1 Number of valid (projective) examples : 1 SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT >>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False) Number of training examples : 1 Number of valid (projective) examples : 1 >>> remove(input_file.name) B. Check the ARC-EAGER training >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False) >>> parser_eager = TransitionParser('arc-eager') >>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file))) Number of training examples : 1 Number of valid (projective) examples : 1 SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU >>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False) Number of training examples : 1 Number of valid (projective) examples : 1 >>> remove(input_file.name) ###################### Check The Parsing Function ######################## A. Check the ARC-STANDARD parser >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model') >>> de = DependencyEvaluator(result, [gold_sent]) >>> de.eval() >= (0, 0) True B. Check the ARC-EAGER parser >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model') >>> de = DependencyEvaluator(result, [gold_sent]) >>> de.eval() >= (0, 0) True Remove test temporary files >>> remove('temp.arceager.model') >>> remove('temp.arcstd.model') Note that result is very poor because of only one training example. """ nltk-3.7/nltk/parse/util.py000066400000000000000000000202571420073152400157500ustar00rootroot00000000000000# Natural Language Toolkit: Parser Utility Functions # # Author: Ewan Klein # Tom Aarsen <> # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT """ Utility functions for parsers. """ from nltk.data import load from nltk.grammar import CFG, PCFG, FeatureGrammar from nltk.parse.chart import Chart, ChartParser from nltk.parse.featurechart import FeatureChart, FeatureChartParser from nltk.parse.pchart import InsideChartParser def load_parser( grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args ): """ Load a grammar from a file, and build a parser based on that grammar. The parser depends on the grammar format, and might also depend on properties of the grammar itself. The following grammar formats are currently supported: - ``'cfg'`` (CFGs: ``CFG``) - ``'pcfg'`` (probabilistic CFGs: ``PCFG``) - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``) :type grammar_url: str :param grammar_url: A URL specifying where the grammar is located. The default protocol is ``"nltk:"``, which searches for the file in the the NLTK data package. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :param parser: The class used for parsing; should be ``ChartParser`` or a subclass. If None, the class depends on the grammar format. :param chart_class: The class used for storing the chart; should be ``Chart`` or a subclass. Only used for CFGs and feature CFGs. If None, the chart class depends on the grammar format. :type beam_size: int :param beam_size: The maximum length for the parser's edge queue. Only used for probabilistic CFGs. :param load_args: Keyword parameters used when loading the grammar. See ``data.load`` for more information. """ grammar = load(grammar_url, **load_args) if not isinstance(grammar, CFG): raise ValueError("The grammar must be a CFG, " "or a subclass thereof.") if isinstance(grammar, PCFG): if parser is None: parser = InsideChartParser return parser(grammar, trace=trace, beam_size=beam_size) elif isinstance(grammar, FeatureGrammar): if parser is None: parser = FeatureChartParser if chart_class is None: chart_class = FeatureChart return parser(grammar, trace=trace, chart_class=chart_class) else: # Plain CFG. if parser is None: parser = ChartParser if chart_class is None: chart_class = Chart return parser(grammar, trace=trace, chart_class=chart_class) def taggedsent_to_conll(sentence): """ A module to convert a single POS tagged sentence into CONLL format. >>> from nltk import word_tokenize, pos_tag >>> text = "This is a foobar sentence." >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): ... print(line, end="") 1 This _ DT DT _ 0 a _ _ 2 is _ VBZ VBZ _ 0 a _ _ 3 a _ DT DT _ 0 a _ _ 4 foobar _ JJ JJ _ 0 a _ _ 5 sentence _ NN NN _ 0 a _ _ 6 . _ . . _ 0 a _ _ :param sentence: A single input sentence to parse :type sentence: list(tuple(str, str)) :rtype: iter(str) :return: a generator yielding a single sentence in CONLL format. """ for (i, (word, tag)) in enumerate(sentence, start=1): input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"] input_str = "\t".join(input_str) + "\n" yield input_str def taggedsents_to_conll(sentences): """ A module to convert the a POS tagged document stream (i.e. list of list of tuples, a list of sentences) and yield lines in CONLL format. This module yields one line per word and two newlines for end of sentence. >>> from nltk import word_tokenize, sent_tokenize, pos_tag >>> text = "This is a foobar sentence. Is that right?" >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)] >>> for line in taggedsents_to_conll(sentences): ... if line: ... print(line, end="") 1 This _ DT DT _ 0 a _ _ 2 is _ VBZ VBZ _ 0 a _ _ 3 a _ DT DT _ 0 a _ _ 4 foobar _ JJ JJ _ 0 a _ _ 5 sentence _ NN NN _ 0 a _ _ 6 . _ . . _ 0 a _ _ 1 Is _ VBZ VBZ _ 0 a _ _ 2 that _ IN IN _ 0 a _ _ 3 right _ NN NN _ 0 a _ _ 4 ? _ . . _ 0 a _ _ :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :rtype: iter(str) :return: a generator yielding sentences in CONLL format. """ for sentence in sentences: yield from taggedsent_to_conll(sentence) yield "\n\n" ###################################################################### # { Test Suites ###################################################################### class TestGrammar: """ Unit tests for CFG. """ def __init__(self, grammar, suite, accept=None, reject=None): self.test_grammar = grammar self.cp = load_parser(grammar, trace=0) self.suite = suite self._accept = accept self._reject = reject def run(self, show_trees=False): """ Sentences in the test suite are divided into two classes: - grammatical (``accept``) and - ungrammatical (``reject``). If a sentence should parse according to the grammar, the value of ``trees`` will be a non-empty list. If a sentence should be rejected according to the grammar, then the value of ``trees`` will be None. """ for test in self.suite: print(test["doc"] + ":", end=" ") for key in ["accept", "reject"]: for sent in test[key]: tokens = sent.split() trees = list(self.cp.parse(tokens)) if show_trees and trees: print() print(sent) for tree in trees: print(tree) if key == "accept": if trees == []: raise ValueError("Sentence '%s' failed to parse'" % sent) else: accepted = True else: if trees: raise ValueError("Sentence '%s' received a parse'" % sent) else: rejected = True if accepted and rejected: print("All tests passed!") def extract_test_sentences(string, comment_chars="#%;", encoding=None): """ Parses a string with one test sentence per line. Lines can optionally begin with: - a bool, saying if the sentence is grammatical or not, or - an int, giving the number of parse trees is should have, The result information is followed by a colon, and then the sentence. Empty lines and lines beginning with a comment char are ignored. :return: a list of tuple of sentences and expected results, where a sentence is a list of str, and a result is None, or bool, or int :param comment_chars: ``str`` of possible comment characters. :param encoding: the encoding of the string, if it is binary """ if encoding is not None: string = string.decode(encoding) sentences = [] for sentence in string.split("\n"): if sentence == "" or sentence[0] in comment_chars: continue split_info = sentence.split(":", 1) result = None if len(split_info) == 2: if split_info[0] in ["True", "true", "False", "false"]: result = split_info[0] in ["True", "true"] sentence = split_info[1] else: result = int(split_info[0]) sentence = split_info[1] tokens = sentence.split() if tokens == []: continue sentences += [(tokens, result)] return sentences nltk-3.7/nltk/parse/viterbi.py000066400000000000000000000427521420073152400164430ustar00rootroot00000000000000# Natural Language Toolkit: Viterbi Probabilistic Parser # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from functools import reduce from nltk.parse.api import ParserI from nltk.tree import ProbabilisticTree, Tree ##////////////////////////////////////////////////////// ## Viterbi PCFG Parser ##////////////////////////////////////////////////////// class ViterbiParser(ParserI): """ A bottom-up ``PCFG`` parser that uses dynamic programming to find the single most likely parse for a text. The ``ViterbiParser`` parser parses texts by filling in a "most likely constituent table". This table records the most probable tree representation for any given span and node value. In particular, it has an entry for every start index, end index, and node value, recording the most likely subtree that spans from the start index to the end index, and has the given node value. The ``ViterbiParser`` parser fills in this table incrementally. It starts by filling in all entries for constituents that span one element of text (i.e., entries where the end index is one greater than the start index). After it has filled in all table entries for constituents that span one element of text, it fills in the entries for constitutants that span two elements of text. It continues filling in the entries for constituents spanning larger and larger portions of the text, until the entire table has been filled. Finally, it returns the table entry for a constituent spanning the entire text, whose node value is the grammar's start symbol. In order to find the most likely constituent with a given span and node value, the ``ViterbiParser`` parser considers all productions that could produce that node value. For each production, it finds all children that collectively cover the span and have the node values specified by the production's right hand side. If the probability of the tree formed by applying the production to the children is greater than the probability of the current entry in the table, then the table is updated with this new tree. A pseudo-code description of the algorithm used by ``ViterbiParser`` is: | Create an empty most likely constituent table, *MLC*. | For width in 1...len(text): | For start in 1...len(text)-width: | For prod in grammar.productions: | For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC, | where t[i].label()==prod.rhs[i], | and the sequence covers [start:start+width]: | old_p = MLC[start, start+width, prod.lhs] | new_p = P(t[1])P(t[1])...P(t[n])P(prod) | if new_p > old_p: | new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n]) | MLC[start, start+width, prod.lhs] = new_tree | Return MLC[0, len(text), start_symbol] :type _grammar: PCFG :ivar _grammar: The grammar used to parse sentences. :type _trace: int :ivar _trace: The level of tracing output that should be generated when parsing a text. """ def __init__(self, grammar, trace=0): """ Create a new ``ViterbiParser`` parser, that uses ``grammar`` to parse texts. :type grammar: PCFG :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. """ self._grammar = grammar self._trace = trace def grammar(self): return self._grammar def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. :type trace: int :param trace: The trace level. A trace level of ``0`` will generate no tracing output; and higher trace levels will produce more verbose tracing output. :rtype: None """ self._trace = trace def parse(self, tokens): # Inherit docs from ParserI tokens = list(tokens) self._grammar.check_coverage(tokens) # The most likely constituent table. This table specifies the # most likely constituent for a given span and type. # Constituents can be either Trees or tokens. For Trees, # the "type" is the Nonterminal for the tree's root node # value. For Tokens, the "type" is the token's type. # The table is stored as a dictionary, since it is sparse. constituents = {} # Initialize the constituents dictionary with the words from # the text. if self._trace: print("Inserting tokens into the most likely" + " constituents table...") for index in range(len(tokens)): token = tokens[index] constituents[index, index + 1, token] = token if self._trace > 1: self._trace_lexical_insertion(token, index, len(tokens)) # Consider each span of length 1, 2, ..., n; and add any trees # that might cover that span to the constituents dictionary. for length in range(1, len(tokens) + 1): if self._trace: print( "Finding the most likely constituents" + " spanning %d text elements..." % length ) for start in range(len(tokens) - length + 1): span = (start, start + length) self._add_constituents_spanning(span, constituents, tokens) # Return the tree that spans the entire text & have the right cat tree = constituents.get((0, len(tokens), self._grammar.start())) if tree is not None: yield tree def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover ``span``, and add them to the most likely constituents table. :rtype: None :type span: tuple(int, int) :param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover ``text[span[0]:span[1]]``, where ``text`` is the text that we are parsing. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, ``constituents(s,e,nv)`` is the most likely ``ProbabilisticTree`` that covers ``text[s:e]`` and has a node value ``nv.symbol()``, where ``text`` is the text that we are parsing. When ``_add_constituents_spanning`` is called, ``constituents`` should contain all possible constituents that are shorter than ``span``. :type tokens: list of tokens :param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print(" Insert:", end=" ") else: print(" Discard:", end=" ") self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True def _find_instantiations(self, span, constituents): """ :return: a list of the production instantiations that cover a given span of the text. A "production instantiation" is a tuple containing a production and a list of children, where the production's right hand side matches the list of children; and the children cover ``span``. :rtype: list of ``pair`` of ``Production``, (list of (``ProbabilisticTree`` or token. :type span: tuple(int, int) :param span: The section of the text for which we are trying to find production instantiations. The span is specified as a pair of integers, where the first integer is the index of the first token that should be covered by the production instantiation; and the second integer is the index of the first token that should not be covered by the production instantiation. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. See the module documentation for more information. """ rv = [] for production in self._grammar.productions(): childlists = self._match_rhs(production.rhs(), span, constituents) for childlist in childlists: rv.append((production, childlist)) return rv def _match_rhs(self, rhs, span, constituents): """ :return: a set of all the lists of children that cover ``span`` and that match ``rhs``. :rtype: list(list(ProbabilisticTree or token) :type rhs: list(Nonterminal or any) :param rhs: The list specifying what kinds of children need to cover ``span``. Each nonterminal in ``rhs`` specifies that the corresponding child should be a tree whose node value is that nonterminal's symbol. Each terminal in ``rhs`` specifies that the corresponding child should be a token whose type is that terminal. :type span: tuple(int, int) :param span: The section of the text for which we are trying to find child lists. The span is specified as a pair of integers, where the first integer is the index of the first token that should be covered by the child list; and the second integer is the index of the first token that should not be covered by the child list. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. See the module documentation for more information. """ (start, end) = span # Base case if start >= end and rhs == (): return [[]] if start >= end or rhs == (): return [] # Find everything that matches the 1st symbol of the RHS childlists = [] for split in range(start, end + 1): l = constituents.get((start, split, rhs[0])) if l is not None: rights = self._match_rhs(rhs[1:], (split, end), constituents) childlists += [[l] + r for r in rights] return childlists def _trace_production(self, production, p, span, width): """ Print trace output indicating that a given production has been applied at a given location. :param production: The production that has been applied :type production: Production :param p: The probability of the tree produced by the production. :type p: float :param span: The span of the production :type span: tuple :rtype: None """ str = "|" + "." * span[0] str += "=" * (span[1] - span[0]) str += "." * (width - span[1]) + "| " str += "%s" % production if self._trace > 2: str = f"{str:<40} {p:12.10f} " print(str) def _trace_lexical_insertion(self, token, index, width): str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| " str += f"{token}" print(str) def __repr__(self): return "" % self._grammar ##////////////////////////////////////////////////////// ## Test Code ##////////////////////////////////////////////////////// def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.grammar import PCFG from nltk.parse import ViterbiParser toy_pcfg1 = PCFG.fromstring( """ S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """ ) toy_pcfg2 = PCFG.fromstring( """ S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """ ) # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw the man with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = ( reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0 ) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print("Time (secs) # Parses Average P(parse)") print("-----------------------------------------") print("%11.4f%11d%19.14f" % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------------------------") print("%11s%11d%19.14f" % ("n/a", len(parses), p)) # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print(parse) if __name__ == "__main__": demo() nltk-3.7/nltk/probability.py000077500000000000000000002617241420073152400162120ustar00rootroot00000000000000# Natural Language Toolkit: Probability and Statistics # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (additions) # Trevor Cohn (additions) # Peter Ljunglöf (additions) # Liang Dong (additions) # Geoffrey Sampson (additions) # Ilia Kurenkov (additions) # # URL: # For license information, see LICENSE.TXT """ Classes for representing and processing probabilistic information. The ``FreqDist`` class is used to encode "frequency distributions", which count the number of times that each outcome of an experiment occurs. The ``ProbDistI`` class defines a standard interface for "probability distributions", which encode the probability of each outcome for an experiment. There are two types of probability distribution: - "derived probability distributions" are created from frequency distributions. They attempt to model the probability distribution that generated the frequency distribution. - "analytic probability distributions" are created directly from parameters (such as variance). The ``ConditionalFreqDist`` class and ``ConditionalProbDistI`` interface are used to encode conditional distributions. Conditional probability distributions can be derived or analytic; but currently the only implementation of the ``ConditionalProbDistI`` interface is ``ConditionalProbDist``, a derived distribution. """ import array import math import random import warnings from abc import ABCMeta, abstractmethod from collections import Counter, defaultdict from functools import reduce from nltk.internals import raise_unorderable_types _NINF = float("-1e300") ##////////////////////////////////////////////////////// ## Frequency Distributions ##////////////////////////////////////////////////////// class FreqDist(Counter): """ A frequency distribution for the outcomes of an experiment. A frequency distribution records the number of times each outcome of an experiment has occurred. For example, a frequency distribution could be used to record the frequency of each word type in a document. Formally, a frequency distribution can be defined as a function mapping from each sample to the number of times that sample occurred as an outcome. Frequency distributions are generally constructed by running a number of experiments, and incrementing the count for a sample every time it is an outcome of an experiment. For example, the following code will produce a frequency distribution that encodes how often each word occurs in a text: >>> from nltk.tokenize import word_tokenize >>> from nltk.probability import FreqDist >>> sent = 'This is an example sentence' >>> fdist = FreqDist() >>> for word in word_tokenize(sent): ... fdist[word.lower()] += 1 An equivalent way to do this is with the initializer: >>> fdist = FreqDist(word.lower() for word in word_tokenize(sent)) """ def __init__(self, samples=None): """ Construct a new frequency distribution. If ``samples`` is given, then the frequency distribution will be initialized with the count of each object in ``samples``; otherwise, it will be initialized to be empty. In particular, ``FreqDist()`` returns an empty frequency distribution; and ``FreqDist(samples)`` first creates an empty frequency distribution, and then calls ``update`` with the list ``samples``. :param samples: The samples to initialize the frequency distribution with. :type samples: Sequence """ Counter.__init__(self, samples) # Cached number of samples in this FreqDist self._N = None def N(self): """ Return the total number of sample outcomes that have been recorded by this FreqDist. For the number of unique sample values (or bins) with counts greater than zero, use ``FreqDist.B()``. :rtype: int """ if self._N is None: # Not already cached, or cache has been invalidated self._N = sum(self.values()) return self._N def __setitem__(self, key, val): """ Override ``Counter.__setitem__()`` to invalidate the cached N """ self._N = None super().__setitem__(key, val) def __delitem__(self, key): """ Override ``Counter.__delitem__()`` to invalidate the cached N """ self._N = None super().__delitem__(key) def update(self, *args, **kwargs): """ Override ``Counter.update()`` to invalidate the cached N """ self._N = None super().update(*args, **kwargs) def setdefault(self, key, val): """ Override ``Counter.setdefault()`` to invalidate the cached N """ self._N = None super().setdefault(key, val) def B(self): """ Return the total number of sample values (or "bins") that have counts greater than zero. For the total number of sample outcomes recorded, use ``FreqDist.N()``. (FreqDist.B() is the same as len(FreqDist).) :rtype: int """ return len(self) def hapaxes(self): """ Return a list of all samples that occur once (hapax legomena) :rtype: list """ return [item for item in self if self[item] == 1] def Nr(self, r, bins=None): return self.r_Nr(bins)[r] def r_Nr(self, bins=None): """ Return the dictionary mapping r to Nr, the number of samples with frequency r, where Nr > 0. :type bins: int :param bins: The number of possible sample outcomes. ``bins`` is used to calculate Nr(0). In particular, Nr(0) is ``bins-self.B()``. If ``bins`` is not specified, it defaults to ``self.B()`` (so Nr(0) will be 0). :rtype: int """ _r_Nr = defaultdict(int) for count in self.values(): _r_Nr[count] += 1 # Special case for Nr[0]: _r_Nr[0] = bins - self.B() if bins is not None else 0 return _r_Nr def _cumulative_frequencies(self, samples): """ Return the cumulative frequencies of the specified samples. If no samples are specified, all counts are returned, starting with the largest. :param samples: the samples whose frequencies should be returned. :type samples: any :rtype: list(float) """ cf = 0.0 for sample in samples: cf += self[sample] yield cf # slightly odd nomenclature freq() if FreqDist does counts and ProbDist does probs, # here, freq() does probs def freq(self, sample): """ Return the frequency of a given sample. The frequency of a sample is defined as the count of that sample divided by the total number of sample outcomes that have been recorded by this FreqDist. The count of a sample is defined as the number of times that sample outcome was recorded by this FreqDist. Frequencies are always real numbers in the range [0, 1]. :param sample: the sample whose frequency should be returned. :type sample: any :rtype: float """ n = self.N() if n == 0: return 0 return self[sample] / n def max(self): """ Return the sample with the greatest number of outcomes in this frequency distribution. If two or more samples have the same number of outcomes, return one of them; which sample is returned is undefined. If no outcomes have occurred in this frequency distribution, return None. :return: The sample with the maximum number of outcomes in this frequency distribution. :rtype: any or None """ if len(self) == 0: raise ValueError( "A FreqDist must have at least one sample before max is defined." ) return self.most_common(1)[0][0] def plot( self, *args, title="", cumulative=False, percents=False, show=True, **kwargs ): """ Plot samples from the frequency distribution displaying the most frequent sample first. If an integer parameter is supplied, stop after this many samples have been plotted. For a cumulative plot, specify cumulative=True. Additional ``**kwargs`` are passed to matplotlib's plot function. (Requires Matplotlib to be installed.) :param title: The title for the graph. :type title: str :param cumulative: Whether the plot is cumulative. (default = False) :type cumulative: bool :param percents: Whether the plot uses percents instead of counts. (default = False) :type percents: bool :param show: Whether to show the plot, or only return the ax. :type show: bool """ try: import matplotlib.pyplot as plt except ImportError as e: raise ValueError( "The plot function requires matplotlib to be installed." "See https://matplotlib.org/" ) from e if len(args) == 0: args = [len(self)] samples = [item for item, _ in self.most_common(*args)] if cumulative: freqs = list(self._cumulative_frequencies(samples)) ylabel = "Cumulative " else: freqs = [self[sample] for sample in samples] ylabel = "" if percents: freqs = [f / self.N() * 100 for f in freqs] ylabel += "Percents" else: ylabel += "Counts" ax = plt.gca() ax.grid(True, color="silver") if "linewidth" not in kwargs: kwargs["linewidth"] = 2 if title: ax.set_title(title) ax.plot(freqs, **kwargs) ax.set_xticks(range(len(samples))) ax.set_xticklabels([str(s) for s in samples], rotation=90) ax.set_xlabel("Samples") ax.set_ylabel(ylabel) if show: plt.show() return ax def tabulate(self, *args, **kwargs): """ Tabulate the given samples from the frequency distribution (cumulative), displaying the most frequent sample first. If an integer parameter is supplied, stop after this many samples have been plotted. :param samples: The samples to plot (default is all samples) :type samples: list :param cumulative: A flag to specify whether the freqs are cumulative (default = False) :type title: bool """ if len(args) == 0: args = [len(self)] samples = _get_kwarg( kwargs, "samples", [item for item, _ in self.most_common(*args)] ) cumulative = _get_kwarg(kwargs, "cumulative", False) if cumulative: freqs = list(self._cumulative_frequencies(samples)) else: freqs = [self[sample] for sample in samples] # percents = [f * 100 for f in freqs] only in ProbDist? width = max(len(f"{s}") for s in samples) width = max(width, max(len("%d" % f) for f in freqs)) for i in range(len(samples)): print("%*s" % (width, samples[i]), end=" ") print() for i in range(len(samples)): print("%*d" % (width, freqs[i]), end=" ") print() def copy(self): """ Create a copy of this frequency distribution. :rtype: FreqDist """ return self.__class__(self) # Mathematical operatiors def __add__(self, other): """ Add counts from two counters. >>> FreqDist('abbb') + FreqDist('bcc') FreqDist({'b': 4, 'c': 2, 'a': 1}) """ return self.__class__(super().__add__(other)) def __sub__(self, other): """ Subtract count, but keep only results with positive counts. >>> FreqDist('abbbc') - FreqDist('bccd') FreqDist({'b': 2, 'a': 1}) """ return self.__class__(super().__sub__(other)) def __or__(self, other): """ Union is the maximum of value in either of the input counters. >>> FreqDist('abbb') | FreqDist('bcc') FreqDist({'b': 3, 'c': 2, 'a': 1}) """ return self.__class__(super().__or__(other)) def __and__(self, other): """ Intersection is the minimum of corresponding counts. >>> FreqDist('abbb') & FreqDist('bcc') FreqDist({'b': 1}) """ return self.__class__(super().__and__(other)) def __le__(self, other): """ Returns True if this frequency distribution is a subset of the other and for no key the value exceeds the value of the same key from the other frequency distribution. The <= operator forms partial order and satisfying the axioms reflexivity, antisymmetry and transitivity. >>> FreqDist('a') <= FreqDist('a') True >>> a = FreqDist('abc') >>> b = FreqDist('aabc') >>> (a <= b, b <= a) (True, False) >>> FreqDist('a') <= FreqDist('abcd') True >>> FreqDist('abc') <= FreqDist('xyz') False >>> FreqDist('xyz') <= FreqDist('abc') False >>> c = FreqDist('a') >>> d = FreqDist('aa') >>> e = FreqDist('aaa') >>> c <= d and d <= e and c <= e True """ if not isinstance(other, FreqDist): raise_unorderable_types("<=", self, other) return set(self).issubset(other) and all( self[key] <= other[key] for key in self ) def __ge__(self, other): if not isinstance(other, FreqDist): raise_unorderable_types(">=", self, other) return set(self).issuperset(other) and all( self[key] >= other[key] for key in other ) __lt__ = lambda self, other: self <= other and not self == other __gt__ = lambda self, other: self >= other and not self == other def __repr__(self): """ Return a string representation of this FreqDist. :rtype: string """ return self.pformat() def pprint(self, maxlen=10, stream=None): """ Print a string representation of this FreqDist to 'stream' :param maxlen: The maximum number of items to print :type maxlen: int :param stream: The stream to print to. stdout by default """ print(self.pformat(maxlen=maxlen), file=stream) def pformat(self, maxlen=10): """ Return a string representation of this FreqDist. :param maxlen: The maximum number of items to display :type maxlen: int :rtype: string """ items = ["{!r}: {!r}".format(*item) for item in self.most_common(maxlen)] if len(self) > maxlen: items.append("...") return "FreqDist({{{0}}})".format(", ".join(items)) def __str__(self): """ Return a string representation of this FreqDist. :rtype: string """ return "" % (len(self), self.N()) def __iter__(self): """ Return an iterator which yields tokens ordered by frequency. :rtype: iterator """ for token, _ in self.most_common(self.B()): yield token ##////////////////////////////////////////////////////// ## Probability Distributions ##////////////////////////////////////////////////////// class ProbDistI(metaclass=ABCMeta): """ A probability distribution for the outcomes of an experiment. A probability distribution specifies how likely it is that an experiment will have any given outcome. For example, a probability distribution could be used to predict the probability that a token in a document will have a given type. Formally, a probability distribution can be defined as a function mapping from samples to nonnegative real numbers, such that the sum of every number in the function's range is 1.0. A ``ProbDist`` is often used to model the probability distribution of the experiment used to generate a frequency distribution. """ SUM_TO_ONE = True """True if the probabilities of the samples in this probability distribution will always sum to one.""" @abstractmethod def __init__(self): """ Classes inheriting from ProbDistI should implement __init__. """ @abstractmethod def prob(self, sample): """ Return the probability for a given sample. Probabilities are always real numbers in the range [0, 1]. :param sample: The sample whose probability should be returned. :type sample: any :rtype: float """ def logprob(self, sample): """ Return the base 2 logarithm of the probability for a given sample. :param sample: The sample whose probability should be returned. :type sample: any :rtype: float """ # Default definition, in terms of prob() p = self.prob(sample) return math.log(p, 2) if p != 0 else _NINF @abstractmethod def max(self): """ Return the sample with the greatest probability. If two or more samples have the same probability, return one of them; which sample is returned is undefined. :rtype: any """ @abstractmethod def samples(self): """ Return a list of all samples that have nonzero probabilities. Use ``prob`` to find the probability of each sample. :rtype: list """ # cf self.SUM_TO_ONE def discount(self): """ Return the ratio by which counts are discounted on average: c*/c :rtype: float """ return 0.0 # Subclasses should define more efficient implementations of this, # where possible. def generate(self): """ Return a randomly selected sample from this probability distribution. The probability of returning each sample ``samp`` is equal to ``self.prob(samp)``. """ p = random.random() p_init = p for sample in self.samples(): p -= self.prob(sample) if p <= 0: return sample # allow for some rounding error: if p < 0.0001: return sample # we *should* never get here if self.SUM_TO_ONE: warnings.warn( "Probability distribution %r sums to %r; generate()" " is returning an arbitrary sample." % (self, p_init - p) ) return random.choice(list(self.samples())) class UniformProbDist(ProbDistI): """ A probability distribution that assigns equal probability to each sample in a given set; and a zero probability to all other samples. """ def __init__(self, samples): """ Construct a new uniform probability distribution, that assigns equal probability to each sample in ``samples``. :param samples: The samples that should be given uniform probability. :type samples: list :raise ValueError: If ``samples`` is empty. """ if len(samples) == 0: raise ValueError( "A Uniform probability distribution must " + "have at least one sample." ) self._sampleset = set(samples) self._prob = 1.0 / len(self._sampleset) self._samples = list(self._sampleset) def prob(self, sample): return self._prob if sample in self._sampleset else 0 def max(self): return self._samples[0] def samples(self): return self._samples def __repr__(self): return "" % len(self._sampleset) class RandomProbDist(ProbDistI): """ Generates a random probability distribution whereby each sample will be between 0 and 1 with equal probability (uniform random distribution. Also called a continuous uniform distribution). """ def __init__(self, samples): if len(samples) == 0: raise ValueError( "A probability distribution must " + "have at least one sample." ) self._probs = self.unirand(samples) self._samples = list(self._probs.keys()) @classmethod def unirand(cls, samples): """ The key function that creates a randomized initial distribution that still sums to 1. Set as a dictionary of prob values so that it can still be passed to MutableProbDist and called with identical syntax to UniformProbDist """ samples = set(samples) randrow = [random.random() for i in range(len(samples))] total = sum(randrow) for i, x in enumerate(randrow): randrow[i] = x / total total = sum(randrow) if total != 1: # this difference, if present, is so small (near NINF) that it # can be subtracted from any element without risking probs not (0 1) randrow[-1] -= total - 1 return {s: randrow[i] for i, s in enumerate(samples)} def max(self): if not hasattr(self, "_max"): self._max = max((p, v) for (v, p) in self._probs.items())[1] return self._max def prob(self, sample): return self._probs.get(sample, 0) def samples(self): return self._samples def __repr__(self): return "" % len(self._probs) class DictionaryProbDist(ProbDistI): """ A probability distribution whose probabilities are directly specified by a given dictionary. The given dictionary maps samples to probabilities. """ def __init__(self, prob_dict=None, log=False, normalize=False): """ Construct a new probability distribution from the given dictionary, which maps values to probabilities (or to log probabilities, if ``log`` is true). If ``normalize`` is true, then the probability values are scaled by a constant factor such that they sum to 1. If called without arguments, the resulting probability distribution assigns zero probability to all values. """ self._prob_dict = prob_dict.copy() if prob_dict is not None else {} self._log = log # Normalize the distribution, if requested. if normalize: if len(prob_dict) == 0: raise ValueError( "A DictionaryProbDist must have at least one sample " + "before it can be normalized." ) if log: value_sum = sum_logs(list(self._prob_dict.values())) if value_sum <= _NINF: logp = math.log(1.0 / len(prob_dict), 2) for x in prob_dict: self._prob_dict[x] = logp else: for (x, p) in self._prob_dict.items(): self._prob_dict[x] -= value_sum else: value_sum = sum(self._prob_dict.values()) if value_sum == 0: p = 1.0 / len(prob_dict) for x in prob_dict: self._prob_dict[x] = p else: norm_factor = 1.0 / value_sum for (x, p) in self._prob_dict.items(): self._prob_dict[x] *= norm_factor def prob(self, sample): if self._log: return 2 ** (self._prob_dict[sample]) if sample in self._prob_dict else 0 else: return self._prob_dict.get(sample, 0) def logprob(self, sample): if self._log: return self._prob_dict.get(sample, _NINF) else: if sample not in self._prob_dict: return _NINF elif self._prob_dict[sample] == 0: return _NINF else: return math.log(self._prob_dict[sample], 2) def max(self): if not hasattr(self, "_max"): self._max = max((p, v) for (v, p) in self._prob_dict.items())[1] return self._max def samples(self): return self._prob_dict.keys() def __repr__(self): return "" % len(self._prob_dict) class MLEProbDist(ProbDistI): """ The maximum likelihood estimate for the probability distribution of the experiment used to generate a frequency distribution. The "maximum likelihood estimate" approximates the probability of each sample as the frequency of that sample in the frequency distribution. """ def __init__(self, freqdist, bins=None): """ Use the maximum likelihood estimate to create a probability distribution for the experiment used to generate ``freqdist``. :type freqdist: FreqDist :param freqdist: The frequency distribution that the probability estimates should be based on. """ self._freqdist = freqdist def freqdist(self): """ Return the frequency distribution that this probability distribution is based on. :rtype: FreqDist """ return self._freqdist def prob(self, sample): return self._freqdist.freq(sample) def max(self): return self._freqdist.max() def samples(self): return self._freqdist.keys() def __repr__(self): """ :rtype: str :return: A string representation of this ``ProbDist``. """ return "" % self._freqdist.N() class LidstoneProbDist(ProbDistI): """ The Lidstone estimate for the probability distribution of the experiment used to generate a frequency distribution. The "Lidstone estimate" is parameterized by a real number *gamma*, which typically ranges from 0 to 1. The Lidstone estimate approximates the probability of a sample with count *c* from an experiment with *N* outcomes and *B* bins as ``c+gamma)/(N+B*gamma)``. This is equivalent to adding *gamma* to the count for each bin, and taking the maximum likelihood estimate of the resulting frequency distribution. """ SUM_TO_ONE = False def __init__(self, freqdist, gamma, bins=None): """ Use the Lidstone estimate to create a probability distribution for the experiment used to generate ``freqdist``. :type freqdist: FreqDist :param freqdist: The frequency distribution that the probability estimates should be based on. :type gamma: float :param gamma: A real number used to parameterize the estimate. The Lidstone estimate is equivalent to adding *gamma* to the count for each bin, and taking the maximum likelihood estimate of the resulting frequency distribution. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ if (bins == 0) or (bins is None and freqdist.N() == 0): name = self.__class__.__name__[:-8] raise ValueError( "A %s probability distribution " % name + "must have at least one bin." ) if (bins is not None) and (bins < freqdist.B()): name = self.__class__.__name__[:-8] raise ValueError( "\nThe number of bins in a %s distribution " % name + "(%d) must be greater than or equal to\n" % bins + "the number of bins in the FreqDist used " + "to create it (%d)." % freqdist.B() ) self._freqdist = freqdist self._gamma = float(gamma) self._N = self._freqdist.N() if bins is None: bins = freqdist.B() self._bins = bins self._divisor = self._N + bins * gamma if self._divisor == 0.0: # In extreme cases we force the probability to be 0, # which it will be, since the count will be 0: self._gamma = 0 self._divisor = 1 def freqdist(self): """ Return the frequency distribution that this probability distribution is based on. :rtype: FreqDist """ return self._freqdist def prob(self, sample): c = self._freqdist[sample] return (c + self._gamma) / self._divisor def max(self): # For Lidstone distributions, probability is monotonic with # frequency, so the most probable sample is the one that # occurs most frequently. return self._freqdist.max() def samples(self): return self._freqdist.keys() def discount(self): gb = self._gamma * self._bins return gb / (self._N + gb) def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return "" % self._freqdist.N() class LaplaceProbDist(LidstoneProbDist): """ The Laplace estimate for the probability distribution of the experiment used to generate a frequency distribution. The "Laplace estimate" approximates the probability of a sample with count *c* from an experiment with *N* outcomes and *B* bins as *(c+1)/(N+B)*. This is equivalent to adding one to the count for each bin, and taking the maximum likelihood estimate of the resulting frequency distribution. """ def __init__(self, freqdist, bins=None): """ Use the Laplace estimate to create a probability distribution for the experiment used to generate ``freqdist``. :type freqdist: FreqDist :param freqdist: The frequency distribution that the probability estimates should be based on. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ LidstoneProbDist.__init__(self, freqdist, 1, bins) def __repr__(self): """ :rtype: str :return: A string representation of this ``ProbDist``. """ return "" % self._freqdist.N() class ELEProbDist(LidstoneProbDist): """ The expected likelihood estimate for the probability distribution of the experiment used to generate a frequency distribution. The "expected likelihood estimate" approximates the probability of a sample with count *c* from an experiment with *N* outcomes and *B* bins as *(c+0.5)/(N+B/2)*. This is equivalent to adding 0.5 to the count for each bin, and taking the maximum likelihood estimate of the resulting frequency distribution. """ def __init__(self, freqdist, bins=None): """ Use the expected likelihood estimate to create a probability distribution for the experiment used to generate ``freqdist``. :type freqdist: FreqDist :param freqdist: The frequency distribution that the probability estimates should be based on. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ LidstoneProbDist.__init__(self, freqdist, 0.5, bins) def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return "" % self._freqdist.N() class HeldoutProbDist(ProbDistI): """ The heldout estimate for the probability distribution of the experiment used to generate two frequency distributions. These two frequency distributions are called the "heldout frequency distribution" and the "base frequency distribution." The "heldout estimate" uses uses the "heldout frequency distribution" to predict the probability of each sample, given its frequency in the "base frequency distribution". In particular, the heldout estimate approximates the probability for a sample that occurs *r* times in the base distribution as the average frequency in the heldout distribution of all samples that occur *r* times in the base distribution. This average frequency is *Tr[r]/(Nr[r].N)*, where: - *Tr[r]* is the total count in the heldout distribution for all samples that occur *r* times in the base distribution. - *Nr[r]* is the number of samples that occur *r* times in the base distribution. - *N* is the number of outcomes recorded by the heldout frequency distribution. In order to increase the efficiency of the ``prob`` member function, *Tr[r]/(Nr[r].N)* is precomputed for each value of *r* when the ``HeldoutProbDist`` is created. :type _estimate: list(float) :ivar _estimate: A list mapping from *r*, the number of times that a sample occurs in the base distribution, to the probability estimate for that sample. ``_estimate[r]`` is calculated by finding the average frequency in the heldout distribution of all samples that occur *r* times in the base distribution. In particular, ``_estimate[r]`` = *Tr[r]/(Nr[r].N)*. :type _max_r: int :ivar _max_r: The maximum number of times that any sample occurs in the base distribution. ``_max_r`` is used to decide how large ``_estimate`` must be. """ SUM_TO_ONE = False def __init__(self, base_fdist, heldout_fdist, bins=None): """ Use the heldout estimate to create a probability distribution for the experiment used to generate ``base_fdist`` and ``heldout_fdist``. :type base_fdist: FreqDist :param base_fdist: The base frequency distribution. :type heldout_fdist: FreqDist :param heldout_fdist: The heldout frequency distribution. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ self._base_fdist = base_fdist self._heldout_fdist = heldout_fdist # The max number of times any sample occurs in base_fdist. self._max_r = base_fdist[base_fdist.max()] # Calculate Tr, Nr, and N. Tr = self._calculate_Tr() r_Nr = base_fdist.r_Nr(bins) Nr = [r_Nr[r] for r in range(self._max_r + 1)] N = heldout_fdist.N() # Use Tr, Nr, and N to compute the probability estimate for # each value of r. self._estimate = self._calculate_estimate(Tr, Nr, N) def _calculate_Tr(self): """ Return the list *Tr*, where *Tr[r]* is the total count in ``heldout_fdist`` for all samples that occur *r* times in ``base_fdist``. :rtype: list(float) """ Tr = [0.0] * (self._max_r + 1) for sample in self._heldout_fdist: r = self._base_fdist[sample] Tr[r] += self._heldout_fdist[sample] return Tr def _calculate_estimate(self, Tr, Nr, N): """ Return the list *estimate*, where *estimate[r]* is the probability estimate for any sample that occurs *r* times in the base frequency distribution. In particular, *estimate[r]* is *Tr[r]/(N[r].N)*. In the special case that *N[r]=0*, *estimate[r]* will never be used; so we define *estimate[r]=None* for those cases. :rtype: list(float) :type Tr: list(float) :param Tr: the list *Tr*, where *Tr[r]* is the total count in the heldout distribution for all samples that occur *r* times in base distribution. :type Nr: list(float) :param Nr: The list *Nr*, where *Nr[r]* is the number of samples that occur *r* times in the base distribution. :type N: int :param N: The total number of outcomes recorded by the heldout frequency distribution. """ estimate = [] for r in range(self._max_r + 1): if Nr[r] == 0: estimate.append(None) else: estimate.append(Tr[r] / (Nr[r] * N)) return estimate def base_fdist(self): """ Return the base frequency distribution that this probability distribution is based on. :rtype: FreqDist """ return self._base_fdist def heldout_fdist(self): """ Return the heldout frequency distribution that this probability distribution is based on. :rtype: FreqDist """ return self._heldout_fdist def samples(self): return self._base_fdist.keys() def prob(self, sample): # Use our precomputed probability estimate. r = self._base_fdist[sample] return self._estimate[r] def max(self): # Note: the Heldout estimation is *not* necessarily monotonic; # so this implementation is currently broken. However, it # should give the right answer *most* of the time. :) return self._base_fdist.max() def discount(self): raise NotImplementedError() def __repr__(self): """ :rtype: str :return: A string representation of this ``ProbDist``. """ s = "" return s % (self._base_fdist.N(), self._heldout_fdist.N()) class CrossValidationProbDist(ProbDistI): """ The cross-validation estimate for the probability distribution of the experiment used to generate a set of frequency distribution. The "cross-validation estimate" for the probability of a sample is found by averaging the held-out estimates for the sample in each pair of frequency distributions. """ SUM_TO_ONE = False def __init__(self, freqdists, bins): """ Use the cross-validation estimate to create a probability distribution for the experiment used to generate ``freqdists``. :type freqdists: list(FreqDist) :param freqdists: A list of the frequency distributions generated by the experiment. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ self._freqdists = freqdists # Create a heldout probability distribution for each pair of # frequency distributions in freqdists. self._heldout_probdists = [] for fdist1 in freqdists: for fdist2 in freqdists: if fdist1 is not fdist2: probdist = HeldoutProbDist(fdist1, fdist2, bins) self._heldout_probdists.append(probdist) def freqdists(self): """ Return the list of frequency distributions that this ``ProbDist`` is based on. :rtype: list(FreqDist) """ return self._freqdists def samples(self): # [xx] nb: this is not too efficient return set(sum((list(fd) for fd in self._freqdists), [])) def prob(self, sample): # Find the average probability estimate returned by each # heldout distribution. prob = 0.0 for heldout_probdist in self._heldout_probdists: prob += heldout_probdist.prob(sample) return prob / len(self._heldout_probdists) def discount(self): raise NotImplementedError() def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return "" % len(self._freqdists) class WittenBellProbDist(ProbDistI): """ The Witten-Bell estimate of a probability distribution. This distribution allocates uniform probability mass to as yet unseen events by using the number of events that have only been seen once. The probability mass reserved for unseen events is equal to *T / (N + T)* where *T* is the number of observed event types and *N* is the total number of observed events. This equates to the maximum likelihood estimate of a new type event occurring. The remaining probability mass is discounted such that all probability estimates sum to one, yielding: - *p = T / Z (N + T)*, if count = 0 - *p = c / (N + T)*, otherwise """ def __init__(self, freqdist, bins=None): """ Creates a distribution of Witten-Bell probability estimates. This distribution allocates uniform probability mass to as yet unseen events by using the number of events that have only been seen once. The probability mass reserved for unseen events is equal to *T / (N + T)* where *T* is the number of observed event types and *N* is the total number of observed events. This equates to the maximum likelihood estimate of a new type event occurring. The remaining probability mass is discounted such that all probability estimates sum to one, yielding: - *p = T / Z (N + T)*, if count = 0 - *p = c / (N + T)*, otherwise The parameters *T* and *N* are taken from the ``freqdist`` parameter (the ``B()`` and ``N()`` values). The normalizing factor *Z* is calculated using these values along with the ``bins`` parameter. :param freqdist: The frequency counts upon which to base the estimation. :type freqdist: FreqDist :param bins: The number of possible event types. This must be at least as large as the number of bins in the ``freqdist``. If None, then it's assumed to be equal to that of the ``freqdist`` :type bins: int """ assert bins is None or bins >= freqdist.B(), ( "bins parameter must not be less than %d=freqdist.B()" % freqdist.B() ) if bins is None: bins = freqdist.B() self._freqdist = freqdist self._T = self._freqdist.B() self._Z = bins - self._freqdist.B() self._N = self._freqdist.N() # self._P0 is P(0), precalculated for efficiency: if self._N == 0: # if freqdist is empty, we approximate P(0) by a UniformProbDist: self._P0 = 1.0 / self._Z else: self._P0 = self._T / (self._Z * (self._N + self._T)) def prob(self, sample): # inherit docs from ProbDistI c = self._freqdist[sample] return c / (self._N + self._T) if c != 0 else self._P0 def max(self): return self._freqdist.max() def samples(self): return self._freqdist.keys() def freqdist(self): return self._freqdist def discount(self): raise NotImplementedError() def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return "" % self._freqdist.N() ##////////////////////////////////////////////////////// ## Good-Turing Probability Distributions ##////////////////////////////////////////////////////// # Good-Turing frequency estimation was contributed by Alan Turing and # his statistical assistant I.J. Good, during their collaboration in # the WWII. It is a statistical technique for predicting the # probability of occurrence of objects belonging to an unknown number # of species, given past observations of such objects and their # species. (In drawing balls from an urn, the 'objects' would be balls # and the 'species' would be the distinct colors of the balls (finite # but unknown in number). # # Good-Turing method calculates the probability mass to assign to # events with zero or low counts based on the number of events with # higher counts. It does so by using the adjusted count *c\**: # # - *c\* = (c + 1) N(c + 1) / N(c)* for c >= 1 # - *things with frequency zero in training* = N(1) for c == 0 # # where *c* is the original count, *N(i)* is the number of event types # observed with count *i*. We can think the count of unseen as the count # of frequency one (see Jurafsky & Martin 2nd Edition, p101). # # This method is problematic because the situation ``N(c+1) == 0`` # is quite common in the original Good-Turing estimation; smoothing or # interpolation of *N(i)* values is essential in practice. # # Bill Gale and Geoffrey Sampson present a simple and effective approach, # Simple Good-Turing. As a smoothing curve they simply use a power curve: # # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic # relationship) # # They estimate a and b by simple linear regression technique on the # logarithmic form of the equation: # # log Nr = a + b*log(r) # # However, they suggest that such a simple curve is probably only # appropriate for high values of r. For low values of r, they use the # measured Nr directly. (see M&S, p.213) # # Gale and Sampson propose to use r while the difference between r and # r* is 1.96 greater than the standard deviation, and switch to r* if # it is less or equal: # # |r - r*| > 1.96 * sqrt((r + 1)^2 (Nr+1 / Nr^2) (1 + Nr+1 / Nr)) # # The 1.96 coefficient correspond to a 0.05 significance criterion, # some implementations can use a coefficient of 1.65 for a 0.1 # significance criterion. # ##////////////////////////////////////////////////////// ## Simple Good-Turing Probablity Distributions ##////////////////////////////////////////////////////// class SimpleGoodTuringProbDist(ProbDistI): """ SimpleGoodTuring ProbDist approximates from frequency to frequency of frequency into a linear line under log space by linear regression. Details of Simple Good-Turing algorithm can be found in: - Good Turing smoothing without tears" (Gale & Sampson 1995), Journal of Quantitative Linguistics, vol. 2 pp. 217-237. - "Speech and Language Processing (Jurafsky & Martin), 2nd Edition, Chapter 4.5 p103 (log(Nc) = a + b*log(c)) - https://www.grsampson.net/RGoodTur.html Given a set of pair (xi, yi), where the xi denotes the frequency and yi denotes the frequency of frequency, we want to minimize their square variation. E(x) and E(y) represent the mean of xi and yi. - slope: b = sigma ((xi-E(x)(yi-E(y))) / sigma ((xi-E(x))(xi-E(x))) - intercept: a = E(y) - b.E(x) """ SUM_TO_ONE = False def __init__(self, freqdist, bins=None): """ :param freqdist: The frequency counts upon which to base the estimation. :type freqdist: FreqDist :param bins: The number of possible event types. This must be larger than the number of bins in the ``freqdist``. If None, then it's assumed to be equal to ``freqdist``.B() + 1 :type bins: int """ assert ( bins is None or bins > freqdist.B() ), "bins parameter must not be less than %d=freqdist.B()+1" % (freqdist.B() + 1) if bins is None: bins = freqdist.B() + 1 self._freqdist = freqdist self._bins = bins r, nr = self._r_Nr() self.find_best_fit(r, nr) self._switch(r, nr) self._renormalize(r, nr) def _r_Nr_non_zero(self): r_Nr = self._freqdist.r_Nr() del r_Nr[0] return r_Nr def _r_Nr(self): """ Split the frequency distribution in two list (r, Nr), where Nr(r) > 0 """ nonzero = self._r_Nr_non_zero() if not nonzero: return [], [] return zip(*sorted(nonzero.items())) def find_best_fit(self, r, nr): """ Use simple linear regression to tune parameters self._slope and self._intercept in the log-log space based on count and Nr(count) (Work in log space to avoid floating point underflow.) """ # For higher sample frequencies the data points becomes horizontal # along line Nr=1. To create a more evident linear model in log-log # space, we average positive Nr values with the surrounding zero # values. (Church and Gale, 1991) if not r or not nr: # Empty r or nr? return zr = [] for j in range(len(r)): i = r[j - 1] if j > 0 else 0 k = 2 * r[j] - i if j == len(r) - 1 else r[j + 1] zr_ = 2.0 * nr[j] / (k - i) zr.append(zr_) log_r = [math.log(i) for i in r] log_zr = [math.log(i) for i in zr] xy_cov = x_var = 0.0 x_mean = sum(log_r) / len(log_r) y_mean = sum(log_zr) / len(log_zr) for (x, y) in zip(log_r, log_zr): xy_cov += (x - x_mean) * (y - y_mean) x_var += (x - x_mean) ** 2 self._slope = xy_cov / x_var if x_var != 0 else 0.0 if self._slope >= -1: warnings.warn( "SimpleGoodTuring did not find a proper best fit " "line for smoothing probabilities of occurrences. " "The probability estimates are likely to be " "unreliable." ) self._intercept = y_mean - self._slope * x_mean def _switch(self, r, nr): """ Calculate the r frontier where we must switch from Nr to Sr when estimating E[Nr]. """ for i, r_ in enumerate(r): if len(r) == i + 1 or r[i + 1] != r_ + 1: # We are at the end of r, or there is a gap in r self._switch_at = r_ break Sr = self.smoothedNr smooth_r_star = (r_ + 1) * Sr(r_ + 1) / Sr(r_) unsmooth_r_star = (r_ + 1) * nr[i + 1] / nr[i] std = math.sqrt(self._variance(r_, nr[i], nr[i + 1])) if abs(unsmooth_r_star - smooth_r_star) <= 1.96 * std: self._switch_at = r_ break def _variance(self, r, nr, nr_1): r = float(r) nr = float(nr) nr_1 = float(nr_1) return (r + 1.0) ** 2 * (nr_1 / nr ** 2) * (1.0 + nr_1 / nr) def _renormalize(self, r, nr): """ It is necessary to renormalize all the probability estimates to ensure a proper probability distribution results. This can be done by keeping the estimate of the probability mass for unseen items as N(1)/N and renormalizing all the estimates for previously seen items (as Gale and Sampson (1995) propose). (See M&S P.213, 1999) """ prob_cov = 0.0 for r_, nr_ in zip(r, nr): prob_cov += nr_ * self._prob_measure(r_) if prob_cov: self._renormal = (1 - self._prob_measure(0)) / prob_cov def smoothedNr(self, r): """ Return the number of samples with count r. :param r: The amount of frequency. :type r: int :rtype: float """ # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic # relationship) # Estimate a and b by simple linear regression technique on # the logarithmic form of the equation: log Nr = a + b*log(r) return math.exp(self._intercept + self._slope * math.log(r)) def prob(self, sample): """ Return the sample's probability. :param sample: sample of the event :type sample: str :rtype: float """ count = self._freqdist[sample] p = self._prob_measure(count) if count == 0: if self._bins == self._freqdist.B(): p = 0.0 else: p = p / (self._bins - self._freqdist.B()) else: p = p * self._renormal return p def _prob_measure(self, count): if count == 0 and self._freqdist.N() == 0: return 1.0 elif count == 0 and self._freqdist.N() != 0: return self._freqdist.Nr(1) / self._freqdist.N() if self._switch_at > count: Er_1 = self._freqdist.Nr(count + 1) Er = self._freqdist.Nr(count) else: Er_1 = self.smoothedNr(count + 1) Er = self.smoothedNr(count) r_star = (count + 1) * Er_1 / Er return r_star / self._freqdist.N() def check(self): prob_sum = 0.0 for i in range(0, len(self._Nr)): prob_sum += self._Nr[i] * self._prob_measure(i) / self._renormal print("Probability Sum:", prob_sum) # assert prob_sum != 1.0, "probability sum should be one!" def discount(self): """ This function returns the total mass of probability transfers from the seen samples to the unseen samples. """ return self.smoothedNr(1) / self._freqdist.N() def max(self): return self._freqdist.max() def samples(self): return self._freqdist.keys() def freqdist(self): return self._freqdist def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return "" % self._freqdist.N() class MutableProbDist(ProbDistI): """ An mutable probdist where the probabilities may be easily modified. This simply copies an existing probdist, storing the probability values in a mutable dictionary and providing an update method. """ def __init__(self, prob_dist, samples, store_logs=True): """ Creates the mutable probdist based on the given prob_dist and using the list of samples given. These values are stored as log probabilities if the store_logs flag is set. :param prob_dist: the distribution from which to garner the probabilities :type prob_dist: ProbDist :param samples: the complete set of samples :type samples: sequence of any :param store_logs: whether to store the probabilities as logarithms :type store_logs: bool """ self._samples = samples self._sample_dict = {samples[i]: i for i in range(len(samples))} self._data = array.array("d", [0.0]) * len(samples) for i in range(len(samples)): if store_logs: self._data[i] = prob_dist.logprob(samples[i]) else: self._data[i] = prob_dist.prob(samples[i]) self._logs = store_logs def max(self): # inherit documentation return max((p, v) for (v, p) in self._sample_dict.items())[1] def samples(self): # inherit documentation return self._samples def prob(self, sample): # inherit documentation i = self._sample_dict.get(sample) if i is None: return 0.0 return 2 ** (self._data[i]) if self._logs else self._data[i] def logprob(self, sample): # inherit documentation i = self._sample_dict.get(sample) if i is None: return float("-inf") return self._data[i] if self._logs else math.log(self._data[i], 2) def update(self, sample, prob, log=True): """ Update the probability for the given sample. This may cause the object to stop being the valid probability distribution - the user must ensure that they update the sample probabilities such that all samples have probabilities between 0 and 1 and that all probabilities sum to one. :param sample: the sample for which to update the probability :type sample: any :param prob: the new probability :type prob: float :param log: is the probability already logged :type log: bool """ i = self._sample_dict.get(sample) assert i is not None if self._logs: self._data[i] = prob if log else math.log(prob, 2) else: self._data[i] = 2 ** (prob) if log else prob ##///////////////////////////////////////////////////// ## Kneser-Ney Probability Distribution ##////////////////////////////////////////////////////// # This method for calculating probabilities was introduced in 1995 by Reinhard # Kneser and Hermann Ney. It was meant to improve the accuracy of language # models that use backing-off to deal with sparse data. The authors propose two # ways of doing so: a marginal distribution constraint on the back-off # distribution and a leave-one-out distribution. For a start, the first one is # implemented as a class below. # # The idea behind a back-off n-gram model is that we have a series of # frequency distributions for our n-grams so that in case we have not seen a # given n-gram during training (and as a result have a 0 probability for it) we # can 'back off' (hence the name!) and try testing whether we've seen the # n-1-gram part of the n-gram in training. # # The novelty of Kneser and Ney's approach was that they decided to fiddle # around with the way this latter, backed off probability was being calculated # whereas their peers seemed to focus on the primary probability. # # The implementation below uses one of the techniques described in their paper # titled "Improved backing-off for n-gram language modeling." In the same paper # another technique is introduced to attempt to smooth the back-off # distribution as well as the primary one. There is also a much-cited # modification of this method proposed by Chen and Goodman. # # In order for the implementation of Kneser-Ney to be more efficient, some # changes have been made to the original algorithm. Namely, the calculation of # the normalizing function gamma has been significantly simplified and # combined slightly differently with beta. None of these changes affect the # nature of the algorithm, but instead aim to cut out unnecessary calculations # and take advantage of storing and retrieving information in dictionaries # where possible. class KneserNeyProbDist(ProbDistI): """ Kneser-Ney estimate of a probability distribution. This is a version of back-off that counts how likely an n-gram is provided the n-1-gram had been seen in training. Extends the ProbDistI interface, requires a trigram FreqDist instance to train on. Optionally, a different from default discount value can be specified. The default discount is set to 0.75. """ def __init__(self, freqdist, bins=None, discount=0.75): """ :param freqdist: The trigram frequency distribution upon which to base the estimation :type freqdist: FreqDist :param bins: Included for compatibility with nltk.tag.hmm :type bins: int or float :param discount: The discount applied when retrieving counts of trigrams :type discount: float (preferred, but can be set to int) """ if not bins: self._bins = freqdist.B() else: self._bins = bins self._D = discount # cache for probability calculation self._cache = {} # internal bigram and trigram frequency distributions self._bigrams = defaultdict(int) self._trigrams = freqdist # helper dictionaries used to calculate probabilities self._wordtypes_after = defaultdict(float) self._trigrams_contain = defaultdict(float) self._wordtypes_before = defaultdict(float) for w0, w1, w2 in freqdist: self._bigrams[(w0, w1)] += freqdist[(w0, w1, w2)] self._wordtypes_after[(w0, w1)] += 1 self._trigrams_contain[w1] += 1 self._wordtypes_before[(w1, w2)] += 1 def prob(self, trigram): # sample must be a triple if len(trigram) != 3: raise ValueError("Expected an iterable with 3 members.") trigram = tuple(trigram) w0, w1, w2 = trigram if trigram in self._cache: return self._cache[trigram] else: # if the sample trigram was seen during training if trigram in self._trigrams: prob = (self._trigrams[trigram] - self.discount()) / self._bigrams[ (w0, w1) ] # else if the 'rougher' environment was seen during training elif (w0, w1) in self._bigrams and (w1, w2) in self._wordtypes_before: aftr = self._wordtypes_after[(w0, w1)] bfr = self._wordtypes_before[(w1, w2)] # the probability left over from alphas leftover_prob = (aftr * self.discount()) / self._bigrams[(w0, w1)] # the beta (including normalization) beta = bfr / (self._trigrams_contain[w1] - aftr) prob = leftover_prob * beta # else the sample was completely unseen during training else: prob = 0.0 self._cache[trigram] = prob return prob def discount(self): """ Return the value by which counts are discounted. By default set to 0.75. :rtype: float """ return self._D def set_discount(self, discount): """ Set the value by which counts are discounted to the value of discount. :param discount: the new value to discount counts by :type discount: float (preferred, but int possible) :rtype: None """ self._D = discount def samples(self): return self._trigrams.keys() def max(self): return self._trigrams.max() def __repr__(self): """ Return a string representation of this ProbDist :rtype: str """ return f">> from nltk.probability import ConditionalFreqDist >>> from nltk.tokenize import word_tokenize >>> sent = "the the the dog dog some other words that we do not care about" >>> cfdist = ConditionalFreqDist() >>> for word in word_tokenize(sent): ... condition = len(word) ... cfdist[condition][word] += 1 An equivalent way to do this is with the initializer: >>> cfdist = ConditionalFreqDist((len(word), word) for word in word_tokenize(sent)) The frequency distribution for each condition is accessed using the indexing operator: >>> cfdist[3] FreqDist({'the': 3, 'dog': 2, 'not': 1}) >>> cfdist[3].freq('the') 0.5 >>> cfdist[3]['dog'] 2 When the indexing operator is used to access the frequency distribution for a condition that has not been accessed before, ``ConditionalFreqDist`` creates a new empty FreqDist for that condition. """ def __init__(self, cond_samples=None): """ Construct a new empty conditional frequency distribution. In particular, the count for every sample, under every condition, is zero. :param cond_samples: The samples to initialize the conditional frequency distribution with :type cond_samples: Sequence of (condition, sample) tuples """ defaultdict.__init__(self, FreqDist) if cond_samples: for (cond, sample) in cond_samples: self[cond][sample] += 1 def __reduce__(self): kv_pairs = ((cond, self[cond]) for cond in self.conditions()) return (self.__class__, (), None, None, kv_pairs) def conditions(self): """ Return a list of the conditions that have been accessed for this ``ConditionalFreqDist``. Use the indexing operator to access the frequency distribution for a given condition. Note that the frequency distributions for some conditions may contain zero sample outcomes. :rtype: list """ return list(self.keys()) def N(self): """ Return the total number of sample outcomes that have been recorded by this ``ConditionalFreqDist``. :rtype: int """ return sum(fdist.N() for fdist in self.values()) def plot( self, *args, samples=None, title="", cumulative=False, percents=False, conditions=None, show=True, **kwargs, ): """ Plot the given samples from the conditional frequency distribution. For a cumulative plot, specify cumulative=True. Additional ``*args`` and ``**kwargs`` are passed to matplotlib's plot function. (Requires Matplotlib to be installed.) :param samples: The samples to plot :type samples: list :param title: The title for the graph :type title: str :param cumulative: Whether the plot is cumulative. (default = False) :type cumulative: bool :param percents: Whether the plot uses percents instead of counts. (default = False) :type percents: bool :param conditions: The conditions to plot (default is all) :type conditions: list :param show: Whether to show the plot, or only return the ax. :type show: bool """ try: import matplotlib.pyplot as plt # import statement fix except ImportError as e: raise ValueError( "The plot function requires matplotlib to be installed." "See https://matplotlib.org/" ) from e if not conditions: conditions = self.conditions() else: conditions = [c for c in conditions if c in self] if not samples: samples = sorted({v for c in conditions for v in self[c]}) if "linewidth" not in kwargs: kwargs["linewidth"] = 2 ax = plt.gca() if conditions: freqs = [] for condition in conditions: if cumulative: # freqs should be a list of list where each sub list will be a frequency of a condition freq = list(self[condition]._cumulative_frequencies(samples)) else: freq = [self[condition][sample] for sample in samples] if percents: freq = [f / self[condition].N() * 100 for f in freq] freqs.append(freq) if cumulative: ylabel = "Cumulative " legend_loc = "lower right" else: ylabel = "" legend_loc = "upper right" if percents: ylabel += "Percents" else: ylabel += "Counts" i = 0 for freq in freqs: kwargs["label"] = conditions[i] # label for each condition i += 1 ax.plot(freq, *args, **kwargs) ax.legend(loc=legend_loc) ax.grid(True, color="silver") ax.set_xticks(range(len(samples))) ax.set_xticklabels([str(s) for s in samples], rotation=90) if title: ax.set_title(title) ax.set_xlabel("Samples") ax.set_ylabel(ylabel) if show: plt.show() return ax def tabulate(self, *args, **kwargs): """ Tabulate the given samples from the conditional frequency distribution. :param samples: The samples to plot :type samples: list :param conditions: The conditions to plot (default is all) :type conditions: list :param cumulative: A flag to specify whether the freqs are cumulative (default = False) :type title: bool """ cumulative = _get_kwarg(kwargs, "cumulative", False) conditions = _get_kwarg(kwargs, "conditions", sorted(self.conditions())) samples = _get_kwarg( kwargs, "samples", sorted({v for c in conditions if c in self for v in self[c]}), ) # this computation could be wasted width = max(len("%s" % s) for s in samples) freqs = dict() for c in conditions: if cumulative: freqs[c] = list(self[c]._cumulative_frequencies(samples)) else: freqs[c] = [self[c][sample] for sample in samples] width = max(width, max(len("%d" % f) for f in freqs[c])) condition_size = max(len("%s" % c) for c in conditions) print(" " * condition_size, end=" ") for s in samples: print("%*s" % (width, s), end=" ") print() for c in conditions: print("%*s" % (condition_size, c), end=" ") for f in freqs[c]: print("%*d" % (width, f), end=" ") print() # Mathematical operators def __add__(self, other): """ Add counts from two ConditionalFreqDists. """ if not isinstance(other, ConditionalFreqDist): return NotImplemented result = ConditionalFreqDist() for cond in self.conditions(): newfreqdist = self[cond] + other[cond] if newfreqdist: result[cond] = newfreqdist for cond in other.conditions(): if cond not in self.conditions(): for elem, count in other[cond].items(): if count > 0: result[cond][elem] = count return result def __sub__(self, other): """ Subtract count, but keep only results with positive counts. """ if not isinstance(other, ConditionalFreqDist): return NotImplemented result = ConditionalFreqDist() for cond in self.conditions(): newfreqdist = self[cond] - other[cond] if newfreqdist: result[cond] = newfreqdist for cond in other.conditions(): if cond not in self.conditions(): for elem, count in other[cond].items(): if count < 0: result[cond][elem] = 0 - count return result def __or__(self, other): """ Union is the maximum of value in either of the input counters. """ if not isinstance(other, ConditionalFreqDist): return NotImplemented result = ConditionalFreqDist() for cond in self.conditions(): newfreqdist = self[cond] | other[cond] if newfreqdist: result[cond] = newfreqdist for cond in other.conditions(): if cond not in self.conditions(): for elem, count in other[cond].items(): if count > 0: result[cond][elem] = count return result def __and__(self, other): """ Intersection is the minimum of corresponding counts. """ if not isinstance(other, ConditionalFreqDist): return NotImplemented result = ConditionalFreqDist() for cond in self.conditions(): newfreqdist = self[cond] & other[cond] if newfreqdist: result[cond] = newfreqdist return result # @total_ordering doesn't work here, since the class inherits from a builtin class def __le__(self, other): if not isinstance(other, ConditionalFreqDist): raise_unorderable_types("<=", self, other) return set(self.conditions()).issubset(other.conditions()) and all( self[c] <= other[c] for c in self.conditions() ) def __lt__(self, other): if not isinstance(other, ConditionalFreqDist): raise_unorderable_types("<", self, other) return self <= other and self != other def __ge__(self, other): if not isinstance(other, ConditionalFreqDist): raise_unorderable_types(">=", self, other) return other <= self def __gt__(self, other): if not isinstance(other, ConditionalFreqDist): raise_unorderable_types(">", self, other) return other < self def __repr__(self): """ Return a string representation of this ``ConditionalFreqDist``. :rtype: str """ return "" % len(self) class ConditionalProbDistI(dict, metaclass=ABCMeta): """ A collection of probability distributions for a single experiment run under different conditions. Conditional probability distributions are used to estimate the likelihood of each sample, given the condition under which the experiment was run. For example, a conditional probability distribution could be used to estimate the probability of each word type in a document, given the length of the word type. Formally, a conditional probability distribution can be defined as a function that maps from each condition to the ``ProbDist`` for the experiment under that condition. """ @abstractmethod def __init__(self): """ Classes inheriting from ConditionalProbDistI should implement __init__. """ def conditions(self): """ Return a list of the conditions that are represented by this ``ConditionalProbDist``. Use the indexing operator to access the probability distribution for a given condition. :rtype: list """ return list(self.keys()) def __repr__(self): """ Return a string representation of this ``ConditionalProbDist``. :rtype: str """ return "<%s with %d conditions>" % (type(self).__name__, len(self)) class ConditionalProbDist(ConditionalProbDistI): """ A conditional probability distribution modeling the experiments that were used to generate a conditional frequency distribution. A ConditionalProbDist is constructed from a ``ConditionalFreqDist`` and a ``ProbDist`` factory: - The ``ConditionalFreqDist`` specifies the frequency distribution for each condition. - The ``ProbDist`` factory is a function that takes a condition's frequency distribution, and returns its probability distribution. A ``ProbDist`` class's name (such as ``MLEProbDist`` or ``HeldoutProbDist``) can be used to specify that class's constructor. The first argument to the ``ProbDist`` factory is the frequency distribution that it should model; and the remaining arguments are specified by the ``factory_args`` parameter to the ``ConditionalProbDist`` constructor. For example, the following code constructs a ``ConditionalProbDist``, where the probability distribution for each condition is an ``ELEProbDist`` with 10 bins: >>> from nltk.corpus import brown >>> from nltk.probability import ConditionalFreqDist >>> from nltk.probability import ConditionalProbDist, ELEProbDist >>> cfdist = ConditionalFreqDist(brown.tagged_words()[:5000]) >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10) >>> cpdist['passed'].max() 'VBD' >>> cpdist['passed'].prob('VBD') 0.423... """ def __init__(self, cfdist, probdist_factory, *factory_args, **factory_kw_args): """ Construct a new conditional probability distribution, based on the given conditional frequency distribution and ``ProbDist`` factory. :type cfdist: ConditionalFreqDist :param cfdist: The ``ConditionalFreqDist`` specifying the frequency distribution for each condition. :type probdist_factory: class or function :param probdist_factory: The function or class that maps a condition's frequency distribution to its probability distribution. The function is called with the frequency distribution as its first argument, ``factory_args`` as its remaining arguments, and ``factory_kw_args`` as keyword arguments. :type factory_args: (any) :param factory_args: Extra arguments for ``probdist_factory``. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. :type factory_kw_args: (any) :param factory_kw_args: Extra keyword arguments for ``probdist_factory``. """ self._probdist_factory = probdist_factory self._factory_args = factory_args self._factory_kw_args = factory_kw_args for condition in cfdist: self[condition] = probdist_factory( cfdist[condition], *factory_args, **factory_kw_args ) def __missing__(self, key): self[key] = self._probdist_factory( FreqDist(), *self._factory_args, **self._factory_kw_args ) return self[key] class DictionaryConditionalProbDist(ConditionalProbDistI): """ An alternative ConditionalProbDist that simply wraps a dictionary of ProbDists rather than creating these from FreqDists. """ def __init__(self, probdist_dict): """ :param probdist_dict: a dictionary containing the probdists indexed by the conditions :type probdist_dict: dict any -> probdist """ self.update(probdist_dict) def __missing__(self, key): self[key] = DictionaryProbDist() return self[key] ##////////////////////////////////////////////////////// ## Adding in log-space. ##////////////////////////////////////////////////////// # If the difference is bigger than this, then just take the bigger one: _ADD_LOGS_MAX_DIFF = math.log(1e-30, 2) def add_logs(logx, logy): """ Given two numbers ``logx`` = *log(x)* and ``logy`` = *log(y)*, return *log(x+y)*. Conceptually, this is the same as returning ``log(2**(logx)+2**(logy))``, but the actual implementation avoids overflow errors that could result from direct computation. """ if logx < logy + _ADD_LOGS_MAX_DIFF: return logy if logy < logx + _ADD_LOGS_MAX_DIFF: return logx base = min(logx, logy) return base + math.log(2 ** (logx - base) + 2 ** (logy - base), 2) def sum_logs(logs): return reduce(add_logs, logs[1:], logs[0]) if len(logs) != 0 else _NINF ##////////////////////////////////////////////////////// ## Probabilistic Mix-in ##////////////////////////////////////////////////////// class ProbabilisticMixIn: """ A mix-in class to associate probabilities with other classes (trees, rules, etc.). To use the ``ProbabilisticMixIn`` class, define a new class that derives from an existing class and from ProbabilisticMixIn. You will need to define a new constructor for the new class, which explicitly calls the constructors of both its parent classes. For example: >>> from nltk.probability import ProbabilisticMixIn >>> class A: ... def __init__(self, x, y): self.data = (x,y) ... >>> class ProbabilisticA(A, ProbabilisticMixIn): ... def __init__(self, x, y, **prob_kwarg): ... A.__init__(self, x, y) ... ProbabilisticMixIn.__init__(self, **prob_kwarg) See the documentation for the ProbabilisticMixIn ``constructor<__init__>`` for information about the arguments it expects. You should generally also redefine the string representation methods, the comparison methods, and the hashing method. """ def __init__(self, **kwargs): """ Initialize this object's probability. This initializer should be called by subclass constructors. ``prob`` should generally be the first argument for those constructors. :param prob: The probability associated with the object. :type prob: float :param logprob: The log of the probability associated with the object. :type logprob: float """ if "prob" in kwargs: if "logprob" in kwargs: raise TypeError("Must specify either prob or logprob " "(not both)") else: ProbabilisticMixIn.set_prob(self, kwargs["prob"]) elif "logprob" in kwargs: ProbabilisticMixIn.set_logprob(self, kwargs["logprob"]) else: self.__prob = self.__logprob = None def set_prob(self, prob): """ Set the probability associated with this object to ``prob``. :param prob: The new probability :type prob: float """ self.__prob = prob self.__logprob = None def set_logprob(self, logprob): """ Set the log probability associated with this object to ``logprob``. I.e., set the probability associated with this object to ``2**(logprob)``. :param logprob: The new log probability :type logprob: float """ self.__logprob = logprob self.__prob = None def prob(self): """ Return the probability associated with this object. :rtype: float """ if self.__prob is None: if self.__logprob is None: return None self.__prob = 2 ** (self.__logprob) return self.__prob def logprob(self): """ Return ``log(p)``, where ``p`` is the probability associated with this object. :rtype: float """ if self.__logprob is None: if self.__prob is None: return None self.__logprob = math.log(self.__prob, 2) return self.__logprob class ImmutableProbabilisticMixIn(ProbabilisticMixIn): def set_prob(self, prob): raise ValueError("%s is immutable" % self.__class__.__name__) def set_logprob(self, prob): raise ValueError("%s is immutable" % self.__class__.__name__) ## Helper function for processing keyword arguments def _get_kwarg(kwargs, key, default): if key in kwargs: arg = kwargs[key] del kwargs[key] else: arg = default return arg ##////////////////////////////////////////////////////// ## Demonstration ##////////////////////////////////////////////////////// def _create_rand_fdist(numsamples, numoutcomes): """ Create a new frequency distribution, with random samples. The samples are numbers from 1 to ``numsamples``, and are generated by summing two numbers, each of which has a uniform distribution. """ fdist = FreqDist() for x in range(numoutcomes): y = random.randint(1, (1 + numsamples) // 2) + random.randint( 0, numsamples // 2 ) fdist[y] += 1 return fdist def _create_sum_pdist(numsamples): """ Return the true probability distribution for the experiment ``_create_rand_fdist(numsamples, x)``. """ fdist = FreqDist() for x in range(1, (1 + numsamples) // 2 + 1): for y in range(0, numsamples // 2 + 1): fdist[x + y] += 1 return MLEProbDist(fdist) def demo(numsamples=6, numoutcomes=500): """ A demonstration of frequency distributions and probability distributions. This demonstration creates three frequency distributions with, and uses them to sample a random process with ``numsamples`` samples. Each frequency distribution is sampled ``numoutcomes`` times. These three frequency distributions are then used to build six probability distributions. Finally, the probability estimates of these distributions are compared to the actual probability of each sample. :type numsamples: int :param numsamples: The number of samples to use in each demo frequency distributions. :type numoutcomes: int :param numoutcomes: The total number of outcomes for each demo frequency distribution. These outcomes are divided into ``numsamples`` bins. :rtype: None """ # Randomly sample a stochastic process three times. fdist1 = _create_rand_fdist(numsamples, numoutcomes) fdist2 = _create_rand_fdist(numsamples, numoutcomes) fdist3 = _create_rand_fdist(numsamples, numoutcomes) # Use our samples to create probability distributions. pdists = [ MLEProbDist(fdist1), LidstoneProbDist(fdist1, 0.5, numsamples), HeldoutProbDist(fdist1, fdist2, numsamples), HeldoutProbDist(fdist2, fdist1, numsamples), CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples), SimpleGoodTuringProbDist(fdist1), SimpleGoodTuringProbDist(fdist1, 7), _create_sum_pdist(numsamples), ] # Find the probability of each sample. vals = [] for n in range(1, numsamples + 1): vals.append(tuple([n, fdist1.freq(n)] + [pdist.prob(n) for pdist in pdists])) # Print the results in a formatted table. print( "%d samples (1-%d); %d outcomes were sampled for each FreqDist" % (numsamples, numsamples, numoutcomes) ) print("=" * 9 * (len(pdists) + 2)) FORMATSTR = " FreqDist " + "%8s " * (len(pdists) - 1) + "| Actual" print(FORMATSTR % tuple(repr(pdist)[1:9] for pdist in pdists[:-1])) print("-" * 9 * (len(pdists) + 2)) FORMATSTR = "%3d %8.6f " + "%8.6f " * (len(pdists) - 1) + "| %8.6f" for val in vals: print(FORMATSTR % val) # Print the totals for each column (should all be 1.0) zvals = list(zip(*vals)) sums = [sum(val) for val in zvals[1:]] print("-" * 9 * (len(pdists) + 2)) FORMATSTR = "Total " + "%8.6f " * (len(pdists)) + "| %8.6f" print(FORMATSTR % tuple(sums)) print("=" * 9 * (len(pdists) + 2)) # Display the distributions themselves, if they're short enough. if len("%s" % fdist1) < 70: print(" fdist1: %s" % fdist1) print(" fdist2: %s" % fdist2) print(" fdist3: %s" % fdist3) print() print("Generating:") for pdist in pdists: fdist = FreqDist(pdist.generate() for i in range(5000)) print("{:>20} {}".format(pdist.__class__.__name__[:20], ("%s" % fdist)[:55])) print() def gt_demo(): from nltk import corpus emma_words = corpus.gutenberg.words("austen-emma.txt") fd = FreqDist(emma_words) sgt = SimpleGoodTuringProbDist(fd) print("{:>18} {:>8} {:>14}".format("word", "frequency", "SimpleGoodTuring")) fd_keys_sorted = ( key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True) ) for key in fd_keys_sorted: print("%18s %8d %14e" % (key, fd[key], sgt.prob(key))) if __name__ == "__main__": demo(6, 10) demo(5, 5000) gt_demo() __all__ = [ "ConditionalFreqDist", "ConditionalProbDist", "ConditionalProbDistI", "CrossValidationProbDist", "DictionaryConditionalProbDist", "DictionaryProbDist", "ELEProbDist", "FreqDist", "SimpleGoodTuringProbDist", "HeldoutProbDist", "ImmutableProbabilisticMixIn", "LaplaceProbDist", "LidstoneProbDist", "MLEProbDist", "MutableProbDist", "KneserNeyProbDist", "ProbDistI", "ProbabilisticMixIn", "UniformProbDist", "WittenBellProbDist", "add_logs", "log_likelihood", "sum_logs", "entropy", ] nltk-3.7/nltk/sem/000077500000000000000000000000001420073152400140655ustar00rootroot00000000000000nltk-3.7/nltk/sem/__init__.py000066400000000000000000000045001420073152400161750ustar00rootroot00000000000000# Natural Language Toolkit: Semantic Interpretation # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ NLTK Semantic Interpretation Package This package contains classes for representing semantic structure in formulas of first-order logic and for evaluating such formulas in set-theoretic models. >>> from nltk.sem import logic >>> logic._counter._value = 0 The package has two main components: - ``logic`` provides support for analyzing expressions of First Order Logic (FOL). - ``evaluate`` allows users to recursively determine truth in a model for formulas of FOL. A model consists of a domain of discourse and a valuation function, which assigns values to non-logical constants. We assume that entities in the domain are represented as strings such as ``'b1'``, ``'g1'``, etc. A ``Valuation`` is initialized with a list of (symbol, value) pairs, where values are entities, sets of entities or sets of tuples of entities. The domain of discourse can be inferred from the valuation, and model is then created with domain and valuation as parameters. >>> from nltk.sem import Valuation, Model >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ... ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] >>> val = Valuation(v) >>> dom = val.domain >>> m = Model(dom, val) """ from nltk.sem.boxer import Boxer from nltk.sem.drt import DRS, DrtExpression from nltk.sem.evaluate import ( Assignment, Model, Undefined, Valuation, arity, is_rel, read_valuation, set2rel, ) from nltk.sem.lfg import FStructure from nltk.sem.logic import ( ApplicationExpression, Expression, LogicalExpressionException, Variable, binding_ops, boolean_ops, equality_preds, read_logic, ) from nltk.sem.relextract import clause, extract_rels, rtuple from nltk.sem.skolemize import skolemize from nltk.sem.util import evaluate_sents, interpret_sents, parse_sents, root_semrep # from nltk.sem.glue import Glue # from nltk.sem.hole import HoleSemantics # from nltk.sem.cooper_storage import CooperStore # don't import chat80 as its names are too generic nltk-3.7/nltk/sem/boxer.py000066400000000000000000001506221420073152400155640ustar00rootroot00000000000000# Natural Language Toolkit: Interface to Boxer # # # Author: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT """ An interface to Boxer. This interface relies on the latest version of the development (subversion) version of C&C and Boxer. Usage ===== Set the environment variable CANDC to the bin directory of your CandC installation. The models directory should be in the CandC root directory. For example:: /path/to/candc/ bin/ candc boxer models/ boxer/ """ import operator import os import re import subprocess import tempfile from functools import reduce from optparse import OptionParser from nltk.internals import find_binary from nltk.sem.drt import ( DRS, DrtApplicationExpression, DrtEqualityExpression, DrtNegatedExpression, DrtOrExpression, DrtParser, DrtProposition, DrtTokens, DrtVariableExpression, ) from nltk.sem.logic import ( ExpectedMoreTokensException, LogicalExpressionException, UnexpectedTokenException, Variable, ) class Boxer: """ This class is an interface to Johan Bos's program Boxer, a wide-coverage semantic parser that produces Discourse Representation Structures (DRSs). """ def __init__( self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False, resolve=True, ): """ :param boxer_drs_interpreter: A class that converts from the ``AbstractBoxerDrs`` object hierarchy to a different object. The default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK DRT hierarchy. :param elimeq: When set to true, Boxer removes all equalities from the DRSs and discourse referents standing in the equality relation are unified, but only if this can be done in a meaning-preserving manner. :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction. Resolution follows Van der Sandt's theory of binding and accommodation. """ if boxer_drs_interpreter is None: boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter() self._boxer_drs_interpreter = boxer_drs_interpreter self._resolve = resolve self._elimeq = elimeq self.set_bin_dir(bin_dir, verbose) def set_bin_dir(self, bin_dir, verbose=False): self._candc_bin = self._find_binary("candc", bin_dir, verbose) self._candc_models_path = os.path.normpath( os.path.join(self._candc_bin[:-5], "../models") ) self._boxer_bin = self._find_binary("boxer", bin_dir, verbose) def interpret(self, input, discourse_id=None, question=False, verbose=False): """ Use Boxer to give a first order representation. :param input: str Input sentence to parse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ discourse_ids = [discourse_id] if discourse_id is not None else None (d,) = self.interpret_multi_sents([[input]], discourse_ids, question, verbose) if not d: raise Exception(f'Unable to interpret: "{input}"') return d def interpret_multi(self, input, discourse_id=None, question=False, verbose=False): """ Use Boxer to give a first order representation. :param input: list of str Input sentences to parse as a single discourse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ discourse_ids = [discourse_id] if discourse_id is not None else None (d,) = self.interpret_multi_sents([input], discourse_ids, question, verbose) if not d: raise Exception(f'Unable to interpret: "{input}"') return d def interpret_sents( self, inputs, discourse_ids=None, question=False, verbose=False ): """ Use Boxer to give a first order representation. :param inputs: list of str Input sentences to parse as individual discourses :param occur_index: bool Should predicates be occurrence indexed? :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :return: list of ``drt.DrtExpression`` """ return self.interpret_multi_sents( [[input] for input in inputs], discourse_ids, question, verbose ) def interpret_multi_sents( self, inputs, discourse_ids=None, question=False, verbose=False ): """ Use Boxer to give a first order representation. :param inputs: list of list of str Input discourses to parse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ if discourse_ids is not None: assert len(inputs) == len(discourse_ids) assert reduce(operator.and_, (id is not None for id in discourse_ids)) use_disc_id = True else: discourse_ids = list(map(str, range(len(inputs)))) use_disc_id = False candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose) boxer_out = self._call_boxer(candc_out, verbose=verbose) # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out: # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str) drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id) return [drs_dict.get(id, None) for id in discourse_ids] def _call_candc(self, inputs, discourse_ids, question, verbose=False): """ Call the ``candc`` binary with the given input. :param inputs: list of list of str Input discourses to parse :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :param filename: str A filename for the output file :return: stdout """ args = [ "--models", os.path.join(self._candc_models_path, ["boxer", "questions"][question]), "--candc-printer", "boxer", ] return self._call( "\n".join( sum( ([f"'{id}'"] + d for d, id in zip(inputs, discourse_ids)), [], ) ), self._candc_bin, args, verbose, ) def _call_boxer(self, candc_out, verbose=False): """ Call the ``boxer`` binary with the given input. :param candc_out: str output from C&C parser :return: stdout """ f = None try: fd, temp_filename = tempfile.mkstemp( prefix="boxer-", suffix=".in", text=True ) f = os.fdopen(fd, "w") f.write(candc_out) finally: if f: f.close() args = [ "--box", "false", "--semantics", "drs", #'--flat', 'false', # removed from boxer "--resolve", ["false", "true"][self._resolve], "--elimeq", ["false", "true"][self._elimeq], "--format", "prolog", "--instantiate", "true", "--input", temp_filename, ] stdout = self._call(None, self._boxer_bin, args, verbose) os.remove(temp_filename) return stdout def _find_binary(self, name, bin_dir, verbose=False): return find_binary( name, path_to_bin=bin_dir, env_vars=["CANDC"], url="http://svn.ask.it.usyd.edu.au/trac/candc/", binary_names=[name, name + ".exe"], verbose=verbose, ) def _call(self, input_str, binary, args=[], verbose=False): """ Call the binary with the given input. :param input_str: A string whose contents are used as stdin. :param binary: The location of the binary to call :param args: A list of command-line arguments. :return: stdout """ if verbose: print("Calling:", binary) print("Args:", args) print("Input:", input_str) print("Command:", binary + " " + " ".join(args)) # Call via a subprocess if input_str is None: cmd = [binary] + args p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: cmd = 'echo "{}" | {} {}'.format(input_str, binary, " ".join(args)) p = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True ) stdout, stderr = p.communicate() if verbose: print("Return code:", p.returncode) if stdout: print("stdout:\n", stdout, "\n") if stderr: print("stderr:\n", stderr, "\n") if p.returncode != 0: raise Exception( "ERROR CALLING: {} {}\nReturncode: {}\n{}".format( binary, " ".join(args), p.returncode, stderr ) ) return stdout def _parse_to_drs_dict(self, boxer_out, use_disc_id): lines = boxer_out.split("\n") drs_dict = {} i = 0 while i < len(lines): line = lines[i] if line.startswith("id("): comma_idx = line.index(",") discourse_id = line[3:comma_idx] if discourse_id[0] == "'" and discourse_id[-1] == "'": discourse_id = discourse_id[1:-1] drs_id = line[comma_idx + 1 : line.index(")")] i += 1 line = lines[i] assert line.startswith(f"sem({drs_id},") if line[-4:] == "').'": line = line[:-4] + ")." assert line.endswith(")."), f"can't parse line: {line}" search_start = len(f"sem({drs_id},[") brace_count = 1 drs_start = -1 for j, c in enumerate(line[search_start:]): if c == "[": brace_count += 1 if c == "]": brace_count -= 1 if brace_count == 0: drs_start = search_start + j + 1 if line[drs_start : drs_start + 3] == "','": drs_start = drs_start + 3 else: drs_start = drs_start + 1 break assert drs_start > -1 drs_input = line[drs_start:-2].strip() parsed = self._parse_drs(drs_input, discourse_id, use_disc_id) drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed) i += 1 return drs_dict def _parse_drs(self, drs_string, discourse_id, use_disc_id): return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string) class BoxerOutputDrsParser(DrtParser): def __init__(self, discourse_id=None): """ This class is used to parse the Prolog DRS output from Boxer into a hierarchy of python objects. """ DrtParser.__init__(self) self.discourse_id = discourse_id self.sentence_id_offset = None self.quote_chars = [("'", "'", "\\", False)] def parse(self, data, signature=None): return DrtParser.parse(self, data, signature) def get_all_symbols(self): return ["(", ")", ",", "[", "]", ":"] def handle(self, tok, context): return self.handle_drs(tok) def attempt_adjuncts(self, expression, context): return expression def parse_condition(self, indices): """ Parse a DRS condition :return: list of ``DrtExpression`` """ tok = self.token() accum = self.handle_condition(tok, indices) if accum is None: raise UnexpectedTokenException(tok) return accum def handle_drs(self, tok): if tok == "drs": return self.parse_drs() elif tok in ["merge", "smerge"]: return self._handle_binary_expression(self._make_merge_expression)(None, []) elif tok in ["alfa"]: return self._handle_alfa(self._make_merge_expression)(None, []) def handle_condition(self, tok, indices): """ Handle a DRS condition :param indices: list of int :return: list of ``DrtExpression`` """ if tok == "not": return [self._handle_not()] if tok == "or": conds = [self._handle_binary_expression(self._make_or_expression)] elif tok == "imp": conds = [self._handle_binary_expression(self._make_imp_expression)] elif tok == "eq": conds = [self._handle_eq()] elif tok == "prop": conds = [self._handle_prop()] elif tok == "pred": conds = [self._handle_pred()] elif tok == "named": conds = [self._handle_named()] elif tok == "rel": conds = [self._handle_rel()] elif tok == "timex": conds = self._handle_timex() elif tok == "card": conds = [self._handle_card()] elif tok == "whq": conds = [self._handle_whq()] elif tok == "duplex": conds = [self._handle_duplex()] else: conds = [] return sum( ( [cond(sent_index, word_indices) for cond in conds] for sent_index, word_indices in self._sent_and_word_indices(indices) ), [], ) def _handle_not(self): self.assertToken(self.token(), "(") drs = self.process_next_expression(None) self.assertToken(self.token(), ")") return BoxerNot(drs) def _handle_pred(self): # pred(_G3943, dog, n, 0) self.assertToken(self.token(), "(") variable = self.parse_variable() self.assertToken(self.token(), ",") name = self.token() self.assertToken(self.token(), ",") pos = self.token() self.assertToken(self.token(), ",") sense = int(self.token()) self.assertToken(self.token(), ")") def _handle_pred_f(sent_index, word_indices): return BoxerPred( self.discourse_id, sent_index, word_indices, variable, name, pos, sense ) return _handle_pred_f def _handle_duplex(self): # duplex(whq, drs(...), var, drs(...)) self.assertToken(self.token(), "(") # self.assertToken(self.token(), '[') ans_types = [] # while self.token(0) != ']': # cat = self.token() # self.assertToken(self.token(), ':') # if cat == 'des': # ans_types.append(self.token()) # elif cat == 'num': # ans_types.append('number') # typ = self.token() # if typ == 'cou': # ans_types.append('count') # else: # ans_types.append(typ) # else: # ans_types.append(self.token()) # self.token() #swallow the ']' self.assertToken(self.token(), "whq") self.assertToken(self.token(), ",") d1 = self.process_next_expression(None) self.assertToken(self.token(), ",") ref = self.parse_variable() self.assertToken(self.token(), ",") d2 = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerWhq( self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2 ) def _handle_named(self): # named(x0, john, per, 0) self.assertToken(self.token(), "(") variable = self.parse_variable() self.assertToken(self.token(), ",") name = self.token() self.assertToken(self.token(), ",") type = self.token() self.assertToken(self.token(), ",") sense = self.token() # as per boxer rev 2554 self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerNamed( self.discourse_id, sent_index, word_indices, variable, name, type, sense ) def _handle_rel(self): # rel(_G3993, _G3943, agent, 0) self.assertToken(self.token(), "(") var1 = self.parse_variable() self.assertToken(self.token(), ",") var2 = self.parse_variable() self.assertToken(self.token(), ",") rel = self.token() self.assertToken(self.token(), ",") sense = int(self.token()) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerRel( self.discourse_id, sent_index, word_indices, var1, var2, rel, sense ) def _handle_timex(self): # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX')) self.assertToken(self.token(), "(") arg = self.parse_variable() self.assertToken(self.token(), ",") new_conds = self._handle_time_expression(arg) self.assertToken(self.token(), ")") return new_conds def _handle_time_expression(self, arg): # date([]: (+), []:'XXXX', [1004]:'04', []:'XX') tok = self.token() self.assertToken(self.token(), "(") if tok == "date": conds = self._handle_date(arg) elif tok == "time": conds = self._handle_time(arg) else: return None self.assertToken(self.token(), ")") return [ lambda sent_index, word_indices: BoxerPred( self.discourse_id, sent_index, word_indices, arg, tok, "n", 0 ) ] + [lambda sent_index, word_indices: cond for cond in conds] def _handle_date(self, arg): # []: (+), []:'XXXX', [1004]:'04', []:'XX' conds = [] ((sent_index, word_indices),) = self._sent_and_word_indices( self._parse_index_list() ) self.assertToken(self.token(), "(") pol = self.token() self.assertToken(self.token(), ")") conds.append( BoxerPred( self.discourse_id, sent_index, word_indices, arg, f"date_pol_{pol}", "a", 0, ) ) self.assertToken(self.token(), ",") ((sent_index, word_indices),) = self._sent_and_word_indices( self._parse_index_list() ) year = self.token() if year != "XXXX": year = year.replace(":", "_") conds.append( BoxerPred( self.discourse_id, sent_index, word_indices, arg, f"date_year_{year}", "a", 0, ) ) self.assertToken(self.token(), ",") ((sent_index, word_indices),) = self._sent_and_word_indices( self._parse_index_list() ) month = self.token() if month != "XX": conds.append( BoxerPred( self.discourse_id, sent_index, word_indices, arg, f"date_month_{month}", "a", 0, ) ) self.assertToken(self.token(), ",") ((sent_index, word_indices),) = self._sent_and_word_indices( self._parse_index_list() ) day = self.token() if day != "XX": conds.append( BoxerPred( self.discourse_id, sent_index, word_indices, arg, f"date_day_{day}", "a", 0, ) ) return conds def _handle_time(self, arg): # time([1018]:'18', []:'XX', []:'XX') conds = [] self._parse_index_list() hour = self.token() if hour != "XX": conds.append(self._make_atom("r_hour_2", arg, hour)) self.assertToken(self.token(), ",") self._parse_index_list() min = self.token() if min != "XX": conds.append(self._make_atom("r_min_2", arg, min)) self.assertToken(self.token(), ",") self._parse_index_list() sec = self.token() if sec != "XX": conds.append(self._make_atom("r_sec_2", arg, sec)) return conds def _handle_card(self): # card(_G18535, 28, ge) self.assertToken(self.token(), "(") variable = self.parse_variable() self.assertToken(self.token(), ",") value = self.token() self.assertToken(self.token(), ",") type = self.token() self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerCard( self.discourse_id, sent_index, word_indices, variable, value, type ) def _handle_prop(self): # prop(_G15949, drs(...)) self.assertToken(self.token(), "(") variable = self.parse_variable() self.assertToken(self.token(), ",") drs = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerProp( self.discourse_id, sent_index, word_indices, variable, drs ) def _parse_index_list(self): # [1001,1002]: indices = [] self.assertToken(self.token(), "[") while self.token(0) != "]": indices.append(self.parse_index()) if self.token(0) == ",": self.token() # swallow ',' self.token() # swallow ']' self.assertToken(self.token(), ":") return indices def parse_drs(self): # drs([[1001]:_G3943], # [[1002]:pred(_G3943, dog, n, 0)] # ) self.assertToken(self.token(), "(") self.assertToken(self.token(), "[") refs = set() while self.token(0) != "]": indices = self._parse_index_list() refs.add(self.parse_variable()) if self.token(0) == ",": self.token() # swallow ',' self.token() # swallow ']' self.assertToken(self.token(), ",") self.assertToken(self.token(), "[") conds = [] while self.token(0) != "]": indices = self._parse_index_list() conds.extend(self.parse_condition(indices)) if self.token(0) == ",": self.token() # swallow ',' self.token() # swallow ']' self.assertToken(self.token(), ")") return BoxerDrs(list(refs), conds) def _handle_binary_expression(self, make_callback): self.assertToken(self.token(), "(") drs1 = self.process_next_expression(None) self.assertToken(self.token(), ",") drs2 = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: make_callback( sent_index, word_indices, drs1, drs2 ) def _handle_alfa(self, make_callback): self.assertToken(self.token(), "(") type = self.token() self.assertToken(self.token(), ",") drs1 = self.process_next_expression(None) self.assertToken(self.token(), ",") drs2 = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: make_callback( sent_index, word_indices, drs1, drs2 ) def _handle_eq(self): self.assertToken(self.token(), "(") var1 = self.parse_variable() self.assertToken(self.token(), ",") var2 = self.parse_variable() self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerEq( self.discourse_id, sent_index, word_indices, var1, var2 ) def _handle_whq(self): self.assertToken(self.token(), "(") self.assertToken(self.token(), "[") ans_types = [] while self.token(0) != "]": cat = self.token() self.assertToken(self.token(), ":") if cat == "des": ans_types.append(self.token()) elif cat == "num": ans_types.append("number") typ = self.token() if typ == "cou": ans_types.append("count") else: ans_types.append(typ) else: ans_types.append(self.token()) self.token() # swallow the ']' self.assertToken(self.token(), ",") d1 = self.process_next_expression(None) self.assertToken(self.token(), ",") ref = self.parse_variable() self.assertToken(self.token(), ",") d2 = self.process_next_expression(None) self.assertToken(self.token(), ")") return lambda sent_index, word_indices: BoxerWhq( self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2 ) def _make_merge_expression(self, sent_index, word_indices, drs1, drs2): return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds) def _make_or_expression(self, sent_index, word_indices, drs1, drs2): return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2) def _make_imp_expression(self, sent_index, word_indices, drs1, drs2): return BoxerDrs(drs1.refs, drs1.conds, drs2) def parse_variable(self): var = self.token() assert re.match(r"^[exps]\d+$", var), var return var def parse_index(self): return int(self.token()) def _sent_and_word_indices(self, indices): """ :return: list of (sent_index, word_indices) tuples """ sent_indices = {(i / 1000) - 1 for i in indices if i >= 0} if sent_indices: pairs = [] for sent_index in sent_indices: word_indices = [ (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1 ] pairs.append((sent_index, word_indices)) return pairs else: word_indices = [(i % 1000) - 1 for i in indices] return [(None, word_indices)] class BoxerDrsParser(DrtParser): """ Reparse the str form of subclasses of ``AbstractBoxerDrs`` """ def __init__(self, discourse_id=None): DrtParser.__init__(self) self.discourse_id = discourse_id def get_all_symbols(self): return [ DrtTokens.OPEN, DrtTokens.CLOSE, DrtTokens.COMMA, DrtTokens.OPEN_BRACKET, DrtTokens.CLOSE_BRACKET, ] def attempt_adjuncts(self, expression, context): return expression def handle(self, tok, context): try: # if tok == 'drs': # self.assertNextToken(DrtTokens.OPEN) # label = int(self.token()) # self.assertNextToken(DrtTokens.COMMA) # refs = list(map(int, self.handle_refs())) # self.assertNextToken(DrtTokens.COMMA) # conds = self.handle_conds(None) # self.assertNextToken(DrtTokens.CLOSE) # return BoxerDrs(label, refs, conds) if tok == "pred": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) name = self.token() self.assertNextToken(DrtTokens.COMMA) pos = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense) elif tok == "named": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = int(self.token()) self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) name = self.token() self.assertNextToken(DrtTokens.COMMA) type = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerNamed( disc_id, sent_id, word_ids, variable, name, type, sense ) elif tok == "rel": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) var1 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) var2 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) rel = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense) elif tok == "prop": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = int(self.token()) self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) drs = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerProp(disc_id, sent_id, word_ids, variable, drs) elif tok == "not": self.assertNextToken(DrtTokens.OPEN) drs = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerNot(drs) elif tok == "imp": self.assertNextToken(DrtTokens.OPEN) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerDrs(drs1.refs, drs1.conds, drs2) elif tok == "or": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2) elif tok == "eq": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) var1 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) var2 = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerEq(disc_id, sent_id, word_ids, var1, var2) elif tok == "card": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) var = int(self.token()) self.assertNextToken(DrtTokens.COMMA) value = self.token() self.assertNextToken(DrtTokens.COMMA) type = self.token() self.assertNextToken(DrtTokens.CLOSE) return BoxerCard(disc_id, sent_id, word_ids, var, value, type) elif tok == "whq": self.assertNextToken(DrtTokens.OPEN) disc_id = ( self.discourse_id if self.discourse_id is not None else self.token() ) self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) ans_types = self.handle_refs() self.assertNextToken(DrtTokens.COMMA) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) var = int(self.token()) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2) except Exception as e: raise LogicalExpressionException(self._currentIndex, str(e)) from e assert False, repr(tok) def nullableIntToken(self): t = self.token() return int(t) if t != "None" else None def get_next_token_variable(self, description): try: return self.token() except ExpectedMoreTokensException as e: raise ExpectedMoreTokensException(e.index, "Variable expected.") from e class AbstractBoxerDrs: def variables(self): """ :return: (set, set, set) """ variables, events, propositions = self._variables() return (variables - (events | propositions), events, propositions - events) def variable_types(self): vartypes = {} for t, vars in zip(("z", "e", "p"), self.variables()): for v in vars: vartypes[v] = t return vartypes def _variables(self): """ :return: (set, set, set) """ return (set(), set(), set()) def atoms(self): return set() def clean(self): return self def _clean_name(self, name): return name.replace("-", "_").replace("'", "_") def renumber_sentences(self, f): return self def __hash__(self): return hash(f"{self}") class BoxerDrs(AbstractBoxerDrs): def __init__(self, refs, conds, consequent=None): AbstractBoxerDrs.__init__(self) self.refs = refs self.conds = conds self.consequent = consequent def _variables(self): variables = (set(), set(), set()) for cond in self.conds: for s, v in zip(variables, cond._variables()): s.update(v) if self.consequent is not None: for s, v in zip(variables, self.consequent._variables()): s.update(v) return variables def atoms(self): atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set()) if self.consequent is not None: atoms.update(self.consequent.atoms()) return atoms def clean(self): consequent = self.consequent.clean() if self.consequent else None return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent) def renumber_sentences(self, f): consequent = self.consequent.renumber_sentences(f) if self.consequent else None return BoxerDrs( self.refs, [c.renumber_sentences(f) for c in self.conds], consequent ) def __repr__(self): s = "drs([{}], [{}])".format( ", ".join("%s" % r for r in self.refs), ", ".join("%s" % c for c in self.conds), ) if self.consequent is not None: s = f"imp({s}, {self.consequent})" return s def __eq__(self, other): return ( self.__class__ == other.__class__ and self.refs == other.refs and len(self.conds) == len(other.conds) and reduce( operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds)) ) and self.consequent == other.consequent ) def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__ class BoxerNot(AbstractBoxerDrs): def __init__(self, drs): AbstractBoxerDrs.__init__(self) self.drs = drs def _variables(self): return self.drs._variables() def atoms(self): return self.drs.atoms() def clean(self): return BoxerNot(self.drs.clean()) def renumber_sentences(self, f): return BoxerNot(self.drs.renumber_sentences(f)) def __repr__(self): return "not(%s)" % (self.drs) def __eq__(self, other): return self.__class__ == other.__class__ and self.drs == other.drs def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__ class BoxerIndexed(AbstractBoxerDrs): def __init__(self, discourse_id, sent_index, word_indices): AbstractBoxerDrs.__init__(self) self.discourse_id = discourse_id self.sent_index = sent_index self.word_indices = word_indices def atoms(self): return {self} def __eq__(self, other): return ( self.__class__ == other.__class__ and self.discourse_id == other.discourse_id and self.sent_index == other.sent_index and self.word_indices == other.word_indices and reduce(operator.and_, (s == o for s, o in zip(self, other))) ) def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__ def __repr__(self): s = "{}({}, {}, [{}]".format( self._pred(), self.discourse_id, self.sent_index, ", ".join("%s" % wi for wi in self.word_indices), ) for v in self: s += ", %s" % v return s + ")" class BoxerPred(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.name = name self.pos = pos self.sense = sense def _variables(self): return ({self.var}, set(), set()) def change_var(self, var): return BoxerPred( self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.pos, self.sense, ) def clean(self): return BoxerPred( self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.pos, self.sense, ) def renumber_sentences(self, f): new_sent_index = f(self.sent_index) return BoxerPred( self.discourse_id, new_sent_index, self.word_indices, self.var, self.name, self.pos, self.sense, ) def __iter__(self): return iter((self.var, self.name, self.pos, self.sense)) def _pred(self): return "pred" class BoxerNamed(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.name = name self.type = type self.sense = sense def _variables(self): return ({self.var}, set(), set()) def change_var(self, var): return BoxerNamed( self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.type, self.sense, ) def clean(self): return BoxerNamed( self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.type, self.sense, ) def renumber_sentences(self, f): return BoxerNamed( self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.name, self.type, self.sense, ) def __iter__(self): return iter((self.var, self.name, self.type, self.sense)) def _pred(self): return "named" class BoxerRel(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var1 = var1 self.var2 = var2 self.rel = rel self.sense = sense def _variables(self): return ({self.var1, self.var2}, set(), set()) def clean(self): return BoxerRel( self.discourse_id, self.sent_index, self.word_indices, self.var1, self.var2, self._clean_name(self.rel), self.sense, ) def renumber_sentences(self, f): return BoxerRel( self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2, self.rel, self.sense, ) def __iter__(self): return iter((self.var1, self.var2, self.rel, self.sense)) def _pred(self): return "rel" class BoxerProp(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var, drs): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.drs = drs def _variables(self): return tuple( map(operator.or_, (set(), set(), {self.var}), self.drs._variables()) ) def referenced_labels(self): return {self.drs} def atoms(self): return self.drs.atoms() def clean(self): return BoxerProp( self.discourse_id, self.sent_index, self.word_indices, self.var, self.drs.clean(), ) def renumber_sentences(self, f): return BoxerProp( self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.drs.renumber_sentences(f), ) def __iter__(self): return iter((self.var, self.drs)) def _pred(self): return "prop" class BoxerEq(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var1, var2): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var1 = var1 self.var2 = var2 def _variables(self): return ({self.var1, self.var2}, set(), set()) def atoms(self): return set() def renumber_sentences(self, f): return BoxerEq( self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2, ) def __iter__(self): return iter((self.var1, self.var2)) def _pred(self): return "eq" class BoxerCard(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var, value, type): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.value = value self.type = type def _variables(self): return ({self.var}, set(), set()) def renumber_sentences(self, f): return BoxerCard( self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.value, self.type, ) def __iter__(self): return iter((self.var, self.value, self.type)) def _pred(self): return "card" class BoxerOr(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.drs1 = drs1 self.drs2 = drs2 def _variables(self): return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables())) def atoms(self): return self.drs1.atoms() | self.drs2.atoms() def clean(self): return BoxerOr( self.discourse_id, self.sent_index, self.word_indices, self.drs1.clean(), self.drs2.clean(), ) def renumber_sentences(self, f): return BoxerOr( self.discourse_id, f(self.sent_index), self.word_indices, self.drs1, self.drs2, ) def __iter__(self): return iter((self.drs1, self.drs2)) def _pred(self): return "or" class BoxerWhq(BoxerIndexed): def __init__( self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2 ): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.ans_types = ans_types self.drs1 = drs1 self.variable = variable self.drs2 = drs2 def _variables(self): return tuple( map( operator.or_, ({self.variable}, set(), set()), self.drs1._variables(), self.drs2._variables(), ) ) def atoms(self): return self.drs1.atoms() | self.drs2.atoms() def clean(self): return BoxerWhq( self.discourse_id, self.sent_index, self.word_indices, self.ans_types, self.drs1.clean(), self.variable, self.drs2.clean(), ) def renumber_sentences(self, f): return BoxerWhq( self.discourse_id, f(self.sent_index), self.word_indices, self.ans_types, self.drs1, self.variable, self.drs2, ) def __iter__(self): return iter( ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2) ) def _pred(self): return "whq" class PassthroughBoxerDrsInterpreter: def interpret(self, ex): return ex class NltkDrtBoxerDrsInterpreter: def __init__(self, occur_index=False): self._occur_index = occur_index def interpret(self, ex): """ :param ex: ``AbstractBoxerDrs`` :return: ``DrtExpression`` """ if isinstance(ex, BoxerDrs): drs = DRS( [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds)) ) if ex.consequent is not None: drs.consequent = self.interpret(ex.consequent) return drs elif isinstance(ex, BoxerNot): return DrtNegatedExpression(self.interpret(ex.drs)) elif isinstance(ex, BoxerPred): pred = self._add_occur_indexing(f"{ex.pos}_{ex.name}", ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerNamed): pred = self._add_occur_indexing(f"ne_{ex.type}_{ex.name}", ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerRel): pred = self._add_occur_indexing("%s" % (ex.rel), ex) return self._make_atom(pred, ex.var1, ex.var2) elif isinstance(ex, BoxerProp): return DrtProposition(Variable(ex.var), self.interpret(ex.drs)) elif isinstance(ex, BoxerEq): return DrtEqualityExpression( DrtVariableExpression(Variable(ex.var1)), DrtVariableExpression(Variable(ex.var2)), ) elif isinstance(ex, BoxerCard): pred = self._add_occur_indexing(f"card_{ex.type}_{ex.value}", ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerOr): return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2)) elif isinstance(ex, BoxerWhq): drs1 = self.interpret(ex.drs1) drs2 = self.interpret(ex.drs2) return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds) assert False, f"{ex.__class__.__name__}: {ex}" def _make_atom(self, pred, *args): accum = DrtVariableExpression(Variable(pred)) for arg in args: accum = DrtApplicationExpression( accum, DrtVariableExpression(Variable(arg)) ) return accum def _add_occur_indexing(self, base, ex): if self._occur_index and ex.sent_index is not None: if ex.discourse_id: base += "_%s" % ex.discourse_id base += "_s%s" % ex.sent_index base += "_w%s" % sorted(ex.word_indices)[0] return base class UnparseableInputException(Exception): pass if __name__ == "__main__": opts = OptionParser("usage: %prog TEXT [options]") opts.add_option( "--verbose", "-v", help="display verbose logs", action="store_true", default=False, dest="verbose", ) opts.add_option( "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol" ) opts.add_option( "--question", "-q", help="input is a question", action="store_true", default=False, dest="question", ) opts.add_option( "--occur", "-o", help="occurrence index", action="store_true", default=False, dest="occur_index", ) (options, args) = opts.parse_args() if len(args) != 1: opts.error("incorrect number of arguments") interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index) drs = Boxer(interpreter).interpret_multi( args[0].split(r"\n"), question=options.question, verbose=options.verbose ) if drs is None: print(None) else: drs = drs.simplify().eliminate_equality() if options.fol: print(drs.fol().normalize()) else: drs.pretty_print() nltk-3.7/nltk/sem/chat80.py000066400000000000000000000620761420073152400155410ustar00rootroot00000000000000# Natural Language Toolkit: Chat-80 KB Reader # See https://www.w3.org/TR/swbp-skos-core-guide/ # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein , # URL: # For license information, see LICENSE.TXT r""" Overview ======== Chat-80 was a natural language system which allowed the user to interrogate a Prolog knowledge base in the domain of world geography. It was developed in the early '80s by Warren and Pereira; see ``https://www.aclweb.org/anthology/J82-3002.pdf`` for a description and ``http://www.cis.upenn.edu/~pereira/oldies.html`` for the source files. This module contains functions to extract data from the Chat-80 relation files ('the world database'), and convert then into a format that can be incorporated in the FOL models of ``nltk.sem.evaluate``. The code assumes that the Prolog input files are available in the NLTK corpora directory. The Chat-80 World Database consists of the following files:: world0.pl rivers.pl cities.pl countries.pl contain.pl borders.pl This module uses a slightly modified version of ``world0.pl``, in which a set of Prolog rules have been omitted. The modified file is named ``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since it uses a list rather than a string in the second field. Reading Chat-80 Files ===================== Chat-80 relations are like tables in a relational database. The relation acts as the name of the table; the first argument acts as the 'primary key'; and subsequent arguments are further fields in the table. In general, the name of the table provides a label for a unary predicate whose extension is all the primary keys. For example, relations in ``cities.pl`` are of the following form:: 'city(athens,greece,1368).' Here, ``'athens'`` is the key, and will be mapped to a member of the unary predicate *city*. The fields in the table are mapped to binary predicates. The first argument of the predicate is the primary key, while the second argument is the data in the relevant field. Thus, in the above example, the third field is mapped to the binary predicate *population_of*, whose extension is a set of pairs such as ``'(athens, 1368)'``. An exception to this general framework is required by the relations in the files ``borders.pl`` and ``contains.pl``. These contain facts of the following form:: 'borders(albania,greece).' 'contains0(africa,central_africa).' We do not want to form a unary concept out the element in the first field of these records, and we want the label of the binary relation just to be ``'border'``/``'contain'`` respectively. In order to drive the extraction process, we use 'relation metadata bundles' which are Python dictionaries such as the following:: city = {'label': 'city', 'closures': [], 'schema': ['city', 'country', 'population'], 'filename': 'cities.pl'} According to this, the file ``city['filename']`` contains a list of relational tuples (or more accurately, the corresponding strings in Prolog form) whose predicate symbol is ``city['label']`` and whose relational schema is ``city['schema']``. The notion of a ``closure`` is discussed in the next section. Concepts ======== In order to encapsulate the results of the extraction, a class of ``Concept`` objects is introduced. A ``Concept`` object has a number of attributes, in particular a ``prefLabel`` and ``extension``, which make it easier to inspect the output of the extraction. In addition, the ``extension`` can be further processed: in the case of the ``'border'`` relation, we check that the relation is symmetric, and in the case of the ``'contain'`` relation, we carry out the transitive closure. The closure properties associated with a concept is indicated in the relation metadata, as indicated earlier. The ``extension`` of a ``Concept`` object is then incorporated into a ``Valuation`` object. Persistence =========== The functions ``val_dump`` and ``val_load`` are provided to allow a valuation to be stored in a persistent database and re-loaded, rather than having to be re-computed each time. Individuals and Lexical Items ============================= As well as deriving relations from the Chat-80 data, we also create a set of individual constants, one for each entity in the domain. The individual constants are string-identical to the entities. For example, given a data item such as ``'zloty'``, we add to the valuation a pair ``('zloty', 'zloty')``. In order to parse English sentences that refer to these entities, we also create a lexical item such as the following for each individual constant:: PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty' The set of rules is written to the file ``chat_pnames.cfg`` in the current directory. """ import os import re import shelve import sys import nltk.data ########################################################################### # Chat-80 relation metadata bundles needed to build the valuation ########################################################################### borders = { "rel_name": "borders", "closures": ["symmetric"], "schema": ["region", "border"], "filename": "borders.pl", } contains = { "rel_name": "contains0", "closures": ["transitive"], "schema": ["region", "contain"], "filename": "contain.pl", } city = { "rel_name": "city", "closures": [], "schema": ["city", "country", "population"], "filename": "cities.pl", } country = { "rel_name": "country", "closures": [], "schema": [ "country", "region", "latitude", "longitude", "area", "population", "capital", "currency", ], "filename": "countries.pl", } circle_of_lat = { "rel_name": "circle_of_latitude", "closures": [], "schema": ["circle_of_latitude", "degrees"], "filename": "world1.pl", } circle_of_long = { "rel_name": "circle_of_longitude", "closures": [], "schema": ["circle_of_longitude", "degrees"], "filename": "world1.pl", } continent = { "rel_name": "continent", "closures": [], "schema": ["continent"], "filename": "world1.pl", } region = { "rel_name": "in_continent", "closures": [], "schema": ["region", "continent"], "filename": "world1.pl", } ocean = { "rel_name": "ocean", "closures": [], "schema": ["ocean"], "filename": "world1.pl", } sea = {"rel_name": "sea", "closures": [], "schema": ["sea"], "filename": "world1.pl"} items = [ "borders", "contains", "city", "country", "circle_of_lat", "circle_of_long", "continent", "region", "ocean", "sea", ] items = tuple(sorted(items)) item_metadata = { "borders": borders, "contains": contains, "city": city, "country": country, "circle_of_lat": circle_of_lat, "circle_of_long": circle_of_long, "continent": continent, "region": region, "ocean": ocean, "sea": sea, } rels = item_metadata.values() not_unary = ["borders.pl", "contain.pl"] ########################################################################### class Concept: """ A Concept class, loosely based on SKOS (https://www.w3.org/TR/swbp-skos-core-guide/). """ def __init__(self, prefLabel, arity, altLabels=[], closures=[], extension=set()): """ :param prefLabel: the preferred label for the concept :type prefLabel: str :param arity: the arity of the concept :type arity: int :param altLabels: other (related) labels :type altLabels: list :param closures: closure properties of the extension (list items can be ``symmetric``, ``reflexive``, ``transitive``) :type closures: list :param extension: the extensional value of the concept :type extension: set """ self.prefLabel = prefLabel self.arity = arity self.altLabels = altLabels self.closures = closures # keep _extension internally as a set self._extension = extension # public access is via a list (for slicing) self.extension = sorted(list(extension)) def __str__(self): # _extension = '' # for element in sorted(self.extension): # if isinstance(element, tuple): # element = '(%s, %s)' % (element) # _extension += element + ', ' # _extension = _extension[:-1] return "Label = '{}'\nArity = {}\nExtension = {}".format( self.prefLabel, self.arity, self.extension, ) def __repr__(self): return "Concept('%s')" % self.prefLabel def augment(self, data): """ Add more data to the ``Concept``'s extension set. :param data: a new semantic value :type data: string or pair of strings :rtype: set """ self._extension.add(data) self.extension = sorted(list(self._extension)) return self._extension def _make_graph(self, s): """ Convert a set of pairs into an adjacency linked list encoding of a graph. """ g = {} for (x, y) in s: if x in g: g[x].append(y) else: g[x] = [y] return g def _transclose(self, g): """ Compute the transitive closure of a graph represented as a linked list. """ for x in g: for adjacent in g[x]: # check that adjacent is a key if adjacent in g: for y in g[adjacent]: if y not in g[x]: g[x].append(y) return g def _make_pairs(self, g): """ Convert an adjacency linked list back into a set of pairs. """ pairs = [] for node in g: for adjacent in g[node]: pairs.append((node, adjacent)) return set(pairs) def close(self): """ Close a binary relation in the ``Concept``'s extension set. :return: a new extension for the ``Concept`` in which the relation is closed under a given property """ from nltk.sem import is_rel assert is_rel(self._extension) if "symmetric" in self.closures: pairs = [] for (x, y) in self._extension: pairs.append((y, x)) sym = set(pairs) self._extension = self._extension.union(sym) if "transitive" in self.closures: all = self._make_graph(self._extension) closed = self._transclose(all) trans = self._make_pairs(closed) self._extension = self._extension.union(trans) self.extension = sorted(list(self._extension)) def clause2concepts(filename, rel_name, schema, closures=[]): """ Convert a file of Prolog clauses into a list of ``Concept`` objects. :param filename: filename containing the relations :type filename: str :param rel_name: name of the relation :type rel_name: str :param schema: the schema used in a set of relational tuples :type schema: list :param closures: closure properties for the extension of the concept :type closures: list :return: a list of ``Concept`` objects :rtype: list """ concepts = [] # position of the subject of a binary relation subj = 0 # label of the 'primary key' pkey = schema[0] # fields other than the primary key fields = schema[1:] # convert a file into a list of lists records = _str2records(filename, rel_name) # add a unary concept corresponding to the set of entities # in the primary key position # relations in 'not_unary' are more like ordinary binary relations if not filename in not_unary: concepts.append(unary_concept(pkey, subj, records)) # add a binary concept for each non-key field for field in fields: obj = schema.index(field) concepts.append(binary_concept(field, closures, subj, obj, records)) return concepts def cities2table(filename, rel_name, dbname, verbose=False, setup=False): """ Convert a file of Prolog clauses into a database table. This is not generic, since it doesn't allow arbitrary schemas to be set as a parameter. Intended usage:: cities2table('cities.pl', 'city', 'city.db', verbose=True, setup=True) :param filename: filename containing the relations :type filename: str :param rel_name: name of the relation :type rel_name: str :param dbname: filename of persistent store :type schema: str """ import sqlite3 records = _str2records(filename, rel_name) connection = sqlite3.connect(dbname) cur = connection.cursor() if setup: cur.execute( """CREATE TABLE city_table (City text, Country text, Population int)""" ) table_name = "city_table" for t in records: cur.execute("insert into %s values (?,?,?)" % table_name, t) if verbose: print("inserting values into %s: " % table_name, t) connection.commit() if verbose: print("Committing update to %s" % dbname) cur.close() def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ import sqlite3 try: path = nltk.data.find(dbname) connection = sqlite3.connect(str(path)) cur = connection.cursor() return cur.execute(query) except (ValueError, sqlite3.OperationalError): import warnings warnings.warn( "Make sure the database file %s is installed and uncompressed." % dbname ) raise def _str2records(filename, rel): """ Read a file into memory and convert each relation clause into a list. """ recs = [] contents = nltk.data.load("corpora/chat80/%s" % filename, format="text") for line in contents.splitlines(): if line.startswith(rel): line = re.sub(rel + r"\(", "", line) line = re.sub(r"\)\.$", "", line) record = line.split(",") recs.append(record) return recs def unary_concept(label, subj, records): """ Make a unary concept out of the primary key in a record. A record is a list of entities in some relation, such as ``['france', 'paris']``, where ``'france'`` is acting as the primary key. :param label: the preferred label for the concept :type label: string :param subj: position in the record of the subject of the predicate :type subj: int :param records: a list of records :type records: list of lists :return: ``Concept`` of arity 1 :rtype: Concept """ c = Concept(label, arity=1, extension=set()) for record in records: c.augment(record[subj]) return c def binary_concept(label, closures, subj, obj, records): """ Make a binary concept out of the primary key and another field in a record. A record is a list of entities in some relation, such as ``['france', 'paris']``, where ``'france'`` is acting as the primary key, and ``'paris'`` stands in the ``'capital_of'`` relation to ``'france'``. More generally, given a record such as ``['a', 'b', 'c']``, where label is bound to ``'B'``, and ``obj`` bound to 1, the derived binary concept will have label ``'B_of'``, and its extension will be a set of pairs such as ``('a', 'b')``. :param label: the base part of the preferred label for the concept :type label: str :param closures: closure properties for the extension of the concept :type closures: list :param subj: position in the record of the subject of the predicate :type subj: int :param obj: position in the record of the object of the predicate :type obj: int :param records: a list of records :type records: list of lists :return: ``Concept`` of arity 2 :rtype: Concept """ if not label == "border" and not label == "contain": label = label + "_of" c = Concept(label, arity=2, closures=closures, extension=set()) for record in records: c.augment((record[subj], record[obj])) # close the concept's extension according to the properties in closures c.close() return c def process_bundle(rels): """ Given a list of relation metadata bundles, make a corresponding dictionary of concepts, indexed by the relation name. :param rels: bundle of metadata needed for constructing a concept :type rels: list(dict) :return: a dictionary of concepts, indexed by the relation name. :rtype: dict(str): Concept """ concepts = {} for rel in rels: rel_name = rel["rel_name"] closures = rel["closures"] schema = rel["schema"] filename = rel["filename"] concept_list = clause2concepts(filename, rel_name, schema, closures) for c in concept_list: label = c.prefLabel if label in concepts: for data in c.extension: concepts[label].augment(data) concepts[label].close() else: concepts[label] = c return concepts def make_valuation(concepts, read=False, lexicon=False): """ Convert a list of ``Concept`` objects into a list of (label, extension) pairs; optionally create a ``Valuation`` object. :param concepts: concepts :type concepts: list(Concept) :param read: if ``True``, ``(symbol, set)`` pairs are read into a ``Valuation`` :type read: bool :rtype: list or Valuation """ vals = [] for c in concepts: vals.append((c.prefLabel, c.extension)) if lexicon: read = True if read: from nltk.sem import Valuation val = Valuation({}) val.update(vals) # add labels for individuals val = label_indivs(val, lexicon=lexicon) return val else: return vals def val_dump(rels, db): """ Make a ``Valuation`` from a list of relation metadata bundles and dump to persistent database. :param rels: bundle of metadata needed for constructing a concept :type rels: list of dict :param db: name of file to which data is written. The suffix '.db' will be automatically appended. :type db: str """ concepts = process_bundle(rels).values() valuation = make_valuation(concepts, read=True) db_out = shelve.open(db, "n") db_out.update(valuation) db_out.close() def val_load(db): """ Load a ``Valuation`` from a persistent database. :param db: name of file from which data is read. The suffix '.db' should be omitted from the name. :type db: str """ dbname = db + ".db" if not os.access(dbname, os.R_OK): sys.exit("Cannot read file: %s" % dbname) else: db_in = shelve.open(db) from nltk.sem import Valuation val = Valuation(db_in) # val.read(db_in.items()) return val # def alpha(str): # """ # Utility to filter out non-alphabetic constants. #:param str: candidate constant #:type str: string #:rtype: bool # """ # try: # int(str) # return False # except ValueError: ## some unknown values in records are labeled '?' # if not str == '?': # return True def label_indivs(valuation, lexicon=False): """ Assign individual constants to the individuals in the domain of a ``Valuation``. Given a valuation with an entry of the form ``{'rel': {'a': True}}``, add a new entry ``{'a': 'a'}``. :type valuation: Valuation :rtype: Valuation """ # collect all the individuals into a domain domain = valuation.domain # convert the domain into a sorted list of alphabetic terms # use the same string as a label pairs = [(e, e) for e in domain] if lexicon: lex = make_lex(domain) with open("chat_pnames.cfg", "w") as outfile: outfile.writelines(lex) # read the pairs into the valuation valuation.update(pairs) return valuation def make_lex(symbols): """ Create lexical CFG rules for each individual symbol. Given a valuation with an entry of the form ``{'zloty': 'zloty'}``, create a lexical rule for the proper name 'Zloty'. :param symbols: a list of individual constants in the semantic representation :type symbols: sequence -- set(str) :rtype: list(str) """ lex = [] header = """ ################################################################## # Lexical rules automatically generated by running 'chat80.py -x'. ################################################################## """ lex.append(header) template = r"PropN[num=sg, sem=<\P.(P %s)>] -> '%s'\n" for s in symbols: parts = s.split("_") caps = [p.capitalize() for p in parts] pname = "_".join(caps) rule = template % (s, pname) lex.append(rule) return lex ########################################################################### # Interface function to emulate other corpus readers ########################################################################### def concepts(items=items): """ Build a list of concepts corresponding to the relation names in ``items``. :param items: names of the Chat-80 relations to extract :type items: list(str) :return: the ``Concept`` objects which are extracted from the relations :rtype: list(Concept) """ if isinstance(items, str): items = (items,) rels = [item_metadata[r] for r in items] concept_map = process_bundle(rels) return concept_map.values() ########################################################################### def main(): import sys from optparse import OptionParser description = """ Extract data from the Chat-80 Prolog files and convert them into a Valuation object for use in the NLTK semantics package. """ opts = OptionParser(description=description) opts.set_defaults(verbose=True, lex=False, vocab=False) opts.add_option( "-s", "--store", dest="outdb", help="store a valuation in DB", metavar="DB" ) opts.add_option( "-l", "--load", dest="indb", help="load a stored valuation from DB", metavar="DB", ) opts.add_option( "-c", "--concepts", action="store_true", help="print concepts instead of a valuation", ) opts.add_option( "-r", "--relation", dest="label", help="print concept with label REL (check possible labels with '-v' option)", metavar="REL", ) opts.add_option( "-q", "--quiet", action="store_false", dest="verbose", help="don't print out progress info", ) opts.add_option( "-x", "--lex", action="store_true", dest="lex", help="write a file of lexical entries for country names, then exit", ) opts.add_option( "-v", "--vocab", action="store_true", dest="vocab", help="print out the vocabulary of concept labels and their arity, then exit", ) (options, args) = opts.parse_args() if options.outdb and options.indb: opts.error("Options --store and --load are mutually exclusive") if options.outdb: # write the valuation to a persistent database if options.verbose: outdb = options.outdb + ".db" print("Dumping a valuation to %s" % outdb) val_dump(rels, options.outdb) sys.exit(0) else: # try to read in a valuation from a database if options.indb is not None: dbname = options.indb + ".db" if not os.access(dbname, os.R_OK): sys.exit("Cannot read file: %s" % dbname) else: valuation = val_load(options.indb) # we need to create the valuation from scratch else: # build some concepts concept_map = process_bundle(rels) concepts = concept_map.values() # just print out the vocabulary if options.vocab: items = sorted((c.arity, c.prefLabel) for c in concepts) for (arity, label) in items: print(label, arity) sys.exit(0) # show all the concepts if options.concepts: for c in concepts: print(c) print() if options.label: print(concept_map[options.label]) sys.exit(0) else: # turn the concepts into a Valuation if options.lex: if options.verbose: print("Writing out lexical rules") make_valuation(concepts, lexicon=True) else: valuation = make_valuation(concepts, read=True) print(valuation) def sql_demo(): """ Print out every row from the 'city.db' database. """ print() print("Using SQL to extract rows from 'city.db' RDB.") for row in sql_query("corpora/city_database/city.db", "SELECT * FROM city_table"): print(row) if __name__ == "__main__": main() sql_demo() nltk-3.7/nltk/sem/cooper_storage.py000066400000000000000000000077661420073152400174720ustar00rootroot00000000000000# Natural Language Toolkit: Cooper storage for Quantifier Ambiguity # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT from nltk.parse import load_parser from nltk.parse.featurechart import InstantiateVarsChart from nltk.sem.logic import ApplicationExpression, LambdaExpression, Variable class CooperStore: """ A container for handling quantifier ambiguity via Cooper storage. """ def __init__(self, featstruct): """ :param featstruct: The value of the ``sem`` node in a tree from ``parse_with_bindops()`` :type featstruct: FeatStruct (with features ``core`` and ``store``) """ self.featstruct = featstruct self.readings = [] try: self.core = featstruct["CORE"] self.store = featstruct["STORE"] except KeyError: print("%s is not a Cooper storage structure" % featstruct) def _permute(self, lst): """ :return: An iterator over the permutations of the input list :type lst: list :rtype: iter """ remove = lambda lst0, index: lst0[:index] + lst0[index + 1 :] if lst: for index, x in enumerate(lst): for y in self._permute(remove(lst, index)): yield (x,) + y else: yield () def s_retrieve(self, trace=False): r""" Carry out S-Retrieval of binding operators in store. If hack=True, serialize the bindop and core as strings and reparse. Ugh. Each permutation of the store (i.e. list of binding operators) is taken to be a possible scoping of quantifiers. We iterate through the binding operators in each permutation, and successively apply them to the current term, starting with the core semantic representation, working from the inside out. Binding operators are of the form:: bo(\P.all x.(man(x) -> P(x)),z1) """ for perm, store_perm in enumerate(self._permute(self.store)): if trace: print("Permutation %s" % (perm + 1)) term = self.core for bindop in store_perm: # we just want the arguments that are wrapped by the 'bo' predicate quant, varex = tuple(bindop.args) # use var to make an abstraction over the current term and then # apply the quantifier to it term = ApplicationExpression( quant, LambdaExpression(varex.variable, term) ) if trace: print(" ", term) term = term.simplify() self.readings.append(term) def parse_with_bindops(sentence, grammar=None, trace=0): """ Use a grammar with Binding Operators to parse a sentence. """ if not grammar: grammar = "grammars/book_grammars/storage.fcfg" parser = load_parser(grammar, trace=trace, chart_class=InstantiateVarsChart) # Parse the sentence. tokens = sentence.split() return list(parser.parse(tokens)) def demo(): from nltk.sem import cooper_storage as cs sentence = "every girl chases a dog" # sentence = "a man gives a bone to every dog" print() print("Analysis of sentence '%s'" % sentence) print("=" * 50) trees = cs.parse_with_bindops(sentence, trace=0) for tree in trees: semrep = cs.CooperStore(tree.label()["SEM"]) print() print("Binding operators:") print("-" * 15) for s in semrep.store: print(s) print() print("Core:") print("-" * 15) print(semrep.core) print() print("S-Retrieval:") print("-" * 15) semrep.s_retrieve(trace=True) print("Readings:") print("-" * 15) for i, reading in enumerate(semrep.readings): print(f"{i + 1}: {reading}") if __name__ == "__main__": demo() nltk-3.7/nltk/sem/drt.py000066400000000000000000001447571420073152400152520ustar00rootroot00000000000000# Natural Language Toolkit: Discourse Representation Theory (DRT) # # Author: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT import operator from functools import reduce from itertools import chain from nltk.sem.logic import ( APP, AbstractVariableExpression, AllExpression, AndExpression, ApplicationExpression, BinaryExpression, BooleanExpression, ConstantExpression, EqualityExpression, EventVariableExpression, ExistsExpression, Expression, FunctionVariableExpression, ImpExpression, IndividualVariableExpression, LambdaExpression, LogicParser, NegatedExpression, OrExpression, Tokens, Variable, is_eventvar, is_funcvar, is_indvar, unique_variable, ) # Import Tkinter-based modules if they are available try: from tkinter import Canvas, Tk from tkinter.font import Font from nltk.util import in_idle except ImportError: # No need to print a warning here, nltk.draw has already printed one. pass class DrtTokens(Tokens): DRS = "DRS" DRS_CONC = "+" PRONOUN = "PRO" OPEN_BRACKET = "[" CLOSE_BRACKET = "]" COLON = ":" PUNCT = [DRS_CONC, OPEN_BRACKET, CLOSE_BRACKET, COLON] SYMBOLS = Tokens.SYMBOLS + PUNCT TOKENS = Tokens.TOKENS + [DRS] + PUNCT class DrtParser(LogicParser): """A lambda calculus expression parser.""" def __init__(self): LogicParser.__init__(self) self.operator_precedence = dict( [(x, 1) for x in DrtTokens.LAMBDA_LIST] + [(x, 2) for x in DrtTokens.NOT_LIST] + [(APP, 3)] + [(x, 4) for x in DrtTokens.EQ_LIST + Tokens.NEQ_LIST] + [(DrtTokens.COLON, 5)] + [(DrtTokens.DRS_CONC, 6)] + [(x, 7) for x in DrtTokens.OR_LIST] + [(x, 8) for x in DrtTokens.IMP_LIST] + [(None, 9)] ) def get_all_symbols(self): """This method exists to be overridden""" return DrtTokens.SYMBOLS def isvariable(self, tok): return tok not in DrtTokens.TOKENS def handle(self, tok, context): """This method is intended to be overridden for logics that use different operators or expressions""" if tok in DrtTokens.NOT_LIST: return self.handle_negation(tok, context) elif tok in DrtTokens.LAMBDA_LIST: return self.handle_lambda(tok, context) elif tok == DrtTokens.OPEN: if self.inRange(0) and self.token(0) == DrtTokens.OPEN_BRACKET: return self.handle_DRS(tok, context) else: return self.handle_open(tok, context) elif tok.upper() == DrtTokens.DRS: self.assertNextToken(DrtTokens.OPEN) return self.handle_DRS(tok, context) elif self.isvariable(tok): if self.inRange(0) and self.token(0) == DrtTokens.COLON: return self.handle_prop(tok, context) else: return self.handle_variable(tok, context) def make_NegatedExpression(self, expression): return DrtNegatedExpression(expression) def handle_DRS(self, tok, context): # a DRS refs = self.handle_refs() if ( self.inRange(0) and self.token(0) == DrtTokens.COMMA ): # if there is a comma (it's optional) self.token() # swallow the comma conds = self.handle_conds(context) self.assertNextToken(DrtTokens.CLOSE) return DRS(refs, conds, None) def handle_refs(self): self.assertNextToken(DrtTokens.OPEN_BRACKET) refs = [] while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET: # Support expressions like: DRS([x y],C) == DRS([x,y],C) if refs and self.token(0) == DrtTokens.COMMA: self.token() # swallow the comma refs.append(self.get_next_token_variable("quantified")) self.assertNextToken(DrtTokens.CLOSE_BRACKET) return refs def handle_conds(self, context): self.assertNextToken(DrtTokens.OPEN_BRACKET) conds = [] while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET: # Support expressions like: DRS([x y],C) == DRS([x, y],C) if conds and self.token(0) == DrtTokens.COMMA: self.token() # swallow the comma conds.append(self.process_next_expression(context)) self.assertNextToken(DrtTokens.CLOSE_BRACKET) return conds def handle_prop(self, tok, context): variable = self.make_VariableExpression(tok) self.assertNextToken(":") drs = self.process_next_expression(DrtTokens.COLON) return DrtProposition(variable, drs) def make_EqualityExpression(self, first, second): """This method serves as a hook for other logic parsers that have different equality expression classes""" return DrtEqualityExpression(first, second) def get_BooleanExpression_factory(self, tok): """This method serves as a hook for other logic parsers that have different boolean operators""" if tok == DrtTokens.DRS_CONC: return lambda first, second: DrtConcatenation(first, second, None) elif tok in DrtTokens.OR_LIST: return DrtOrExpression elif tok in DrtTokens.IMP_LIST: def make_imp_expression(first, second): if isinstance(first, DRS): return DRS(first.refs, first.conds, second) if isinstance(first, DrtConcatenation): return DrtConcatenation(first.first, first.second, second) raise Exception("Antecedent of implication must be a DRS") return make_imp_expression else: return None def make_BooleanExpression(self, factory, first, second): return factory(first, second) def make_ApplicationExpression(self, function, argument): return DrtApplicationExpression(function, argument) def make_VariableExpression(self, name): return DrtVariableExpression(Variable(name)) def make_LambdaExpression(self, variables, term): return DrtLambdaExpression(variables, term) class DrtExpression: """ This is the base abstract DRT Expression from which every DRT Expression extends. """ _drt_parser = DrtParser() @classmethod def fromstring(cls, s): return cls._drt_parser.parse(s) def applyto(self, other): return DrtApplicationExpression(self, other) def __neg__(self): return DrtNegatedExpression(self) def __and__(self, other): raise NotImplementedError() def __or__(self, other): assert isinstance(other, DrtExpression) return DrtOrExpression(self, other) def __gt__(self, other): assert isinstance(other, DrtExpression) if isinstance(self, DRS): return DRS(self.refs, self.conds, other) if isinstance(self, DrtConcatenation): return DrtConcatenation(self.first, self.second, other) raise Exception("Antecedent of implication must be a DRS") def equiv(self, other, prover=None): """ Check for logical equivalence. Pass the expression (self <-> other) to the theorem prover. If the prover says it is valid, then the self and other are equal. :param other: an ``DrtExpression`` to check equality against :param prover: a ``nltk.inference.api.Prover`` """ assert isinstance(other, DrtExpression) f1 = self.simplify().fol() f2 = other.simplify().fol() return f1.equiv(f2, prover) @property def type(self): raise AttributeError( "'%s' object has no attribute 'type'" % self.__class__.__name__ ) def typecheck(self, signature=None): raise NotImplementedError() def __add__(self, other): return DrtConcatenation(self, other, None) def get_refs(self, recursive=False): """ Return the set of discourse referents in this DRS. :param recursive: bool Also find discourse referents in subterms? :return: list of ``Variable`` objects """ raise NotImplementedError() def is_pronoun_function(self): """Is self of the form "PRO(x)"?""" return ( isinstance(self, DrtApplicationExpression) and isinstance(self.function, DrtAbstractVariableExpression) and self.function.variable.name == DrtTokens.PRONOUN and isinstance(self.argument, DrtIndividualVariableExpression) ) def make_EqualityExpression(self, first, second): return DrtEqualityExpression(first, second) def make_VariableExpression(self, variable): return DrtVariableExpression(variable) def resolve_anaphora(self): return resolve_anaphora(self) def eliminate_equality(self): return self.visit_structured(lambda e: e.eliminate_equality(), self.__class__) def pretty_format(self): """ Draw the DRS :return: the pretty print string """ return "\n".join(self._pretty()) def pretty_print(self): print(self.pretty_format()) def draw(self): DrsDrawer(self).draw() class DRS(DrtExpression, Expression): """A Discourse Representation Structure.""" def __init__(self, refs, conds, consequent=None): """ :param refs: list of ``DrtIndividualVariableExpression`` for the discourse referents :param conds: list of ``Expression`` for the conditions """ self.refs = refs self.conds = conds self.consequent = consequent def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """Replace all instances of variable v with expression E in self, where v is free in self.""" if variable in self.refs: # if a bound variable is the thing being replaced if not replace_bound: return self else: i = self.refs.index(variable) if self.consequent: consequent = self.consequent.replace( variable, expression, True, alpha_convert ) else: consequent = None return DRS( self.refs[:i] + [expression.variable] + self.refs[i + 1 :], [ cond.replace(variable, expression, True, alpha_convert) for cond in self.conds ], consequent, ) else: if alpha_convert: # any bound variable that appears in the expression must # be alpha converted to avoid a conflict for ref in set(self.refs) & expression.free(): newvar = unique_variable(ref) newvarex = DrtVariableExpression(newvar) i = self.refs.index(ref) if self.consequent: consequent = self.consequent.replace( ref, newvarex, True, alpha_convert ) else: consequent = None self = DRS( self.refs[:i] + [newvar] + self.refs[i + 1 :], [ cond.replace(ref, newvarex, True, alpha_convert) for cond in self.conds ], consequent, ) # replace in the conditions if self.consequent: consequent = self.consequent.replace( variable, expression, replace_bound, alpha_convert ) else: consequent = None return DRS( self.refs, [ cond.replace(variable, expression, replace_bound, alpha_convert) for cond in self.conds ], consequent, ) def free(self): """:see: Expression.free()""" conds_free = reduce(operator.or_, [c.free() for c in self.conds], set()) if self.consequent: conds_free.update(self.consequent.free()) return conds_free - set(self.refs) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" if recursive: conds_refs = self.refs + list( chain.from_iterable(c.get_refs(True) for c in self.conds) ) if self.consequent: conds_refs.extend(self.consequent.get_refs(True)) return conds_refs else: return self.refs def visit(self, function, combinator): """:see: Expression.visit()""" parts = list(map(function, self.conds)) if self.consequent: parts.append(function(self.consequent)) return combinator(parts) def visit_structured(self, function, combinator): """:see: Expression.visit_structured()""" consequent = function(self.consequent) if self.consequent else None return combinator(self.refs, list(map(function, self.conds)), consequent) def eliminate_equality(self): drs = self i = 0 while i < len(drs.conds): cond = drs.conds[i] if ( isinstance(cond, EqualityExpression) and isinstance(cond.first, AbstractVariableExpression) and isinstance(cond.second, AbstractVariableExpression) ): drs = DRS( list(set(drs.refs) - {cond.second.variable}), drs.conds[:i] + drs.conds[i + 1 :], drs.consequent, ) if cond.second.variable != cond.first.variable: drs = drs.replace(cond.second.variable, cond.first, False, False) i = 0 i -= 1 i += 1 conds = [] for cond in drs.conds: new_cond = cond.eliminate_equality() new_cond_simp = new_cond.simplify() if ( not isinstance(new_cond_simp, DRS) or new_cond_simp.refs or new_cond_simp.conds or new_cond_simp.consequent ): conds.append(new_cond) consequent = drs.consequent.eliminate_equality() if drs.consequent else None return DRS(drs.refs, conds, consequent) def fol(self): if self.consequent: accum = None if self.conds: accum = reduce(AndExpression, [c.fol() for c in self.conds]) if accum: accum = ImpExpression(accum, self.consequent.fol()) else: accum = self.consequent.fol() for ref in self.refs[::-1]: accum = AllExpression(ref, accum) return accum else: if not self.conds: raise Exception("Cannot convert DRS with no conditions to FOL.") accum = reduce(AndExpression, [c.fol() for c in self.conds]) for ref in map(Variable, self._order_ref_strings(self.refs)[::-1]): accum = ExistsExpression(ref, accum) return accum def _pretty(self): refs_line = " ".join(self._order_ref_strings(self.refs)) cond_lines = [ cond for cond_line in [ filter(lambda s: s.strip(), cond._pretty()) for cond in self.conds ] for cond in cond_line ] length = max([len(refs_line)] + list(map(len, cond_lines))) drs = ( [ " _" + "_" * length + "_ ", "| " + refs_line.ljust(length) + " |", "|-" + "-" * length + "-|", ] + ["| " + line.ljust(length) + " |" for line in cond_lines] + ["|_" + "_" * length + "_|"] ) if self.consequent: return DrtBinaryExpression._assemble_pretty( drs, DrtTokens.IMP, self.consequent._pretty() ) return drs def _order_ref_strings(self, refs): strings = ["%s" % ref for ref in refs] ind_vars = [] func_vars = [] event_vars = [] other_vars = [] for s in strings: if is_indvar(s): ind_vars.append(s) elif is_funcvar(s): func_vars.append(s) elif is_eventvar(s): event_vars.append(s) else: other_vars.append(s) return ( sorted(other_vars) + sorted(event_vars, key=lambda v: int([v[2:], -1][len(v[2:]) == 0])) + sorted(func_vars, key=lambda v: (v[0], int([v[1:], -1][len(v[1:]) == 0]))) + sorted(ind_vars, key=lambda v: (v[0], int([v[1:], -1][len(v[1:]) == 0]))) ) def __eq__(self, other): r"""Defines equality modulo alphabetic variance. If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" if isinstance(other, DRS): if len(self.refs) == len(other.refs): converted_other = other for (r1, r2) in zip(self.refs, converted_other.refs): varex = self.make_VariableExpression(r1) converted_other = converted_other.replace(r2, varex, True) if self.consequent == converted_other.consequent and len( self.conds ) == len(converted_other.conds): for c1, c2 in zip(self.conds, converted_other.conds): if not (c1 == c2): return False return True return False def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def __str__(self): drs = "([{}],[{}])".format( ",".join(self._order_ref_strings(self.refs)), ", ".join("%s" % cond for cond in self.conds), ) # map(str, self.conds))) if self.consequent: return ( DrtTokens.OPEN + drs + " " + DrtTokens.IMP + " " + "%s" % self.consequent + DrtTokens.CLOSE ) return drs def DrtVariableExpression(variable): """ This is a factory method that instantiates and returns a subtype of ``DrtAbstractVariableExpression`` appropriate for the given variable. """ if is_indvar(variable.name): return DrtIndividualVariableExpression(variable) elif is_funcvar(variable.name): return DrtFunctionVariableExpression(variable) elif is_eventvar(variable.name): return DrtEventVariableExpression(variable) else: return DrtConstantExpression(variable) class DrtAbstractVariableExpression(DrtExpression, AbstractVariableExpression): def fol(self): return self def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return [] def _pretty(self): s = "%s" % self blank = " " * len(s) return [blank, blank, s, blank] def eliminate_equality(self): return self class DrtIndividualVariableExpression( DrtAbstractVariableExpression, IndividualVariableExpression ): pass class DrtFunctionVariableExpression( DrtAbstractVariableExpression, FunctionVariableExpression ): pass class DrtEventVariableExpression( DrtIndividualVariableExpression, EventVariableExpression ): pass class DrtConstantExpression(DrtAbstractVariableExpression, ConstantExpression): pass class DrtProposition(DrtExpression, Expression): def __init__(self, variable, drs): self.variable = variable self.drs = drs def replace(self, variable, expression, replace_bound=False, alpha_convert=True): if self.variable == variable: assert isinstance( expression, DrtAbstractVariableExpression ), "Can only replace a proposition label with a variable" return DrtProposition( expression.variable, self.drs.replace(variable, expression, replace_bound, alpha_convert), ) else: return DrtProposition( self.variable, self.drs.replace(variable, expression, replace_bound, alpha_convert), ) def eliminate_equality(self): return DrtProposition(self.variable, self.drs.eliminate_equality()) def get_refs(self, recursive=False): return self.drs.get_refs(True) if recursive else [] def __eq__(self, other): return ( self.__class__ == other.__class__ and self.variable == other.variable and self.drs == other.drs ) def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def fol(self): return self.drs.fol() def _pretty(self): drs_s = self.drs._pretty() blank = " " * len("%s" % self.variable) return ( [blank + " " + line for line in drs_s[:1]] + ["%s" % self.variable + ":" + line for line in drs_s[1:2]] + [blank + " " + line for line in drs_s[2:]] ) def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.drs)]) def visit_structured(self, function, combinator): """:see: Expression.visit_structured()""" return combinator(self.variable, function(self.drs)) def __str__(self): return f"prop({self.variable}, {self.drs})" class DrtNegatedExpression(DrtExpression, NegatedExpression): def fol(self): return NegatedExpression(self.term.fol()) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return self.term.get_refs(recursive) def _pretty(self): term_lines = self.term._pretty() return ( [" " + line for line in term_lines[:2]] + ["__ " + line for line in term_lines[2:3]] + [" | " + line for line in term_lines[3:4]] + [" " + line for line in term_lines[4:]] ) class DrtLambdaExpression(DrtExpression, LambdaExpression): def alpha_convert(self, newvar): """Rename all occurrences of the variable introduced by this variable binder in the expression to ``newvar``. :param newvar: ``Variable``, for the new variable """ return self.__class__( newvar, self.term.replace(self.variable, DrtVariableExpression(newvar), True), ) def fol(self): return LambdaExpression(self.variable, self.term.fol()) def _pretty(self): variables = [self.variable] term = self.term while term.__class__ == self.__class__: variables.append(term.variable) term = term.term var_string = " ".join("%s" % v for v in variables) + DrtTokens.DOT term_lines = term._pretty() blank = " " * len(var_string) return ( [" " + blank + line for line in term_lines[:1]] + [r" \ " + blank + line for line in term_lines[1:2]] + [r" /\ " + var_string + line for line in term_lines[2:3]] + [" " + blank + line for line in term_lines[3:]] ) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return ( [self.variable] + self.term.get_refs(True) if recursive else [self.variable] ) class DrtBinaryExpression(DrtExpression, BinaryExpression): def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return ( self.first.get_refs(True) + self.second.get_refs(True) if recursive else [] ) def _pretty(self): return DrtBinaryExpression._assemble_pretty( self._pretty_subex(self.first), self.getOp(), self._pretty_subex(self.second), ) @staticmethod def _assemble_pretty(first_lines, op, second_lines): max_lines = max(len(first_lines), len(second_lines)) first_lines = _pad_vertically(first_lines, max_lines) second_lines = _pad_vertically(second_lines, max_lines) blank = " " * len(op) first_second_lines = list(zip(first_lines, second_lines)) return ( [ " " + first_line + " " + blank + " " + second_line + " " for first_line, second_line in first_second_lines[:2] ] + [ "(" + first_line + " " + op + " " + second_line + ")" for first_line, second_line in first_second_lines[2:3] ] + [ " " + first_line + " " + blank + " " + second_line + " " for first_line, second_line in first_second_lines[3:] ] ) def _pretty_subex(self, subex): return subex._pretty() class DrtBooleanExpression(DrtBinaryExpression, BooleanExpression): pass class DrtOrExpression(DrtBooleanExpression, OrExpression): def fol(self): return OrExpression(self.first.fol(), self.second.fol()) def _pretty_subex(self, subex): if isinstance(subex, DrtOrExpression): return [line[1:-1] for line in subex._pretty()] return DrtBooleanExpression._pretty_subex(self, subex) class DrtEqualityExpression(DrtBinaryExpression, EqualityExpression): def fol(self): return EqualityExpression(self.first.fol(), self.second.fol()) class DrtConcatenation(DrtBooleanExpression): """DRS of the form '(DRS + DRS)'""" def __init__(self, first, second, consequent=None): DrtBooleanExpression.__init__(self, first, second) self.consequent = consequent def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """Replace all instances of variable v with expression E in self, where v is free in self.""" first = self.first second = self.second consequent = self.consequent # If variable is bound if variable in self.get_refs(): if replace_bound: first = first.replace( variable, expression, replace_bound, alpha_convert ) second = second.replace( variable, expression, replace_bound, alpha_convert ) if consequent: consequent = consequent.replace( variable, expression, replace_bound, alpha_convert ) else: if alpha_convert: # alpha convert every ref that is free in 'expression' for ref in set(self.get_refs(True)) & expression.free(): v = DrtVariableExpression(unique_variable(ref)) first = first.replace(ref, v, True, alpha_convert) second = second.replace(ref, v, True, alpha_convert) if consequent: consequent = consequent.replace(ref, v, True, alpha_convert) first = first.replace(variable, expression, replace_bound, alpha_convert) second = second.replace(variable, expression, replace_bound, alpha_convert) if consequent: consequent = consequent.replace( variable, expression, replace_bound, alpha_convert ) return self.__class__(first, second, consequent) def eliminate_equality(self): # TODO: at some point. for now, simplify. drs = self.simplify() assert not isinstance(drs, DrtConcatenation) return drs.eliminate_equality() def simplify(self): first = self.first.simplify() second = self.second.simplify() consequent = self.consequent.simplify() if self.consequent else None if isinstance(first, DRS) and isinstance(second, DRS): # For any ref that is in both 'first' and 'second' for ref in set(first.get_refs(True)) & set(second.get_refs(True)): # alpha convert the ref in 'second' to prevent collision newvar = DrtVariableExpression(unique_variable(ref)) second = second.replace(ref, newvar, True) return DRS(first.refs + second.refs, first.conds + second.conds, consequent) else: return self.__class__(first, second, consequent) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" refs = self.first.get_refs(recursive) + self.second.get_refs(recursive) if self.consequent and recursive: refs.extend(self.consequent.get_refs(True)) return refs def getOp(self): return DrtTokens.DRS_CONC def __eq__(self, other): r"""Defines equality modulo alphabetic variance. If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" if isinstance(other, DrtConcatenation): self_refs = self.get_refs() other_refs = other.get_refs() if len(self_refs) == len(other_refs): converted_other = other for (r1, r2) in zip(self_refs, other_refs): varex = self.make_VariableExpression(r1) converted_other = converted_other.replace(r2, varex, True) return ( self.first == converted_other.first and self.second == converted_other.second and self.consequent == converted_other.consequent ) return False def __ne__(self, other): return not self == other __hash__ = DrtBooleanExpression.__hash__ def fol(self): e = AndExpression(self.first.fol(), self.second.fol()) if self.consequent: e = ImpExpression(e, self.consequent.fol()) return e def _pretty(self): drs = DrtBinaryExpression._assemble_pretty( self._pretty_subex(self.first), self.getOp(), self._pretty_subex(self.second), ) if self.consequent: drs = DrtBinaryExpression._assemble_pretty( drs, DrtTokens.IMP, self.consequent._pretty() ) return drs def _pretty_subex(self, subex): if isinstance(subex, DrtConcatenation): return [line[1:-1] for line in subex._pretty()] return DrtBooleanExpression._pretty_subex(self, subex) def visit(self, function, combinator): """:see: Expression.visit()""" if self.consequent: return combinator( [function(self.first), function(self.second), function(self.consequent)] ) else: return combinator([function(self.first), function(self.second)]) def __str__(self): first = self._str_subex(self.first) second = self._str_subex(self.second) drs = Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE if self.consequent: return ( DrtTokens.OPEN + drs + " " + DrtTokens.IMP + " " + "%s" % self.consequent + DrtTokens.CLOSE ) return drs def _str_subex(self, subex): s = "%s" % subex if isinstance(subex, DrtConcatenation) and subex.consequent is None: return s[1:-1] return s class DrtApplicationExpression(DrtExpression, ApplicationExpression): def fol(self): return ApplicationExpression(self.function.fol(), self.argument.fol()) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return ( self.function.get_refs(True) + self.argument.get_refs(True) if recursive else [] ) def _pretty(self): function, args = self.uncurry() function_lines = function._pretty() args_lines = [arg._pretty() for arg in args] max_lines = max(map(len, [function_lines] + args_lines)) function_lines = _pad_vertically(function_lines, max_lines) args_lines = [_pad_vertically(arg_lines, max_lines) for arg_lines in args_lines] func_args_lines = list(zip(function_lines, list(zip(*args_lines)))) return ( [ func_line + " " + " ".join(args_line) + " " for func_line, args_line in func_args_lines[:2] ] + [ func_line + "(" + ",".join(args_line) + ")" for func_line, args_line in func_args_lines[2:3] ] + [ func_line + " " + " ".join(args_line) + " " for func_line, args_line in func_args_lines[3:] ] ) def _pad_vertically(lines, max_lines): pad_line = [" " * len(lines[0])] return lines + pad_line * (max_lines - len(lines)) class PossibleAntecedents(list, DrtExpression, Expression): def free(self): """Set of free variables.""" return set(self) def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """Replace all instances of variable v with expression E in self, where v is free in self.""" result = PossibleAntecedents() for item in self: if item == variable: self.append(expression) else: self.append(item) return result def _pretty(self): s = "%s" % self blank = " " * len(s) return [blank, blank, s] def __str__(self): return "[" + ",".join("%s" % it for it in self) + "]" class AnaphoraResolutionException(Exception): pass def resolve_anaphora(expression, trail=[]): if isinstance(expression, ApplicationExpression): if expression.is_pronoun_function(): possible_antecedents = PossibleAntecedents() for ancestor in trail: for ref in ancestor.get_refs(): refex = expression.make_VariableExpression(ref) # ========================================================== # Don't allow resolution to itself or other types # ========================================================== if refex.__class__ == expression.argument.__class__ and not ( refex == expression.argument ): possible_antecedents.append(refex) if len(possible_antecedents) == 1: resolution = possible_antecedents[0] else: resolution = possible_antecedents return expression.make_EqualityExpression(expression.argument, resolution) else: r_function = resolve_anaphora(expression.function, trail + [expression]) r_argument = resolve_anaphora(expression.argument, trail + [expression]) return expression.__class__(r_function, r_argument) elif isinstance(expression, DRS): r_conds = [] for cond in expression.conds: r_cond = resolve_anaphora(cond, trail + [expression]) # if the condition is of the form '(x = [])' then raise exception if isinstance(r_cond, EqualityExpression): if isinstance(r_cond.first, PossibleAntecedents): # Reverse the order so that the variable is on the left temp = r_cond.first r_cond.first = r_cond.second r_cond.second = temp if isinstance(r_cond.second, PossibleAntecedents): if not r_cond.second: raise AnaphoraResolutionException( "Variable '%s' does not " "resolve to anything." % r_cond.first ) r_conds.append(r_cond) if expression.consequent: consequent = resolve_anaphora(expression.consequent, trail + [expression]) else: consequent = None return expression.__class__(expression.refs, r_conds, consequent) elif isinstance(expression, AbstractVariableExpression): return expression elif isinstance(expression, NegatedExpression): return expression.__class__( resolve_anaphora(expression.term, trail + [expression]) ) elif isinstance(expression, DrtConcatenation): if expression.consequent: consequent = resolve_anaphora(expression.consequent, trail + [expression]) else: consequent = None return expression.__class__( resolve_anaphora(expression.first, trail + [expression]), resolve_anaphora(expression.second, trail + [expression]), consequent, ) elif isinstance(expression, BinaryExpression): return expression.__class__( resolve_anaphora(expression.first, trail + [expression]), resolve_anaphora(expression.second, trail + [expression]), ) elif isinstance(expression, LambdaExpression): return expression.__class__( expression.variable, resolve_anaphora(expression.term, trail + [expression]) ) class DrsDrawer: BUFFER = 3 # Space between elements TOPSPACE = 10 # Space above whole DRS OUTERSPACE = 6 # Space to the left, right, and bottom of the while DRS def __init__(self, drs, size_canvas=True, canvas=None): """ :param drs: ``DrtExpression``, The DRS to be drawn :param size_canvas: bool, True if the canvas size should be the exact size of the DRS :param canvas: ``Canvas`` The canvas on which to draw the DRS. If none is given, create a new canvas. """ master = None if not canvas: master = Tk() master.title("DRT") font = Font(family="helvetica", size=12) if size_canvas: canvas = Canvas(master, width=0, height=0) canvas.font = font self.canvas = canvas (right, bottom) = self._visit(drs, self.OUTERSPACE, self.TOPSPACE) width = max(right + self.OUTERSPACE, 100) height = bottom + self.OUTERSPACE canvas = Canvas(master, width=width, height=height) # , bg='white') else: canvas = Canvas(master, width=300, height=300) canvas.pack() canvas.font = font self.canvas = canvas self.drs = drs self.master = master def _get_text_height(self): """Get the height of a line of text""" return self.canvas.font.metrics("linespace") def draw(self, x=OUTERSPACE, y=TOPSPACE): """Draw the DRS""" self._handle(self.drs, self._draw_command, x, y) if self.master and not in_idle(): self.master.mainloop() else: return self._visit(self.drs, x, y) def _visit(self, expression, x, y): """ Return the bottom-rightmost point without actually drawing the item :param expression: the item to visit :param x: the top of the current drawing area :param y: the left side of the current drawing area :return: the bottom-rightmost point """ return self._handle(expression, self._visit_command, x, y) def _draw_command(self, item, x, y): """ Draw the given item at the given location :param item: the item to draw :param x: the top of the current drawing area :param y: the left side of the current drawing area :return: the bottom-rightmost point """ if isinstance(item, str): self.canvas.create_text(x, y, anchor="nw", font=self.canvas.font, text=item) elif isinstance(item, tuple): # item is the lower-right of a box (right, bottom) = item self.canvas.create_rectangle(x, y, right, bottom) horiz_line_y = ( y + self._get_text_height() + (self.BUFFER * 2) ) # the line separating refs from conds self.canvas.create_line(x, horiz_line_y, right, horiz_line_y) return self._visit_command(item, x, y) def _visit_command(self, item, x, y): """ Return the bottom-rightmost point without actually drawing the item :param item: the item to visit :param x: the top of the current drawing area :param y: the left side of the current drawing area :return: the bottom-rightmost point """ if isinstance(item, str): return (x + self.canvas.font.measure(item), y + self._get_text_height()) elif isinstance(item, tuple): return item def _handle(self, expression, command, x=0, y=0): """ :param expression: the expression to handle :param command: the function to apply, either _draw_command or _visit_command :param x: the top of the current drawing area :param y: the left side of the current drawing area :return: the bottom-rightmost point """ if command == self._visit_command: # if we don't need to draw the item, then we can use the cached values try: # attempt to retrieve cached values right = expression._drawing_width + x bottom = expression._drawing_height + y return (right, bottom) except AttributeError: # the values have not been cached yet, so compute them pass if isinstance(expression, DrtAbstractVariableExpression): factory = self._handle_VariableExpression elif isinstance(expression, DRS): factory = self._handle_DRS elif isinstance(expression, DrtNegatedExpression): factory = self._handle_NegatedExpression elif isinstance(expression, DrtLambdaExpression): factory = self._handle_LambdaExpression elif isinstance(expression, BinaryExpression): factory = self._handle_BinaryExpression elif isinstance(expression, DrtApplicationExpression): factory = self._handle_ApplicationExpression elif isinstance(expression, PossibleAntecedents): factory = self._handle_VariableExpression elif isinstance(expression, DrtProposition): factory = self._handle_DrtProposition else: raise Exception(expression.__class__.__name__) (right, bottom) = factory(expression, command, x, y) # cache the values expression._drawing_width = right - x expression._drawing_height = bottom - y return (right, bottom) def _handle_VariableExpression(self, expression, command, x, y): return command("%s" % expression, x, y) def _handle_NegatedExpression(self, expression, command, x, y): # Find the width of the negation symbol right = self._visit_command(DrtTokens.NOT, x, y)[0] # Handle term (right, bottom) = self._handle(expression.term, command, right, y) # Handle variables now that we know the y-coordinate command( DrtTokens.NOT, x, self._get_centered_top(y, bottom - y, self._get_text_height()), ) return (right, bottom) def _handle_DRS(self, expression, command, x, y): left = x + self.BUFFER # indent the left side bottom = y + self.BUFFER # indent the top # Handle Discourse Referents if expression.refs: refs = " ".join("%s" % r for r in expression.refs) else: refs = " " (max_right, bottom) = command(refs, left, bottom) bottom += self.BUFFER * 2 # Handle Conditions if expression.conds: for cond in expression.conds: (right, bottom) = self._handle(cond, command, left, bottom) max_right = max(max_right, right) bottom += self.BUFFER else: bottom += self._get_text_height() + self.BUFFER # Handle Box max_right += self.BUFFER return command((max_right, bottom), x, y) def _handle_ApplicationExpression(self, expression, command, x, y): function, args = expression.uncurry() if not isinstance(function, DrtAbstractVariableExpression): # It's not a predicate expression ("P(x,y)"), so leave arguments curried function = expression.function args = [expression.argument] # Get the max bottom of any element on the line function_bottom = self._visit(function, x, y)[1] max_bottom = max( [function_bottom] + [self._visit(arg, x, y)[1] for arg in args] ) line_height = max_bottom - y # Handle 'function' function_drawing_top = self._get_centered_top( y, line_height, function._drawing_height ) right = self._handle(function, command, x, function_drawing_top)[0] # Handle open paren centred_string_top = self._get_centered_top( y, line_height, self._get_text_height() ) right = command(DrtTokens.OPEN, right, centred_string_top)[0] # Handle each arg for (i, arg) in enumerate(args): arg_drawing_top = self._get_centered_top( y, line_height, arg._drawing_height ) right = self._handle(arg, command, right, arg_drawing_top)[0] if i + 1 < len(args): # since it's not the last arg, add a comma right = command(DrtTokens.COMMA + " ", right, centred_string_top)[0] # Handle close paren right = command(DrtTokens.CLOSE, right, centred_string_top)[0] return (right, max_bottom) def _handle_LambdaExpression(self, expression, command, x, y): # Find the width of the lambda symbol and abstracted variables variables = DrtTokens.LAMBDA + "%s" % expression.variable + DrtTokens.DOT right = self._visit_command(variables, x, y)[0] # Handle term (right, bottom) = self._handle(expression.term, command, right, y) # Handle variables now that we know the y-coordinate command( variables, x, self._get_centered_top(y, bottom - y, self._get_text_height()) ) return (right, bottom) def _handle_BinaryExpression(self, expression, command, x, y): # Get the full height of the line, based on the operands first_height = self._visit(expression.first, 0, 0)[1] second_height = self._visit(expression.second, 0, 0)[1] line_height = max(first_height, second_height) # Handle open paren centred_string_top = self._get_centered_top( y, line_height, self._get_text_height() ) right = command(DrtTokens.OPEN, x, centred_string_top)[0] # Handle the first operand first_height = expression.first._drawing_height (right, first_bottom) = self._handle( expression.first, command, right, self._get_centered_top(y, line_height, first_height), ) # Handle the operator right = command(" %s " % expression.getOp(), right, centred_string_top)[0] # Handle the second operand second_height = expression.second._drawing_height (right, second_bottom) = self._handle( expression.second, command, right, self._get_centered_top(y, line_height, second_height), ) # Handle close paren right = command(DrtTokens.CLOSE, right, centred_string_top)[0] return (right, max(first_bottom, second_bottom)) def _handle_DrtProposition(self, expression, command, x, y): # Find the width of the negation symbol right = command(expression.variable, x, y)[0] # Handle term (right, bottom) = self._handle(expression.term, command, right, y) return (right, bottom) def _get_centered_top(self, top, full_height, item_height): """Get the y-coordinate of the point that a figure should start at if its height is 'item_height' and it needs to be centered in an area that starts at 'top' and is 'full_height' tall.""" return top + (full_height - item_height) / 2 def demo(): print("=" * 20 + "TEST PARSE" + "=" * 20) dexpr = DrtExpression.fromstring print(dexpr(r"([x,y],[sees(x,y)])")) print(dexpr(r"([x],[man(x), walks(x)])")) print(dexpr(r"\x.\y.([],[sees(x,y)])")) print(dexpr(r"\x.([],[walks(x)])(john)")) print(dexpr(r"(([x],[walks(x)]) + ([y],[runs(y)]))")) print(dexpr(r"(([],[walks(x)]) -> ([],[runs(x)]))")) print(dexpr(r"([x],[PRO(x), sees(John,x)])")) print(dexpr(r"([x],[man(x), -([],[walks(x)])])")) print(dexpr(r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])")) print("=" * 20 + "Test fol()" + "=" * 20) print(dexpr(r"([x,y],[sees(x,y)])").fol()) print("=" * 20 + "Test alpha conversion and lambda expression equality" + "=" * 20) e1 = dexpr(r"\x.([],[P(x)])") print(e1) e2 = e1.alpha_convert(Variable("z")) print(e2) print(e1 == e2) print("=" * 20 + "Test resolve_anaphora()" + "=" * 20) print(resolve_anaphora(dexpr(r"([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])"))) print( resolve_anaphora(dexpr(r"([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])")) ) print(resolve_anaphora(dexpr(r"(([x,y],[]) + ([],[PRO(x)]))"))) print("=" * 20 + "Test pretty_print()" + "=" * 20) dexpr(r"([],[])").pretty_print() dexpr( r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])" ).pretty_print() dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print() dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print() dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print() def test_draw(): try: from tkinter import Tk except ImportError as e: raise ValueError("tkinter is required, but it's not available.") expressions = [ r"x", r"([],[])", r"([x],[])", r"([x],[man(x)])", r"([x,y],[sees(x,y)])", r"([x],[man(x), walks(x)])", r"\x.([],[man(x), walks(x)])", r"\x y.([],[sees(x,y)])", r"([],[(([],[walks(x)]) + ([],[runs(x)]))])", r"([x],[man(x), -([],[walks(x)])])", r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])", ] for e in expressions: d = DrtExpression.fromstring(e) d.draw() if __name__ == "__main__": demo() nltk-3.7/nltk/sem/drt_glue_demo.py000066400000000000000000000442721420073152400172610ustar00rootroot00000000000000# Natural Language Toolkit: GUI Demo for Glue Semantics with Discourse # Representation Theory (DRT) as meaning language # # Author: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT try: from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk from tkinter.font import Font from nltk.draw.util import CanvasFrame, ShowText except ImportError: """Ignore ImportError because tkinter might not be available.""" from nltk.parse import MaltParser from nltk.sem.drt import DrsDrawer, DrtVariableExpression from nltk.sem.glue import DrtGlue from nltk.sem.logic import Variable from nltk.tag import RegexpTagger from nltk.util import in_idle class DrtGlueDemo: def __init__(self, examples): # Set up the main window. self._top = Tk() self._top.title("DRT Glue Demo") # Set up key bindings. self._init_bindings() # Initialize the fonts.self._error = None self._init_fonts(self._top) self._examples = examples self._readingCache = [None for example in examples] # The user can hide the grammar. self._show_grammar = IntVar(self._top) self._show_grammar.set(1) # Set the data to None self._curExample = -1 self._readings = [] self._drs = None self._drsWidget = None self._error = None self._init_glue() # Create the basic frames. self._init_menubar(self._top) self._init_buttons(self._top) self._init_exampleListbox(self._top) self._init_readingListbox(self._top) self._init_canvas(self._top) # Resize callback self._canvas.bind("", self._configure) ######################################### ## Initialization Helpers ######################################### def _init_glue(self): tagger = RegexpTagger( [ ("^(David|Mary|John)$", "NNP"), ( "^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$", "VB", ), ("^(go|order|vanish|find|approach)$", "VB"), ("^(a)$", "ex_quant"), ("^(every)$", "univ_quant"), ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"), ("^(big|gray|former)$", "JJ"), ("^(him|himself)$", "PRP"), ] ) depparser = MaltParser(tagger=tagger) self._glue = DrtGlue(depparser=depparser, remove_duplicates=False) def _init_fonts(self, root): # See: self._sysfont = Font(font=Button()["font"]) root.option_add("*Font", self._sysfont) # TWhat's our font size (default=same as sysfont) self._size = IntVar(root) self._size.set(self._sysfont.cget("size")) self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) self._font = Font(family="helvetica", size=self._size.get()) if self._size.get() < 0: big = self._size.get() - 2 else: big = self._size.get() + 2 self._bigfont = Font(family="helvetica", weight="bold", size=big) def _init_exampleListbox(self, parent): self._exampleFrame = listframe = Frame(parent) self._exampleFrame.pack(fill="both", side="left", padx=2) self._exampleList_label = Label( self._exampleFrame, font=self._boldfont, text="Examples" ) self._exampleList_label.pack() self._exampleList = Listbox( self._exampleFrame, selectmode="single", relief="groove", background="white", foreground="#909090", font=self._font, selectforeground="#004040", selectbackground="#c0f0c0", ) self._exampleList.pack(side="right", fill="both", expand=1) for example in self._examples: self._exampleList.insert("end", (" %s" % example)) self._exampleList.config(height=min(len(self._examples), 25), width=40) # Add a scrollbar if there are more than 25 examples. if len(self._examples) > 25: listscroll = Scrollbar(self._exampleFrame, orient="vertical") self._exampleList.config(yscrollcommand=listscroll.set) listscroll.config(command=self._exampleList.yview) listscroll.pack(side="left", fill="y") # If they select a example, apply it. self._exampleList.bind("<>", self._exampleList_select) def _init_readingListbox(self, parent): self._readingFrame = listframe = Frame(parent) self._readingFrame.pack(fill="both", side="left", padx=2) self._readingList_label = Label( self._readingFrame, font=self._boldfont, text="Readings" ) self._readingList_label.pack() self._readingList = Listbox( self._readingFrame, selectmode="single", relief="groove", background="white", foreground="#909090", font=self._font, selectforeground="#004040", selectbackground="#c0f0c0", ) self._readingList.pack(side="right", fill="both", expand=1) # Add a scrollbar if there are more than 25 examples. listscroll = Scrollbar(self._readingFrame, orient="vertical") self._readingList.config(yscrollcommand=listscroll.set) listscroll.config(command=self._readingList.yview) listscroll.pack(side="right", fill="y") self._populate_readingListbox() def _populate_readingListbox(self): # Populate the listbox with integers self._readingList.delete(0, "end") for i in range(len(self._readings)): self._readingList.insert("end", (" %s" % (i + 1))) self._readingList.config(height=min(len(self._readings), 25), width=5) # If they select a example, apply it. self._readingList.bind("<>", self._readingList_select) def _init_bindings(self): # Key bindings are a good thing. self._top.bind("", self.destroy) self._top.bind("", self.destroy) self._top.bind("", self.destroy) self._top.bind("n", self.next) self._top.bind("", self.next) self._top.bind("p", self.prev) self._top.bind("", self.prev) def _init_buttons(self, parent): # Set up the frames. self._buttonframe = buttonframe = Frame(parent) buttonframe.pack(fill="none", side="bottom", padx=3, pady=2) Button( buttonframe, text="Prev", background="#90c0d0", foreground="black", command=self.prev, ).pack(side="left") Button( buttonframe, text="Next", background="#90c0d0", foreground="black", command=self.next, ).pack(side="left") def _configure(self, event): self._autostep = 0 (x1, y1, x2, y2) = self._cframe.scrollregion() y2 = event.height - 6 self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2) self._redraw() def _init_canvas(self, parent): self._cframe = CanvasFrame( parent, background="white", # width=525, height=250, closeenough=10, border=2, relief="sunken", ) self._cframe.pack(expand=1, fill="both", side="top", pady=2) canvas = self._canvas = self._cframe.canvas() # Initially, there's no tree or text self._tree = None self._textwidgets = [] self._textline = None def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="q" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) actionmenu = Menu(menubar, tearoff=0) actionmenu.add_command( label="Next", underline=0, command=self.next, accelerator="n, Space" ) actionmenu.add_command( label="Previous", underline=0, command=self.prev, accelerator="p, Backspace" ) menubar.add_cascade(label="Action", underline=0, menu=actionmenu) optionmenu = Menu(menubar, tearoff=0) optionmenu.add_checkbutton( label="Remove Duplicates", underline=0, variable=self._glue.remove_duplicates, command=self._toggle_remove_duplicates, accelerator="r", ) menubar.add_cascade(label="Options", underline=0, menu=optionmenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_radiobutton( label="Tiny", variable=self._size, underline=0, value=10, command=self.resize, ) viewmenu.add_radiobutton( label="Small", variable=self._size, underline=0, value=12, command=self.resize, ) viewmenu.add_radiobutton( label="Medium", variable=self._size, underline=0, value=14, command=self.resize, ) viewmenu.add_radiobutton( label="Large", variable=self._size, underline=0, value=18, command=self.resize, ) viewmenu.add_radiobutton( label="Huge", variable=self._size, underline=0, value=24, command=self.resize, ) menubar.add_cascade(label="View", underline=0, menu=viewmenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label="About", underline=0, command=self.about) menubar.add_cascade(label="Help", underline=0, menu=helpmenu) parent.config(menu=menubar) ######################################### ## Main draw procedure ######################################### def _redraw(self): canvas = self._canvas # Delete the old DRS, widgets, etc. if self._drsWidget is not None: self._drsWidget.clear() if self._drs: self._drsWidget = DrsWidget(self._canvas, self._drs) self._drsWidget.draw() if self._error: self._drsWidget = DrsWidget(self._canvas, self._error) self._drsWidget.draw() ######################################### ## Button Callbacks ######################################### def destroy(self, *e): self._autostep = 0 if self._top is None: return self._top.destroy() self._top = None def prev(self, *e): selection = self._readingList.curselection() readingListSize = self._readingList.size() # there are readings if readingListSize > 0: # if one reading is currently selected if len(selection) == 1: index = int(selection[0]) # if it's on (or before) the first item if index <= 0: self._select_previous_example() else: self._readingList_store_selection(index - 1) else: # select its first reading self._readingList_store_selection(readingListSize - 1) else: self._select_previous_example() def _select_previous_example(self): # if the current example is not the first example if self._curExample > 0: self._exampleList_store_selection(self._curExample - 1) else: # go to the last example self._exampleList_store_selection(len(self._examples) - 1) def next(self, *e): selection = self._readingList.curselection() readingListSize = self._readingList.size() # if there are readings if readingListSize > 0: # if one reading is currently selected if len(selection) == 1: index = int(selection[0]) # if it's on (or past) the last item if index >= (readingListSize - 1): self._select_next_example() else: self._readingList_store_selection(index + 1) else: # select its first reading self._readingList_store_selection(0) else: self._select_next_example() def _select_next_example(self): # if the current example is not the last example if self._curExample < len(self._examples) - 1: self._exampleList_store_selection(self._curExample + 1) else: # go to the first example self._exampleList_store_selection(0) def about(self, *e): ABOUT = ( "NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n" + "Written by Daniel H. Garrette" ) TITLE = "About: NLTK DRT Glue Demo" try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self._top, TITLE, ABOUT) def postscript(self, *e): self._autostep = 0 self._cframe.print_to_file() def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._boldfont.configure(size=-(abs(size))) self._sysfont.configure(size=-(abs(size))) self._bigfont.configure(size=-(abs(size + 2))) self._redraw() def _toggle_remove_duplicates(self): self._glue.remove_duplicates = not self._glue.remove_duplicates self._exampleList.selection_clear(0, "end") self._readings = [] self._populate_readingListbox() self._readingCache = [None for ex in self._examples] self._curExample = -1 self._error = None self._drs = None self._redraw() def _exampleList_select(self, event): selection = self._exampleList.curselection() if len(selection) != 1: return self._exampleList_store_selection(int(selection[0])) def _exampleList_store_selection(self, index): self._curExample = index example = self._examples[index] self._exampleList.selection_clear(0, "end") if example: cache = self._readingCache[index] if cache: if isinstance(cache, list): self._readings = cache self._error = None else: self._readings = [] self._error = cache else: try: self._readings = self._glue.parse_to_meaning(example) self._error = None self._readingCache[index] = self._readings except Exception as e: self._readings = [] self._error = DrtVariableExpression(Variable("Error: " + str(e))) self._readingCache[index] = self._error # add a star to the end of the example self._exampleList.delete(index) self._exampleList.insert(index, (" %s *" % example)) self._exampleList.config( height=min(len(self._examples), 25), width=40 ) self._populate_readingListbox() self._exampleList.selection_set(index) self._drs = None self._redraw() def _readingList_select(self, event): selection = self._readingList.curselection() if len(selection) != 1: return self._readingList_store_selection(int(selection[0])) def _readingList_store_selection(self, index): reading = self._readings[index] self._readingList.selection_clear(0, "end") if reading: self._readingList.selection_set(index) self._drs = reading.simplify().normalize().resolve_anaphora() self._redraw() class DrsWidget: def __init__(self, canvas, drs, **attribs): self._drs = drs self._canvas = canvas canvas.font = Font( font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font") ) canvas._BUFFER = 3 self.bbox = (0, 0, 0, 0) def draw(self): (right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw() self.bbox = (0, 0, right + 1, bottom + 1) def clear(self): self._canvas.create_rectangle(self.bbox, fill="white", width="0") def demo(): examples = [ "John walks", "David sees Mary", "David eats a sandwich", "every man chases a dog", # 'every man believes a dog yawns', # 'John gives David a sandwich', "John chases himself", # 'John persuades David to order a pizza', # 'John tries to go', # 'John tries to find a unicorn', # 'John seems to vanish', # 'a unicorn seems to approach', # 'every big cat leaves', # 'every gray cat leaves', # 'every big gray cat leaves', # 'a former senator leaves', # 'John likes a cat', # 'John likes every cat', # 'he walks', # 'John walks and he leaves' ] DrtGlueDemo(examples).mainloop() if __name__ == "__main__": demo() nltk-3.7/nltk/sem/evaluate.py000066400000000000000000000611101420073152400162440ustar00rootroot00000000000000# Natural Language Toolkit: Models for first-order languages with lambda # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein , # URL: # For license information, see LICENSE.TXT # TODO: # - fix tracing # - fix iterator-based approach to existentials """ This module provides data structures for representing first-order models. """ import inspect import re import sys import textwrap from pprint import pformat from nltk.decorators import decorator # this used in code that is commented out from nltk.sem.logic import ( AbstractVariableExpression, AllExpression, AndExpression, ApplicationExpression, EqualityExpression, ExistsExpression, Expression, IffExpression, ImpExpression, IndividualVariableExpression, LambdaExpression, NegatedExpression, OrExpression, Variable, is_indvar, ) class Error(Exception): pass class Undefined(Error): pass def trace(f, *args, **kw): argspec = inspect.getfullargspec(f) d = dict(zip(argspec[0], args)) if d.pop("trace", None): print() for item in d.items(): print("%s => %s" % item) return f(*args, **kw) def is_rel(s): """ Check whether a set represents a relation (of any arity). :param s: a set containing tuples of str elements :type s: set :rtype: bool """ # we have the empty relation, i.e. set() if len(s) == 0: return True # all the elements are tuples of the same length elif all(isinstance(el, tuple) for el in s) and len(max(s)) == len(min(s)): return True else: raise ValueError("Set %r contains sequences of different lengths" % s) def set2rel(s): """ Convert a set containing individuals (strings or numbers) into a set of unary tuples. Any tuples of strings already in the set are passed through unchanged. For example: - set(['a', 'b']) => set([('a',), ('b',)]) - set([3, 27]) => set([('3',), ('27',)]) :type s: set :rtype: set of tuple of str """ new = set() for elem in s: if isinstance(elem, str): new.add((elem,)) elif isinstance(elem, int): new.add(str(elem)) else: new.add(elem) return new def arity(rel): """ Check the arity of a relation. :type rel: set of tuples :rtype: int of tuple of str """ if len(rel) == 0: return 0 return len(list(rel)[0]) class Valuation(dict): """ A dictionary which represents a model-theoretic Valuation of non-logical constants. Keys are strings representing the constants to be interpreted, and values correspond to individuals (represented as strings) and n-ary relations (represented as sets of tuples of strings). An instance of ``Valuation`` will raise a KeyError exception (i.e., just behave like a standard dictionary) if indexed with an expression that is not in its list of symbols. """ def __init__(self, xs): """ :param xs: a list of (symbol, value) pairs. """ super().__init__() for (sym, val) in xs: if isinstance(val, str) or isinstance(val, bool): self[sym] = val elif isinstance(val, set): self[sym] = set2rel(val) else: msg = textwrap.fill( "Error in initializing Valuation. " "Unrecognized value for symbol '%s':\n%s" % (sym, val), width=66, ) raise ValueError(msg) def __getitem__(self, key): if key in self: return dict.__getitem__(self, key) else: raise Undefined("Unknown expression: '%s'" % key) def __str__(self): return pformat(self) @property def domain(self): """Set-theoretic domain of the value-space of a Valuation.""" dom = [] for val in self.values(): if isinstance(val, str): dom.append(val) elif not isinstance(val, bool): dom.extend( [elem for tuple_ in val for elem in tuple_ if elem is not None] ) return set(dom) @property def symbols(self): """The non-logical constants which the Valuation recognizes.""" return sorted(self.keys()) @classmethod def fromstring(cls, s): return read_valuation(s) ########################################## # REs used by the _read_valuation function ########################################## _VAL_SPLIT_RE = re.compile(r"\s*=+>\s*") _ELEMENT_SPLIT_RE = re.compile(r"\s*,\s*") _TUPLES_RE = re.compile( r"""\s* (\([^)]+\)) # tuple-expression \s*""", re.VERBOSE, ) def _read_valuation_line(s): """ Read a line in a valuation file. Lines are expected to be of the form:: noosa => n girl => {g1, g2} chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} :param s: input line :type s: str :return: a pair (symbol, value) :rtype: tuple """ pieces = _VAL_SPLIT_RE.split(s) symbol = pieces[0] value = pieces[1] # check whether the value is meant to be a set if value.startswith("{"): value = value[1:-1] tuple_strings = _TUPLES_RE.findall(value) # are the set elements tuples? if tuple_strings: set_elements = [] for ts in tuple_strings: ts = ts[1:-1] element = tuple(_ELEMENT_SPLIT_RE.split(ts)) set_elements.append(element) else: set_elements = _ELEMENT_SPLIT_RE.split(value) value = set(set_elements) return symbol, value def read_valuation(s, encoding=None): """ Convert a valuation string into a valuation. :param s: a valuation string :type s: str :param encoding: the encoding of the input string, if it is binary :type encoding: str :return: a ``nltk.sem`` valuation :rtype: Valuation """ if encoding is not None: s = s.decode(encoding) statements = [] for linenum, line in enumerate(s.splitlines()): line = line.strip() if line.startswith("#") or line == "": continue try: statements.append(_read_valuation_line(line)) except ValueError as e: raise ValueError(f"Unable to parse line {linenum}: {line}") from e return Valuation(statements) class Assignment(dict): r""" A dictionary which represents an assignment of values to variables. An assignment can only assign values from its domain. If an unknown expression *a* is passed to a model *M*\ 's interpretation function *i*, *i* will first check whether *M*\ 's valuation assigns an interpretation to *a* as a constant, and if this fails, *i* will delegate the interpretation of *a* to *g*. *g* only assigns values to individual variables (i.e., members of the class ``IndividualVariableExpression`` in the ``logic`` module. If a variable is not assigned a value by *g*, it will raise an ``Undefined`` exception. A variable *Assignment* is a mapping from individual variables to entities in the domain. Individual variables are usually indicated with the letters ``'x'``, ``'y'``, ``'w'`` and ``'z'``, optionally followed by an integer (e.g., ``'x0'``, ``'y332'``). Assignments are created using the ``Assignment`` constructor, which also takes the domain as a parameter. >>> from nltk.sem.evaluate import Assignment >>> dom = set(['u1', 'u2', 'u3', 'u4']) >>> g3 = Assignment(dom, [('x', 'u1'), ('y', 'u2')]) >>> g3 == {'x': 'u1', 'y': 'u2'} True There is also a ``print`` format for assignments which uses a notation closer to that in logic textbooks: >>> print(g3) g[u1/x][u2/y] It is also possible to update an assignment using the ``add`` method: >>> dom = set(['u1', 'u2', 'u3', 'u4']) >>> g4 = Assignment(dom) >>> g4.add('x', 'u1') {'x': 'u1'} With no arguments, ``purge()`` is equivalent to ``clear()`` on a dictionary: >>> g4.purge() >>> g4 {} :param domain: the domain of discourse :type domain: set :param assign: a list of (varname, value) associations :type assign: list """ def __init__(self, domain, assign=None): super().__init__() self.domain = domain if assign: for (var, val) in assign: assert val in self.domain, "'{}' is not in the domain: {}".format( val, self.domain, ) assert is_indvar(var), ( "Wrong format for an Individual Variable: '%s'" % var ) self[var] = val self.variant = None self._addvariant() def __getitem__(self, key): if key in self: return dict.__getitem__(self, key) else: raise Undefined("Not recognized as a variable: '%s'" % key) def copy(self): new = Assignment(self.domain) new.update(self) return new def purge(self, var=None): """ Remove one or all keys (i.e. logic variables) from an assignment, and update ``self.variant``. :param var: a Variable acting as a key for the assignment. """ if var: del self[var] else: self.clear() self._addvariant() return None def __str__(self): """ Pretty printing for assignments. {'x', 'u'} appears as 'g[u/x]' """ gstring = "g" # Deterministic output for unit testing. variant = sorted(self.variant) for (val, var) in variant: gstring += f"[{val}/{var}]" return gstring def _addvariant(self): """ Create a more pretty-printable version of the assignment. """ list_ = [] for item in self.items(): pair = (item[1], item[0]) list_.append(pair) self.variant = list_ return None def add(self, var, val): """ Add a new variable-value pair to the assignment, and update ``self.variant``. """ assert val in self.domain, f"{val} is not in the domain {self.domain}" assert is_indvar(var), "Wrong format for an Individual Variable: '%s'" % var self[var] = val self._addvariant() return self class Model: """ A first order model is a domain *D* of discourse and a valuation *V*. A domain *D* is a set, and a valuation *V* is a map that associates expressions with values in the model. The domain of *V* should be a subset of *D*. Construct a new ``Model``. :type domain: set :param domain: A set of entities representing the domain of discourse of the model. :type valuation: Valuation :param valuation: the valuation of the model. :param prop: If this is set, then we are building a propositional\ model and don't require the domain of *V* to be subset of *D*. """ def __init__(self, domain, valuation): assert isinstance(domain, set) self.domain = domain self.valuation = valuation if not domain.issuperset(valuation.domain): raise Error( "The valuation domain, %s, must be a subset of the model's domain, %s" % (valuation.domain, domain) ) def __repr__(self): return f"({self.domain!r}, {self.valuation!r})" def __str__(self): return f"Domain = {self.domain},\nValuation = \n{self.valuation}" def evaluate(self, expr, g, trace=None): """ Read input expressions, and provide a handler for ``satisfy`` that blocks further propagation of the ``Undefined`` error. :param expr: An ``Expression`` of ``logic``. :type g: Assignment :param g: an assignment to individual variables. :rtype: bool or 'Undefined' """ try: parsed = Expression.fromstring(expr) value = self.satisfy(parsed, g, trace=trace) if trace: print() print(f"'{expr}' evaluates to {value} under M, {g}") return value except Undefined: if trace: print() print(f"'{expr}' is undefined under M, {g}") return "Undefined" def satisfy(self, parsed, g, trace=None): """ Recursive interpretation function for a formula of first-order logic. Raises an ``Undefined`` error when ``parsed`` is an atomic string but is not a symbol or an individual variable. :return: Returns a truth value or ``Undefined`` if ``parsed`` is\ complex, and calls the interpretation function ``i`` if ``parsed``\ is atomic. :param parsed: An expression of ``logic``. :type g: Assignment :param g: an assignment to individual variables. """ if isinstance(parsed, ApplicationExpression): function, arguments = parsed.uncurry() if isinstance(function, AbstractVariableExpression): # It's a predicate expression ("P(x,y)"), so used uncurried arguments funval = self.satisfy(function, g) argvals = tuple(self.satisfy(arg, g) for arg in arguments) return argvals in funval else: # It must be a lambda expression, so use curried form funval = self.satisfy(parsed.function, g) argval = self.satisfy(parsed.argument, g) return funval[argval] elif isinstance(parsed, NegatedExpression): return not self.satisfy(parsed.term, g) elif isinstance(parsed, AndExpression): return self.satisfy(parsed.first, g) and self.satisfy(parsed.second, g) elif isinstance(parsed, OrExpression): return self.satisfy(parsed.first, g) or self.satisfy(parsed.second, g) elif isinstance(parsed, ImpExpression): return (not self.satisfy(parsed.first, g)) or self.satisfy(parsed.second, g) elif isinstance(parsed, IffExpression): return self.satisfy(parsed.first, g) == self.satisfy(parsed.second, g) elif isinstance(parsed, EqualityExpression): return self.satisfy(parsed.first, g) == self.satisfy(parsed.second, g) elif isinstance(parsed, AllExpression): new_g = g.copy() for u in self.domain: new_g.add(parsed.variable.name, u) if not self.satisfy(parsed.term, new_g): return False return True elif isinstance(parsed, ExistsExpression): new_g = g.copy() for u in self.domain: new_g.add(parsed.variable.name, u) if self.satisfy(parsed.term, new_g): return True return False elif isinstance(parsed, LambdaExpression): cf = {} var = parsed.variable.name for u in self.domain: val = self.satisfy(parsed.term, g.add(var, u)) # NB the dict would be a lot smaller if we do this: # if val: cf[u] = val # But then need to deal with cases where f(a) should yield # a function rather than just False. cf[u] = val return cf else: return self.i(parsed, g, trace) # @decorator(trace_eval) def i(self, parsed, g, trace=False): """ An interpretation function. Assuming that ``parsed`` is atomic: - if ``parsed`` is a non-logical constant, calls the valuation *V* - else if ``parsed`` is an individual variable, calls assignment *g* - else returns ``Undefined``. :param parsed: an ``Expression`` of ``logic``. :type g: Assignment :param g: an assignment to individual variables. :return: a semantic value """ # If parsed is a propositional letter 'p', 'q', etc, it could be in valuation.symbols # and also be an IndividualVariableExpression. We want to catch this first case. # So there is a procedural consequence to the ordering of clauses here: if parsed.variable.name in self.valuation.symbols: return self.valuation[parsed.variable.name] elif isinstance(parsed, IndividualVariableExpression): return g[parsed.variable.name] else: raise Undefined("Can't find a value for %s" % parsed) def satisfiers(self, parsed, varex, g, trace=None, nesting=0): """ Generate the entities from the model's domain that satisfy an open formula. :param parsed: an open formula :type parsed: Expression :param varex: the relevant free individual variable in ``parsed``. :type varex: VariableExpression or str :param g: a variable assignment :type g: Assignment :return: a set of the entities that satisfy ``parsed``. """ spacer = " " indent = spacer + (spacer * nesting) candidates = [] if isinstance(varex, str): var = Variable(varex) else: var = varex if var in parsed.free(): if trace: print() print( (spacer * nesting) + f"Open formula is '{parsed}' with assignment {g}" ) for u in self.domain: new_g = g.copy() new_g.add(var.name, u) if trace and trace > 1: lowtrace = trace - 1 else: lowtrace = 0 value = self.satisfy(parsed, new_g, lowtrace) if trace: print(indent + "(trying assignment %s)" % new_g) # parsed == False under g[u/var]? if value == False: if trace: print(indent + f"value of '{parsed}' under {new_g} is False") # so g[u/var] is a satisfying assignment else: candidates.append(u) if trace: print(indent + f"value of '{parsed}' under {new_g} is {value}") result = {c for c in candidates} # var isn't free in parsed else: raise Undefined(f"{var.name} is not free in {parsed}") return result # ////////////////////////////////////////////////////////////////////// # Demo.. # ////////////////////////////////////////////////////////////////////// # number of spacer chars mult = 30 # Demo 1: Propositional Logic ################# def propdemo(trace=None): """Example of a propositional model.""" global val1, dom1, m1, g1 val1 = Valuation([("P", True), ("Q", True), ("R", False)]) dom1 = set() m1 = Model(dom1, val1) g1 = Assignment(dom1) print() print("*" * mult) print("Propositional Formulas Demo") print("*" * mult) print("(Propositional constants treated as nullary predicates)") print() print("Model m1:\n", m1) print("*" * mult) sentences = [ "(P & Q)", "(P & R)", "- P", "- R", "- - P", "- (P & R)", "(P | R)", "(R | P)", "(R | R)", "(- P | R)", "(P | - P)", "(P -> Q)", "(P -> R)", "(R -> P)", "(P <-> P)", "(R <-> R)", "(P <-> R)", ] for sent in sentences: if trace: print() m1.evaluate(sent, g1, trace) else: print(f"The value of '{sent}' is: {m1.evaluate(sent, g1)}") # Demo 2: FOL Model ############# def folmodel(quiet=False, trace=None): """Example of a first-order model.""" global val2, v2, dom2, m2, g2 v2 = [ ("adam", "b1"), ("betty", "g1"), ("fido", "d1"), ("girl", {"g1", "g2"}), ("boy", {"b1", "b2"}), ("dog", {"d1"}), ("love", {("b1", "g1"), ("b2", "g2"), ("g1", "b1"), ("g2", "b1")}), ] val2 = Valuation(v2) dom2 = val2.domain m2 = Model(dom2, val2) g2 = Assignment(dom2, [("x", "b1"), ("y", "g2")]) if not quiet: print() print("*" * mult) print("Models Demo") print("*" * mult) print("Model m2:\n", "-" * 14, "\n", m2) print("Variable assignment = ", g2) exprs = ["adam", "boy", "love", "walks", "x", "y", "z"] parsed_exprs = [Expression.fromstring(e) for e in exprs] print() for parsed in parsed_exprs: try: print( "The interpretation of '%s' in m2 is %s" % (parsed, m2.i(parsed, g2)) ) except Undefined: print("The interpretation of '%s' in m2 is Undefined" % parsed) applications = [ ("boy", ("adam")), ("walks", ("adam",)), ("love", ("adam", "y")), ("love", ("y", "adam")), ] for (fun, args) in applications: try: funval = m2.i(Expression.fromstring(fun), g2) argsval = tuple(m2.i(Expression.fromstring(arg), g2) for arg in args) print(f"{fun}({args}) evaluates to {argsval in funval}") except Undefined: print(f"{fun}({args}) evaluates to Undefined") # Demo 3: FOL ######### def foldemo(trace=None): """ Interpretation of closed expressions in a first-order model. """ folmodel(quiet=True) print() print("*" * mult) print("FOL Formulas Demo") print("*" * mult) formulas = [ "love (adam, betty)", "(adam = mia)", "\\x. (boy(x) | girl(x))", "\\x. boy(x)(adam)", "\\x y. love(x, y)", "\\x y. love(x, y)(adam)(betty)", "\\x y. love(x, y)(adam, betty)", "\\x y. (boy(x) & love(x, y))", "\\x. exists y. (boy(x) & love(x, y))", "exists z1. boy(z1)", "exists x. (boy(x) & -(x = adam))", "exists x. (boy(x) & all y. love(y, x))", "all x. (boy(x) | girl(x))", "all x. (girl(x) -> exists y. boy(y) & love(x, y))", # Every girl loves exists boy. "exists x. (boy(x) & all y. (girl(y) -> love(y, x)))", # There is exists boy that every girl loves. "exists x. (boy(x) & all y. (girl(y) -> love(x, y)))", # exists boy loves every girl. "all x. (dog(x) -> - girl(x))", "exists x. exists y. (love(x, y) & love(x, y))", ] for fmla in formulas: g2.purge() if trace: m2.evaluate(fmla, g2, trace) else: print(f"The value of '{fmla}' is: {m2.evaluate(fmla, g2)}") # Demo 3: Satisfaction ############# def satdemo(trace=None): """Satisfiers of an open formula in a first order model.""" print() print("*" * mult) print("Satisfiers Demo") print("*" * mult) folmodel(quiet=True) formulas = [ "boy(x)", "(x = x)", "(boy(x) | girl(x))", "(boy(x) & girl(x))", "love(adam, x)", "love(x, adam)", "-(x = adam)", "exists z22. love(x, z22)", "exists y. love(y, x)", "all y. (girl(y) -> love(x, y))", "all y. (girl(y) -> love(y, x))", "all y. (girl(y) -> (boy(x) & love(y, x)))", "(boy(x) & all y. (girl(y) -> love(x, y)))", "(boy(x) & all y. (girl(y) -> love(y, x)))", "(boy(x) & exists y. (girl(y) & love(y, x)))", "(girl(x) -> dog(x))", "all y. (dog(y) -> (x = y))", "exists y. love(y, x)", "exists y. (love(adam, y) & love(y, x))", ] if trace: print(m2) for fmla in formulas: print(fmla) Expression.fromstring(fmla) parsed = [Expression.fromstring(fmla) for fmla in formulas] for p in parsed: g2.purge() print( "The satisfiers of '{}' are: {}".format(p, m2.satisfiers(p, "x", g2, trace)) ) def demo(num=0, trace=None): """ Run exists demos. - num = 1: propositional logic demo - num = 2: first order model demo (only if trace is set) - num = 3: first order sentences demo - num = 4: satisfaction of open formulas demo - any other value: run all the demos :param trace: trace = 1, or trace = 2 for more verbose tracing """ demos = {1: propdemo, 2: folmodel, 3: foldemo, 4: satdemo} try: demos[num](trace=trace) except KeyError: for num in demos: demos[num](trace=trace) if __name__ == "__main__": demo(2, trace=0) nltk-3.7/nltk/sem/glue.py000066400000000000000000000713531420073152400154040ustar00rootroot00000000000000# Natural Language Toolkit: Glue Semantics # # Author: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT import os from itertools import chain import nltk from nltk.internals import Counter from nltk.sem import drt, linearlogic from nltk.sem.logic import ( AbstractVariableExpression, Expression, LambdaExpression, Variable, VariableExpression, ) from nltk.tag import BigramTagger, RegexpTagger, TrigramTagger, UnigramTagger SPEC_SEMTYPES = { "a": "ex_quant", "an": "ex_quant", "every": "univ_quant", "the": "def_art", "no": "no_quant", "default": "ex_quant", } OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"] class GlueFormula: def __init__(self, meaning, glue, indices=None): if not indices: indices = set() if isinstance(meaning, str): self.meaning = Expression.fromstring(meaning) elif isinstance(meaning, Expression): self.meaning = meaning else: raise RuntimeError( "Meaning term neither string or expression: %s, %s" % (meaning, meaning.__class__) ) if isinstance(glue, str): self.glue = linearlogic.LinearLogicParser().parse(glue) elif isinstance(glue, linearlogic.Expression): self.glue = glue else: raise RuntimeError( "Glue term neither string or expression: %s, %s" % (glue, glue.__class__) ) self.indices = indices def applyto(self, arg): """self = (\\x.(walk x), (subj -o f)) arg = (john , subj) returns ((walk john), f) """ if self.indices & arg.indices: # if the sets are NOT disjoint raise linearlogic.LinearLogicApplicationException( f"'{self}' applied to '{arg}'. Indices are not disjoint." ) else: # if the sets ARE disjoint return_indices = self.indices | arg.indices try: return_glue = linearlogic.ApplicationExpression( self.glue, arg.glue, arg.indices ) except linearlogic.LinearLogicApplicationException as e: raise linearlogic.LinearLogicApplicationException( f"'{self.simplify()}' applied to '{arg.simplify()}'" ) from e arg_meaning_abstracted = arg.meaning if return_indices: for dep in self.glue.simplify().antecedent.dependencies[ ::-1 ]: # if self.glue is (A -o B), dep is in A.dependencies arg_meaning_abstracted = self.make_LambdaExpression( Variable("v%s" % dep), arg_meaning_abstracted ) return_meaning = self.meaning.applyto(arg_meaning_abstracted) return self.__class__(return_meaning, return_glue, return_indices) def make_VariableExpression(self, name): return VariableExpression(name) def make_LambdaExpression(self, variable, term): return LambdaExpression(variable, term) def lambda_abstract(self, other): assert isinstance(other, GlueFormula) assert isinstance(other.meaning, AbstractVariableExpression) return self.__class__( self.make_LambdaExpression(other.meaning.variable, self.meaning), linearlogic.ImpExpression(other.glue, self.glue), ) def compile(self, counter=None): """From Iddo Lev's PhD Dissertation p108-109""" if not counter: counter = Counter() (compiled_glue, new_forms) = self.glue.simplify().compile_pos( counter, self.__class__ ) return new_forms + [ self.__class__(self.meaning, compiled_glue, {counter.get()}) ] def simplify(self): return self.__class__( self.meaning.simplify(), self.glue.simplify(), self.indices ) def __eq__(self, other): return ( self.__class__ == other.__class__ and self.meaning == other.meaning and self.glue == other.glue ) def __ne__(self, other): return not self == other # sorting for use in doctests which must be deterministic def __lt__(self, other): return str(self) < str(other) def __str__(self): assert isinstance(self.indices, set) accum = f"{self.meaning} : {self.glue}" if self.indices: accum += ( " : {" + ", ".join(str(index) for index in sorted(self.indices)) + "}" ) return accum def __repr__(self): return "%s" % self class GlueDict(dict): def __init__(self, filename, encoding=None): self.filename = filename self.file_encoding = encoding self.read_file() def read_file(self, empty_first=True): if empty_first: self.clear() try: contents = nltk.data.load( self.filename, format="text", encoding=self.file_encoding ) # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load() except LookupError as e: try: contents = nltk.data.load( "file:" + self.filename, format="text", encoding=self.file_encoding ) except LookupError: raise e lines = contents.splitlines() for line in lines: # example: 'n : (\\x.( x), (v-or))' # lambdacalc -^ linear logic -^ line = line.strip() # remove trailing newline if not len(line): continue # skip empty lines if line[0] == "#": continue # skip commented out lines parts = line.split( " : ", 2 ) # ['verb', '(\\x.( x), ( subj -o f ))', '[subj]'] glue_formulas = [] paren_count = 0 tuple_start = 0 tuple_comma = 0 relationships = None if len(parts) > 1: for (i, c) in enumerate(parts[1]): if c == "(": if paren_count == 0: # if it's the first '(' of a tuple tuple_start = i + 1 # then save the index paren_count += 1 elif c == ")": paren_count -= 1 if paren_count == 0: # if it's the last ')' of a tuple meaning_term = parts[1][ tuple_start:tuple_comma ] # '\\x.( x)' glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)' glue_formulas.append( [meaning_term, glue_term] ) # add the GlueFormula to the list elif c == ",": if ( paren_count == 1 ): # if it's a comma separating the parts of the tuple tuple_comma = i # then save the index elif c == "#": # skip comments at the ends of lines if ( paren_count != 0 ): # if the line hasn't parsed correctly so far raise RuntimeError( "Formula syntax is incorrect for entry " + line ) break # break to the next line if len(parts) > 2: # if there is a relationship entry at the end rel_start = parts[2].index("[") + 1 rel_end = parts[2].index("]") if rel_start == rel_end: relationships = frozenset() else: relationships = frozenset( r.strip() for r in parts[2][rel_start:rel_end].split(",") ) try: start_inheritance = parts[0].index("(") end_inheritance = parts[0].index(")") sem = parts[0][:start_inheritance].strip() supertype = parts[0][start_inheritance + 1 : end_inheritance] except: sem = parts[0].strip() supertype = None if sem not in self: self[sem] = {} if ( relationships is None ): # if not specified for a specific relationship set # add all relationship entries for parents if supertype: for rels in self[supertype]: if rels not in self[sem]: self[sem][rels] = [] glue = self[supertype][rels] self[sem][rels].extend(glue) self[sem][rels].extend( glue_formulas ) # add the glue formulas to every rel entry else: if None not in self[sem]: self[sem][None] = [] self[sem][None].extend( glue_formulas ) # add the glue formulas to every rel entry else: if relationships not in self[sem]: self[sem][relationships] = [] if supertype: self[sem][relationships].extend(self[supertype][relationships]) self[sem][relationships].extend( glue_formulas ) # add the glue entry to the dictionary def __str__(self): accum = "" for pos in self: str_pos = "%s" % pos for relset in self[pos]: i = 1 for gf in self[pos][relset]: if i == 1: accum += str_pos + ": " else: accum += " " * (len(str_pos) + 2) accum += "%s" % gf if relset and i == len(self[pos][relset]): accum += " : %s" % relset accum += "\n" i += 1 return accum def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False): if node is None: # TODO: should it be depgraph.root? Is this code tested? top = depgraph.nodes[0] depList = list(chain.from_iterable(top["deps"].values())) root = depgraph.nodes[depList[0]] return self.to_glueformula_list(depgraph, root, Counter(), verbose) glueformulas = self.lookup(node, depgraph, counter) for dep_idx in chain.from_iterable(node["deps"].values()): dep = depgraph.nodes[dep_idx] glueformulas.extend( self.to_glueformula_list(depgraph, dep, counter, verbose) ) return glueformulas def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError( "There is no GlueDict entry for sem type of '%s' " "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"]) ) return self.get_glueformulas_from_semtype_entry( lookup, node["word"], node, depgraph, counter ) def add_missing_dependencies(self, node, depgraph): rel = node["rel"].lower() if rel == "main": headnode = depgraph.nodes[node["head"]] subj = self.lookup_unique("subj", headnode, depgraph) relation = subj["rel"] node["deps"].setdefault(relation, []) node["deps"][relation].append(subj["address"]) # node['deps'].append(subj['address']) def _lookup_semtype_option(self, semtype, node, depgraph): relationships = frozenset( depgraph.nodes[dep]["rel"].lower() for dep in chain.from_iterable(node["deps"].values()) if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS ) try: lookup = semtype[relationships] except KeyError: # An exact match is not found, so find the best match where # 'best' is defined as the glue entry whose relationship set has the # most relations of any possible relationship set that is a subset # of the actual depgraph best_match = frozenset() for relset_option in set(semtype) - {None}: if ( len(relset_option) > len(best_match) and relset_option < relationships ): best_match = relset_option if not best_match: if None in semtype: best_match = None else: return None lookup = semtype[best_match] return lookup def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ rel = node["rel"].lower() word = node["word"].lower() if rel == "spec": if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES["default"]] elif rel in ["nmod", "vmod"]: return [node["tag"], rel] else: return [node["tag"]] def get_glueformulas_from_semtype_entry( self, lookup, word, node, depgraph, counter ): glueformulas = [] glueFormulaFactory = self.get_GlueFormula_factory() for meaning, glue in lookup: gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue) if not len(glueformulas): gf.word = word else: gf.word = f"{word}{len(glueformulas) + 1}" gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get()) glueformulas.append(gf) return glueformulas def get_meaning_formula(self, generic, word): """ :param generic: A meaning formula string containing the parameter "" :param word: The actual word to be replace "" """ word = word.replace(".", "") return generic.replace("", word) def initialize_labels(self, expr, node, depgraph, unique_index): if isinstance(expr, linearlogic.AtomicExpression): name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index) if name[0].isupper(): return linearlogic.VariableExpression(name) else: return linearlogic.ConstantExpression(name) else: return linearlogic.ImpExpression( self.initialize_labels(expr.antecedent, node, depgraph, unique_index), self.initialize_labels(expr.consequent, node, depgraph, unique_index), ) def find_label_name(self, name, node, depgraph, unique_index): try: dot = name.index(".") before_dot = name[:dot] after_dot = name[dot + 1 :] if before_dot == "super": return self.find_label_name( after_dot, depgraph.nodes[node["head"]], depgraph, unique_index ) else: return self.find_label_name( after_dot, self.lookup_unique(before_dot, node, depgraph), depgraph, unique_index, ) except ValueError: lbl = self.get_label(node) if name == "f": return lbl elif name == "v": return "%sv" % lbl elif name == "r": return "%sr" % lbl elif name == "super": return self.get_label(depgraph.nodes[node["head"]]) elif name == "var": return f"{lbl.upper()}{unique_index}" elif name == "a": return self.get_label(self.lookup_unique("conja", node, depgraph)) elif name == "b": return self.get_label(self.lookup_unique("conjb", node, depgraph)) else: return self.get_label(self.lookup_unique(name, node, depgraph)) def get_label(self, node): """ Pick an alphabetic character as identifier for an entity in the model. :param value: where to index into the list of characters :type value: int """ value = node["address"] letter = [ "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "a", "b", "c", "d", "e", ][value - 1] num = int(value) // 26 if num > 0: return letter + str(num) else: return letter def lookup_unique(self, rel, node, depgraph): """ Lookup 'key'. There should be exactly one item in the associated relation. """ deps = [ depgraph.nodes[dep] for dep in chain.from_iterable(node["deps"].values()) if depgraph.nodes[dep]["rel"].lower() == rel.lower() ] if len(deps) == 0: raise KeyError( "'{}' doesn't contain a feature '{}'".format(node["word"], rel) ) elif len(deps) > 1: raise KeyError( "'{}' should only have one feature '{}'".format(node["word"], rel) ) else: return deps[0] def get_GlueFormula_factory(self): return GlueFormula class Glue: def __init__( self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False ): self.verbose = verbose self.remove_duplicates = remove_duplicates self.depparser = depparser from nltk import Prover9 self.prover = Prover9() if semtype_file: self.semtype_file = semtype_file else: self.semtype_file = os.path.join( "grammars", "sample_grammars", "glue.semtype" ) def train_depparser(self, depgraphs=None): if depgraphs: self.depparser.train(depgraphs) else: self.depparser.train_from_file( nltk.data.find( os.path.join("grammars", "sample_grammars", "glue_train.conll") ) ) def parse_to_meaning(self, sentence): readings = [] for agenda in self.parse_to_compiled(sentence): readings.extend(self.get_readings(agenda)) return readings def get_readings(self, agenda): readings = [] agenda_length = len(agenda) atomics = dict() nonatomics = dict() while agenda: # is not empty cur = agenda.pop() glue_simp = cur.glue.simplify() if isinstance( glue_simp, linearlogic.ImpExpression ): # if cur.glue is non-atomic for key in atomics: try: if isinstance(cur.glue, linearlogic.ApplicationExpression): bindings = cur.glue.bindings else: bindings = linearlogic.BindingDict() glue_simp.antecedent.unify(key, bindings) for atomic in atomics[key]: if not ( cur.indices & atomic.indices ): # if the sets of indices are disjoint try: agenda.append(cur.applyto(atomic)) except linearlogic.LinearLogicApplicationException: pass except linearlogic.UnificationException: pass try: nonatomics[glue_simp.antecedent].append(cur) except KeyError: nonatomics[glue_simp.antecedent] = [cur] else: # else cur.glue is atomic for key in nonatomics: for nonatomic in nonatomics[key]: try: if isinstance( nonatomic.glue, linearlogic.ApplicationExpression ): bindings = nonatomic.glue.bindings else: bindings = linearlogic.BindingDict() glue_simp.unify(key, bindings) if not ( cur.indices & nonatomic.indices ): # if the sets of indices are disjoint try: agenda.append(nonatomic.applyto(cur)) except linearlogic.LinearLogicApplicationException: pass except linearlogic.UnificationException: pass try: atomics[glue_simp].append(cur) except KeyError: atomics[glue_simp] = [cur] for entry in atomics: for gf in atomics[entry]: if len(gf.indices) == agenda_length: self._add_to_reading_list(gf, readings) for entry in nonatomics: for gf in nonatomics[entry]: if len(gf.indices) == agenda_length: self._add_to_reading_list(gf, readings) return readings def _add_to_reading_list(self, glueformula, reading_list): add_reading = True if self.remove_duplicates: for reading in reading_list: try: if reading.equiv(glueformula.meaning, self.prover): add_reading = False break except Exception as e: # if there is an exception, the syntax of the formula # may not be understandable by the prover, so don't # throw out the reading. print("Error when checking logical equality of statements", e) if add_reading: reading_list.append(glueformula.meaning) def parse_to_compiled(self, sentence): gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)] return [self.gfl_to_compiled(gfl) for gfl in gfls] def dep_parse(self, sentence): """ Return a dependency graph for the sentence. :param sentence: the sentence to be parsed :type sentence: list(str) :rtype: DependencyGraph """ # Lazy-initialize the depparser if self.depparser is None: from nltk.parse import MaltParser self.depparser = MaltParser(tagger=self.get_pos_tagger()) if not self.depparser._trained: self.train_depparser() return self.depparser.parse(sentence, verbose=self.verbose) def depgraph_to_glue(self, depgraph): return self.get_glue_dict().to_glueformula_list(depgraph) def get_glue_dict(self): return GlueDict(self.semtype_file) def gfl_to_compiled(self, gfl): index_counter = Counter() return_list = [] for gf in gfl: return_list.extend(gf.compile(index_counter)) if self.verbose: print("Compiled Glue Premises:") for cgf in return_list: print(cgf) return return_list def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [ (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ] ) brown_train = brown.tagged_sents(categories="news") unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = RegexpTagger( [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")], backoff=trigram_tagger, ) return main_tagger class DrtGlueFormula(GlueFormula): def __init__(self, meaning, glue, indices=None): if not indices: indices = set() if isinstance(meaning, str): self.meaning = drt.DrtExpression.fromstring(meaning) elif isinstance(meaning, drt.DrtExpression): self.meaning = meaning else: raise RuntimeError( "Meaning term neither string or expression: %s, %s" % (meaning, meaning.__class__) ) if isinstance(glue, str): self.glue = linearlogic.LinearLogicParser().parse(glue) elif isinstance(glue, linearlogic.Expression): self.glue = glue else: raise RuntimeError( "Glue term neither string or expression: %s, %s" % (glue, glue.__class__) ) self.indices = indices def make_VariableExpression(self, name): return drt.DrtVariableExpression(name) def make_LambdaExpression(self, variable, term): return drt.DrtLambdaExpression(variable, term) class DrtGlueDict(GlueDict): def get_GlueFormula_factory(self): return DrtGlueFormula class DrtGlue(Glue): def __init__( self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False ): if not semtype_file: semtype_file = os.path.join( "grammars", "sample_grammars", "drt_glue.semtype" ) Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose) def get_glue_dict(self): return DrtGlueDict(self.semtype_file) def demo(show_example=-1): from nltk.parse import MaltParser examples = [ "David sees Mary", "David eats a sandwich", "every man chases a dog", "every man believes a dog sleeps", "John gives David a sandwich", "John chases himself", ] # 'John persuades David to order a pizza', # 'John tries to go', # 'John tries to find a unicorn', # 'John seems to vanish', # 'a unicorn seems to approach', # 'every big cat leaves', # 'every gray cat leaves', # 'every big gray cat leaves', # 'a former senator leaves', print("============== DEMO ==============") tagger = RegexpTagger( [ ("^(David|Mary|John)$", "NNP"), ( "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$", "VB", ), ("^(go|order|vanish|find|approach)$", "VB"), ("^(a)$", "ex_quant"), ("^(every)$", "univ_quant"), ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"), ("^(big|gray|former)$", "JJ"), ("^(him|himself)$", "PRP"), ] ) depparser = MaltParser(tagger=tagger) glue = Glue(depparser=depparser, verbose=False) for (i, sentence) in enumerate(examples): if i == show_example or show_example == -1: print(f"[[[Example {i}]]] {sentence}") for reading in glue.parse_to_meaning(sentence.split()): print(reading.simplify()) print("") if __name__ == "__main__": demo() nltk-3.7/nltk/sem/hole.py000066400000000000000000000327751420073152400154040ustar00rootroot00000000000000# Natural Language Toolkit: Logic # # Author: Peter Wang # Updated by: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT """ An implementation of the Hole Semantics model, following Blackburn and Bos, Representation and Inference for Natural Language (CSLI, 2005). The semantic representations are built by the grammar hole.fcfg. This module contains driver code to read in sentences and parse them according to a hole semantics grammar. After parsing, the semantic representation is in the form of an underspecified representation that is not easy to read. We use a "plugging" algorithm to convert that representation into first-order logic formulas. """ from functools import reduce from nltk.parse import load_parser from nltk.sem.logic import ( AllExpression, AndExpression, ApplicationExpression, ExistsExpression, IffExpression, ImpExpression, LambdaExpression, NegatedExpression, OrExpression, ) from nltk.sem.skolemize import skolemize # Note that in this code there may be multiple types of trees being referred to: # # 1. parse trees # 2. the underspecified representation # 3. first-order logic formula trees # 4. the search space when plugging (search tree) # class Constants: ALL = "ALL" EXISTS = "EXISTS" NOT = "NOT" AND = "AND" OR = "OR" IMP = "IMP" IFF = "IFF" PRED = "PRED" LEQ = "LEQ" HOLE = "HOLE" LABEL = "LABEL" MAP = { ALL: lambda v, e: AllExpression(v.variable, e), EXISTS: lambda v, e: ExistsExpression(v.variable, e), NOT: NegatedExpression, AND: AndExpression, OR: OrExpression, IMP: ImpExpression, IFF: IffExpression, PRED: ApplicationExpression, } class HoleSemantics: """ This class holds the broken-down components of a hole semantics, i.e. it extracts the holes, labels, logic formula fragments and constraints out of a big conjunction of such as produced by the hole semantics grammar. It then provides some operations on the semantics dealing with holes, labels and finding legal ways to plug holes with labels. """ def __init__(self, usr): """ Constructor. `usr' is a ``sem.Expression`` representing an Underspecified Representation Structure (USR). A USR has the following special predicates: ALL(l,v,n), EXISTS(l,v,n), AND(l,n,n), OR(l,n,n), IMP(l,n,n), IFF(l,n,n), PRED(l,v,n,v[,v]*) where the brackets and star indicate zero or more repetitions, LEQ(n,n), HOLE(n), LABEL(n) where l is the label of the node described by the predicate, n is either a label or a hole, and v is a variable. """ self.holes = set() self.labels = set() self.fragments = {} # mapping of label -> formula fragment self.constraints = set() # set of Constraints self._break_down(usr) self.top_most_labels = self._find_top_most_labels() self.top_hole = self._find_top_hole() def is_node(self, x): """ Return true if x is a node (label or hole) in this semantic representation. """ return x in (self.labels | self.holes) def _break_down(self, usr): """ Extract holes, labels, formula fragments and constraints from the hole semantics underspecified representation (USR). """ if isinstance(usr, AndExpression): self._break_down(usr.first) self._break_down(usr.second) elif isinstance(usr, ApplicationExpression): func, args = usr.uncurry() if func.variable.name == Constants.LEQ: self.constraints.add(Constraint(args[0], args[1])) elif func.variable.name == Constants.HOLE: self.holes.add(args[0]) elif func.variable.name == Constants.LABEL: self.labels.add(args[0]) else: label = args[0] assert label not in self.fragments self.fragments[label] = (func, args[1:]) else: raise ValueError(usr.label()) def _find_top_nodes(self, node_list): top_nodes = node_list.copy() for f in self.fragments.values(): # the label is the first argument of the predicate args = f[1] for arg in args: if arg in node_list: top_nodes.discard(arg) return top_nodes def _find_top_most_labels(self): """ Return the set of labels which are not referenced directly as part of another formula fragment. These will be the top-most labels for the subtree that they are part of. """ return self._find_top_nodes(self.labels) def _find_top_hole(self): """ Return the hole that will be the top of the formula tree. """ top_holes = self._find_top_nodes(self.holes) assert len(top_holes) == 1 # it must be unique return top_holes.pop() def pluggings(self): """ Calculate and return all the legal pluggings (mappings of labels to holes) of this semantics given the constraints. """ record = [] self._plug_nodes([(self.top_hole, [])], self.top_most_labels, {}, record) return record def _plug_nodes(self, queue, potential_labels, plug_acc, record): """ Plug the nodes in `queue' with the labels in `potential_labels'. Each element of `queue' is a tuple of the node to plug and the list of ancestor holes from the root of the graph to that node. `potential_labels' is a set of the labels which are still available for plugging. `plug_acc' is the incomplete mapping of holes to labels made on the current branch of the search tree so far. `record' is a list of all the complete pluggings that we have found in total so far. It is the only parameter that is destructively updated. """ if queue != []: (node, ancestors) = queue[0] if node in self.holes: # The node is a hole, try to plug it. self._plug_hole( node, ancestors, queue[1:], potential_labels, plug_acc, record ) else: assert node in self.labels # The node is a label. Replace it in the queue by the holes and # labels in the formula fragment named by that label. args = self.fragments[node][1] head = [(a, ancestors) for a in args if self.is_node(a)] self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record) else: raise Exception("queue empty") def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record): """ Try all possible ways of plugging a single hole. See _plug_nodes for the meanings of the parameters. """ # Add the current hole we're trying to plug into the list of ancestors. assert hole not in ancestors0 ancestors = [hole] + ancestors0 # Try each potential label in this hole in turn. for l in potential_labels0: # Is the label valid in this hole? if self._violates_constraints(l, ancestors): continue plug_acc = plug_acc0.copy() plug_acc[hole] = l potential_labels = potential_labels0.copy() potential_labels.remove(l) if len(potential_labels) == 0: # No more potential labels. That must mean all the holes have # been filled so we have found a legal plugging so remember it. # # Note that the queue might not be empty because there might # be labels on there that point to formula fragments with # no holes in them. _sanity_check_plugging will make sure # all holes are filled. self._sanity_check_plugging(plug_acc, self.top_hole, []) record.append(plug_acc) else: # Recursively try to fill in the rest of the holes in the # queue. The label we just plugged into the hole could have # holes of its own so at the end of the queue. Putting it on # the end of the queue gives us a breadth-first search, so that # all the holes at level i of the formula tree are filled # before filling level i+1. # A depth-first search would work as well since the trees must # be finite but the bookkeeping would be harder. self._plug_nodes( queue + [(l, ancestors)], potential_labels, plug_acc, record ) def _violates_constraints(self, label, ancestors): """ Return True if the `label' cannot be placed underneath the holes given by the set `ancestors' because it would violate the constraints imposed on it. """ for c in self.constraints: if c.lhs == label: if c.rhs not in ancestors: return True return False def _sanity_check_plugging(self, plugging, node, ancestors): """ Make sure that a given plugging is legal. We recursively go through each node and make sure that no constraints are violated. We also check that all holes have been filled. """ if node in self.holes: ancestors = [node] + ancestors label = plugging[node] else: label = node assert label in self.labels for c in self.constraints: if c.lhs == label: assert c.rhs in ancestors args = self.fragments[label][1] for arg in args: if self.is_node(arg): self._sanity_check_plugging(plugging, arg, [label] + ancestors) def formula_tree(self, plugging): """ Return the first-order logic formula tree for this underspecified representation using the plugging given. """ return self._formula_tree(plugging, self.top_hole) def _formula_tree(self, plugging, node): if node in plugging: return self._formula_tree(plugging, plugging[node]) elif node in self.fragments: pred, args = self.fragments[node] children = [self._formula_tree(plugging, arg) for arg in args] return reduce(Constants.MAP[pred.variable.name], children) else: return node class Constraint: """ This class represents a constraint of the form (L =< N), where L is a label and N is a node (a label or a hole). """ def __init__(self, lhs, rhs): self.lhs = lhs self.rhs = rhs def __eq__(self, other): if self.__class__ == other.__class__: return self.lhs == other.lhs and self.rhs == other.rhs else: return False def __ne__(self, other): return not (self == other) def __hash__(self): return hash(repr(self)) def __repr__(self): return f"({self.lhs} < {self.rhs})" def hole_readings(sentence, grammar_filename=None, verbose=False): if not grammar_filename: grammar_filename = "grammars/sample_grammars/hole.fcfg" if verbose: print("Reading grammar file", grammar_filename) parser = load_parser(grammar_filename) # Parse the sentence. tokens = sentence.split() trees = list(parser.parse(tokens)) if verbose: print("Got %d different parses" % len(trees)) all_readings = [] for tree in trees: # Get the semantic feature from the top of the parse tree. sem = tree.label()["SEM"].simplify() # Print the raw semantic representation. if verbose: print("Raw: ", sem) # Skolemize away all quantifiers. All variables become unique. while isinstance(sem, LambdaExpression): sem = sem.term skolemized = skolemize(sem) if verbose: print("Skolemized:", skolemized) # Break the hole semantics representation down into its components # i.e. holes, labels, formula fragments and constraints. hole_sem = HoleSemantics(skolemized) # Maybe show the details of the semantic representation. if verbose: print("Holes: ", hole_sem.holes) print("Labels: ", hole_sem.labels) print("Constraints: ", hole_sem.constraints) print("Top hole: ", hole_sem.top_hole) print("Top labels: ", hole_sem.top_most_labels) print("Fragments:") for l, f in hole_sem.fragments.items(): print(f"\t{l}: {f}") # Find all the possible ways to plug the formulas together. pluggings = hole_sem.pluggings() # Build FOL formula trees using the pluggings. readings = list(map(hole_sem.formula_tree, pluggings)) # Print out the formulas in a textual format. if verbose: for i, r in enumerate(readings): print() print("%d. %s" % (i, r)) print() all_readings.extend(readings) return all_readings if __name__ == "__main__": for r in hole_readings("a dog barks"): print(r) print() for r in hole_readings("every girl chases a dog"): print(r) nltk-3.7/nltk/sem/lfg.py000066400000000000000000000164371420073152400152220ustar00rootroot00000000000000# Natural Language Toolkit: Lexical Functional Grammar # # Author: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT from itertools import chain from nltk.internals import Counter class FStructure(dict): def safeappend(self, key, item): """ Append 'item' to the list at 'key'. If no list exists for 'key', then construct one. """ if key not in self: self[key] = [] self[key].append(item) def __setitem__(self, key, value): dict.__setitem__(self, key.lower(), value) def __getitem__(self, key): return dict.__getitem__(self, key.lower()) def __contains__(self, key): return dict.__contains__(self, key.lower()) def to_glueformula_list(self, glue_dict): depgraph = self.to_depgraph() return glue_dict.to_glueformula_list(depgraph) def to_depgraph(self, rel=None): from nltk.parse.dependencygraph import DependencyGraph depgraph = DependencyGraph() nodes = depgraph.nodes self._to_depgraph(nodes, 0, "ROOT") # Add all the dependencies for all the nodes for address, node in nodes.items(): for n2 in (n for n in nodes.values() if n["rel"] != "TOP"): if n2["head"] == address: relation = n2["rel"] node["deps"].setdefault(relation, []) node["deps"][relation].append(n2["address"]) depgraph.root = nodes[1] return depgraph def _to_depgraph(self, nodes, head, rel): index = len(nodes) nodes[index].update( { "address": index, "word": self.pred[0], "tag": self.pred[1], "head": head, "rel": rel, } ) for feature in sorted(self): for item in sorted(self[feature]): if isinstance(item, FStructure): item._to_depgraph(nodes, index, feature) elif isinstance(item, tuple): new_index = len(nodes) nodes[new_index].update( { "address": new_index, "word": item[0], "tag": item[1], "head": index, "rel": feature, } ) elif isinstance(item, list): for n in item: n._to_depgraph(nodes, index, feature) else: raise Exception( "feature %s is not an FStruct, a list, or a tuple" % feature ) @staticmethod def read_depgraph(depgraph): return FStructure._read_depgraph(depgraph.root, depgraph) @staticmethod def _read_depgraph(node, depgraph, label_counter=None, parent=None): if not label_counter: label_counter = Counter() if node["rel"].lower() in ["spec", "punct"]: # the value of a 'spec' entry is a word, not an FStructure return (node["word"], node["tag"]) else: fstruct = FStructure() fstruct.pred = None fstruct.label = FStructure._make_label(label_counter.get()) fstruct.parent = parent word, tag = node["word"], node["tag"] if tag[:2] == "VB": if tag[2:3] == "D": fstruct.safeappend("tense", ("PAST", "tense")) fstruct.pred = (word, tag[:2]) if not fstruct.pred: fstruct.pred = (word, tag) children = [ depgraph.nodes[idx] for idx in chain.from_iterable(node["deps"].values()) ] for child in children: fstruct.safeappend( child["rel"], FStructure._read_depgraph(child, depgraph, label_counter, fstruct), ) return fstruct @staticmethod def _make_label(value): """ Pick an alphabetic character as identifier for an entity in the model. :param value: where to index into the list of characters :type value: int """ letter = [ "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "a", "b", "c", "d", "e", ][value - 1] num = int(value) // 26 if num > 0: return letter + str(num) else: return letter def __repr__(self): return self.__str__().replace("\n", "") def __str__(self): return self.pretty_format() def pretty_format(self, indent=3): try: accum = "%s:[" % self.label except NameError: accum = "[" try: accum += "pred '%s'" % (self.pred[0]) except NameError: pass for feature in sorted(self): for item in self[feature]: if isinstance(item, FStructure): next_indent = indent + len(feature) + 3 + len(self.label) accum += "\n{}{} {}".format( " " * (indent), feature, item.pretty_format(next_indent), ) elif isinstance(item, tuple): accum += "\n{}{} '{}'".format(" " * (indent), feature, item[0]) elif isinstance(item, list): accum += "\n{}{} {{{}}}".format( " " * (indent), feature, ("\n%s" % (" " * (indent + len(feature) + 2))).join(item), ) else: # ERROR raise Exception( "feature %s is not an FStruct, a list, or a tuple" % feature ) return accum + "]" def demo_read_depgraph(): from nltk.parse.dependencygraph import DependencyGraph dg1 = DependencyGraph( """\ Esso NNP 2 SUB said VBD 0 ROOT the DT 5 NMOD Whiting NNP 5 NMOD field NN 6 SUB started VBD 2 VMOD production NN 6 OBJ Tuesday NNP 6 VMOD """ ) dg2 = DependencyGraph( """\ John NNP 2 SUB sees VBP 0 ROOT Mary NNP 2 OBJ """ ) dg3 = DependencyGraph( """\ a DT 2 SPEC man NN 3 SUBJ walks VB 0 ROOT """ ) dg4 = DependencyGraph( """\ every DT 2 SPEC girl NN 3 SUBJ chases VB 0 ROOT a DT 5 SPEC dog NN 3 OBJ """ ) depgraphs = [dg1, dg2, dg3, dg4] for dg in depgraphs: print(FStructure.read_depgraph(dg)) if __name__ == "__main__": demo_read_depgraph() nltk-3.7/nltk/sem/linearlogic.py000066400000000000000000000405601420073152400167340ustar00rootroot00000000000000# Natural Language Toolkit: Linear Logic # # Author: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT from nltk.internals import Counter from nltk.sem.logic import APP, LogicParser _counter = Counter() class Tokens: # Punctuation OPEN = "(" CLOSE = ")" # Operations IMP = "-o" PUNCT = [OPEN, CLOSE] TOKENS = PUNCT + [IMP] class LinearLogicParser(LogicParser): """A linear logic expression parser.""" def __init__(self): LogicParser.__init__(self) self.operator_precedence = {APP: 1, Tokens.IMP: 2, None: 3} self.right_associated_operations += [Tokens.IMP] def get_all_symbols(self): return Tokens.TOKENS def handle(self, tok, context): if tok not in Tokens.TOKENS: return self.handle_variable(tok, context) elif tok == Tokens.OPEN: return self.handle_open(tok, context) def get_BooleanExpression_factory(self, tok): if tok == Tokens.IMP: return ImpExpression else: return None def make_BooleanExpression(self, factory, first, second): return factory(first, second) def attempt_ApplicationExpression(self, expression, context): """Attempt to make an application expression. If the next tokens are an argument in parens, then the argument expression is a function being applied to the arguments. Otherwise, return the argument expression.""" if self.has_priority(APP, context): if self.inRange(0) and self.token(0) == Tokens.OPEN: self.token() # swallow then open paren argument = self.process_next_expression(APP) self.assertNextToken(Tokens.CLOSE) expression = ApplicationExpression(expression, argument, None) return expression def make_VariableExpression(self, name): if name[0].isupper(): return VariableExpression(name) else: return ConstantExpression(name) class Expression: _linear_logic_parser = LinearLogicParser() @classmethod def fromstring(cls, s): return cls._linear_logic_parser.parse(s) def applyto(self, other, other_indices=None): return ApplicationExpression(self, other, other_indices) def __call__(self, other): return self.applyto(other) def __repr__(self): return f"<{self.__class__.__name__} {self}>" class AtomicExpression(Expression): def __init__(self, name, dependencies=None): """ :param name: str for the constant name :param dependencies: list of int for the indices on which this atom is dependent """ assert isinstance(name, str) self.name = name if not dependencies: dependencies = [] self.dependencies = dependencies def simplify(self, bindings=None): """ If 'self' is bound by 'bindings', return the atomic to which it is bound. Otherwise, return self. :param bindings: ``BindingDict`` A dictionary of bindings used to simplify :return: ``AtomicExpression`` """ if bindings and self in bindings: return bindings[self] else: return self def compile_pos(self, index_counter, glueFormulaFactory): """ From Iddo Lev's PhD Dissertation p108-109 :param index_counter: ``Counter`` for unique indices :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas """ self.dependencies = [] return (self, []) def compile_neg(self, index_counter, glueFormulaFactory): """ From Iddo Lev's PhD Dissertation p108-109 :param index_counter: ``Counter`` for unique indices :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas """ self.dependencies = [] return (self, []) def initialize_labels(self, fstruct): self.name = fstruct.initialize_label(self.name.lower()) def __eq__(self, other): return self.__class__ == other.__class__ and self.name == other.name def __ne__(self, other): return not self == other def __str__(self): accum = self.name if self.dependencies: accum += "%s" % self.dependencies return accum def __hash__(self): return hash(self.name) class ConstantExpression(AtomicExpression): def unify(self, other, bindings): """ If 'other' is a constant, then it must be equal to 'self'. If 'other' is a variable, then it must not be bound to anything other than 'self'. :param other: ``Expression`` :param bindings: ``BindingDict`` A dictionary of all current bindings :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new binding :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' """ assert isinstance(other, Expression) if isinstance(other, VariableExpression): try: return bindings + BindingDict([(other, self)]) except VariableBindingException: pass elif self == other: return bindings raise UnificationException(self, other, bindings) class VariableExpression(AtomicExpression): def unify(self, other, bindings): """ 'self' must not be bound to anything other than 'other'. :param other: ``Expression`` :param bindings: ``BindingDict`` A dictionary of all current bindings :return: ``BindingDict`` A new combined dictionary of of 'bindings' and the new binding :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' """ assert isinstance(other, Expression) try: if self == other: return bindings else: return bindings + BindingDict([(self, other)]) except VariableBindingException as e: raise UnificationException(self, other, bindings) from e class ImpExpression(Expression): def __init__(self, antecedent, consequent): """ :param antecedent: ``Expression`` for the antecedent :param consequent: ``Expression`` for the consequent """ assert isinstance(antecedent, Expression) assert isinstance(consequent, Expression) self.antecedent = antecedent self.consequent = consequent def simplify(self, bindings=None): return self.__class__( self.antecedent.simplify(bindings), self.consequent.simplify(bindings) ) def unify(self, other, bindings): """ Both the antecedent and consequent of 'self' and 'other' must unify. :param other: ``ImpExpression`` :param bindings: ``BindingDict`` A dictionary of all current bindings :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new bindings :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' """ assert isinstance(other, ImpExpression) try: return ( bindings + self.antecedent.unify(other.antecedent, bindings) + self.consequent.unify(other.consequent, bindings) ) except VariableBindingException as e: raise UnificationException(self, other, bindings) from e def compile_pos(self, index_counter, glueFormulaFactory): """ From Iddo Lev's PhD Dissertation p108-109 :param index_counter: ``Counter`` for unique indices :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas """ (a, a_new) = self.antecedent.compile_neg(index_counter, glueFormulaFactory) (c, c_new) = self.consequent.compile_pos(index_counter, glueFormulaFactory) return (ImpExpression(a, c), a_new + c_new) def compile_neg(self, index_counter, glueFormulaFactory): """ From Iddo Lev's PhD Dissertation p108-109 :param index_counter: ``Counter`` for unique indices :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas :return: (``Expression``,list of ``GlueFormula``) for the compiled linear logic and any newly created glue formulas """ (a, a_new) = self.antecedent.compile_pos(index_counter, glueFormulaFactory) (c, c_new) = self.consequent.compile_neg(index_counter, glueFormulaFactory) fresh_index = index_counter.get() c.dependencies.append(fresh_index) new_v = glueFormulaFactory("v%s" % fresh_index, a, {fresh_index}) return (c, a_new + c_new + [new_v]) def initialize_labels(self, fstruct): self.antecedent.initialize_labels(fstruct) self.consequent.initialize_labels(fstruct) def __eq__(self, other): return ( self.__class__ == other.__class__ and self.antecedent == other.antecedent and self.consequent == other.consequent ) def __ne__(self, other): return not self == other def __str__(self): return "{}{} {} {}{}".format( Tokens.OPEN, self.antecedent, Tokens.IMP, self.consequent, Tokens.CLOSE, ) def __hash__(self): return hash(f"{hash(self.antecedent)}{Tokens.IMP}{hash(self.consequent)}") class ApplicationExpression(Expression): def __init__(self, function, argument, argument_indices=None): """ :param function: ``Expression`` for the function :param argument: ``Expression`` for the argument :param argument_indices: set for the indices of the glue formula from which the argument came :raise LinearLogicApplicationException: If 'function' cannot be applied to 'argument' given 'argument_indices'. """ function_simp = function.simplify() argument_simp = argument.simplify() assert isinstance(function_simp, ImpExpression) assert isinstance(argument_simp, Expression) bindings = BindingDict() try: if isinstance(function, ApplicationExpression): bindings += function.bindings if isinstance(argument, ApplicationExpression): bindings += argument.bindings bindings += function_simp.antecedent.unify(argument_simp, bindings) except UnificationException as e: raise LinearLogicApplicationException( f"Cannot apply {function_simp} to {argument_simp}. {e}" ) from e # If you are running it on complied premises, more conditions apply if argument_indices: # A.dependencies of (A -o (B -o C)) must be a proper subset of argument_indices if not set(function_simp.antecedent.dependencies) < argument_indices: raise LinearLogicApplicationException( "Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s" % (function_simp, argument_simp) ) if set(function_simp.antecedent.dependencies) == argument_indices: raise LinearLogicApplicationException( "Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s" % (function_simp, argument_simp) ) self.function = function self.argument = argument self.bindings = bindings def simplify(self, bindings=None): """ Since function is an implication, return its consequent. There should be no need to check that the application is valid since the checking is done by the constructor. :param bindings: ``BindingDict`` A dictionary of bindings used to simplify :return: ``Expression`` """ if not bindings: bindings = self.bindings return self.function.simplify(bindings).consequent def __eq__(self, other): return ( self.__class__ == other.__class__ and self.function == other.function and self.argument == other.argument ) def __ne__(self, other): return not self == other def __str__(self): return "%s" % self.function + Tokens.OPEN + "%s" % self.argument + Tokens.CLOSE def __hash__(self): return hash(f"{hash(self.antecedent)}{Tokens.OPEN}{hash(self.consequent)}") class BindingDict: def __init__(self, bindings=None): """ :param bindings: list [(``VariableExpression``, ``AtomicExpression``)] to initialize the dictionary dict {``VariableExpression``: ``AtomicExpression``} to initialize the dictionary """ self.d = {} if isinstance(bindings, dict): bindings = bindings.items() if bindings: for (v, b) in bindings: self[v] = b def __setitem__(self, variable, binding): """ A binding is consistent with the dict if its variable is not already bound, OR if its variable is already bound to its argument. :param variable: ``VariableExpression`` The variable bind :param binding: ``Expression`` The expression to which 'variable' should be bound :raise VariableBindingException: If the variable cannot be bound in this dictionary """ assert isinstance(variable, VariableExpression) assert isinstance(binding, Expression) assert variable != binding existing = self.d.get(variable, None) if not existing or binding == existing: self.d[variable] = binding else: raise VariableBindingException( "Variable %s already bound to another value" % (variable) ) def __getitem__(self, variable): """ Return the expression to which 'variable' is bound """ assert isinstance(variable, VariableExpression) intermediate = self.d[variable] while intermediate: try: intermediate = self.d[intermediate] except KeyError: return intermediate def __contains__(self, item): return item in self.d def __add__(self, other): """ :param other: ``BindingDict`` The dict with which to combine self :return: ``BindingDict`` A new dict containing all the elements of both parameters :raise VariableBindingException: If the parameter dictionaries are not consistent with each other """ try: combined = BindingDict() for v in self.d: combined[v] = self.d[v] for v in other.d: combined[v] = other.d[v] return combined except VariableBindingException as e: raise VariableBindingException( "Attempting to add two contradicting" " VariableBindingsLists: %s, %s" % (self, other) ) from e def __ne__(self, other): return not self == other def __eq__(self, other): if not isinstance(other, BindingDict): raise TypeError return self.d == other.d def __str__(self): return "{" + ", ".join(f"{v}: {self.d[v]}" for v in sorted(self.d.keys())) + "}" def __repr__(self): return "BindingDict: %s" % self class VariableBindingException(Exception): pass class UnificationException(Exception): def __init__(self, a, b, bindings): Exception.__init__(self, f"Cannot unify {a} with {b} given {bindings}") class LinearLogicApplicationException(Exception): pass def demo(): lexpr = Expression.fromstring print(lexpr(r"f")) print(lexpr(r"(g -o f)")) print(lexpr(r"((g -o G) -o G)")) print(lexpr(r"g -o h -o f")) print(lexpr(r"(g -o f)(g)").simplify()) print(lexpr(r"(H -o f)(g)").simplify()) print(lexpr(r"((g -o G) -o G)((g -o f))").simplify()) print(lexpr(r"(H -o H)((g -o f))").simplify()) if __name__ == "__main__": demo() nltk-3.7/nltk/sem/logic.py000066400000000000000000002045421420073152400155430ustar00rootroot00000000000000# Natural Language Toolkit: Logic # # Author: Dan Garrette # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT """ A version of first order predicate logic, built on top of the typed lambda calculus. """ import operator import re from collections import defaultdict from functools import reduce, total_ordering from nltk.internals import Counter from nltk.util import Trie APP = "APP" _counter = Counter() class Tokens: LAMBDA = "\\" LAMBDA_LIST = ["\\"] # Quantifiers EXISTS = "exists" EXISTS_LIST = ["some", "exists", "exist"] ALL = "all" ALL_LIST = ["all", "forall"] # Punctuation DOT = "." OPEN = "(" CLOSE = ")" COMMA = "," # Operations NOT = "-" NOT_LIST = ["not", "-", "!"] AND = "&" AND_LIST = ["and", "&", "^"] OR = "|" OR_LIST = ["or", "|"] IMP = "->" IMP_LIST = ["implies", "->", "=>"] IFF = "<->" IFF_LIST = ["iff", "<->", "<=>"] EQ = "=" EQ_LIST = ["=", "=="] NEQ = "!=" NEQ_LIST = ["!="] # Collections of tokens BINOPS = AND_LIST + OR_LIST + IMP_LIST + IFF_LIST QUANTS = EXISTS_LIST + ALL_LIST PUNCT = [DOT, OPEN, CLOSE, COMMA] TOKENS = BINOPS + EQ_LIST + NEQ_LIST + QUANTS + LAMBDA_LIST + PUNCT + NOT_LIST # Special SYMBOLS = [x for x in TOKENS if re.match(r"^[-\\.(),!&^|>=<]*$", x)] def boolean_ops(): """ Boolean operators """ names = ["negation", "conjunction", "disjunction", "implication", "equivalence"] for pair in zip(names, [Tokens.NOT, Tokens.AND, Tokens.OR, Tokens.IMP, Tokens.IFF]): print("%-15s\t%s" % pair) def equality_preds(): """ Equality predicates """ names = ["equality", "inequality"] for pair in zip(names, [Tokens.EQ, Tokens.NEQ]): print("%-15s\t%s" % pair) def binding_ops(): """ Binding operators """ names = ["existential", "universal", "lambda"] for pair in zip(names, [Tokens.EXISTS, Tokens.ALL, Tokens.LAMBDA]): print("%-15s\t%s" % pair) class LogicParser: """A lambda calculus expression parser.""" def __init__(self, type_check=False): """ :param type_check: should type checking be performed to their types? :type type_check: bool """ assert isinstance(type_check, bool) self._currentIndex = 0 self._buffer = [] self.type_check = type_check """A list of tuples of quote characters. The 4-tuple is comprised of the start character, the end character, the escape character, and a boolean indicating whether the quotes should be included in the result. Quotes are used to signify that a token should be treated as atomic, ignoring any special characters within the token. The escape character allows the quote end character to be used within the quote. If True, the boolean indicates that the final token should contain the quote and escape characters. This method exists to be overridden""" self.quote_chars = [] self.operator_precedence = dict( [(x, 1) for x in Tokens.LAMBDA_LIST] + [(x, 2) for x in Tokens.NOT_LIST] + [(APP, 3)] + [(x, 4) for x in Tokens.EQ_LIST + Tokens.NEQ_LIST] + [(x, 5) for x in Tokens.QUANTS] + [(x, 6) for x in Tokens.AND_LIST] + [(x, 7) for x in Tokens.OR_LIST] + [(x, 8) for x in Tokens.IMP_LIST] + [(x, 9) for x in Tokens.IFF_LIST] + [(None, 10)] ) self.right_associated_operations = [APP] def parse(self, data, signature=None): """ Parse the expression. :param data: str for the input to be parsed :param signature: ``dict`` that maps variable names to type strings :returns: a parsed Expression """ data = data.rstrip() self._currentIndex = 0 self._buffer, mapping = self.process(data) try: result = self.process_next_expression(None) if self.inRange(0): raise UnexpectedTokenException(self._currentIndex + 1, self.token(0)) except LogicalExpressionException as e: msg = "{}\n{}\n{}^".format(e, data, " " * mapping[e.index - 1]) raise LogicalExpressionException(None, msg) from e if self.type_check: result.typecheck(signature) return result def process(self, data): """Split the data into tokens""" out = [] mapping = {} tokenTrie = Trie(self.get_all_symbols()) token = "" data_idx = 0 token_start_idx = data_idx while data_idx < len(data): cur_data_idx = data_idx quoted_token, data_idx = self.process_quoted_token(data_idx, data) if quoted_token: if not token: token_start_idx = cur_data_idx token += quoted_token continue st = tokenTrie c = data[data_idx] symbol = "" while c in st: symbol += c st = st[c] if len(data) - data_idx > len(symbol): c = data[data_idx + len(symbol)] else: break if Trie.LEAF in st: # token is a complete symbol if token: mapping[len(out)] = token_start_idx out.append(token) token = "" mapping[len(out)] = data_idx out.append(symbol) data_idx += len(symbol) else: if data[data_idx] in " \t\n": # any whitespace if token: mapping[len(out)] = token_start_idx out.append(token) token = "" else: if not token: token_start_idx = data_idx token += data[data_idx] data_idx += 1 if token: mapping[len(out)] = token_start_idx out.append(token) mapping[len(out)] = len(data) mapping[len(out) + 1] = len(data) + 1 return out, mapping def process_quoted_token(self, data_idx, data): token = "" c = data[data_idx] i = data_idx for start, end, escape, incl_quotes in self.quote_chars: if c == start: if incl_quotes: token += c i += 1 while data[i] != end: if data[i] == escape: if incl_quotes: token += data[i] i += 1 if len(data) == i: # if there are no more chars raise LogicalExpressionException( None, "End of input reached. " "Escape character [%s] found at end." % escape, ) token += data[i] else: token += data[i] i += 1 if len(data) == i: raise LogicalExpressionException( None, "End of input reached. " "Expected: [%s]" % end ) if incl_quotes: token += data[i] i += 1 if not token: raise LogicalExpressionException(None, "Empty quoted token found") break return token, i def get_all_symbols(self): """This method exists to be overridden""" return Tokens.SYMBOLS def inRange(self, location): """Return TRUE if the given location is within the buffer""" return self._currentIndex + location < len(self._buffer) def token(self, location=None): """Get the next waiting token. If a location is given, then return the token at currentIndex+location without advancing currentIndex; setting it gives lookahead/lookback capability.""" try: if location is None: tok = self._buffer[self._currentIndex] self._currentIndex += 1 else: tok = self._buffer[self._currentIndex + location] return tok except IndexError as e: raise ExpectedMoreTokensException(self._currentIndex + 1) from e def isvariable(self, tok): return tok not in Tokens.TOKENS def process_next_expression(self, context): """Parse the next complete expression from the stream and return it.""" try: tok = self.token() except ExpectedMoreTokensException as e: raise ExpectedMoreTokensException( self._currentIndex + 1, message="Expression expected." ) from e accum = self.handle(tok, context) if not accum: raise UnexpectedTokenException( self._currentIndex, tok, message="Expression expected." ) return self.attempt_adjuncts(accum, context) def handle(self, tok, context): """This method is intended to be overridden for logics that use different operators or expressions""" if self.isvariable(tok): return self.handle_variable(tok, context) elif tok in Tokens.NOT_LIST: return self.handle_negation(tok, context) elif tok in Tokens.LAMBDA_LIST: return self.handle_lambda(tok, context) elif tok in Tokens.QUANTS: return self.handle_quant(tok, context) elif tok == Tokens.OPEN: return self.handle_open(tok, context) def attempt_adjuncts(self, expression, context): cur_idx = None while cur_idx != self._currentIndex: # while adjuncts are added cur_idx = self._currentIndex expression = self.attempt_EqualityExpression(expression, context) expression = self.attempt_ApplicationExpression(expression, context) expression = self.attempt_BooleanExpression(expression, context) return expression def handle_negation(self, tok, context): return self.make_NegatedExpression(self.process_next_expression(Tokens.NOT)) def make_NegatedExpression(self, expression): return NegatedExpression(expression) def handle_variable(self, tok, context): # It's either: 1) a predicate expression: sees(x,y) # 2) an application expression: P(x) # 3) a solo variable: john OR x accum = self.make_VariableExpression(tok) if self.inRange(0) and self.token(0) == Tokens.OPEN: # The predicate has arguments if not isinstance(accum, FunctionVariableExpression) and not isinstance( accum, ConstantExpression ): raise LogicalExpressionException( self._currentIndex, "'%s' is an illegal predicate name. " "Individual variables may not be used as " "predicates." % tok, ) self.token() # swallow the Open Paren # curry the arguments accum = self.make_ApplicationExpression( accum, self.process_next_expression(APP) ) while self.inRange(0) and self.token(0) == Tokens.COMMA: self.token() # swallow the comma accum = self.make_ApplicationExpression( accum, self.process_next_expression(APP) ) self.assertNextToken(Tokens.CLOSE) return accum def get_next_token_variable(self, description): try: tok = self.token() except ExpectedMoreTokensException as e: raise ExpectedMoreTokensException(e.index, "Variable expected.") from e if isinstance(self.make_VariableExpression(tok), ConstantExpression): raise LogicalExpressionException( self._currentIndex, "'%s' is an illegal variable name. " "Constants may not be %s." % (tok, description), ) return Variable(tok) def handle_lambda(self, tok, context): # Expression is a lambda expression if not self.inRange(0): raise ExpectedMoreTokensException( self._currentIndex + 2, message="Variable and Expression expected following lambda operator.", ) vars = [self.get_next_token_variable("abstracted")] while True: if not self.inRange(0) or ( self.token(0) == Tokens.DOT and not self.inRange(1) ): raise ExpectedMoreTokensException( self._currentIndex + 2, message="Expression expected." ) if not self.isvariable(self.token(0)): break # Support expressions like: \x y.M == \x.\y.M vars.append(self.get_next_token_variable("abstracted")) if self.inRange(0) and self.token(0) == Tokens.DOT: self.token() # swallow the dot accum = self.process_next_expression(tok) while vars: accum = self.make_LambdaExpression(vars.pop(), accum) return accum def handle_quant(self, tok, context): # Expression is a quantified expression: some x.M factory = self.get_QuantifiedExpression_factory(tok) if not self.inRange(0): raise ExpectedMoreTokensException( self._currentIndex + 2, message="Variable and Expression expected following quantifier '%s'." % tok, ) vars = [self.get_next_token_variable("quantified")] while True: if not self.inRange(0) or ( self.token(0) == Tokens.DOT and not self.inRange(1) ): raise ExpectedMoreTokensException( self._currentIndex + 2, message="Expression expected." ) if not self.isvariable(self.token(0)): break # Support expressions like: some x y.M == some x.some y.M vars.append(self.get_next_token_variable("quantified")) if self.inRange(0) and self.token(0) == Tokens.DOT: self.token() # swallow the dot accum = self.process_next_expression(tok) while vars: accum = self.make_QuanifiedExpression(factory, vars.pop(), accum) return accum def get_QuantifiedExpression_factory(self, tok): """This method serves as a hook for other logic parsers that have different quantifiers""" if tok in Tokens.EXISTS_LIST: return ExistsExpression elif tok in Tokens.ALL_LIST: return AllExpression else: self.assertToken(tok, Tokens.QUANTS) def make_QuanifiedExpression(self, factory, variable, term): return factory(variable, term) def handle_open(self, tok, context): # Expression is in parens accum = self.process_next_expression(None) self.assertNextToken(Tokens.CLOSE) return accum def attempt_EqualityExpression(self, expression, context): """Attempt to make an equality expression. If the next token is an equality operator, then an EqualityExpression will be returned. Otherwise, the parameter will be returned.""" if self.inRange(0): tok = self.token(0) if tok in Tokens.EQ_LIST + Tokens.NEQ_LIST and self.has_priority( tok, context ): self.token() # swallow the "=" or "!=" expression = self.make_EqualityExpression( expression, self.process_next_expression(tok) ) if tok in Tokens.NEQ_LIST: expression = self.make_NegatedExpression(expression) return expression def make_EqualityExpression(self, first, second): """This method serves as a hook for other logic parsers that have different equality expression classes""" return EqualityExpression(first, second) def attempt_BooleanExpression(self, expression, context): """Attempt to make a boolean expression. If the next token is a boolean operator, then a BooleanExpression will be returned. Otherwise, the parameter will be returned.""" while self.inRange(0): tok = self.token(0) factory = self.get_BooleanExpression_factory(tok) if factory and self.has_priority(tok, context): self.token() # swallow the operator expression = self.make_BooleanExpression( factory, expression, self.process_next_expression(tok) ) else: break return expression def get_BooleanExpression_factory(self, tok): """This method serves as a hook for other logic parsers that have different boolean operators""" if tok in Tokens.AND_LIST: return AndExpression elif tok in Tokens.OR_LIST: return OrExpression elif tok in Tokens.IMP_LIST: return ImpExpression elif tok in Tokens.IFF_LIST: return IffExpression else: return None def make_BooleanExpression(self, factory, first, second): return factory(first, second) def attempt_ApplicationExpression(self, expression, context): """Attempt to make an application expression. The next tokens are a list of arguments in parens, then the argument expression is a function being applied to the arguments. Otherwise, return the argument expression.""" if self.has_priority(APP, context): if self.inRange(0) and self.token(0) == Tokens.OPEN: if ( not isinstance(expression, LambdaExpression) and not isinstance(expression, ApplicationExpression) and not isinstance(expression, FunctionVariableExpression) and not isinstance(expression, ConstantExpression) ): raise LogicalExpressionException( self._currentIndex, ("The function '%s" % expression) + "' is not a Lambda Expression, an " "Application Expression, or a " "functional predicate, so it may " "not take arguments.", ) self.token() # swallow then open paren # curry the arguments accum = self.make_ApplicationExpression( expression, self.process_next_expression(APP) ) while self.inRange(0) and self.token(0) == Tokens.COMMA: self.token() # swallow the comma accum = self.make_ApplicationExpression( accum, self.process_next_expression(APP) ) self.assertNextToken(Tokens.CLOSE) return accum return expression def make_ApplicationExpression(self, function, argument): return ApplicationExpression(function, argument) def make_VariableExpression(self, name): return VariableExpression(Variable(name)) def make_LambdaExpression(self, variable, term): return LambdaExpression(variable, term) def has_priority(self, operation, context): return self.operator_precedence[operation] < self.operator_precedence[ context ] or ( operation in self.right_associated_operations and self.operator_precedence[operation] == self.operator_precedence[context] ) def assertNextToken(self, expected): try: tok = self.token() except ExpectedMoreTokensException as e: raise ExpectedMoreTokensException( e.index, message="Expected token '%s'." % expected ) from e if isinstance(expected, list): if tok not in expected: raise UnexpectedTokenException(self._currentIndex, tok, expected) else: if tok != expected: raise UnexpectedTokenException(self._currentIndex, tok, expected) def assertToken(self, tok, expected): if isinstance(expected, list): if tok not in expected: raise UnexpectedTokenException(self._currentIndex, tok, expected) else: if tok != expected: raise UnexpectedTokenException(self._currentIndex, tok, expected) def __repr__(self): if self.inRange(0): msg = "Next token: " + self.token(0) else: msg = "No more tokens" return "<" + self.__class__.__name__ + ": " + msg + ">" def read_logic(s, logic_parser=None, encoding=None): """ Convert a file of First Order Formulas into a list of {Expression}s. :param s: the contents of the file :type s: str :param logic_parser: The parser to be used to parse the logical expression :type logic_parser: LogicParser :param encoding: the encoding of the input string, if it is binary :type encoding: str :return: a list of parsed formulas. :rtype: list(Expression) """ if encoding is not None: s = s.decode(encoding) if logic_parser is None: logic_parser = LogicParser() statements = [] for linenum, line in enumerate(s.splitlines()): line = line.strip() if line.startswith("#") or line == "": continue try: statements.append(logic_parser.parse(line)) except LogicalExpressionException as e: raise ValueError(f"Unable to parse line {linenum}: {line}") from e return statements @total_ordering class Variable: def __init__(self, name): """ :param name: the name of the variable """ assert isinstance(name, str), "%s is not a string" % name self.name = name def __eq__(self, other): return isinstance(other, Variable) and self.name == other.name def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, Variable): raise TypeError return self.name < other.name def substitute_bindings(self, bindings): return bindings.get(self, self) def __hash__(self): return hash(self.name) def __str__(self): return self.name def __repr__(self): return "Variable('%s')" % self.name def unique_variable(pattern=None, ignore=None): """ Return a new, unique variable. :param pattern: ``Variable`` that is being replaced. The new variable must be the same type. :param term: a set of ``Variable`` objects that should not be returned from this function. :rtype: Variable """ if pattern is not None: if is_indvar(pattern.name): prefix = "z" elif is_funcvar(pattern.name): prefix = "F" elif is_eventvar(pattern.name): prefix = "e0" else: assert False, "Cannot generate a unique constant" else: prefix = "z" v = Variable(f"{prefix}{_counter.get()}") while ignore is not None and v in ignore: v = Variable(f"{prefix}{_counter.get()}") return v def skolem_function(univ_scope=None): """ Return a skolem function over the variables in univ_scope param univ_scope """ skolem = VariableExpression(Variable("F%s" % _counter.get())) if univ_scope: for v in list(univ_scope): skolem = skolem(VariableExpression(v)) return skolem class Type: def __repr__(self): return "%s" % self def __hash__(self): return hash("%s" % self) @classmethod def fromstring(cls, s): return read_type(s) class ComplexType(Type): def __init__(self, first, second): assert isinstance(first, Type), "%s is not a Type" % first assert isinstance(second, Type), "%s is not a Type" % second self.first = first self.second = second def __eq__(self, other): return ( isinstance(other, ComplexType) and self.first == other.first and self.second == other.second ) def __ne__(self, other): return not self == other __hash__ = Type.__hash__ def matches(self, other): if isinstance(other, ComplexType): return self.first.matches(other.first) and self.second.matches(other.second) else: return self == ANY_TYPE def resolve(self, other): if other == ANY_TYPE: return self elif isinstance(other, ComplexType): f = self.first.resolve(other.first) s = self.second.resolve(other.second) if f and s: return ComplexType(f, s) else: return None elif self == ANY_TYPE: return other else: return None def __str__(self): if self == ANY_TYPE: return "%s" % ANY_TYPE else: return f"<{self.first},{self.second}>" def str(self): if self == ANY_TYPE: return ANY_TYPE.str() else: return f"({self.first.str()} -> {self.second.str()})" class BasicType(Type): def __eq__(self, other): return isinstance(other, BasicType) and ("%s" % self) == ("%s" % other) def __ne__(self, other): return not self == other __hash__ = Type.__hash__ def matches(self, other): return other == ANY_TYPE or self == other def resolve(self, other): if self.matches(other): return self else: return None class EntityType(BasicType): def __str__(self): return "e" def str(self): return "IND" class TruthValueType(BasicType): def __str__(self): return "t" def str(self): return "BOOL" class EventType(BasicType): def __str__(self): return "v" def str(self): return "EVENT" class AnyType(BasicType, ComplexType): def __init__(self): pass @property def first(self): return self @property def second(self): return self def __eq__(self, other): return isinstance(other, AnyType) or other.__eq__(self) def __ne__(self, other): return not self == other __hash__ = Type.__hash__ def matches(self, other): return True def resolve(self, other): return other def __str__(self): return "?" def str(self): return "ANY" TRUTH_TYPE = TruthValueType() ENTITY_TYPE = EntityType() EVENT_TYPE = EventType() ANY_TYPE = AnyType() def read_type(type_string): assert isinstance(type_string, str) type_string = type_string.replace(" ", "") # remove spaces if type_string[0] == "<": assert type_string[-1] == ">" paren_count = 0 for i, char in enumerate(type_string): if char == "<": paren_count += 1 elif char == ">": paren_count -= 1 assert paren_count > 0 elif char == ",": if paren_count == 1: break return ComplexType( read_type(type_string[1:i]), read_type(type_string[i + 1 : -1]) ) elif type_string[0] == "%s" % ENTITY_TYPE: return ENTITY_TYPE elif type_string[0] == "%s" % TRUTH_TYPE: return TRUTH_TYPE elif type_string[0] == "%s" % ANY_TYPE: return ANY_TYPE else: raise LogicalExpressionException( None, "Unexpected character: '%s'." % type_string[0] ) class TypeException(Exception): def __init__(self, msg): super().__init__(msg) class InconsistentTypeHierarchyException(TypeException): def __init__(self, variable, expression=None): if expression: msg = ( "The variable '%s' was found in multiple places with different" " types in '%s'." % (variable, expression) ) else: msg = ( "The variable '%s' was found in multiple places with different" " types." % (variable) ) super().__init__(msg) class TypeResolutionException(TypeException): def __init__(self, expression, other_type): super().__init__( "The type of '%s', '%s', cannot be resolved with type '%s'" % (expression, expression.type, other_type) ) class IllegalTypeException(TypeException): def __init__(self, expression, other_type, allowed_type): super().__init__( "Cannot set type of %s '%s' to '%s'; must match type '%s'." % (expression.__class__.__name__, expression, other_type, allowed_type) ) def typecheck(expressions, signature=None): """ Ensure correct typing across a collection of ``Expression`` objects. :param expressions: a collection of expressions :param signature: dict that maps variable names to types (or string representations of types) """ # typecheck and create master signature for expression in expressions: signature = expression.typecheck(signature) # apply master signature to all expressions for expression in expressions[:-1]: expression.typecheck(signature) return signature class SubstituteBindingsI: """ An interface for classes that can perform substitutions for variables. """ def substitute_bindings(self, bindings): """ :return: The object that is obtained by replacing each variable bound by ``bindings`` with its values. Aliases are already resolved. (maybe?) :rtype: (any) """ raise NotImplementedError() def variables(self): """ :return: A list of all variables in this object. """ raise NotImplementedError() class Expression(SubstituteBindingsI): """This is the base abstract object for all logical expressions""" _logic_parser = LogicParser() _type_checking_logic_parser = LogicParser(type_check=True) @classmethod def fromstring(cls, s, type_check=False, signature=None): if type_check: return cls._type_checking_logic_parser.parse(s, signature) else: return cls._logic_parser.parse(s, signature) def __call__(self, other, *additional): accum = self.applyto(other) for a in additional: accum = accum(a) return accum def applyto(self, other): assert isinstance(other, Expression), "%s is not an Expression" % other return ApplicationExpression(self, other) def __neg__(self): return NegatedExpression(self) def negate(self): """If this is a negated expression, remove the negation. Otherwise add a negation.""" return -self def __and__(self, other): if not isinstance(other, Expression): raise TypeError("%s is not an Expression" % other) return AndExpression(self, other) def __or__(self, other): if not isinstance(other, Expression): raise TypeError("%s is not an Expression" % other) return OrExpression(self, other) def __gt__(self, other): if not isinstance(other, Expression): raise TypeError("%s is not an Expression" % other) return ImpExpression(self, other) def __lt__(self, other): if not isinstance(other, Expression): raise TypeError("%s is not an Expression" % other) return IffExpression(self, other) def __eq__(self, other): raise NotImplementedError() def __ne__(self, other): return not self == other def equiv(self, other, prover=None): """ Check for logical equivalence. Pass the expression (self <-> other) to the theorem prover. If the prover says it is valid, then the self and other are equal. :param other: an ``Expression`` to check equality against :param prover: a ``nltk.inference.api.Prover`` """ assert isinstance(other, Expression), "%s is not an Expression" % other if prover is None: from nltk.inference import Prover9 prover = Prover9() bicond = IffExpression(self.simplify(), other.simplify()) return prover.prove(bicond) def __hash__(self): return hash(repr(self)) def substitute_bindings(self, bindings): expr = self for var in expr.variables(): if var in bindings: val = bindings[var] if isinstance(val, Variable): val = self.make_VariableExpression(val) elif not isinstance(val, Expression): raise ValueError( "Can not substitute a non-expression " "value into an expression: %r" % (val,) ) # Substitute bindings in the target value. val = val.substitute_bindings(bindings) # Replace var w/ the target value. expr = expr.replace(var, val) return expr.simplify() def typecheck(self, signature=None): """ Infer and check types. Raise exceptions if necessary. :param signature: dict that maps variable names to types (or string representations of types) :return: the signature, plus any additional type mappings """ sig = defaultdict(list) if signature: for key in signature: val = signature[key] varEx = VariableExpression(Variable(key)) if isinstance(val, Type): varEx.type = val else: varEx.type = read_type(val) sig[key].append(varEx) self._set_type(signature=sig) return {key: sig[key][0].type for key in sig} def findtype(self, variable): """ Find the type of the given variable as it is used in this expression. For example, finding the type of "P" in "P(x) & Q(x,y)" yields "" :param variable: Variable """ raise NotImplementedError() def _set_type(self, other_type=ANY_TYPE, signature=None): """ Set the type of this expression to be the given type. Raise type exceptions where applicable. :param other_type: Type :param signature: dict(str -> list(AbstractVariableExpression)) """ raise NotImplementedError() def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """ Replace every instance of 'variable' with 'expression' :param variable: ``Variable`` The variable to replace :param expression: ``Expression`` The expression with which to replace it :param replace_bound: bool Should bound variables be replaced? :param alpha_convert: bool Alpha convert automatically to avoid name clashes? """ assert isinstance(variable, Variable), "%s is not a Variable" % variable assert isinstance(expression, Expression), ( "%s is not an Expression" % expression ) return self.visit_structured( lambda e: e.replace(variable, expression, replace_bound, alpha_convert), self.__class__, ) def normalize(self, newvars=None): """Rename auto-generated unique variables""" def get_indiv_vars(e): if isinstance(e, IndividualVariableExpression): return {e} elif isinstance(e, AbstractVariableExpression): return set() else: return e.visit( get_indiv_vars, lambda parts: reduce(operator.or_, parts, set()) ) result = self for i, e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)): if isinstance(e, EventVariableExpression): newVar = e.__class__(Variable("e0%s" % (i + 1))) elif isinstance(e, IndividualVariableExpression): newVar = e.__class__(Variable("z%s" % (i + 1))) else: newVar = e result = result.replace(e.variable, newVar, True) return result def visit(self, function, combinator): """ Recursively visit subexpressions. Apply 'function' to each subexpression and pass the result of each function application to the 'combinator' for aggregation: return combinator(map(function, self.subexpressions)) Bound variables are neither applied upon by the function nor given to the combinator. :param function: ``Function`` to call on each subexpression :param combinator: ``Function,R>`` to combine the results of the function calls :return: result of combination ``R`` """ raise NotImplementedError() def visit_structured(self, function, combinator): """ Recursively visit subexpressions. Apply 'function' to each subexpression and pass the result of each function application to the 'combinator' for aggregation. The combinator must have the same signature as the constructor. The function is not applied to bound variables, but they are passed to the combinator. :param function: ``Function`` to call on each subexpression :param combinator: ``Function`` with the same signature as the constructor, to combine the results of the function calls :return: result of combination """ return self.visit(function, lambda parts: combinator(*parts)) def __repr__(self): return f"<{self.__class__.__name__} {self}>" def __str__(self): return self.str() def variables(self): """ Return a set of all the variables for binding substitution. The variables returned include all free (non-bound) individual variables and any variable starting with '?' or '@'. :return: set of ``Variable`` objects """ return self.free() | { p for p in self.predicates() | self.constants() if re.match("^[?@]", p.name) } def free(self): """ Return a set of all the free (non-bound) variables. This includes both individual and predicate variables, but not constants. :return: set of ``Variable`` objects """ return self.visit( lambda e: e.free(), lambda parts: reduce(operator.or_, parts, set()) ) def constants(self): """ Return a set of individual constants (non-predicates). :return: set of ``Variable`` objects """ return self.visit( lambda e: e.constants(), lambda parts: reduce(operator.or_, parts, set()) ) def predicates(self): """ Return a set of predicates (constants, not variables). :return: set of ``Variable`` objects """ return self.visit( lambda e: e.predicates(), lambda parts: reduce(operator.or_, parts, set()) ) def simplify(self): """ :return: beta-converted version of this expression """ return self.visit_structured(lambda e: e.simplify(), self.__class__) def make_VariableExpression(self, variable): return VariableExpression(variable) class ApplicationExpression(Expression): r""" This class is used to represent two related types of logical expressions. The first is a Predicate Expression, such as "P(x,y)". A predicate expression is comprised of a ``FunctionVariableExpression`` or ``ConstantExpression`` as the predicate and a list of Expressions as the arguments. The second is a an application of one expression to another, such as "(\x.dog(x))(fido)". The reason Predicate Expressions are treated as Application Expressions is that the Variable Expression predicate of the expression may be replaced with another Expression, such as a LambdaExpression, which would mean that the Predicate should be thought of as being applied to the arguments. The logical expression reader will always curry arguments in a application expression. So, "\x y.see(x,y)(john,mary)" will be represented internally as "((\x y.(see(x))(y))(john))(mary)". This simplifies the internals since there will always be exactly one argument in an application. The str() method will usually print the curried forms of application expressions. The one exception is when the the application expression is really a predicate expression (ie, underlying function is an ``AbstractVariableExpression``). This means that the example from above will be returned as "(\x y.see(x,y)(john))(mary)". """ def __init__(self, function, argument): """ :param function: ``Expression``, for the function expression :param argument: ``Expression``, for the argument """ assert isinstance(function, Expression), "%s is not an Expression" % function assert isinstance(argument, Expression), "%s is not an Expression" % argument self.function = function self.argument = argument def simplify(self): function = self.function.simplify() argument = self.argument.simplify() if isinstance(function, LambdaExpression): return function.term.replace(function.variable, argument).simplify() else: return self.__class__(function, argument) @property def type(self): if isinstance(self.function.type, ComplexType): return self.function.type.second else: return ANY_TYPE def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) self.argument._set_type(ANY_TYPE, signature) try: self.function._set_type( ComplexType(self.argument.type, other_type), signature ) except TypeResolutionException as e: raise TypeException( "The function '%s' is of type '%s' and cannot be applied " "to '%s' of type '%s'. Its argument must match type '%s'." % ( self.function, self.function.type, self.argument, self.argument.type, self.function.type.first, ) ) from e def findtype(self, variable): """:see Expression.findtype()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable if self.is_atom(): function, args = self.uncurry() else: # It's not a predicate expression ("P(x,y)"), so leave args curried function = self.function args = [self.argument] found = [arg.findtype(variable) for arg in [function] + args] unique = [] for f in found: if f != ANY_TYPE: if unique: for u in unique: if f.matches(u): break else: unique.append(f) if len(unique) == 1: return list(unique)[0] else: return ANY_TYPE def constants(self): """:see: Expression.constants()""" if isinstance(self.function, AbstractVariableExpression): function_constants = set() else: function_constants = self.function.constants() return function_constants | self.argument.constants() def predicates(self): """:see: Expression.predicates()""" if isinstance(self.function, ConstantExpression): function_preds = {self.function.variable} else: function_preds = self.function.predicates() return function_preds | self.argument.predicates() def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.function), function(self.argument)]) def __eq__(self, other): return ( isinstance(other, ApplicationExpression) and self.function == other.function and self.argument == other.argument ) def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def __str__(self): # uncurry the arguments and find the base function if self.is_atom(): function, args = self.uncurry() arg_str = ",".join("%s" % arg for arg in args) else: # Leave arguments curried function = self.function arg_str = "%s" % self.argument function_str = "%s" % function parenthesize_function = False if isinstance(function, LambdaExpression): if isinstance(function.term, ApplicationExpression): if not isinstance(function.term.function, AbstractVariableExpression): parenthesize_function = True elif not isinstance(function.term, BooleanExpression): parenthesize_function = True elif isinstance(function, ApplicationExpression): parenthesize_function = True if parenthesize_function: function_str = Tokens.OPEN + function_str + Tokens.CLOSE return function_str + Tokens.OPEN + arg_str + Tokens.CLOSE def uncurry(self): """ Uncurry this application expression return: A tuple (base-function, arg-list) """ function = self.function args = [self.argument] while isinstance(function, ApplicationExpression): # (\x.\y.sees(x,y)(john))(mary) args.insert(0, function.argument) function = function.function return (function, args) @property def pred(self): """ Return uncurried base-function. If this is an atom, then the result will be a variable expression. Otherwise, it will be a lambda expression. """ return self.uncurry()[0] @property def args(self): """ Return uncurried arg-list """ return self.uncurry()[1] def is_atom(self): """ Is this expression an atom (as opposed to a lambda expression applied to a term)? """ return isinstance(self.pred, AbstractVariableExpression) @total_ordering class AbstractVariableExpression(Expression): """This class represents a variable to be used as a predicate or entity""" def __init__(self, variable): """ :param variable: ``Variable``, for the variable """ assert isinstance(variable, Variable), "%s is not a Variable" % variable self.variable = variable def simplify(self): return self def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """:see: Expression.replace()""" assert isinstance(variable, Variable), "%s is not an Variable" % variable assert isinstance(expression, Expression), ( "%s is not an Expression" % expression ) if self.variable == variable: return expression else: return self def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) resolution = other_type for varEx in signature[self.variable.name]: resolution = varEx.type.resolve(resolution) if not resolution: raise InconsistentTypeHierarchyException(self) signature[self.variable.name].append(self) for varEx in signature[self.variable.name]: varEx.type = resolution def findtype(self, variable): """:see Expression.findtype()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable if self.variable == variable: return self.type else: return ANY_TYPE def predicates(self): """:see: Expression.predicates()""" return set() def __eq__(self, other): """Allow equality between instances of ``AbstractVariableExpression`` subtypes.""" return ( isinstance(other, AbstractVariableExpression) and self.variable == other.variable ) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, AbstractVariableExpression): raise TypeError return self.variable < other.variable __hash__ = Expression.__hash__ def __str__(self): return "%s" % self.variable class IndividualVariableExpression(AbstractVariableExpression): """This class represents variables that take the form of a single lowercase character (other than 'e') followed by zero or more digits.""" def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(ENTITY_TYPE): raise IllegalTypeException(self, other_type, ENTITY_TYPE) signature[self.variable.name].append(self) def _get_type(self): return ENTITY_TYPE type = property(_get_type, _set_type) def free(self): """:see: Expression.free()""" return {self.variable} def constants(self): """:see: Expression.constants()""" return set() class FunctionVariableExpression(AbstractVariableExpression): """This class represents variables that take the form of a single uppercase character followed by zero or more digits.""" type = ANY_TYPE def free(self): """:see: Expression.free()""" return {self.variable} def constants(self): """:see: Expression.constants()""" return set() class EventVariableExpression(IndividualVariableExpression): """This class represents variables that take the form of a single lowercase 'e' character followed by zero or more digits.""" type = EVENT_TYPE class ConstantExpression(AbstractVariableExpression): """This class represents variables that do not take the form of a single character followed by zero or more digits.""" type = ENTITY_TYPE def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if other_type == ANY_TYPE: # entity type by default, for individuals resolution = ENTITY_TYPE else: resolution = other_type if self.type != ENTITY_TYPE: resolution = resolution.resolve(self.type) for varEx in signature[self.variable.name]: resolution = varEx.type.resolve(resolution) if not resolution: raise InconsistentTypeHierarchyException(self) signature[self.variable.name].append(self) for varEx in signature[self.variable.name]: varEx.type = resolution def free(self): """:see: Expression.free()""" return set() def constants(self): """:see: Expression.constants()""" return {self.variable} def VariableExpression(variable): """ This is a factory method that instantiates and returns a subtype of ``AbstractVariableExpression`` appropriate for the given variable. """ assert isinstance(variable, Variable), "%s is not a Variable" % variable if is_indvar(variable.name): return IndividualVariableExpression(variable) elif is_funcvar(variable.name): return FunctionVariableExpression(variable) elif is_eventvar(variable.name): return EventVariableExpression(variable) else: return ConstantExpression(variable) class VariableBinderExpression(Expression): """This an abstract class for any Expression that binds a variable in an Expression. This includes LambdaExpressions and Quantified Expressions""" def __init__(self, variable, term): """ :param variable: ``Variable``, for the variable :param term: ``Expression``, for the term """ assert isinstance(variable, Variable), "%s is not a Variable" % variable assert isinstance(term, Expression), "%s is not an Expression" % term self.variable = variable self.term = term def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """:see: Expression.replace()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable assert isinstance(expression, Expression), ( "%s is not an Expression" % expression ) # if the bound variable is the thing being replaced if self.variable == variable: if replace_bound: assert isinstance(expression, AbstractVariableExpression), ( "%s is not a AbstractVariableExpression" % expression ) return self.__class__( expression.variable, self.term.replace(variable, expression, True, alpha_convert), ) else: return self else: # if the bound variable appears in the expression, then it must # be alpha converted to avoid a conflict if alpha_convert and self.variable in expression.free(): self = self.alpha_convert(unique_variable(pattern=self.variable)) # replace in the term return self.__class__( self.variable, self.term.replace(variable, expression, replace_bound, alpha_convert), ) def alpha_convert(self, newvar): """Rename all occurrences of the variable introduced by this variable binder in the expression to ``newvar``. :param newvar: ``Variable``, for the new variable """ assert isinstance(newvar, Variable), "%s is not a Variable" % newvar return self.__class__( newvar, self.term.replace(self.variable, VariableExpression(newvar), True) ) def free(self): """:see: Expression.free()""" return self.term.free() - {self.variable} def findtype(self, variable): """:see Expression.findtype()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable if variable == self.variable: return ANY_TYPE else: return self.term.findtype(variable) def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.term)]) def visit_structured(self, function, combinator): """:see: Expression.visit_structured()""" return combinator(self.variable, function(self.term)) def __eq__(self, other): r"""Defines equality modulo alphabetic variance. If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" if isinstance(self, other.__class__) or isinstance(other, self.__class__): if self.variable == other.variable: return self.term == other.term else: # Comparing \x.M and \y.N. Relabel y in N with x and continue. varex = VariableExpression(self.variable) return self.term == other.term.replace(other.variable, varex) else: return False def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ class LambdaExpression(VariableBinderExpression): @property def type(self): return ComplexType(self.term.findtype(self.variable), self.term.type) def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) self.term._set_type(other_type.second, signature) if not self.type.resolve(other_type): raise TypeResolutionException(self, other_type) def __str__(self): variables = [self.variable] term = self.term while term.__class__ == self.__class__: variables.append(term.variable) term = term.term return ( Tokens.LAMBDA + " ".join("%s" % v for v in variables) + Tokens.DOT + "%s" % term ) class QuantifiedExpression(VariableBinderExpression): @property def type(self): return TRUTH_TYPE def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(TRUTH_TYPE): raise IllegalTypeException(self, other_type, TRUTH_TYPE) self.term._set_type(TRUTH_TYPE, signature) def __str__(self): variables = [self.variable] term = self.term while term.__class__ == self.__class__: variables.append(term.variable) term = term.term return ( self.getQuantifier() + " " + " ".join("%s" % v for v in variables) + Tokens.DOT + "%s" % term ) class ExistsExpression(QuantifiedExpression): def getQuantifier(self): return Tokens.EXISTS class AllExpression(QuantifiedExpression): def getQuantifier(self): return Tokens.ALL class NegatedExpression(Expression): def __init__(self, term): assert isinstance(term, Expression), "%s is not an Expression" % term self.term = term @property def type(self): return TRUTH_TYPE def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(TRUTH_TYPE): raise IllegalTypeException(self, other_type, TRUTH_TYPE) self.term._set_type(TRUTH_TYPE, signature) def findtype(self, variable): assert isinstance(variable, Variable), "%s is not a Variable" % variable return self.term.findtype(variable) def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.term)]) def negate(self): """:see: Expression.negate()""" return self.term def __eq__(self, other): return isinstance(other, NegatedExpression) and self.term == other.term def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def __str__(self): return Tokens.NOT + "%s" % self.term class BinaryExpression(Expression): def __init__(self, first, second): assert isinstance(first, Expression), "%s is not an Expression" % first assert isinstance(second, Expression), "%s is not an Expression" % second self.first = first self.second = second @property def type(self): return TRUTH_TYPE def findtype(self, variable): """:see Expression.findtype()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable f = self.first.findtype(variable) s = self.second.findtype(variable) if f == s or s == ANY_TYPE: return f elif f == ANY_TYPE: return s else: return ANY_TYPE def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.first), function(self.second)]) def __eq__(self, other): return ( (isinstance(self, other.__class__) or isinstance(other, self.__class__)) and self.first == other.first and self.second == other.second ) def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def __str__(self): first = self._str_subex(self.first) second = self._str_subex(self.second) return Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE def _str_subex(self, subex): return "%s" % subex class BooleanExpression(BinaryExpression): def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(TRUTH_TYPE): raise IllegalTypeException(self, other_type, TRUTH_TYPE) self.first._set_type(TRUTH_TYPE, signature) self.second._set_type(TRUTH_TYPE, signature) class AndExpression(BooleanExpression): """This class represents conjunctions""" def getOp(self): return Tokens.AND def _str_subex(self, subex): s = "%s" % subex if isinstance(subex, AndExpression): return s[1:-1] return s class OrExpression(BooleanExpression): """This class represents disjunctions""" def getOp(self): return Tokens.OR def _str_subex(self, subex): s = "%s" % subex if isinstance(subex, OrExpression): return s[1:-1] return s class ImpExpression(BooleanExpression): """This class represents implications""" def getOp(self): return Tokens.IMP class IffExpression(BooleanExpression): """This class represents biconditionals""" def getOp(self): return Tokens.IFF class EqualityExpression(BinaryExpression): """This class represents equality expressions like "(x = y)".""" def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(TRUTH_TYPE): raise IllegalTypeException(self, other_type, TRUTH_TYPE) self.first._set_type(ENTITY_TYPE, signature) self.second._set_type(ENTITY_TYPE, signature) def getOp(self): return Tokens.EQ ### Utilities class LogicalExpressionException(Exception): def __init__(self, index, message): self.index = index Exception.__init__(self, message) class UnexpectedTokenException(LogicalExpressionException): def __init__(self, index, unexpected=None, expected=None, message=None): if unexpected and expected: msg = "Unexpected token: '%s'. " "Expected token '%s'." % ( unexpected, expected, ) elif unexpected: msg = "Unexpected token: '%s'." % unexpected if message: msg += " " + message else: msg = "Expected token '%s'." % expected LogicalExpressionException.__init__(self, index, msg) class ExpectedMoreTokensException(LogicalExpressionException): def __init__(self, index, message=None): if not message: message = "More tokens expected." LogicalExpressionException.__init__( self, index, "End of input found. " + message ) def is_indvar(expr): """ An individual variable must be a single lowercase character other than 'e', followed by zero or more digits. :param expr: str :return: bool True if expr is of the correct form """ assert isinstance(expr, str), "%s is not a string" % expr return re.match(r"^[a-df-z]\d*$", expr) is not None def is_funcvar(expr): """ A function variable must be a single uppercase character followed by zero or more digits. :param expr: str :return: bool True if expr is of the correct form """ assert isinstance(expr, str), "%s is not a string" % expr return re.match(r"^[A-Z]\d*$", expr) is not None def is_eventvar(expr): """ An event variable must be a single lowercase 'e' character followed by zero or more digits. :param expr: str :return: bool True if expr is of the correct form """ assert isinstance(expr, str), "%s is not a string" % expr return re.match(r"^e\d*$", expr) is not None def demo(): lexpr = Expression.fromstring print("=" * 20 + "Test reader" + "=" * 20) print(lexpr(r"john")) print(lexpr(r"man(x)")) print(lexpr(r"-man(x)")) print(lexpr(r"(man(x) & tall(x) & walks(x))")) print(lexpr(r"exists x.(man(x) & tall(x) & walks(x))")) print(lexpr(r"\x.man(x)")) print(lexpr(r"\x.man(x)(john)")) print(lexpr(r"\x y.sees(x,y)")) print(lexpr(r"\x y.sees(x,y)(a,b)")) print(lexpr(r"(\x.exists y.walks(x,y))(x)")) print(lexpr(r"exists x.x = y")) print(lexpr(r"exists x.(x = y)")) print(lexpr("P(x) & x=y & P(y)")) print(lexpr(r"\P Q.exists x.(P(x) & Q(x))")) print(lexpr(r"man(x) <-> tall(x)")) print("=" * 20 + "Test simplify" + "=" * 20) print(lexpr(r"\x.\y.sees(x,y)(john)(mary)").simplify()) print(lexpr(r"\x.\y.sees(x,y)(john, mary)").simplify()) print(lexpr(r"all x.(man(x) & (\x.exists y.walks(x,y))(x))").simplify()) print(lexpr(r"(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))").simplify()) print("=" * 20 + "Test alpha conversion and binder expression equality" + "=" * 20) e1 = lexpr("exists x.P(x)") print(e1) e2 = e1.alpha_convert(Variable("z")) print(e2) print(e1 == e2) def demo_errors(): print("=" * 20 + "Test reader errors" + "=" * 20) demoException("(P(x) & Q(x)") demoException("((P(x) &) & Q(x))") demoException("P(x) -> ") demoException("P(x") demoException("P(x,") demoException("P(x,)") demoException("exists") demoException("exists x.") demoException("\\") demoException("\\ x y.") demoException("P(x)Q(x)") demoException("(P(x)Q(x)") demoException("exists x -> y") def demoException(s): try: Expression.fromstring(s) except LogicalExpressionException as e: print(f"{e.__class__.__name__}: {e}") def printtype(ex): print(f"{ex.str()} : {ex.type}") if __name__ == "__main__": demo() # demo_errors() nltk-3.7/nltk/sem/relextract.py000066400000000000000000000357451420073152400166320ustar00rootroot00000000000000# Natural Language Toolkit: Relation Extraction # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ Code for extracting relational triples from the ieer and conll2002 corpora. Relations are stored internally as dictionaries ('reldicts'). The two serialization outputs are "rtuple" and "clause". - An rtuple is a tuple of the form ``(subj, filler, obj)``, where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to circumvent locale variations in rendering utf-8 encoded strings. - A clause is an atom of the form ``relsym(subjsym, objsym)``, where the relation, subject and object have been canonicalized to single strings. """ # todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs? import html import re from collections import defaultdict # Dictionary that associates corpora with NE classes NE_CLASSES = { "ieer": [ "LOCATION", "ORGANIZATION", "PERSON", "DURATION", "DATE", "CARDINAL", "PERCENT", "MONEY", "MEASURE", ], "conll2002": ["LOC", "PER", "ORG"], "ace": [ "LOCATION", "ORGANIZATION", "PERSON", "DURATION", "DATE", "CARDINAL", "PERCENT", "MONEY", "MEASURE", "FACILITY", "GPE", ], } # Allow abbreviated class labels short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON") long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER") def _expand(type): """ Expand an NE class name. :type type: str :rtype: str """ try: return short2long[type] except KeyError: return type def class_abbrev(type): """ Abbreviate an NE class name. :type type: str :rtype: str """ try: return long2short[type] except KeyError: return type def _join(lst, sep=" ", untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. :param untag: if ``True``, omit the tag from tagged input strings. :type lst: list :rtype: str """ try: return sep.join(lst) except TypeError: if untag: return sep.join(tup[0] for tup in lst) from nltk.tag import tuple2str return sep.join(tuple2str(tup) for tup in lst) def descape_entity(m, defs=html.entities.entitydefs): """ Translate one entity to its ISO Latin value. Inspired by example from effbot.org """ try: return defs[m.group(1)] except KeyError: return m.group(0) # use as is def list2sym(lst): """ Convert a list of strings into a canonical symbol. :type lst: list :return: a Unicode string without whitespace :rtype: unicode """ sym = _join(lst, "_", untag=True) sym = sym.lower() ENT = re.compile(r"&(\w+?);") sym = ENT.sub(descape_entity, sym) sym = sym.replace(".", "") return sym def tree2semi_rel(tree): """ Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this identifies pairs whose first member is a list (possibly empty) of terminal strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). :param tree: a chunk tree :return: a list of pairs (list(str), ``Tree``) :rtype: list of tuple """ from nltk.tree import Tree semi_rels = [] semi_rel = [[], None] for dtr in tree: if not isinstance(dtr, Tree): semi_rel[0].append(dtr) else: # dtr is a Tree semi_rel[1] = dtr semi_rels.append(semi_rel) semi_rel = [[], None] return semi_rels def semi_rel2reldict(pairs, window=5, trace=False): """ Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which stores information about the subject and object NEs plus the filler between them. Additionally, a left and right context of length =< window are captured (within a given input sentence). :param pairs: a pair of list(str) and ``Tree``, as generated by :param window: a threshold for the number of items to include in the left and right context :type window: int :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' :rtype: list(defaultdict) """ result = [] while len(pairs) > 2: reldict = defaultdict(str) reldict["lcon"] = _join(pairs[0][0][-window:]) reldict["subjclass"] = pairs[0][1].label() reldict["subjtext"] = _join(pairs[0][1].leaves()) reldict["subjsym"] = list2sym(pairs[0][1].leaves()) reldict["filler"] = _join(pairs[1][0]) reldict["untagged_filler"] = _join(pairs[1][0], untag=True) reldict["objclass"] = pairs[1][1].label() reldict["objtext"] = _join(pairs[1][1].leaves()) reldict["objsym"] = list2sym(pairs[1][1].leaves()) reldict["rcon"] = _join(pairs[2][0][:window]) if trace: print( "(%s(%s, %s)" % ( reldict["untagged_filler"], reldict["subjclass"], reldict["objclass"], ) ) result.append(reldict) pairs = pairs[1:] return result def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10): """ Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern. The parameters ``subjclass`` and ``objclass`` can be used to restrict the Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). :param subjclass: the class of the subject Named Entity. :type subjclass: str :param objclass: the class of the object Named Entity. :type objclass: str :param doc: input document :type doc: ieer document or a list of chunk trees :param corpus: name of the corpus to take as input; possible values are 'ieer' and 'conll2002' :type corpus: str :param pattern: a regular expression for filtering the fillers of retrieved triples. :type pattern: SRE_Pattern :param window: filters out fillers which exceed this threshold :type window: int :return: see ``mk_reldicts`` :rtype: list(defaultdict) """ if subjclass and subjclass not in NE_CLASSES[corpus]: if _expand(subjclass) in NE_CLASSES[corpus]: subjclass = _expand(subjclass) else: raise ValueError( "your value for the subject type has not been recognized: %s" % subjclass ) if objclass and objclass not in NE_CLASSES[corpus]: if _expand(objclass) in NE_CLASSES[corpus]: objclass = _expand(objclass) else: raise ValueError( "your value for the object type has not been recognized: %s" % objclass ) if corpus == "ace" or corpus == "conll2002": pairs = tree2semi_rel(doc) elif corpus == "ieer": pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline) else: raise ValueError("corpus type not recognized") reldicts = semi_rel2reldict(pairs) relfilter = lambda x: ( x["subjclass"] == subjclass and len(x["filler"].split()) <= window and pattern.match(x["filler"]) and x["objclass"] == objclass ) return list(filter(relfilter, reldicts)) def rtuple(reldict, lcon=False, rcon=False): """ Pretty print the reldict as an rtuple. :param reldict: a relation dictionary :type reldict: defaultdict """ items = [ class_abbrev(reldict["subjclass"]), reldict["subjtext"], reldict["filler"], class_abbrev(reldict["objclass"]), reldict["objtext"], ] format = "[%s: %r] %r [%s: %r]" if lcon: items = [reldict["lcon"]] + items format = "...%r)" + format if rcon: items.append(reldict["rcon"]) format = format + "(%r..." printargs = tuple(items) return format % printargs def clause(reldict, relsym): """ Print the relation in clausal form. :param reldict: a relation dictionary :type reldict: defaultdict :param relsym: a label for the relation :type relsym: str """ items = (relsym, reldict["subjsym"], reldict["objsym"]) return "%s(%r, %r)" % items ####################################################### # Demos of relation extraction with regular expressions ####################################################### ############################################ # Example of in(ORG, LOC) ############################################ def in_demo(trace=0, sql=True): """ Select pairs of organizations and locations whose mentions occur with an intervening occurrence of the preposition "in". If the sql parameter is set to True, then the entity pairs are loaded into an in-memory database, and subsequently pulled out using an SQL "SELECT" query. """ from nltk.corpus import ieer if sql: try: import sqlite3 connection = sqlite3.connect(":memory:") connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() cur.execute( """create table Locations (OrgName text, LocationName text, DocID text)""" ) except ImportError: import warnings warnings.warn("Cannot import sqlite; sql flag will be ignored.") IN = re.compile(r".*\bin\b(?!\b.+ing)") print() print("IEER: in(ORG, LOC) -- just the clauses:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): if trace: print(doc.docno) print("=" * 15) for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN): print(clause(rel, relsym="IN")) if sql: try: rtuple = (rel["subjtext"], rel["objtext"], doc.docno) cur.execute( """insert into Locations values (?, ?, ?)""", rtuple, ) connection.commit() except NameError: pass if sql: try: cur.execute( """select OrgName from Locations where LocationName = 'Atlanta'""" ) print() print("Extract data from SQL table: ORGs in Atlanta") print("-" * 15) for row in cur: print(row) except NameError: pass ############################################ # Example of has_role(PER, LOC) ############################################ def roles_demo(trace=0): from nltk.corpus import ieer roles = r""" (.*( # assorted roles analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* # "X, of (the) Y" """ ROLES = re.compile(roles, re.VERBOSE) print() print("IEER: has_role(PER, ORG) -- raw rtuples:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): lcon = rcon = False if trace: print(doc.docno) print("=" * 15) lcon = rcon = True for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES): print(rtuple(rel, lcon=lcon, rcon=rcon)) ############################################## ### Show what's in the IEER Headlines ############################################## def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print("IEER: First 20 Headlines") print("=" * 45) trees = [ (doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file) ] for tree in trees[:20]: print() print("%s:\n%s" % tree) ############################################# ## Dutch CONLL2002: take_on_role(PER, ORG ############################################# def conllned(trace=1): """ Find the copula+'van' relation ('of') in the Dutch tagged training corpus from CoNLL 2002. """ from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verb zijn ('be') werd/V| # and also present wordt/V # past of worden ('become) ) .* # followed by anything van/Prep # followed by van ('of') """ VAN = re.compile(vnv, re.VERBOSE) print() print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:") print("=" * 45) for doc in conll2002.chunked_sents("ned.train"): lcon = rcon = False if trace: lcon = rcon = True for rel in extract_rels( "PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10 ): print(rtuple(rel, lcon=lcon, rcon=rcon)) ############################################# ## Spanish CONLL2002: (PER, ORG) ############################################# def conllesp(): from nltk.corpus import conll2002 de = """ .* ( de/SP| del/SP ) """ DE = re.compile(de, re.VERBOSE) print() print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") print("=" * 45) rels = [ rel for doc in conll2002.chunked_sents("esp.train") for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE) ] for r in rels[:10]: print(clause(r, relsym="DE")) print() def ne_chunked(): print() print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker") print("=" * 45) ROLE = re.compile( r".*(chairman|president|trader|scientist|economist|analyst|partner).*" ) rels = [] for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]): sent = nltk.ne_chunk(sent) rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7) for rel in rels: print(f"{i:<5}{rtuple(rel)}") if __name__ == "__main__": import nltk from nltk.sem import relextract in_demo(trace=0) roles_demo(trace=0) conllned() conllesp() ieer_headlines() ne_chunked() nltk-3.7/nltk/sem/skolemize.py000066400000000000000000000131321420073152400164410ustar00rootroot00000000000000# Natural Language Toolkit: Semantic Interpretation # # Author: Ewan Klein # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT from nltk.sem.logic import ( AllExpression, AndExpression, ApplicationExpression, EqualityExpression, ExistsExpression, IffExpression, ImpExpression, NegatedExpression, OrExpression, VariableExpression, skolem_function, unique_variable, ) def skolemize(expression, univ_scope=None, used_variables=None): """ Skolemize the expression and convert to conjunctive normal form (CNF) """ if univ_scope is None: univ_scope = set() if used_variables is None: used_variables = set() if isinstance(expression, AllExpression): term = skolemize( expression.term, univ_scope | {expression.variable}, used_variables | {expression.variable}, ) return term.replace( expression.variable, VariableExpression(unique_variable(ignore=used_variables)), ) elif isinstance(expression, AndExpression): return skolemize(expression.first, univ_scope, used_variables) & skolemize( expression.second, univ_scope, used_variables ) elif isinstance(expression, OrExpression): return to_cnf( skolemize(expression.first, univ_scope, used_variables), skolemize(expression.second, univ_scope, used_variables), ) elif isinstance(expression, ImpExpression): return to_cnf( skolemize(-expression.first, univ_scope, used_variables), skolemize(expression.second, univ_scope, used_variables), ) elif isinstance(expression, IffExpression): return to_cnf( skolemize(-expression.first, univ_scope, used_variables), skolemize(expression.second, univ_scope, used_variables), ) & to_cnf( skolemize(expression.first, univ_scope, used_variables), skolemize(-expression.second, univ_scope, used_variables), ) elif isinstance(expression, EqualityExpression): return expression elif isinstance(expression, NegatedExpression): negated = expression.term if isinstance(negated, AllExpression): term = skolemize( -negated.term, univ_scope, used_variables | {negated.variable} ) if univ_scope: return term.replace(negated.variable, skolem_function(univ_scope)) else: skolem_constant = VariableExpression( unique_variable(ignore=used_variables) ) return term.replace(negated.variable, skolem_constant) elif isinstance(negated, AndExpression): return to_cnf( skolemize(-negated.first, univ_scope, used_variables), skolemize(-negated.second, univ_scope, used_variables), ) elif isinstance(negated, OrExpression): return skolemize(-negated.first, univ_scope, used_variables) & skolemize( -negated.second, univ_scope, used_variables ) elif isinstance(negated, ImpExpression): return skolemize(negated.first, univ_scope, used_variables) & skolemize( -negated.second, univ_scope, used_variables ) elif isinstance(negated, IffExpression): return to_cnf( skolemize(-negated.first, univ_scope, used_variables), skolemize(-negated.second, univ_scope, used_variables), ) & to_cnf( skolemize(negated.first, univ_scope, used_variables), skolemize(negated.second, univ_scope, used_variables), ) elif isinstance(negated, EqualityExpression): return expression elif isinstance(negated, NegatedExpression): return skolemize(negated.term, univ_scope, used_variables) elif isinstance(negated, ExistsExpression): term = skolemize( -negated.term, univ_scope | {negated.variable}, used_variables | {negated.variable}, ) return term.replace( negated.variable, VariableExpression(unique_variable(ignore=used_variables)), ) elif isinstance(negated, ApplicationExpression): return expression else: raise Exception("'%s' cannot be skolemized" % expression) elif isinstance(expression, ExistsExpression): term = skolemize( expression.term, univ_scope, used_variables | {expression.variable} ) if univ_scope: return term.replace(expression.variable, skolem_function(univ_scope)) else: skolem_constant = VariableExpression(unique_variable(ignore=used_variables)) return term.replace(expression.variable, skolem_constant) elif isinstance(expression, ApplicationExpression): return expression else: raise Exception("'%s' cannot be skolemized" % expression) def to_cnf(first, second): """ Convert this split disjunction to conjunctive normal form (CNF) """ if isinstance(first, AndExpression): r_first = to_cnf(first.first, second) r_second = to_cnf(first.second, second) return r_first & r_second elif isinstance(second, AndExpression): r_first = to_cnf(first, second.first) r_second = to_cnf(first, second.second) return r_first & r_second else: return first | second nltk-3.7/nltk/sem/util.py000066400000000000000000000210611420073152400154140ustar00rootroot00000000000000# Natural Language Toolkit: Semantic Interpretation # # Author: Ewan Klein # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT """ Utility functions for batch-processing sentences: parsing and extraction of the semantic representation of the root node of the the syntax tree, followed by evaluation of the semantic representation in a first-order model. """ import codecs from nltk.sem import evaluate ############################################################## ## Utility functions for connecting parse output to semantics ############################################################## def parse_sents(inputs, grammar, trace=0): """ Convert input sentences into syntactic trees. :param inputs: sentences to be parsed :type inputs: list(str) :param grammar: ``FeatureGrammar`` or name of feature-based grammar :type grammar: nltk.grammar.FeatureGrammar :rtype: list(nltk.tree.Tree) or dict(list(str)): list(Tree) :return: a mapping from input sentences to a list of ``Tree`` instances. """ # put imports here to avoid circult dependencies from nltk.grammar import FeatureGrammar from nltk.parse import FeatureChartParser, load_parser if isinstance(grammar, FeatureGrammar): cp = FeatureChartParser(grammar) else: cp = load_parser(grammar, trace=trace) parses = [] for sent in inputs: tokens = sent.split() # use a tokenizer? syntrees = list(cp.parse(tokens)) parses.append(syntrees) return parses def root_semrep(syntree, semkey="SEM"): """ Find the semantic representation at the root of a tree. :param syntree: a parse ``Tree`` :param semkey: the feature label to use for the root semantics in the tree :return: the semantic representation at the root of a ``Tree`` :rtype: sem.Expression """ from nltk.grammar import FeatStructNonterminal node = syntree.label() assert isinstance(node, FeatStructNonterminal) try: return node[semkey] except KeyError: print(node, end=" ") print("has no specification for the feature %s" % semkey) raise def interpret_sents(inputs, grammar, semkey="SEM", trace=0): """ Add the semantic representation to each syntactic parse tree of each input sentence. :param inputs: a list of sentences :type inputs: list(str) :param grammar: ``FeatureGrammar`` or name of feature-based grammar :type grammar: nltk.grammar.FeatureGrammar :return: a mapping from sentences to lists of pairs (parse-tree, semantic-representations) :rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression))) """ return [ [(syn, root_semrep(syn, semkey)) for syn in syntrees] for syntrees in parse_sents(inputs, grammar, trace=trace) ] def evaluate_sents(inputs, grammar, model, assignment, trace=0): """ Add the truth-in-a-model value to each semantic representation for each syntactic parse of each input sentences. :param inputs: a list of sentences :type inputs: list(str) :param grammar: ``FeatureGrammar`` or name of feature-based grammar :type grammar: nltk.grammar.FeatureGrammar :return: a mapping from sentences to lists of triples (parse-tree, semantic-representations, evaluation-in-model) :rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression, bool or dict(str): bool))) """ return [ [ (syn, sem, model.evaluate("%s" % sem, assignment, trace=trace)) for (syn, sem) in interpretations ] for interpretations in interpret_sents(inputs, grammar) ] def demo_model0(): global m0, g0 # Initialize a valuation of non-logical constants.""" v = [ ("john", "b1"), ("mary", "g1"), ("suzie", "g2"), ("fido", "d1"), ("tess", "d2"), ("noosa", "n"), ("girl", {"g1", "g2"}), ("boy", {"b1", "b2"}), ("dog", {"d1", "d2"}), ("bark", {"d1", "d2"}), ("walk", {"b1", "g2", "d1"}), ("chase", {("b1", "g1"), ("b2", "g1"), ("g1", "d1"), ("g2", "d2")}), ( "see", {("b1", "g1"), ("b2", "d2"), ("g1", "b1"), ("d2", "b1"), ("g2", "n")}, ), ("in", {("b1", "n"), ("b2", "n"), ("d2", "n")}), ("with", {("b1", "g1"), ("g1", "b1"), ("d1", "b1"), ("b1", "d1")}), ] # Read in the data from ``v`` val = evaluate.Valuation(v) # Bind ``dom`` to the ``domain`` property of ``val`` dom = val.domain # Initialize a model with parameters ``dom`` and ``val``. m0 = evaluate.Model(dom, val) # Initialize a variable assignment with parameter ``dom`` g0 = evaluate.Assignment(dom) def read_sents(filename, encoding="utf8"): with codecs.open(filename, "r", encoding) as fp: sents = [l.rstrip() for l in fp] # get rid of blank lines sents = [l for l in sents if len(l) > 0] sents = [l for l in sents if not l[0] == "#"] return sents def demo_legacy_grammar(): """ Check that interpret_sents() is compatible with legacy grammars that use a lowercase 'sem' feature. Define 'test.fcfg' to be the following """ from nltk.grammar import FeatureGrammar g = FeatureGrammar.fromstring( """ % start S S[sem=] -> 'hello' """ ) print("Reading grammar: %s" % g) print("*" * 20) for reading in interpret_sents(["hello"], g, semkey="sem"): syn, sem = reading[0] print() print("output: ", sem) def demo(): import sys from optparse import OptionParser description = """ Parse and evaluate some sentences. """ opts = OptionParser(description=description) opts.set_defaults( evaluate=True, beta=True, syntrace=0, semtrace=0, demo="default", grammar="", sentences="", ) opts.add_option( "-d", "--demo", dest="demo", help="choose demo D; omit this for the default demo, or specify 'chat80'", metavar="D", ) opts.add_option( "-g", "--gram", dest="grammar", help="read in grammar G", metavar="G" ) opts.add_option( "-m", "--model", dest="model", help="import model M (omit '.py' suffix)", metavar="M", ) opts.add_option( "-s", "--sentences", dest="sentences", help="read in a file of test sentences S", metavar="S", ) opts.add_option( "-e", "--no-eval", action="store_false", dest="evaluate", help="just do a syntactic analysis", ) opts.add_option( "-b", "--no-beta-reduction", action="store_false", dest="beta", help="don't carry out beta-reduction", ) opts.add_option( "-t", "--syntrace", action="count", dest="syntrace", help="set syntactic tracing on; requires '-e' option", ) opts.add_option( "-T", "--semtrace", action="count", dest="semtrace", help="set semantic tracing on", ) (options, args) = opts.parse_args() SPACER = "-" * 30 demo_model0() sents = [ "Fido sees a boy with Mary", "John sees Mary", "every girl chases a dog", "every boy chases a girl", "John walks with a girl in Noosa", "who walks", ] gramfile = "grammars/sample_grammars/sem2.fcfg" if options.sentences: sentsfile = options.sentences if options.grammar: gramfile = options.grammar if options.model: exec("import %s as model" % options.model) if sents is None: sents = read_sents(sentsfile) # Set model and assignment model = m0 g = g0 if options.evaluate: evaluations = evaluate_sents(sents, gramfile, model, g, trace=options.semtrace) else: semreps = interpret_sents(sents, gramfile, trace=options.syntrace) for i, sent in enumerate(sents): n = 1 print("\nSentence: %s" % sent) print(SPACER) if options.evaluate: for (syntree, semrep, value) in evaluations[i]: if isinstance(value, dict): value = set(value.keys()) print("%d: %s" % (n, semrep)) print(value) n += 1 else: for (syntree, semrep) in semreps[i]: print("%d: %s" % (n, semrep)) n += 1 if __name__ == "__main__": demo() demo_legacy_grammar() nltk-3.7/nltk/sentiment/000077500000000000000000000000001420073152400153075ustar00rootroot00000000000000nltk-3.7/nltk/sentiment/__init__.py000066400000000000000000000005611420073152400174220ustar00rootroot00000000000000# Natural Language Toolkit: Sentiment Analysis # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ NLTK Sentiment Analysis Package """ from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer from nltk.sentiment.vader import SentimentIntensityAnalyzer nltk-3.7/nltk/sentiment/sentiment_analyzer.py000066400000000000000000000237011420073152400215770ustar00rootroot00000000000000# # Natural Language Toolkit: Sentiment Analyzer # # Copyright (C) 2001-2022 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks using NLTK features and classifiers, especially for teaching and demonstrative purposes. """ import sys from collections import defaultdict from nltk.classify.util import accuracy as eval_accuracy from nltk.classify.util import apply_features from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.metrics import f_measure as eval_f_measure from nltk.metrics import precision as eval_precision from nltk.metrics import recall as eval_recall from nltk.probability import FreqDist class SentimentAnalyzer: """ A Sentiment Analysis tool based on machine learning approaches. """ def __init__(self, classifier=None): self.feat_extractors = defaultdict(list) self.classifier = classifier def all_words(self, documents, labeled=None): """ Return all words/tokens from the documents (with duplicates). :param documents: a list of (words, label) tuples. :param labeled: if `True`, assume that each document is represented by a (words, label) tuple: (list(str), str). If `False`, each document is considered as being a simple list of strings: list(str). :rtype: list(str) :return: A list of all words/tokens in `documents`. """ all_words = [] if labeled is None: labeled = documents and isinstance(documents[0], tuple) if labeled: for words, _sentiment in documents: all_words.extend(words) elif not labeled: for words in documents: all_words.extend(words) return all_words def apply_features(self, documents, labeled=None): """ Apply all feature extractor functions to the documents. This is a wrapper around `nltk.classify.util.apply_features`. If `labeled=False`, return featuresets as: [feature_func(doc) for doc in documents] If `labeled=True`, return featuresets as: [(feature_func(tok), label) for (tok, label) in toks] :param documents: a list of documents. `If labeled=True`, the method expects a list of (words, label) tuples. :rtype: LazyMap """ return apply_features(self.extract_features, documents, labeled) def unigram_word_feats(self, words, top_n=None, min_freq=0): """ Return most common top_n word features. :param words: a list of words/tokens. :param top_n: number of best words/tokens to use, sorted by frequency. :rtype: list(str) :return: A list of `top_n` words/tokens (with no duplicates) sorted by frequency. """ # Stopwords are not removed unigram_feats_freqs = FreqDist(word for word in words) return [ w for w, f in unigram_feats_freqs.most_common(top_n) if unigram_feats_freqs[w] > min_freq ] def bigram_collocation_feats( self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi ): """ Return `top_n` bigram features (using `assoc_measure`). Note that this method is based on bigram collocations measures, and not on simple bigram frequency. :param documents: a list (or iterable) of tokens. :param top_n: number of best words/tokens to use, sorted by association measure. :param assoc_measure: bigram association measure to use as score function. :param min_freq: the minimum number of occurrencies of bigrams to take into consideration. :return: `top_n` ngrams scored by the given association measure. """ finder = BigramCollocationFinder.from_documents(documents) finder.apply_freq_filter(min_freq) return finder.nbest(assoc_measure, top_n) def classify(self, instance): """ Classify a single instance applying the features that have already been stored in the SentimentAnalyzer. :param instance: a list (or iterable) of tokens. :return: the classification result given by applying the classifier. """ instance_feats = self.apply_features([instance], labeled=False) return self.classifier.classify(instance_feats[0]) def add_feat_extractor(self, function, **kwargs): """ Add a new function to extract features from a document. This function will be used in extract_features(). Important: in this step our kwargs are only representing additional parameters, and NOT the document we have to parse. The document will always be the first parameter in the parameter list, and it will be added in the extract_features() function. :param function: the extractor function to add to the list of feature extractors. :param kwargs: additional parameters required by the `function` function. """ self.feat_extractors[function].append(kwargs) def extract_features(self, document): """ Apply extractor functions (and their parameters) to the present document. We pass `document` as the first parameter of the extractor functions. If we want to use the same extractor function multiple times, we have to add it to the extractors with `add_feat_extractor` using multiple sets of parameters (one for each call of the extractor function). :param document: the document that will be passed as argument to the feature extractor functions. :return: A dictionary of populated features extracted from the document. :rtype: dict """ all_features = {} for extractor in self.feat_extractors: for param_set in self.feat_extractors[extractor]: feats = extractor(document, **param_set) all_features.update(feats) return all_features def train(self, trainer, training_set, save_classifier=None, **kwargs): """ Train classifier on the training set, optionally saving the output in the file specified by `save_classifier`. Additional arguments depend on the specific trainer used. For example, a MaxentClassifier can use `max_iter` parameter to specify the number of iterations, while a NaiveBayesClassifier cannot. :param trainer: `train` method of a classifier. E.g.: NaiveBayesClassifier.train :param training_set: the training set to be passed as argument to the classifier `train` method. :param save_classifier: the filename of the file where the classifier will be stored (optional). :param kwargs: additional parameters that will be passed as arguments to the classifier `train` function. :return: A classifier instance trained on the training set. :rtype: """ print("Training classifier") self.classifier = trainer(training_set, **kwargs) if save_classifier: self.save_file(self.classifier, save_classifier) return self.classifier def save_file(self, content, filename): """ Store `content` in `filename`. Can be used to store a SentimentAnalyzer. """ print("Saving", filename, file=sys.stderr) with open(filename, "wb") as storage_file: import pickle # The protocol=2 parameter is for python2 compatibility pickle.dump(content, storage_file, protocol=2) def evaluate( self, test_set, classifier=None, accuracy=True, f_measure=True, precision=True, recall=True, verbose=False, ): """ Evaluate and print classifier performance on the test set. :param test_set: A list of (tokens, label) tuples to use as gold set. :param classifier: a classifier instance (previously trained). :param accuracy: if `True`, evaluate classifier accuracy. :param f_measure: if `True`, evaluate classifier f_measure. :param precision: if `True`, evaluate classifier precision. :param recall: if `True`, evaluate classifier recall. :return: evaluation results. :rtype: dict(str): float """ if classifier is None: classifier = self.classifier print(f"Evaluating {type(classifier).__name__} results...") metrics_results = {} if accuracy: accuracy_score = eval_accuracy(classifier, test_set) metrics_results["Accuracy"] = accuracy_score gold_results = defaultdict(set) test_results = defaultdict(set) labels = set() for i, (feats, label) in enumerate(test_set): labels.add(label) gold_results[label].add(i) observed = classifier.classify(feats) test_results[observed].add(i) for label in labels: if precision: precision_score = eval_precision( gold_results[label], test_results[label] ) metrics_results[f"Precision [{label}]"] = precision_score if recall: recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results[f"Recall [{label}]"] = recall_score if f_measure: f_measure_score = eval_f_measure( gold_results[label], test_results[label] ) metrics_results[f"F-measure [{label}]"] = f_measure_score # Print evaluation results (in alphabetical order) if verbose: for result in sorted(metrics_results): print(f"{result}: {metrics_results[result]}") return metrics_results nltk-3.7/nltk/sentiment/util.py000066400000000000000000000732231420073152400166450ustar00rootroot00000000000000# # Natural Language Toolkit: Sentiment Analyzer # # Copyright (C) 2001-2022 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ Utility methods for Sentiment Analysis. """ import codecs import csv import json import pickle import random import re import sys import time from copy import deepcopy import nltk from nltk.corpus import CategorizedPlaintextCorpusReader from nltk.data import load from nltk.tokenize.casual import EMOTICON_RE # //////////////////////////////////////////////////////////// # { Regular expressions # //////////////////////////////////////////////////////////// # Regular expression for negation by Christopher Potts NEGATION = r""" (?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n't""" NEGATION_RE = re.compile(NEGATION, re.VERBOSE) CLAUSE_PUNCT = r"^[.:;!?]$" CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT) # Happy and sad emoticons HAPPY = { ":-)", ":)", ";)", ":o)", ":]", ":3", ":c)", ":>", "=]", "8)", "=)", ":}", ":^)", ":-D", ":D", "8-D", "8D", "x-D", "xD", "X-D", "XD", "=-D", "=D", "=-3", "=3", ":-))", ":'-)", ":')", ":*", ":^*", ">:P", ":-P", ":P", "X-P", "x-p", "xp", "XP", ":-p", ":p", "=p", ":-b", ":b", ">:)", ">;)", ">:-)", "<3", } SAD = { ":L", ":-/", ">:/", ":S", ">:[", ":@", ":-(", ":[", ":-||", "=L", ":<", ":-[", ":-<", "=\\", "=/", ">:(", ":(", ">.<", ":'-(", ":'(", ":\\", ":-c", ":c", ":{", ">:\\", ";(", } def timer(method): """ A timer decorator to measure execution performance of methods. """ def timed(*args, **kw): start = time.time() result = method(*args, **kw) end = time.time() tot_time = end - start hours = tot_time // 3600 mins = tot_time // 60 % 60 # in Python 2.x round() will return a float, so we convert it to int secs = int(round(tot_time % 60)) if hours == 0 and mins == 0 and secs < 10: print(f"[TIMER] {method.__name__}(): {method.__name__:.3f} seconds") else: print(f"[TIMER] {method.__name__}(): {hours}h {mins}m {secs}s") return result return timed # //////////////////////////////////////////////////////////// # { Feature extractor functions # //////////////////////////////////////////////////////////// """ Feature extractor functions are declared outside the SentimentAnalyzer class. Users should have the possibility to create their own feature extractors without modifying SentimentAnalyzer. """ def extract_unigram_feats(document, unigrams, handle_negation=False): """ Populate a dictionary of unigram features, reflecting the presence/absence in the document of each of the tokens in `unigrams`. :param document: a list of words/tokens. :param unigrams: a list of words/tokens whose presence/absence has to be checked in `document`. :param handle_negation: if `handle_negation == True` apply `mark_negation` method to `document` before checking for unigram presence/absence. :return: a dictionary of unigram features {unigram : boolean}. >>> words = ['ice', 'police', 'riot'] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_unigram_feats(document, words).items()) [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)] """ features = {} if handle_negation: document = mark_negation(document) for word in unigrams: features[f"contains({word})"] = word in set(document) return features def extract_bigram_feats(document, bigrams): """ Populate a dictionary of bigram features, reflecting the presence/absence in the document of each of the tokens in `bigrams`. This extractor function only considers contiguous bigrams obtained by `nltk.bigrams`. :param document: a list of words/tokens. :param unigrams: a list of bigrams whose presence/absence has to be checked in `document`. :return: a dictionary of bigram features {bigram : boolean}. >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_bigram_feats(document, bigrams).items()) [('contains(global - warming)', True), ('contains(love - you)', False), ('contains(police - prevented)', False)] """ features = {} for bigr in bigrams: features[f"contains({bigr[0]} - {bigr[1]})"] = bigr in nltk.bigrams(document) return features # //////////////////////////////////////////////////////////// # { Helper Functions # //////////////////////////////////////////////////////////// def mark_negation(document, double_neg_flip=False, shallow=False): """ Append _NEG suffix to words that appear in the scope between a negation and a punctuation mark. :param document: a list of words/tokens, or a tuple (words, label). :param shallow: if True, the method will modify the original document in place. :param double_neg_flip: if True, double negation is considered affirmation (we activate/deactivate negation scope every time we find a negation). :return: if `shallow == True` the method will modify the original document and return it. If `shallow == False` the method will return a modified document, leaving the original unmodified. >>> sent = "I didn't like this movie . It was bad .".split() >>> mark_negation(sent) ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.'] """ if not shallow: document = deepcopy(document) # check if the document is labeled. If so, do not consider the label. labeled = document and isinstance(document[0], (tuple, list)) if labeled: doc = document[0] else: doc = document neg_scope = False for i, word in enumerate(doc): if NEGATION_RE.search(word): if not neg_scope or (neg_scope and double_neg_flip): neg_scope = not neg_scope continue else: doc[i] += "_NEG" elif neg_scope and CLAUSE_PUNCT_RE.search(word): neg_scope = not neg_scope elif neg_scope and not CLAUSE_PUNCT_RE.search(word): doc[i] += "_NEG" return document def output_markdown(filename, **kwargs): """ Write the output of an analysis to a file. """ with codecs.open(filename, "at") as outfile: text = "\n*** \n\n" text += "{} \n\n".format(time.strftime("%d/%m/%Y, %H:%M")) for k in sorted(kwargs): if isinstance(kwargs[k], dict): dictionary = kwargs[k] text += f" - **{k}:**\n" for entry in sorted(dictionary): text += f" - {entry}: {dictionary[entry]} \n" elif isinstance(kwargs[k], list): text += f" - **{k}:**\n" for entry in kwargs[k]: text += f" - {entry}\n" else: text += f" - **{k}:** {kwargs[k]} \n" outfile.write(text) def split_train_test(all_instances, n=None): """ Randomly split `n` instances of the dataset into train and test sets. :param all_instances: a list of instances (e.g. documents) that will be split. :param n: the number of instances to consider (in case we want to use only a subset). :return: two lists of instances. Train set is 8/10 of the total and test set is 2/10 of the total. """ random.seed(12345) random.shuffle(all_instances) if not n or n > len(all_instances): n = len(all_instances) train_set = all_instances[: int(0.8 * n)] test_set = all_instances[int(0.8 * n) : n] return train_set, test_set def _show_plot(x_values, y_values, x_labels=None, y_labels=None): try: import matplotlib.pyplot as plt except ImportError as e: raise ImportError( "The plot function requires matplotlib to be installed." "See https://matplotlib.org/" ) from e plt.locator_params(axis="y", nbins=3) axes = plt.axes() axes.yaxis.grid() plt.plot(x_values, y_values, "ro", color="red") plt.ylim(ymin=-1.2, ymax=1.2) plt.tight_layout(pad=5) if x_labels: plt.xticks(x_values, x_labels, rotation="vertical") if y_labels: plt.yticks([-1, 0, 1], y_labels, rotation="horizontal") # Pad margins so that markers are not clipped by the axes plt.margins(0.2) plt.show() # //////////////////////////////////////////////////////////// # { Parsing and conversion functions # //////////////////////////////////////////////////////////// def json2csv_preprocess( json_file, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False, skip_retweets=True, skip_tongue_tweets=True, skip_ambiguous_tweets=True, strip_off_emoticons=True, remove_duplicates=True, limit=None, ): """ Convert json file to csv file, preprocessing each row to obtain a suitable dataset for tweets Semantic Analysis. :param json_file: the original json file containing tweets. :param outfile: the output csv filename. :param fields: a list of fields that will be extracted from the json file and kept in the output csv file. :param encoding: the encoding of the files. :param errors: the error handling strategy for the output writer. :param gzip_compress: if True, create a compressed GZIP file. :param skip_retweets: if True, remove retweets. :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P" emoticons. :param skip_ambiguous_tweets: if True, remove tweets containing both happy and sad emoticons. :param strip_off_emoticons: if True, strip off emoticons from all tweets. :param remove_duplicates: if True, remove tweets appearing more than once. :param limit: an integer to set the number of tweets to convert. After the limit is reached the conversion will stop. It can be useful to create subsets of the original tweets json data. """ with codecs.open(json_file, encoding=encoding) as fp: (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) # write the list of fields as header writer.writerow(fields) if remove_duplicates == True: tweets_cache = [] i = 0 for line in fp: tweet = json.loads(line) row = extract_fields(tweet, fields) try: text = row[fields.index("text")] # Remove retweets if skip_retweets == True: if re.search(r"\bRT\b", text): continue # Remove tweets containing ":P" and ":-P" emoticons if skip_tongue_tweets == True: if re.search(r"\:\-?P\b", text): continue # Remove tweets containing both happy and sad emoticons if skip_ambiguous_tweets == True: all_emoticons = EMOTICON_RE.findall(text) if all_emoticons: if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD): continue # Strip off emoticons from all tweets if strip_off_emoticons == True: row[fields.index("text")] = re.sub( r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text) ) # Remove duplicate tweets if remove_duplicates == True: if row[fields.index("text")] in tweets_cache: continue else: tweets_cache.append(row[fields.index("text")]) except ValueError: pass writer.writerow(row) i += 1 if limit and i >= limit: break outf.close() def parse_tweets_set( filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True ): """ Parse csv file containing tweets and output data a list of (text, label) tuples. :param filename: the input csv filename. :param label: the label to be appended to each tweet contained in the csv file. :param word_tokenizer: the tokenizer instance that will be used to tokenize each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()). If no word_tokenizer is specified, tweets will not be tokenized. :param sent_tokenizer: the tokenizer that will be used to split each tweet into sentences. :param skip_header: if True, skip the first line of the csv file (which usually contains headers). :return: a list of (text, label) tuples. """ tweets = [] if not sent_tokenizer: sent_tokenizer = load("tokenizers/punkt/english.pickle") with codecs.open(filename, "rt") as csvfile: reader = csv.reader(csvfile) if skip_header == True: next(reader, None) # skip the header i = 0 for tweet_id, text in reader: # text = text[1] i += 1 sys.stdout.write(f"Loaded {i} tweets\r") # Apply sentence and word tokenizer to text if word_tokenizer: tweet = [ w for sent in sent_tokenizer.tokenize(text) for w in word_tokenizer.tokenize(sent) ] else: tweet = text tweets.append((tweet, label)) print(f"Loaded {i} tweets") return tweets # //////////////////////////////////////////////////////////// # { Demos # //////////////////////////////////////////////////////////// def demo_tweets(trainer, n_instances=None, output=None): """ Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import stopwords, twitter_samples from nltk.sentiment import SentimentAnalyzer from nltk.tokenize import TweetTokenizer # Different customizations for the TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) if n_instances is not None: n_instances = int(n_instances / 2) fields = ["id", "text"] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = "positive_tweets.csv" json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = "negative_tweets.csv" json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer) # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs + train_neg_docs testing_tweets = test_pos_docs + test_neg_docs sentim_analyzer = SentimentAnalyzer() # stopwords = stopwords.words('english') # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] all_words = [word for word in sentim_analyzer.all_words(training_tweets)] # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats( [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12 ) sentim_analyzer.add_feat_extractor( extract_bigram_feats, bigrams=bigram_collocs_feats ) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print( "Your classifier does not provide a show_most_informative_features() method." ) results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown( output, Dataset="labeled_tweets", Classifier=type(classifier).__name__, Tokenizer=tokenizer.__class__.__name__, Feats=extr, Results=results, Instances=n_instances, ) def demo_movie_reviews(trainer, n_instances=None, output=None): """ Train classifier on all instances of the Movie Reviews dataset. The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for training and testing. Reviews will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import movie_reviews from nltk.sentiment import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances / 2) pos_docs = [ (list(movie_reviews.words(pos_id)), "pos") for pos_id in movie_reviews.fileids("pos")[:n_instances] ] neg_docs = [ (list(movie_reviews.words(neg_id)), "neg") for neg_id in movie_reviews.fileids("neg")[:n_instances] ] # We separately split positive and negative instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_docs = train_pos_docs + train_neg_docs testing_docs = test_pos_docs + test_neg_docs sentim_analyzer = SentimentAnalyzer() all_words = sentim_analyzer.all_words(training_docs) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print( "Your classifier does not provide a show_most_informative_features() method." ) results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown( output, Dataset="Movie_reviews", Classifier=type(classifier).__name__, Tokenizer="WordPunctTokenizer", Feats=extr, Results=results, Instances=n_instances, ) def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None): """ Train and test a classifier on instances of the Subjective Dataset by Pang and Lee. The dataset is made of 5000 subjective and 5000 objective sentences. All tokens (words and punctuation marks) are separated by a whitespace, so we use the basic WhitespaceTokenizer to parse the data. :param trainer: `train` method of a classifier. :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file. :param n_instances: the number of total sentences that have to be used for training and testing. Sentences will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import subjectivity from nltk.sentiment import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances / 2) subj_docs = [ (sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances] ] obj_docs = [ (sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances] ] # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_subj_docs, test_subj_docs = split_train_test(subj_docs) train_obj_docs, test_obj_docs = split_train_test(obj_docs) training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs] ) # Add simple unigram word features handling negation unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print( "Your classifier does not provide a show_most_informative_features() method." ) results = sentim_analyzer.evaluate(test_set) if save_analyzer == True: sentim_analyzer.save_file(sentim_analyzer, "sa_subjectivity.pickle") if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown( output, Dataset="subjectivity", Classifier=type(classifier).__name__, Tokenizer="WhitespaceTokenizer", Feats=extr, Instances=n_instances, Results=results, ) return sentim_analyzer def demo_sent_subjectivity(text): """ Classify a single sentence as subjective or objective using a stored SentimentAnalyzer. :param text: a sentence whose subjectivity has to be classified. """ from nltk.classify import NaiveBayesClassifier from nltk.tokenize import regexp word_tokenizer = regexp.WhitespaceTokenizer() try: sentim_analyzer = load("sa_subjectivity.pickle") except LookupError: print("Cannot find the sentiment analyzer you want to load.") print("Training a new one using NaiveBayesClassifier.") sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) # Tokenize and convert to lower case tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] print(sentim_analyzer.classify(tokenized_text)) def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: print("Positive") elif pos_words < neg_words: print("Negative") elif pos_words == neg_words: print("Neutral") if plot == True: _show_plot( x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"] ) def demo_vader_instance(text): """ Output polarity scores for a text using Vader approach. :param text: a text whose polarity has to be evaluated. """ from nltk.sentiment import SentimentIntensityAnalyzer vader_analyzer = SentimentIntensityAnalyzer() print(vader_analyzer.polarity_scores(text)) def demo_vader_tweets(n_instances=None, output=None): """ Classify 10000 positive and negative tweets using Vader approach. :param n_instances: the number of total tweets that have to be classified. :param output: the output file where results have to be reported. """ from collections import defaultdict from nltk.corpus import twitter_samples from nltk.metrics import accuracy as eval_accuracy from nltk.metrics import f_measure as eval_f_measure from nltk.metrics import precision as eval_precision from nltk.metrics import recall as eval_recall from nltk.sentiment import SentimentIntensityAnalyzer if n_instances is not None: n_instances = int(n_instances / 2) fields = ["id", "text"] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = "positive_tweets.csv" json2csv_preprocess( positive_json, positive_csv, fields, strip_off_emoticons=False, limit=n_instances, ) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = "negative_tweets.csv" json2csv_preprocess( negative_json, negative_csv, fields, strip_off_emoticons=False, limit=n_instances, ) pos_docs = parse_tweets_set(positive_csv, label="pos") neg_docs = parse_tweets_set(negative_csv, label="neg") # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs + train_neg_docs testing_tweets = test_pos_docs + test_neg_docs vader_analyzer = SentimentIntensityAnalyzer() gold_results = defaultdict(set) test_results = defaultdict(set) acc_gold_results = [] acc_test_results = [] labels = set() num = 0 for i, (text, label) in enumerate(testing_tweets): labels.add(label) gold_results[label].add(i) acc_gold_results.append(label) score = vader_analyzer.polarity_scores(text)["compound"] if score > 0: observed = "pos" else: observed = "neg" num += 1 acc_test_results.append(observed) test_results[observed].add(i) metrics_results = {} for label in labels: accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) metrics_results["Accuracy"] = accuracy_score precision_score = eval_precision(gold_results[label], test_results[label]) metrics_results[f"Precision [{label}]"] = precision_score recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results[f"Recall [{label}]"] = recall_score f_measure_score = eval_f_measure(gold_results[label], test_results[label]) metrics_results[f"F-measure [{label}]"] = f_measure_score for result in sorted(metrics_results): print(f"{result}: {metrics_results[result]}") if output: output_markdown( output, Approach="Vader", Dataset="labeled_tweets", Instances=n_instances, Results=metrics_results, ) if __name__ == "__main__": from sklearn.svm import LinearSVC from nltk.classify import MaxentClassifier, NaiveBayesClassifier from nltk.classify.scikitlearn import SklearnClassifier from nltk.twitter.common import _outf_writer, extract_fields naive_bayes = NaiveBayesClassifier.train svm = SklearnClassifier(LinearSVC()).train maxent = MaxentClassifier.train demo_tweets(naive_bayes) # demo_movie_reviews(svm) # demo_subjectivity(svm) # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ") # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True) # demo_vader_instance("This movie was actually neither that funny, nor super witty.") # demo_vader_tweets() nltk-3.7/nltk/sentiment/vader.py000066400000000000000000000505001420073152400167620ustar00rootroot00000000000000# Natural Language Toolkit: vader # # Copyright (C) 2001-2022 NLTK Project # Author: C.J. Hutto # Ewan Klein (modifications) # Pierpaolo Pantone <24alsecondo@gmail.com> (modifications) # George Berry (modifications) # Malavika Suresh (modifications) # URL: # For license information, see LICENSE.TXT # # Modifications to the original VADER code have been made in order to # integrate it into NLTK. These have involved changes to # ensure Python 3 compatibility, and refactoring to achieve greater modularity. """ If you use the VADER sentiment analysis tools, please cite: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. """ import math import re import string from itertools import product import nltk.data from nltk.util import pairwise class VaderConstants: """ A class to keep the Vader lists and constants. """ ##Constants## # (empirically derived mean sentiment intensity rating increase for booster words) B_INCR = 0.293 B_DECR = -0.293 # (empirically derived mean sentiment intensity rating increase for using # ALLCAPs to emphasize a word) C_INCR = 0.733 N_SCALAR = -0.74 NEGATE = { "aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent", "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't", "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite", } # booster/dampener 'intensifiers' or 'degree adverbs' # https://en.wiktionary.org/wiki/Category:English_degree_adverbs BOOSTER_DICT = { "absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, "completely": B_INCR, "considerably": B_INCR, "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormously": B_INCR, "entirely": B_INCR, "especially": B_INCR, "exceptionally": B_INCR, "extremely": B_INCR, "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR, "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, "fucking": B_INCR, "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, "incredibly": B_INCR, "intensely": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR, "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR, "so": B_INCR, "substantially": B_INCR, "thoroughly": B_INCR, "totally": B_INCR, "tremendously": B_INCR, "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utterly": B_INCR, "very": B_INCR, "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR, "kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR, "less": B_DECR, "little": B_DECR, "marginally": B_DECR, "occasionally": B_DECR, "partly": B_DECR, "scarcely": B_DECR, "slightly": B_DECR, "somewhat": B_DECR, "sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR, } # check for special case idioms using a sentiment-laden keyword known to SAGE SPECIAL_CASE_IDIOMS = { "the shit": 3, "the bomb": 3, "bad ass": 1.5, "yeah right": -2, "cut the mustard": 2, "kiss of death": -1.5, "hand to mouth": -2, } # for removing punctuation REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]") PUNC_LIST = [ ".", "!", "?", ",", ";", ":", "-", "'", '"', "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?", ] def __init__(self): pass def negated(self, input_words, include_nt=True): """ Determine if input contains negation words """ neg_words = self.NEGATE if any(word.lower() in neg_words for word in input_words): return True if include_nt: if any("n't" in word.lower() for word in input_words): return True for first, second in pairwise(input_words): if second.lower() == "least" and first.lower() != "at": return True return False def normalize(self, score, alpha=15): """ Normalize the score to be between -1 and 1 using an alpha that approximates the max expected value """ norm_score = score / math.sqrt((score * score) + alpha) return norm_score def scalar_inc_dec(self, word, valence, is_cap_diff): """ Check if the preceding words increase, decrease, or negate/nullify the valence """ scalar = 0.0 word_lower = word.lower() if word_lower in self.BOOSTER_DICT: scalar = self.BOOSTER_DICT[word_lower] if valence < 0: scalar *= -1 # check if booster/dampener word is in ALLCAPS (while others aren't) if word.isupper() and is_cap_diff: if valence > 0: scalar += self.C_INCR else: scalar -= self.C_INCR return scalar class SentiText: """ Identify sentiment-relevant string-level properties of input text. """ def __init__(self, text, punc_list, regex_remove_punctuation): if not isinstance(text, str): text = str(text.encode("utf-8")) self.text = text self.PUNC_LIST = punc_list self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation self.words_and_emoticons = self._words_and_emoticons() # doesn't separate words from # adjacent punctuation (keeps emoticons & contractions) self.is_cap_diff = self.allcap_differential(self.words_and_emoticons) def _words_plus_punc(self): """ Returns mapping of form: { 'cat,': 'cat', ',cat': 'cat', } """ no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text) # removes punctuation (but loses emoticons & contractions) words_only = no_punc_text.split() # remove singletons words_only = {w for w in words_only if len(w) > 1} # the product gives ('cat', ',') and (',', 'cat') punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)} punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)} words_punc_dict = punc_before words_punc_dict.update(punc_after) return words_punc_dict def _words_and_emoticons(self): """ Removes leading and trailing puncutation Leaves contractions and most emoticons Does not preserve punc-plus-letter emoticons (e.g. :D) """ wes = self.text.split() words_punc_dict = self._words_plus_punc() wes = [we for we in wes if len(we) > 1] for i, we in enumerate(wes): if we in words_punc_dict: wes[i] = words_punc_dict[we] return wes def allcap_differential(self, words): """ Check whether just some words in the input are ALL CAPS :param list words: The words to inspect :returns: `True` if some but not all items in `words` are ALL CAPS """ is_different = False allcap_words = 0 for word in words: if word.isupper(): allcap_words += 1 cap_differential = len(words) - allcap_words if 0 < cap_differential < len(words): is_different = True return is_different class SentimentIntensityAnalyzer: """ Give a sentiment intensity score to sentences. """ def __init__( self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt", ): self.lexicon_file = nltk.data.load(lexicon_file) self.lexicon = self.make_lex_dict() self.constants = VaderConstants() def make_lex_dict(self): """ Convert lexicon file to a dictionary """ lex_dict = {} for line in self.lexicon_file.split("\n"): (word, measure) = line.strip().split("\t")[0:2] lex_dict[word] = float(measure) return lex_dict def polarity_scores(self, text): """ Return a float for sentiment strength based on the input text. Positive values are positive valence, negative value are negative valence. """ # text, words_and_emoticons, is_cap_diff = self.preprocess(text) sentitext = SentiText( text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION ) sentiments = [] words_and_emoticons = sentitext.words_and_emoticons for item in words_and_emoticons: valence = 0 i = words_and_emoticons.index(item) if ( i < len(words_and_emoticons) - 1 and item.lower() == "kind" and words_and_emoticons[i + 1].lower() == "of" ) or item.lower() in self.constants.BOOSTER_DICT: sentiments.append(valence) continue sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments) sentiments = self._but_check(words_and_emoticons, sentiments) return self.score_valence(sentiments, text) def sentiment_valence(self, valence, sentitext, item, i, sentiments): is_cap_diff = sentitext.is_cap_diff words_and_emoticons = sentitext.words_and_emoticons item_lowercase = item.lower() if item_lowercase in self.lexicon: # get the sentiment valence valence = self.lexicon[item_lowercase] # check if sentiment laden word is in ALL CAPS (while others aren't) if item.isupper() and is_cap_diff: if valence > 0: valence += self.constants.C_INCR else: valence -= self.constants.C_INCR for start_i in range(0, 3): if ( i > start_i and words_and_emoticons[i - (start_i + 1)].lower() not in self.lexicon ): # dampen the scalar modifier of preceding words and emoticons # (excluding the ones that immediately preceed the item) based # on their distance from the current item. s = self.constants.scalar_inc_dec( words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff ) if start_i == 1 and s != 0: s = s * 0.95 if start_i == 2 and s != 0: s = s * 0.9 valence = valence + s valence = self._never_check( valence, words_and_emoticons, start_i, i ) if start_i == 2: valence = self._idioms_check(valence, words_and_emoticons, i) # future work: consider other sentiment-laden idioms # other_idioms = # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2, # "upper hand": 1, "break a leg": 2, # "cooking with gas": 2, "in the black": 2, "in the red": -2, # "on the ball": 2,"under the weather": -2} valence = self._least_check(valence, words_and_emoticons, i) sentiments.append(valence) return sentiments def _least_check(self, valence, words_and_emoticons, i): # check for negation case using "least" if ( i > 1 and words_and_emoticons[i - 1].lower() not in self.lexicon and words_and_emoticons[i - 1].lower() == "least" ): if ( words_and_emoticons[i - 2].lower() != "at" and words_and_emoticons[i - 2].lower() != "very" ): valence = valence * self.constants.N_SCALAR elif ( i > 0 and words_and_emoticons[i - 1].lower() not in self.lexicon and words_and_emoticons[i - 1].lower() == "least" ): valence = valence * self.constants.N_SCALAR return valence def _but_check(self, words_and_emoticons, sentiments): words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons] but = {"but"} & set(words_and_emoticons) if but: bi = words_and_emoticons.index(next(iter(but))) for sidx, sentiment in enumerate(sentiments): if sidx < bi: sentiments[sidx] = sentiment * 0.5 elif sidx > bi: sentiments[sidx] = sentiment * 1.5 return sentiments def _idioms_check(self, valence, words_and_emoticons, i): onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}" twoonezero = "{} {} {}".format( words_and_emoticons[i - 2], words_and_emoticons[i - 1], words_and_emoticons[i], ) twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}" threetwoone = "{} {} {}".format( words_and_emoticons[i - 3], words_and_emoticons[i - 2], words_and_emoticons[i - 1], ) threetwo = "{} {}".format( words_and_emoticons[i - 3], words_and_emoticons[i - 2] ) sequences = [onezero, twoonezero, twoone, threetwoone, threetwo] for seq in sequences: if seq in self.constants.SPECIAL_CASE_IDIOMS: valence = self.constants.SPECIAL_CASE_IDIOMS[seq] break if len(words_and_emoticons) - 1 > i: zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}" if zeroone in self.constants.SPECIAL_CASE_IDIOMS: valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone] if len(words_and_emoticons) - 1 > i + 1: zeroonetwo = "{} {} {}".format( words_and_emoticons[i], words_and_emoticons[i + 1], words_and_emoticons[i + 2], ) if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS: valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo] # check for booster/dampener bi-grams such as 'sort of' or 'kind of' if ( threetwo in self.constants.BOOSTER_DICT or twoone in self.constants.BOOSTER_DICT ): valence = valence + self.constants.B_DECR return valence def _never_check(self, valence, words_and_emoticons, start_i, i): if start_i == 0: if self.constants.negated([words_and_emoticons[i - 1]]): valence = valence * self.constants.N_SCALAR if start_i == 1: if words_and_emoticons[i - 2] == "never" and ( words_and_emoticons[i - 1] == "so" or words_and_emoticons[i - 1] == "this" ): valence = valence * 1.5 elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): valence = valence * self.constants.N_SCALAR if start_i == 2: if ( words_and_emoticons[i - 3] == "never" and ( words_and_emoticons[i - 2] == "so" or words_and_emoticons[i - 2] == "this" ) or ( words_and_emoticons[i - 1] == "so" or words_and_emoticons[i - 1] == "this" ) ): valence = valence * 1.25 elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): valence = valence * self.constants.N_SCALAR return valence def _punctuation_emphasis(self, sum_s, text): # add emphasis from exclamation points and question marks ep_amplifier = self._amplify_ep(text) qm_amplifier = self._amplify_qm(text) punct_emph_amplifier = ep_amplifier + qm_amplifier return punct_emph_amplifier def _amplify_ep(self, text): # check for added emphasis resulting from exclamation points (up to 4 of them) ep_count = text.count("!") if ep_count > 4: ep_count = 4 # (empirically derived mean sentiment intensity rating increase for # exclamation points) ep_amplifier = ep_count * 0.292 return ep_amplifier def _amplify_qm(self, text): # check for added emphasis resulting from question marks (2 or 3+) qm_count = text.count("?") qm_amplifier = 0 if qm_count > 1: if qm_count <= 3: # (empirically derived mean sentiment intensity rating increase for # question marks) qm_amplifier = qm_count * 0.18 else: qm_amplifier = 0.96 return qm_amplifier def _sift_sentiment_scores(self, sentiments): # want separate positive versus negative sentiment scores pos_sum = 0.0 neg_sum = 0.0 neu_count = 0 for sentiment_score in sentiments: if sentiment_score > 0: pos_sum += ( float(sentiment_score) + 1 ) # compensates for neutral words that are counted as 1 if sentiment_score < 0: neg_sum += ( float(sentiment_score) - 1 ) # when used with math.fabs(), compensates for neutrals if sentiment_score == 0: neu_count += 1 return pos_sum, neg_sum, neu_count def score_valence(self, sentiments, text): if sentiments: sum_s = float(sum(sentiments)) # compute and add emphasis from punctuation in text punct_emph_amplifier = self._punctuation_emphasis(sum_s, text) if sum_s > 0: sum_s += punct_emph_amplifier elif sum_s < 0: sum_s -= punct_emph_amplifier compound = self.constants.normalize(sum_s) # discriminate between positive, negative and neutral sentiment scores pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments) if pos_sum > math.fabs(neg_sum): pos_sum += punct_emph_amplifier elif pos_sum < math.fabs(neg_sum): neg_sum -= punct_emph_amplifier total = pos_sum + math.fabs(neg_sum) + neu_count pos = math.fabs(pos_sum / total) neg = math.fabs(neg_sum / total) neu = math.fabs(neu_count / total) else: compound = 0.0 pos = 0.0 neg = 0.0 neu = 0.0 sentiment_dict = { "neg": round(neg, 3), "neu": round(neu, 3), "pos": round(pos, 3), "compound": round(compound, 4), } return sentiment_dict nltk-3.7/nltk/stem/000077500000000000000000000000001420073152400142515ustar00rootroot00000000000000nltk-3.7/nltk/stem/__init__.py000066400000000000000000000023561420073152400163700ustar00rootroot00000000000000# Natural Language Toolkit: Stemmers # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ NLTK Stemmers Interfaces used to remove morphological affixes from words, leaving only the word stem. Stemming algorithms aim to remove those affixes required for eg. grammatical role, tense, derivational morphology leaving only the stem of the word. This is a difficult problem due to irregular words (eg. common verbs in English), complicated morphological rules, and part-of-speech and sense ambiguities (eg. ``ceil-`` is not the stem of ``ceiling``). StemmerI defines a standard interface for stemmers. """ from nltk.stem.api import StemmerI from nltk.stem.arlstem import ARLSTem from nltk.stem.arlstem2 import ARLSTem2 from nltk.stem.cistem import Cistem from nltk.stem.isri import ISRIStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.porter import PorterStemmer from nltk.stem.regexp import RegexpStemmer from nltk.stem.rslp import RSLPStemmer from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer nltk-3.7/nltk/stem/api.py000066400000000000000000000013121420073152400153710ustar00rootroot00000000000000# Natural Language Toolkit: Stemmer Interface # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from abc import ABCMeta, abstractmethod class StemmerI(metaclass=ABCMeta): """ A processing interface for removing morphological affixes from words. This process is known as stemming. """ @abstractmethod def stem(self, token): """ Strip affixes from the token and return the stem. :param token: The token that should be stemmed. :type token: str """ nltk-3.7/nltk/stem/arlstem.py000066400000000000000000000305451420073152400163010ustar00rootroot00000000000000# # Natural Language Toolkit: ARLSTem Stemmer # # Copyright (C) 2001-2022 NLTK Project # # Author: Kheireddine Abainia (x-programer) # Algorithms: Kheireddine Abainia # Siham Ouamour # Halim Sayoud # URL: # For license information, see LICENSE.TXT """ ARLSTem Arabic Stemmer The details about the implementation of this algorithm are described in: K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer , Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17), Vol. 29, No. 3, 2017, pp. 557-573. The ARLSTem is a light Arabic stemmer that is based on removing the affixes from the word (i.e. prefixes, suffixes and infixes). It was evaluated and compared to several other stemmers using Paice's parameters (under-stemming index, over-stemming index and stemming weight), and the results showed that ARLSTem is promising and producing high performances. This stemmer is not based on any dictionary and can be used on-line effectively. """ import re from nltk.stem.api import StemmerI class ARLSTem(StemmerI): """ ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary. Department of Telecommunication & Information Processing. USTHB University, Algiers, Algeria. ARLSTem.stem(token) returns the Arabic stem for the input token. The ARLSTem Stemmer requires that all tokens are encoded using Unicode encoding. """ def __init__(self): # different Alif with hamza self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]") self.re_alifMaqsura = re.compile(r"[\u0649]") self.re_diacritics = re.compile(r"[\u064B-\u065F]") # Alif Laam, Laam Laam, Fa Laam, Fa Ba self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"] # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"] # Fa Laam Laam, Waaw Laam Laam self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"] # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam self.pr4 = [ "\u0641\u0628\u0627\u0644", "\u0648\u0628\u0627\u0644", "\u0641\u0643\u0627\u0644", ] # Kaf Yaa, Kaf Miim self.su2 = ["\u0643\u064A", "\u0643\u0645"] # Ha Alif, Ha Miim self.su22 = ["\u0647\u0627", "\u0647\u0645"] # Kaf Miim Alif, Kaf Noon Shadda self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"] # Ha Miim Alif, Ha Noon Shadda self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"] # Alif Noon, Ya Noon, Waaw Noon self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"] # Taa Alif Noon, Taa Ya Noon self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"] # Alif Noon, Waaw Noon self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"] # Siin Taa, Siin Yaa self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"] # Siin Alif, Siin Noon self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"] # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza self.verb_pr33 = [ "\u0644\u0646", "\u0644\u062A", "\u0644\u064A", "\u0644\u0623", ] # Taa Miim Alif, Taa Noon Shadda self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"] # Noon Alif, Taa Miim, Taa Alif, Waaw Alif self.verb_suf2 = [ "\u0646\u0627", "\u062A\u0645", "\u062A\u0627", "\u0648\u0627", ] # Taa, Alif, Noon self.verb_suf1 = ["\u062A", "\u0627", "\u0646"] def stem(self, token): """ call this function to get the word's stem based on ARLSTem . """ try: if token is None: raise ValueError( "The word could not be stemmed, because \ it is empty !" ) # remove Arabic diacritics and replace some letters with others token = self.norm(token) # strip common prefixes of the nouns pre = self.pref(token) if pre is not None: token = pre # strip the suffixes which are common to nouns and verbs token = self.suff(token) # transform a plural noun to a singular noun ps = self.plur2sing(token) if ps is None: # transform from the feminine form to the masculine form fm = self.fem2masc(token) if fm is not None: return fm else: if pre is None: # if the prefixes are not stripped # strip the verb prefixes and suffixes return self.verb(token) else: return ps return token except ValueError as e: print(e) def norm(self, token): """ normalize the word by removing diacritics, replacing hamzated Alif with Alif replacing AlifMaqsura with Yaa and removing Waaw at the beginning. """ # strip Arabic diacritics token = self.re_diacritics.sub("", token) # replace Hamzated Alif with Alif bare token = self.re_hamzated_alif.sub("\u0627", token) # replace alifMaqsura with Yaa token = self.re_alifMaqsura.sub("\u064A", token) # strip the Waaw from the word beginning if the remaining is 3 letters # at least if token.startswith("\u0648") and len(token) > 3: token = token[1:] return token def pref(self, token): """ remove prefixes from the words' beginning. """ if len(token) > 5: for p3 in self.pr3: if token.startswith(p3): return token[3:] if len(token) > 6: for p4 in self.pr4: if token.startswith(p4): return token[4:] if len(token) > 5: for p3 in self.pr32: if token.startswith(p3): return token[3:] if len(token) > 4: for p2 in self.pr2: if token.startswith(p2): return token[2:] def suff(self, token): """ remove suffixes from the word's end. """ if token.endswith("\u0643") and len(token) > 3: return token[:-1] if len(token) > 4: for s2 in self.su2: if token.endswith(s2): return token[:-2] if len(token) > 5: for s3 in self.su3: if token.endswith(s3): return token[:-3] if token.endswith("\u0647") and len(token) > 3: token = token[:-1] return token if len(token) > 4: for s2 in self.su22: if token.endswith(s2): return token[:-2] if len(token) > 5: for s3 in self.su32: if token.endswith(s3): return token[:-3] if token.endswith("\u0646\u0627") and len(token) > 4: return token[:-2] return token def fem2masc(self, token): """ transform the word from the feminine form to the masculine form. """ if token.endswith("\u0629") and len(token) > 3: return token[:-1] def plur2sing(self, token): """ transform the word from the plural form to the singular form. """ if len(token) > 4: for ps2 in self.pl_si2: if token.endswith(ps2): return token[:-2] if len(token) > 5: for ps3 in self.pl_si3: if token.endswith(ps3): return token[:-3] if len(token) > 3 and token.endswith("\u0627\u062A"): return token[:-2] if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627": return token[:2] + token[3:] if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627": return token[1:-2] + token[-1] def verb(self, token): """ stem the verb prefixes and suffixes or both """ vb = self.verb_t1(token) if vb is not None: return vb vb = self.verb_t2(token) if vb is not None: return vb vb = self.verb_t3(token) if vb is not None: return vb vb = self.verb_t4(token) if vb is not None: return vb vb = self.verb_t5(token) if vb is not None: return vb return self.verb_t6(token) def verb_t1(self, token): """ stem the present prefixes and suffixes """ if len(token) > 5 and token.startswith("\u062A"): # Taa for s2 in self.pl_si2: if token.endswith(s2): return token[1:-2] if len(token) > 5 and token.startswith("\u064A"): # Yaa for s2 in self.verb_su2: if token.endswith(s2): return token[1:-2] if len(token) > 4 and token.startswith("\u0627"): # Alif # Waaw Alif if len(token) > 5 and token.endswith("\u0648\u0627"): return token[1:-2] # Yaa if token.endswith("\u064A"): return token[1:-1] # Alif if token.endswith("\u0627"): return token[1:-1] # Noon if token.endswith("\u0646"): return token[1:-1] # ^Yaa, Noon$ if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"): return token[1:-1] # ^Taa, Noon$ if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"): return token[1:-1] def verb_t2(self, token): """ stem the future prefixes and suffixes """ if len(token) > 6: for s2 in self.pl_si2: # ^Siin Taa if token.startswith(self.verb_pr2[0]) and token.endswith(s2): return token[2:-2] # ^Siin Yaa, Alif Noon$ if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]): return token[2:-2] # ^Siin Yaa, Waaw Noon$ if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]): return token[2:-2] # ^Siin Taa, Noon$ if ( len(token) > 5 and token.startswith(self.verb_pr2[0]) and token.endswith("\u0646") ): return token[2:-1] # ^Siin Yaa, Noon$ if ( len(token) > 5 and token.startswith(self.verb_pr2[1]) and token.endswith("\u0646") ): return token[2:-1] def verb_t3(self, token): """ stem the present suffixes """ if len(token) > 5: for su3 in self.verb_suf3: if token.endswith(su3): return token[:-3] if len(token) > 4: for su2 in self.verb_suf2: if token.endswith(su2): return token[:-2] if len(token) > 3: for su1 in self.verb_suf1: if token.endswith(su1): return token[:-1] def verb_t4(self, token): """ stem the present prefixes """ if len(token) > 3: for pr1 in self.verb_suf1: if token.startswith(pr1): return token[1:] if token.startswith("\u064A"): return token[1:] def verb_t5(self, token): """ stem the future prefixes """ if len(token) > 4: for pr2 in self.verb_pr22: if token.startswith(pr2): return token[2:] for pr2 in self.verb_pr2: if token.startswith(pr2): return token[2:] return token def verb_t6(self, token): """ stem the order prefixes """ if len(token) > 4: for pr3 in self.verb_pr33: if token.startswith(pr3): return token[2:] return token nltk-3.7/nltk/stem/arlstem2.py000066400000000000000000000372771420073152400163740ustar00rootroot00000000000000# # Natural Language Toolkit: ARLSTem Stemmer v2 # # Copyright (C) 2001-2022 NLTK Project # # Author: Kheireddine Abainia (x-programer) # Algorithms: Kheireddine Abainia # Hamza Rebbani # URL: # For license information, see LICENSE.TXT """ ARLSTem2 Arabic Light Stemmer The details about the implementation of this algorithm are described in: K. Abainia and H. Rebbani, Comparing the Effectiveness of the Improved ARLSTem Algorithm with Existing Arabic Light Stemmers, International Conference on Theoretical and Applicative Aspects of Computer Science (ICTAACS'19), Skikda, Algeria, December 15-16, 2019. ARLSTem2 is an Arabic light stemmer based on removing the affixes from the words (i.e. prefixes, suffixes and infixes). It is an improvement of the previous Arabic light stemmer (ARLSTem). The new version was compared to the original algorithm and several existing Arabic light stemmers, where the results showed that the new version considerably improves the under-stemming errors that are common to light stemmers. Both ARLSTem and ARLSTem2 can be run online and do not use any dictionary. """ import re from nltk.stem.api import StemmerI class ARLSTem2(StemmerI): """ Return a stemmed Arabic word after removing affixes. This an improved version of the previous algorithm, which reduces under-stemming errors. Typically used in Arabic search engine, information retrieval and NLP. >>> from nltk.stem import arlstem2 >>> stemmer = ARLSTem2() >>> word = stemmer.stem('يعمل') >>> print(word) :param token: The input Arabic word (unicode) to be stemmed :type token: unicode :return: A unicode Arabic word """ def __init__(self): # different Alif with hamza self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]") self.re_alifMaqsura = re.compile(r"[\u0649]") self.re_diacritics = re.compile(r"[\u064B-\u065F]") # Alif Laam, Laam Laam, Fa Laam, Fa Ba self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"] # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"] # Fa Laam Laam, Waaw Laam Laam self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"] # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam self.pr4 = [ "\u0641\u0628\u0627\u0644", "\u0648\u0628\u0627\u0644", "\u0641\u0643\u0627\u0644", ] # Kaf Yaa, Kaf Miim self.su2 = ["\u0643\u064A", "\u0643\u0645"] # Ha Alif, Ha Miim self.su22 = ["\u0647\u0627", "\u0647\u0645"] # Kaf Miim Alif, Kaf Noon Shadda self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"] # Ha Miim Alif, Ha Noon Shadda self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"] # Alif Noon, Ya Noon, Waaw Noon self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"] # Taa Alif Noon, Taa Ya Noon self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"] # Alif Noon, Waaw Noon self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"] # Siin Taa, Siin Yaa self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"] # Siin Alif, Siin Noon self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"] # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza self.verb_pr33 = [ "\u0644\u0646", "\u0644\u062A", "\u0644\u064A", "\u0644\u0623", ] # Taa Miim Alif, Taa Noon Shadda self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"] # Noon Alif, Taa Miim, Taa Alif, Waaw Alif self.verb_suf2 = [ "\u0646\u0627", "\u062A\u0645", "\u062A\u0627", "\u0648\u0627", ] # Taa, Alif, Noon self.verb_suf1 = ["\u062A", "\u0627", "\u0646"] def stem1(self, token): """ call this function to get the first stem """ try: if token is None: raise ValueError( "The word could not be stemmed, because \ it is empty !" ) self.is_verb = False # remove Arabic diacritics and replace some letters with others token = self.norm(token) # strip the common noun prefixes pre = self.pref(token) if pre is not None: token = pre # transform the feminine form to masculine form fm = self.fem2masc(token) if fm is not None: return fm # strip the adjective affixes adj = self.adjective(token) if adj is not None: return adj # strip the suffixes that are common to nouns and verbs token = self.suff(token) # transform a plural noun to a singular noun ps = self.plur2sing(token) if ps is None: if pre is None: # if the noun prefixes are not stripped # strip the verb prefixes and suffixes verb = self.verb(token) if verb is not None: self.is_verb = True return verb else: return ps return token except ValueError as e: print(e) def stem(self, token): # stem the input word try: if token is None: raise ValueError( "The word could not be stemmed, because \ it is empty !" ) # run the first round of stemming token = self.stem1(token) # check if there is some additional noun affixes if len(token) > 4: # ^Taa, $Yaa + char if token.startswith("\u062A") and token[-2] == "\u064A": token = token[1:-2] + token[-1] return token # ^Miim, $Waaw + char if token.startswith("\u0645") and token[-2] == "\u0648": token = token[1:-2] + token[-1] return token if len(token) > 3: # !^Alif, $Yaa if not token.startswith("\u0627") and token.endswith("\u064A"): token = token[:-1] return token # $Laam if token.startswith("\u0644"): return token[1:] return token except ValueError as e: print(e) def norm(self, token): """ normalize the word by removing diacritics, replace hamzated Alif with Alif bare, replace AlifMaqsura with Yaa and remove Waaw at the beginning. """ # strip Arabic diacritics token = self.re_diacritics.sub("", token) # replace Hamzated Alif with Alif bare token = self.re_hamzated_alif.sub("\u0627", token) # replace alifMaqsura with Yaa token = self.re_alifMaqsura.sub("\u064A", token) # strip the Waaw from the word beginning if the remaining is # tri-literal at least if token.startswith("\u0648") and len(token) > 3: token = token[1:] return token def pref(self, token): """ remove prefixes from the words' beginning. """ if len(token) > 5: for p3 in self.pr3: if token.startswith(p3): return token[3:] if len(token) > 6: for p4 in self.pr4: if token.startswith(p4): return token[4:] if len(token) > 5: for p3 in self.pr32: if token.startswith(p3): return token[3:] if len(token) > 4: for p2 in self.pr2: if token.startswith(p2): return token[2:] def adjective(self, token): """ remove the infixes from adjectives """ # ^Alif, Alif, $Yaa if len(token) > 5: if ( token.startswith("\u0627") and token[-3] == "\u0627" and token.endswith("\u064A") ): return token[:-3] + token[-2] def suff(self, token): """ remove the suffixes from the word's ending. """ if token.endswith("\u0643") and len(token) > 3: return token[:-1] if len(token) > 4: for s2 in self.su2: if token.endswith(s2): return token[:-2] if len(token) > 5: for s3 in self.su3: if token.endswith(s3): return token[:-3] if token.endswith("\u0647") and len(token) > 3: token = token[:-1] return token if len(token) > 4: for s2 in self.su22: if token.endswith(s2): return token[:-2] if len(token) > 5: for s3 in self.su32: if token.endswith(s3): return token[:-3] # $Noon and Alif if token.endswith("\u0646\u0627") and len(token) > 4: return token[:-2] return token def fem2masc(self, token): """ transform the word from the feminine form to the masculine form. """ if len(token) > 6: # ^Taa, Yaa, $Yaa and Taa Marbuta if ( token.startswith("\u062A") and token[-4] == "\u064A" and token.endswith("\u064A\u0629") ): return token[1:-4] + token[-3] # ^Alif, Yaa, $Yaa and Taa Marbuta if ( token.startswith("\u0627") and token[-4] == "\u0627" and token.endswith("\u064A\u0629") ): return token[:-4] + token[-3] # $Alif, Yaa and Taa Marbuta if token.endswith("\u0627\u064A\u0629") and len(token) > 5: return token[:-2] if len(token) > 4: # Alif, $Taa Marbuta if token[1] == "\u0627" and token.endswith("\u0629"): return token[0] + token[2:-1] # $Yaa and Taa Marbuta if token.endswith("\u064A\u0629"): return token[:-2] # $Taa Marbuta if token.endswith("\u0629") and len(token) > 3: return token[:-1] def plur2sing(self, token): """ transform the word from the plural form to the singular form. """ # ^Haa, $Noon, Waaw if len(token) > 5: if token.startswith("\u0645") and token.endswith("\u0648\u0646"): return token[1:-2] if len(token) > 4: for ps2 in self.pl_si2: if token.endswith(ps2): return token[:-2] if len(token) > 5: for ps3 in self.pl_si3: if token.endswith(ps3): return token[:-3] if len(token) > 4: # $Alif, Taa if token.endswith("\u0627\u062A"): return token[:-2] # ^Alif Alif if token.startswith("\u0627") and token[2] == "\u0627": return token[:2] + token[3:] # ^Alif Alif if token.startswith("\u0627") and token[-2] == "\u0627": return token[1:-2] + token[-1] def verb(self, token): """ stem the verb prefixes and suffixes or both """ vb = self.verb_t1(token) if vb is not None: return vb vb = self.verb_t2(token) if vb is not None: return vb vb = self.verb_t3(token) if vb is not None: return vb vb = self.verb_t4(token) if vb is not None: return vb vb = self.verb_t5(token) if vb is not None: return vb vb = self.verb_t6(token) return vb def verb_t1(self, token): """ stem the present tense co-occurred prefixes and suffixes """ if len(token) > 5 and token.startswith("\u062A"): # Taa for s2 in self.pl_si2: if token.endswith(s2): return token[1:-2] if len(token) > 5 and token.startswith("\u064A"): # Yaa for s2 in self.verb_su2: if token.endswith(s2): return token[1:-2] if len(token) > 4 and token.startswith("\u0627"): # Alif # Waaw Alif if len(token) > 5 and token.endswith("\u0648\u0627"): return token[1:-2] # Yaa if token.endswith("\u064A"): return token[1:-1] # Alif if token.endswith("\u0627"): return token[1:-1] # Noon if token.endswith("\u0646"): return token[1:-1] # ^Yaa, Noon$ if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"): return token[1:-1] # ^Taa, Noon$ if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"): return token[1:-1] def verb_t2(self, token): """ stem the future tense co-occurred prefixes and suffixes """ if len(token) > 6: for s2 in self.pl_si2: # ^Siin Taa if token.startswith(self.verb_pr2[0]) and token.endswith(s2): return token[2:-2] # ^Siin Yaa, Alif Noon$ if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]): return token[2:-2] # ^Siin Yaa, Waaw Noon$ if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]): return token[2:-2] # ^Siin Taa, Noon$ if ( len(token) > 5 and token.startswith(self.verb_pr2[0]) and token.endswith("\u0646") ): return token[2:-1] # ^Siin Yaa, Noon$ if ( len(token) > 5 and token.startswith(self.verb_pr2[1]) and token.endswith("\u0646") ): return token[2:-1] def verb_t3(self, token): """ stem the present tense suffixes """ if len(token) > 5: for su3 in self.verb_suf3: if token.endswith(su3): return token[:-3] if len(token) > 4: for su2 in self.verb_suf2: if token.endswith(su2): return token[:-2] if len(token) > 3: for su1 in self.verb_suf1: if token.endswith(su1): return token[:-1] def verb_t4(self, token): """ stem the present tense prefixes """ if len(token) > 3: for pr1 in self.verb_suf1: if token.startswith(pr1): return token[1:] if token.startswith("\u064A"): return token[1:] def verb_t5(self, token): """ stem the future tense prefixes """ if len(token) > 4: for pr2 in self.verb_pr22: if token.startswith(pr2): return token[2:] for pr2 in self.verb_pr2: if token.startswith(pr2): return token[2:] def verb_t6(self, token): """ stem the imperative tense prefixes """ if len(token) > 4: for pr3 in self.verb_pr33: if token.startswith(pr3): return token[2:] return token nltk-3.7/nltk/stem/cistem.py000066400000000000000000000156121420073152400161140ustar00rootroot00000000000000# Natural Language Toolkit: CISTEM Stemmer for German # Copyright (C) 2001-2022 NLTK Project # Author: Leonie Weissweiler # Tom Aarsen <> (modifications) # Algorithm: Leonie Weissweiler # Alexander Fraser # URL: # For license information, see LICENSE.TXT import re from typing import Tuple from nltk.stem.api import StemmerI class Cistem(StemmerI): """ CISTEM Stemmer for German This is the official Python implementation of the CISTEM stemmer. It is based on the paper Leonie Weissweiler, Alexander Fraser (2017). Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers. In Proceedings of the German Society for Computational Linguistics and Language Technology (GSCL) which can be read here: https://www.cis.lmu.de/~weissweiler/cistem/ In the paper, we conducted an analysis of publicly available stemmers, developed two gold standards for German stemming and evaluated the stemmers based on the two gold standards. We then proposed the stemmer implemented here and show that it achieves slightly better f-measure than the other stemmers and is thrice as fast as the Snowball stemmer for German while being about as fast as most other stemmers. case_insensitive is a a boolean specifying if case-insensitive stemming should be used. Case insensitivity improves performance only if words in the text may be incorrectly upper case. For all-lowercase and correctly cased text, best performance is achieved by setting case_insensitive for false. :param case_insensitive: if True, the stemming is case insensitive. False by default. :type case_insensitive: bool """ strip_ge = re.compile(r"^ge(.{4,})") repl_xx = re.compile(r"(.)\1") strip_emr = re.compile(r"e[mr]$") strip_nd = re.compile(r"nd$") strip_t = re.compile(r"t$") strip_esn = re.compile(r"[esn]$") repl_xx_back = re.compile(r"(.)\*") def __init__(self, case_insensitive: bool = False): self._case_insensitive = case_insensitive @staticmethod def replace_to(word: str) -> str: word = word.replace("sch", "$") word = word.replace("ei", "%") word = word.replace("ie", "&") word = Cistem.repl_xx.sub(r"\1*", word) return word @staticmethod def replace_back(word: str) -> str: word = Cistem.repl_xx_back.sub(r"\1\1", word) word = word.replace("%", "ei") word = word.replace("&", "ie") word = word.replace("$", "sch") return word def stem(self, word: str) -> str: """Stems the input word. :param word: The word that is to be stemmed. :type word: str :return: The stemmed word. :rtype: str >>> from nltk.stem.cistem import Cistem >>> stemmer = Cistem() >>> s1 = "Speicherbehältern" >>> stemmer.stem(s1) 'speicherbehalt' >>> s2 = "Grenzpostens" >>> stemmer.stem(s2) 'grenzpost' >>> s3 = "Ausgefeiltere" >>> stemmer.stem(s3) 'ausgefeilt' >>> stemmer = Cistem(True) >>> stemmer.stem(s1) 'speicherbehal' >>> stemmer.stem(s2) 'grenzpo' >>> stemmer.stem(s3) 'ausgefeil' """ if len(word) == 0: return word upper = word[0].isupper() word = word.lower() word = word.replace("ü", "u") word = word.replace("ö", "o") word = word.replace("ä", "a") word = word.replace("ß", "ss") word = Cistem.strip_ge.sub(r"\1", word) return self._segment_inner(word, upper)[0] def segment(self, word: str) -> Tuple[str, str]: """ This method works very similarly to stem (:func:'cistem.stem'). The difference is that in addition to returning the stem, it also returns the rest that was removed at the end. To be able to return the stem unchanged so the stem and the rest can be concatenated to form the original word, all subsitutions that altered the stem in any other way than by removing letters at the end were left out. :param word: The word that is to be stemmed. :type word: str :return: A tuple of the stemmed word and the removed suffix. :rtype: Tuple[str, str] >>> from nltk.stem.cistem import Cistem >>> stemmer = Cistem() >>> s1 = "Speicherbehältern" >>> stemmer.segment(s1) ('speicherbehält', 'ern') >>> s2 = "Grenzpostens" >>> stemmer.segment(s2) ('grenzpost', 'ens') >>> s3 = "Ausgefeiltere" >>> stemmer.segment(s3) ('ausgefeilt', 'ere') >>> stemmer = Cistem(True) >>> stemmer.segment(s1) ('speicherbehäl', 'tern') >>> stemmer.segment(s2) ('grenzpo', 'stens') >>> stemmer.segment(s3) ('ausgefeil', 'tere') """ if len(word) == 0: return ("", "") upper = word[0].isupper() word = word.lower() return self._segment_inner(word, upper) def _segment_inner(self, word: str, upper: bool): """Inner method for iteratively applying the code stemming regexes. This method receives a pre-processed variant of the word to be stemmed, or the word to be segmented, and returns a tuple of the word and the removed suffix. :param word: A pre-processed variant of the word that is to be stemmed. :type word: str :param upper: Whether the original word started with a capital letter. :type upper: bool :return: A tuple of the stemmed word and the removed suffix. :rtype: Tuple[str, str] """ rest_length = 0 word_copy = word[:] # Pre-processing before applying the substitution patterns word = Cistem.replace_to(word) rest = "" # Apply the substitution patterns while len(word) > 3: if len(word) > 5: word, n = Cistem.strip_emr.subn("", word) if n != 0: rest_length += 2 continue word, n = Cistem.strip_nd.subn("", word) if n != 0: rest_length += 2 continue if not upper or self._case_insensitive: word, n = Cistem.strip_t.subn("", word) if n != 0: rest_length += 1 continue word, n = Cistem.strip_esn.subn("", word) if n != 0: rest_length += 1 continue else: break # Post-processing after applying the substitution patterns word = Cistem.replace_back(word) if rest_length: rest = word_copy[-rest_length:] return (word, rest) nltk-3.7/nltk/stem/isri.py000066400000000000000000000344031420073152400155750ustar00rootroot00000000000000# # Natural Language Toolkit: The ISRI Arabic Stemmer # # Copyright (C) 2001-2022 NLTK Project # Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005) # Author: Hosam Algasaier # URL: # For license information, see LICENSE.TXT """ ISRI Arabic Stemmer The algorithm for this stemmer is described in: Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. The Information Science Research Institute’s (ISRI) Arabic stemmer shares many features with the Khoja stemmer. However, the main difference is that ISRI stemmer does not use root dictionary. Also, if a root is not found, ISRI stemmer returned normalized form, rather than returning the original unmodified word. Additional adjustments were made to improve the algorithm: 1- Adding 60 stop words. 2- Adding the pattern (تفاعيل) to ISRI pattern set. 3- The step 2 in the original algorithm was normalizing all hamza. This step is discarded because it increases the word ambiguities and changes the original root. """ import re from nltk.stem.api import StemmerI class ISRIStemmer(StemmerI): """ ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. A few minor modifications have been made to ISRI basic algorithm. See the source code of this module for more information. isri.stem(token) returns Arabic root for the given token. The ISRI Stemmer requires that all tokens have Unicode string types. If you use Python IDLE on Arabic Windows you have to decode text first using Arabic '1256' coding. """ def __init__(self): # length three prefixes self.p3 = [ "\u0643\u0627\u0644", "\u0628\u0627\u0644", "\u0648\u0644\u0644", "\u0648\u0627\u0644", ] # length two prefixes self.p2 = ["\u0627\u0644", "\u0644\u0644"] # length one prefixes self.p1 = [ "\u0644", "\u0628", "\u0641", "\u0633", "\u0648", "\u064a", "\u062a", "\u0646", "\u0627", ] # length three suffixes self.s3 = [ "\u062a\u0645\u0644", "\u0647\u0645\u0644", "\u062a\u0627\u0646", "\u062a\u064a\u0646", "\u0643\u0645\u0644", ] # length two suffixes self.s2 = [ "\u0648\u0646", "\u0627\u062a", "\u0627\u0646", "\u064a\u0646", "\u062a\u0646", "\u0643\u0645", "\u0647\u0646", "\u0646\u0627", "\u064a\u0627", "\u0647\u0627", "\u062a\u0645", "\u0643\u0646", "\u0646\u064a", "\u0648\u0627", "\u0645\u0627", "\u0647\u0645", ] # length one suffixes self.s1 = ["\u0629", "\u0647", "\u064a", "\u0643", "\u062a", "\u0627", "\u0646"] # groups of length four patterns self.pr4 = { 0: ["\u0645"], 1: ["\u0627"], 2: ["\u0627", "\u0648", "\u064A"], 3: ["\u0629"], } # Groups of length five patterns and length three roots self.pr53 = { 0: ["\u0627", "\u062a"], 1: ["\u0627", "\u064a", "\u0648"], 2: ["\u0627", "\u062a", "\u0645"], 3: ["\u0645", "\u064a", "\u062a"], 4: ["\u0645", "\u062a"], 5: ["\u0627", "\u0648"], 6: ["\u0627", "\u0645"], } self.re_short_vowels = re.compile(r"[\u064B-\u0652]") self.re_hamza = re.compile(r"[\u0621\u0624\u0626]") self.re_initial_hamza = re.compile(r"^[\u0622\u0623\u0625]") self.stop_words = [ "\u064a\u0643\u0648\u0646", "\u0648\u0644\u064a\u0633", "\u0648\u0643\u0627\u0646", "\u0643\u0630\u0644\u0643", "\u0627\u0644\u062a\u064a", "\u0648\u0628\u064a\u0646", "\u0639\u0644\u064a\u0647\u0627", "\u0645\u0633\u0627\u0621", "\u0627\u0644\u0630\u064a", "\u0648\u0643\u0627\u0646\u062a", "\u0648\u0644\u0643\u0646", "\u0648\u0627\u0644\u062a\u064a", "\u062a\u0643\u0648\u0646", "\u0627\u0644\u064a\u0648\u0645", "\u0627\u0644\u0644\u0630\u064a\u0646", "\u0639\u0644\u064a\u0647", "\u0643\u0627\u0646\u062a", "\u0644\u0630\u0644\u0643", "\u0623\u0645\u0627\u0645", "\u0647\u0646\u0627\u0643", "\u0645\u0646\u0647\u0627", "\u0645\u0627\u0632\u0627\u0644", "\u0644\u0627\u0632\u0627\u0644", "\u0644\u0627\u064a\u0632\u0627\u0644", "\u0645\u0627\u064a\u0632\u0627\u0644", "\u0627\u0635\u0628\u062d", "\u0623\u0635\u0628\u062d", "\u0623\u0645\u0633\u0649", "\u0627\u0645\u0633\u0649", "\u0623\u0636\u062d\u0649", "\u0627\u0636\u062d\u0649", "\u0645\u0627\u0628\u0631\u062d", "\u0645\u0627\u0641\u062a\u0626", "\u0645\u0627\u0627\u0646\u0641\u0643", "\u0644\u0627\u0633\u064a\u0645\u0627", "\u0648\u0644\u0627\u064a\u0632\u0627\u0644", "\u0627\u0644\u062d\u0627\u0644\u064a", "\u0627\u0644\u064a\u0647\u0627", "\u0627\u0644\u0630\u064a\u0646", "\u0641\u0627\u0646\u0647", "\u0648\u0627\u0644\u0630\u064a", "\u0648\u0647\u0630\u0627", "\u0644\u0647\u0630\u0627", "\u0641\u0643\u0627\u0646", "\u0633\u062a\u0643\u0648\u0646", "\u0627\u0644\u064a\u0647", "\u064a\u0645\u0643\u0646", "\u0628\u0647\u0630\u0627", "\u0627\u0644\u0630\u0649", ] def stem(self, token): """ Stemming a word token using the ISRI stemmer. """ token = self.norm( token, 1 ) # remove diacritics which representing Arabic short vowels if token in self.stop_words: return token # exclude stop words from being processed token = self.pre32( token ) # remove length three and length two prefixes in this order token = self.suf32( token ) # remove length three and length two suffixes in this order token = self.waw( token ) # remove connective ‘و’ if it precedes a word beginning with ‘و’ token = self.norm(token, 2) # normalize initial hamza to bare alif # if 4 <= word length <= 7, then stem; otherwise, no stemming if len(token) == 4: # length 4 word token = self.pro_w4(token) elif len(token) == 5: # length 5 word token = self.pro_w53(token) token = self.end_w5(token) elif len(token) == 6: # length 6 word token = self.pro_w6(token) token = self.end_w6(token) elif len(token) == 7: # length 7 word token = self.suf1(token) if len(token) == 7: token = self.pre1(token) if len(token) == 6: token = self.pro_w6(token) token = self.end_w6(token) return token def norm(self, word, num=3): """ normalization: num=1 normalize diacritics num=2 normalize initial hamza num=3 both 1&2 """ if num == 1: word = self.re_short_vowels.sub("", word) elif num == 2: word = self.re_initial_hamza.sub("\u0627", word) elif num == 3: word = self.re_short_vowels.sub("", word) word = self.re_initial_hamza.sub("\u0627", word) return word def pre32(self, word): """remove length three and length two prefixes in this order""" if len(word) >= 6: for pre3 in self.p3: if word.startswith(pre3): return word[3:] if len(word) >= 5: for pre2 in self.p2: if word.startswith(pre2): return word[2:] return word def suf32(self, word): """remove length three and length two suffixes in this order""" if len(word) >= 6: for suf3 in self.s3: if word.endswith(suf3): return word[:-3] if len(word) >= 5: for suf2 in self.s2: if word.endswith(suf2): return word[:-2] return word def waw(self, word): """remove connective ‘و’ if it precedes a word beginning with ‘و’""" if len(word) >= 4 and word[:2] == "\u0648\u0648": word = word[1:] return word def pro_w4(self, word): """process length four patterns and extract length three roots""" if word[0] in self.pr4[0]: # مفعل word = word[1:] elif word[1] in self.pr4[1]: # فاعل word = word[:1] + word[2:] elif word[2] in self.pr4[2]: # فعال - فعول - فعيل word = word[:2] + word[3] elif word[3] in self.pr4[3]: # فعلة word = word[:-1] else: word = self.suf1(word) # do - normalize short sufix if len(word) == 4: word = self.pre1(word) # do - normalize short prefix return word def pro_w53(self, word): """process length five patterns and extract length three roots""" if word[2] in self.pr53[0] and word[0] == "\u0627": # افتعل - افاعل word = word[1] + word[3:] elif word[3] in self.pr53[1] and word[0] == "\u0645": # مفعول - مفعال - مفعيل word = word[1:3] + word[4] elif word[0] in self.pr53[2] and word[4] == "\u0629": # مفعلة - تفعلة - افعلة word = word[1:4] elif word[0] in self.pr53[3] and word[2] == "\u062a": # مفتعل - يفتعل - تفتعل word = word[1] + word[3:] elif word[0] in self.pr53[4] and word[2] == "\u0627": # مفاعل - تفاعل word = word[1] + word[3:] elif word[2] in self.pr53[5] and word[4] == "\u0629": # فعولة - فعالة word = word[:2] + word[3] elif word[0] in self.pr53[6] and word[1] == "\u0646": # انفعل - منفعل word = word[2:] elif word[3] == "\u0627" and word[0] == "\u0627": # افعال word = word[1:3] + word[4] elif word[4] == "\u0646" and word[3] == "\u0627": # فعلان word = word[:3] elif word[3] == "\u064a" and word[0] == "\u062a": # تفعيل word = word[1:3] + word[4] elif word[3] == "\u0648" and word[1] == "\u0627": # فاعول word = word[0] + word[2] + word[4] elif word[2] == "\u0627" and word[1] == "\u0648": # فواعل word = word[0] + word[3:] elif word[3] == "\u0626" and word[2] == "\u0627": # فعائل word = word[:2] + word[4] elif word[4] == "\u0629" and word[1] == "\u0627": # فاعلة word = word[0] + word[2:4] elif word[4] == "\u064a" and word[2] == "\u0627": # فعالي word = word[:2] + word[3] else: word = self.suf1(word) # do - normalize short sufix if len(word) == 5: word = self.pre1(word) # do - normalize short prefix return word def pro_w54(self, word): """process length five patterns and extract length four roots""" if word[0] in self.pr53[2]: # تفعلل - افعلل - مفعلل word = word[1:] elif word[4] == "\u0629": # فعللة word = word[:4] elif word[2] == "\u0627": # فعالل word = word[:2] + word[3:] return word def end_w5(self, word): """ending step (word of length five)""" if len(word) == 4: word = self.pro_w4(word) elif len(word) == 5: word = self.pro_w54(word) return word def pro_w6(self, word): """process length six patterns and extract length three roots""" if word.startswith("\u0627\u0633\u062a") or word.startswith( "\u0645\u0633\u062a" ): # مستفعل - استفعل word = word[3:] elif ( word[0] == "\u0645" and word[3] == "\u0627" and word[5] == "\u0629" ): # مفعالة word = word[1:3] + word[4] elif ( word[0] == "\u0627" and word[2] == "\u062a" and word[4] == "\u0627" ): # افتعال word = word[1] + word[3] + word[5] elif ( word[0] == "\u0627" and word[3] == "\u0648" and word[2] == word[4] ): # افعوعل word = word[1] + word[4:] elif ( word[0] == "\u062a" and word[2] == "\u0627" and word[4] == "\u064a" ): # تفاعيل new pattern word = word[1] + word[3] + word[5] else: word = self.suf1(word) # do - normalize short sufix if len(word) == 6: word = self.pre1(word) # do - normalize short prefix return word def pro_w64(self, word): """process length six patterns and extract length four roots""" if word[0] == "\u0627" and word[4] == "\u0627": # افعلال word = word[1:4] + word[5] elif word.startswith("\u0645\u062a"): # متفعلل word = word[2:] return word def end_w6(self, word): """ending step (word of length six)""" if len(word) == 5: word = self.pro_w53(word) word = self.end_w5(word) elif len(word) == 6: word = self.pro_w64(word) return word def suf1(self, word): """normalize short sufix""" for sf1 in self.s1: if word.endswith(sf1): return word[:-1] return word def pre1(self, word): """normalize short prefix""" for sp1 in self.p1: if word.startswith(sp1): return word[1:] return word nltk-3.7/nltk/stem/lancaster.py000066400000000000000000000277241420073152400166130ustar00rootroot00000000000000# Natural Language Toolkit: Stemmers # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Tomcavage # URL: # For license information, see LICENSE.TXT """ A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm. Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61. """ import re from nltk.stem.api import StemmerI class LancasterStemmer(StemmerI): """ Lancaster Stemmer >>> from nltk.stem.lancaster import LancasterStemmer >>> st = LancasterStemmer() >>> st.stem('maximum') # Remove "-um" when word is intact 'maxim' >>> st.stem('presumably') # Don't remove "-um" when word is not intact 'presum' >>> st.stem('multiply') # No action taken if word ends with "-ply" 'multiply' >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules 'provid' >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters 'ow' >>> st.stem('ear') # ditto 'ear' >>> st.stem('saying') # Words starting with consonant must contain at least 3 'say' >>> st.stem('crying') # letters and one of those letters must be a vowel 'cry' >>> st.stem('string') # ditto 'string' >>> st.stem('meant') # ditto 'meant' >>> st.stem('cement') # ditto 'cem' >>> st_pre = LancasterStemmer(strip_prefix_flag=True) >>> st_pre.stem('kilometer') # Test Prefix 'met' >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t.")) >>> st_custom.stem("ness") # Change s to t 'nest' """ # The rule list is static since it doesn't change between instances default_rule_tuple = ( "ai*2.", # -ia > - if intact "a*1.", # -a > - if intact "bb1.", # -bb > -b "city3s.", # -ytic > -ys "ci2>", # -ic > - "cn1t>", # -nc > -nt "dd1.", # -dd > -d "dei3y>", # -ied > -y "deec2ss.", # -ceed >", -cess "dee1.", # -eed > -ee "de2>", # -ed > - "dooh4>", # -hood > - "e1>", # -e > - "feil1v.", # -lief > -liev "fi2>", # -if > - "gni3>", # -ing > - "gai3y.", # -iag > -y "ga2>", # -ag > - "gg1.", # -gg > -g "ht*2.", # -th > - if intact "hsiug5ct.", # -guish > -ct "hsi3>", # -ish > - "i*1.", # -i > - if intact "i1y>", # -i > -y "ji1d.", # -ij > -id -- see nois4j> & vis3j> "juf1s.", # -fuj > -fus "ju1d.", # -uj > -ud "jo1d.", # -oj > -od "jeh1r.", # -hej > -her "jrev1t.", # -verj > -vert "jsim2t.", # -misj > -mit "jn1d.", # -nj > -nd "j1s.", # -j > -s "lbaifi6.", # -ifiabl > - "lbai4y.", # -iabl > -y "lba3>", # -abl > - "lbi3.", # -ibl > - "lib2l>", # -bil > -bl "lc1.", # -cl > c "lufi4y.", # -iful > -y "luf3>", # -ful > - "lu2.", # -ul > - "lai3>", # -ial > - "lau3>", # -ual > - "la2>", # -al > - "ll1.", # -ll > -l "mui3.", # -ium > - "mu*2.", # -um > - if intact "msi3>", # -ism > - "mm1.", # -mm > -m "nois4j>", # -sion > -j "noix4ct.", # -xion > -ct "noi3>", # -ion > - "nai3>", # -ian > - "na2>", # -an > - "nee0.", # protect -een "ne2>", # -en > - "nn1.", # -nn > -n "pihs4>", # -ship > - "pp1.", # -pp > -p "re2>", # -er > - "rae0.", # protect -ear "ra2.", # -ar > - "ro2>", # -or > - "ru2>", # -ur > - "rr1.", # -rr > -r "rt1>", # -tr > -t "rei3y>", # -ier > -y "sei3y>", # -ies > -y "sis2.", # -sis > -s "si2>", # -is > - "ssen4>", # -ness > - "ss0.", # protect -ss "suo3>", # -ous > - "su*2.", # -us > - if intact "s*1>", # -s > - if intact "s0.", # -s > -s "tacilp4y.", # -plicat > -ply "ta2>", # -at > - "tnem4>", # -ment > - "tne3>", # -ent > - "tna3>", # -ant > - "tpir2b.", # -ript > -rib "tpro2b.", # -orpt > -orb "tcud1.", # -duct > -duc "tpmus2.", # -sumpt > -sum "tpec2iv.", # -cept > -ceiv "tulo2v.", # -olut > -olv "tsis0.", # protect -sist "tsi3>", # -ist > - "tt1.", # -tt > -t "uqi3.", # -iqu > - "ugo1.", # -ogu > -og "vis3j>", # -siv > -j "vie0.", # protect -eiv "vi2>", # -iv > - "ylb1>", # -bly > -bl "yli3y>", # -ily > -y "ylp0.", # protect -ply "yl2>", # -ly > - "ygo1.", # -ogy > -og "yhp1.", # -phy > -ph "ymo1.", # -omy > -om "ypo1.", # -opy > -op "yti3>", # -ity > - "yte3>", # -ety > - "ytl2.", # -lty > -l "yrtsi5.", # -istry > - "yra3>", # -ary > - "yro3>", # -ory > - "yfi3.", # -ify > - "ycn2t>", # -ncy > -nt "yca3>", # -acy > - "zi2>", # -iz > - "zy1s.", # -yz > -ys ) def __init__(self, rule_tuple=None, strip_prefix_flag=False): """Create an instance of the Lancaster stemmer.""" # Setup an empty rule dictionary - this will be filled in later self.rule_dictionary = {} # Check if a user wants to strip prefix self._strip_prefix = strip_prefix_flag # Check if a user wants to use his/her own rule tuples. self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple def parseRules(self, rule_tuple=None): """Validate the set of rules used in this stemmer. If this function is called as an individual method, without using stem method, rule_tuple argument will be compiled into self.rule_dictionary. If this function is called within stem, self._rule_tuple will be used. """ # If there is no argument for the function, use class' own rule tuple. rule_tuple = rule_tuple if rule_tuple else self._rule_tuple valid_rule = re.compile(r"^[a-z]+\*?\d[a-z]*[>\.]?$") # Empty any old rules from the rule set before adding new ones self.rule_dictionary = {} for rule in rule_tuple: if not valid_rule.match(rule): raise ValueError(f"The rule {rule} is invalid") first_letter = rule[0:1] if first_letter in self.rule_dictionary: self.rule_dictionary[first_letter].append(rule) else: self.rule_dictionary[first_letter] = [rule] def stem(self, word): """Stem a word using the Lancaster stemmer.""" # Lower-case the word, since all the rules are lower-cased word = word.lower() word = self.__stripPrefix(word) if self._strip_prefix else word # Save a copy of the original word intact_word = word # If rule dictionary is empty, parse rule tuple. if not self.rule_dictionary: self.parseRules() return self.__doStemming(word, intact_word) def __doStemming(self, word, intact_word): """Perform the actual word stemming""" valid_rule = re.compile(r"^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$") proceed = True while proceed: # Find the position of the last letter of the word to be stemmed last_letter_position = self.__getLastLetter(word) # Only stem the word if it has a last letter and a rule matching that last letter if ( last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary ): proceed = False else: rule_was_applied = False # Go through each rule that matches the word's final letter for rule in self.rule_dictionary[word[last_letter_position]]: rule_match = valid_rule.match(rule) if rule_match: ( ending_string, intact_flag, remove_total, append_string, cont_flag, ) = rule_match.groups() # Convert the number of chars to remove when stemming # from a string to an integer remove_total = int(remove_total) # Proceed if word's ending matches rule's word ending if word.endswith(ending_string[::-1]): if intact_flag: if word == intact_word and self.__isAcceptable( word, remove_total ): word = self.__applyRule( word, remove_total, append_string ) rule_was_applied = True if cont_flag == ".": proceed = False break elif self.__isAcceptable(word, remove_total): word = self.__applyRule( word, remove_total, append_string ) rule_was_applied = True if cont_flag == ".": proceed = False break # If no rules apply, the word doesn't need any more stemming if rule_was_applied == False: proceed = False return word def __getLastLetter(self, word): """Get the zero-based index of the last alphabetic character in this string""" last_letter = -1 for position in range(len(word)): if word[position].isalpha(): last_letter = position else: break return last_letter def __isAcceptable(self, word, remove_total): """Determine if the word is acceptable for stemming.""" word_is_acceptable = False # If the word starts with a vowel, it must be at least 2 # characters long to be stemmed if word[0] in "aeiouy": if len(word) - remove_total >= 2: word_is_acceptable = True # If the word starts with a consonant, it must be at least 3 # characters long (including one vowel) to be stemmed elif len(word) - remove_total >= 3: if word[1] in "aeiouy": word_is_acceptable = True elif word[2] in "aeiouy": word_is_acceptable = True return word_is_acceptable def __applyRule(self, word, remove_total, append_string): """Apply the stemming rule to the word""" # Remove letters from the end of the word new_word_length = len(word) - remove_total word = word[0:new_word_length] # And add new letters to the end of the truncated word if append_string: word += append_string return word def __stripPrefix(self, word): """Remove prefix from a word. This function originally taken from Whoosh. """ for prefix in ( "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo", ): if word.startswith(prefix): return word[len(prefix) :] return word def __repr__(self): return "" nltk-3.7/nltk/stem/porter.py000066400000000000000000000660111420073152400161420ustar00rootroot00000000000000""" Porter Stemmer This is the Porter stemming algorithm. It follows the algorithm presented in Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137. with some optional deviations that can be turned on or off with the `mode` argument to the constructor. Martin Porter, the algorithm's inventor, maintains a web page about the algorithm at https://www.tartarus.org/~martin/PorterStemmer/ which includes another Python implementation and other implementations in many languages. """ __docformat__ = "plaintext" import re from nltk.stem.api import StemmerI class PorterStemmer(StemmerI): """ A word stemmer based on the Porter stemming algorithm. Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137. See https://www.tartarus.org/~martin/PorterStemmer/ for the homepage of the algorithm. Martin Porter has endorsed several modifications to the Porter algorithm since writing his original paper, and those extensions are included in the implementations on his website. Additionally, others have proposed further improvements to the algorithm, including NLTK contributors. There are thus three modes that can be selected by passing the appropriate constant to the class constructor's `mode` attribute: - PorterStemmer.ORIGINAL_ALGORITHM An implementation that is faithful to the original paper. Note that Martin Porter has deprecated this version of the algorithm. Martin distributes implementations of the Porter Stemmer in many languages, hosted at: https://www.tartarus.org/~martin/PorterStemmer/ and all of these implementations include his extensions. He strongly recommends against using the original, published version of the algorithm; only use this mode if you clearly understand why you are choosing to do so. - PorterStemmer.MARTIN_EXTENSIONS An implementation that only uses the modifications to the algorithm that are included in the implementations on Martin Porter's website. He has declared Porter frozen, so the behaviour of those implementations should never change. - PorterStemmer.NLTK_EXTENSIONS (default) An implementation that includes further improvements devised by NLTK contributors or taken from other modified implementations found on the web. For the best stemming, you should use the default NLTK_EXTENSIONS version. However, if you need to get the same results as either the original algorithm or one of Martin Porter's hosted versions for compatibility with an existing implementation or dataset, you can use one of the other modes instead. """ # Modes the Stemmer can be instantiated in NLTK_EXTENSIONS = "NLTK_EXTENSIONS" MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS" ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM" def __init__(self, mode=NLTK_EXTENSIONS): if mode not in ( self.NLTK_EXTENSIONS, self.MARTIN_EXTENSIONS, self.ORIGINAL_ALGORITHM, ): raise ValueError( "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, " "PorterStemmer.MARTIN_EXTENSIONS, or " "PorterStemmer.ORIGINAL_ALGORITHM" ) self.mode = mode if self.mode == self.NLTK_EXTENSIONS: # This is a table of irregular forms. It is quite short, # but still reflects the errors actually drawn to Martin # Porter's attention over a 20 year period! irregular_forms = { "sky": ["sky", "skies"], "die": ["dying"], "lie": ["lying"], "tie": ["tying"], "news": ["news"], "inning": ["innings", "inning"], "outing": ["outings", "outing"], "canning": ["cannings", "canning"], "howe": ["howe"], "proceed": ["proceed"], "exceed": ["exceed"], "succeed": ["succeed"], } self.pool = {} for key in irregular_forms: for val in irregular_forms[key]: self.pool[val] = key self.vowels = frozenset(["a", "e", "i", "o", "u"]) def _is_consonant(self, word, i): """Returns True if word[i] is a consonant, False otherwise A consonant is defined in the paper as follows: A consonant in a word is a letter other than A, E, I, O or U, and other than Y preceded by a consonant. (The fact that the term `consonant' is defined to some extent in terms of itself does not make it ambiguous.) So in TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a letter is not a consonant it is a vowel. """ if word[i] in self.vowels: return False if word[i] == "y": if i == 0: return True else: return not self._is_consonant(word, i - 1) return True def _measure(self, stem): r"""Returns the 'measure' of stem, per definition in the paper From the paper: A consonant will be denoted by c, a vowel by v. A list ccc... of length greater than 0 will be denoted by C, and a list vvv... of length greater than 0 will be denoted by V. Any word, or part of a word, therefore has one of the four forms: CVCV ... C CVCV ... V VCVC ... C VCVC ... V These may all be represented by the single form [C]VCVC ... [V] where the square brackets denote arbitrary presence of their contents. Using (VC){m} to denote VC repeated m times, this may again be written as [C](VC){m}[V]. m will be called the \measure\ of any word or word part when represented in this form. The case m = 0 covers the null word. Here are some examples: m=0 TR, EE, TREE, Y, BY. m=1 TROUBLE, OATS, TREES, IVY. m=2 TROUBLES, PRIVATE, OATEN, ORRERY. """ cv_sequence = "" # Construct a string of 'c's and 'v's representing whether each # character in `stem` is a consonant or a vowel. # e.g. 'falafel' becomes 'cvcvcvc', # 'architecture' becomes 'vcccvcvccvcv' for i in range(len(stem)): if self._is_consonant(stem, i): cv_sequence += "c" else: cv_sequence += "v" # Count the number of 'vc' occurrences, which is equivalent to # the number of 'VC' occurrences in Porter's reduced form in the # docstring above, which is in turn equivalent to `m` return cv_sequence.count("vc") def _has_positive_measure(self, stem): return self._measure(stem) > 0 def _contains_vowel(self, stem): """Returns True if stem contains a vowel, else False""" for i in range(len(stem)): if not self._is_consonant(stem, i): return True return False def _ends_double_consonant(self, word): """Implements condition *d from the paper Returns True if word ends with a double consonant """ return ( len(word) >= 2 and word[-1] == word[-2] and self._is_consonant(word, len(word) - 1) ) def _ends_cvc(self, word): """Implements condition *o from the paper From the paper: *o - the stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP). """ return ( len(word) >= 3 and self._is_consonant(word, len(word) - 3) and not self._is_consonant(word, len(word) - 2) and self._is_consonant(word, len(word) - 1) and word[-1] not in ("w", "x", "y") ) or ( self.mode == self.NLTK_EXTENSIONS and len(word) == 2 and not self._is_consonant(word, 0) and self._is_consonant(word, 1) ) def _replace_suffix(self, word, suffix, replacement): """Replaces `suffix` of `word` with `replacement""" assert word.endswith(suffix), "Given word doesn't end with given suffix" if suffix == "": return word + replacement else: return word[: -len(suffix)] + replacement def _apply_rule_list(self, word, rules): """Applies the first applicable suffix-removal rule to the word Takes a word and a list of suffix-removal rules represented as 3-tuples, with the first element being the suffix to remove, the second element being the string to replace it with, and the final element being the condition for the rule to be applicable, or None if the rule is unconditional. """ for rule in rules: suffix, replacement, condition = rule if suffix == "*d" and self._ends_double_consonant(word): stem = word[:-2] if condition is None or condition(stem): return stem + replacement else: # Don't try any further rules return word if word.endswith(suffix): stem = self._replace_suffix(word, suffix, "") if condition is None or condition(stem): return stem + replacement else: # Don't try any further rules return word return word def _step1a(self, word): """Implements Step 1a from "An algorithm for suffix stripping" From the paper: SSES -> SS caresses -> caress IES -> I ponies -> poni ties -> ti SS -> SS caress -> caress S -> cats -> cat """ # this NLTK-only rule extends the original algorithm, so # that 'flies'->'fli' but 'dies'->'die' etc if self.mode == self.NLTK_EXTENSIONS: if word.endswith("ies") and len(word) == 4: return self._replace_suffix(word, "ies", "ie") return self._apply_rule_list( word, [ ("sses", "ss", None), # SSES -> SS ("ies", "i", None), # IES -> I ("ss", "ss", None), # SS -> SS ("s", "", None), # S -> ], ) def _step1b(self, word): """Implements Step 1b from "An algorithm for suffix stripping" From the paper: (m>0) EED -> EE feed -> feed agreed -> agree (*v*) ED -> plastered -> plaster bled -> bled (*v*) ING -> motoring -> motor sing -> sing If the second or third of the rules in Step 1b is successful, the following is done: AT -> ATE conflat(ed) -> conflate BL -> BLE troubl(ed) -> trouble IZ -> IZE siz(ed) -> size (*d and not (*L or *S or *Z)) -> single letter hopp(ing) -> hop tann(ed) -> tan fall(ing) -> fall hiss(ing) -> hiss fizz(ed) -> fizz (m=1 and *o) -> E fail(ing) -> fail fil(ing) -> file The rule to map to a single letter causes the removal of one of the double letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes -ATE, -BLE and -IZE can be recognised later. This E may be removed in step 4. """ # this NLTK-only block extends the original algorithm, so that # 'spied'->'spi' but 'died'->'die' etc if self.mode == self.NLTK_EXTENSIONS: if word.endswith("ied"): if len(word) == 4: return self._replace_suffix(word, "ied", "ie") else: return self._replace_suffix(word, "ied", "i") # (m>0) EED -> EE if word.endswith("eed"): stem = self._replace_suffix(word, "eed", "") if self._measure(stem) > 0: return stem + "ee" else: return word rule_2_or_3_succeeded = False for suffix in ["ed", "ing"]: if word.endswith(suffix): intermediate_stem = self._replace_suffix(word, suffix, "") if self._contains_vowel(intermediate_stem): rule_2_or_3_succeeded = True break if not rule_2_or_3_succeeded: return word return self._apply_rule_list( intermediate_stem, [ ("at", "ate", None), # AT -> ATE ("bl", "ble", None), # BL -> BLE ("iz", "ize", None), # IZ -> IZE # (*d and not (*L or *S or *Z)) # -> single letter ( "*d", intermediate_stem[-1], lambda stem: intermediate_stem[-1] not in ("l", "s", "z"), ), # (m=1 and *o) -> E ( "", "e", lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)), ), ], ) def _step1c(self, word): """Implements Step 1c from "An algorithm for suffix stripping" From the paper: Step 1c (*v*) Y -> I happy -> happi sky -> sky """ def nltk_condition(stem): """ This has been modified from the original Porter algorithm so that y->i is only done when y is preceded by a consonant, but not if the stem is only a single consonant, i.e. (*c and not c) Y -> I So 'happy' -> 'happi', but 'enjoy' -> 'enjoy' etc This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but with this modification that no longer really matters. Also, the removal of the contains_vowel(z) condition means that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried', 'flies' ... """ return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1) def original_condition(stem): return self._contains_vowel(stem) return self._apply_rule_list( word, [ ( "y", "i", nltk_condition if self.mode == self.NLTK_EXTENSIONS else original_condition, ) ], ) def _step2(self, word): """Implements Step 2 from "An algorithm for suffix stripping" From the paper: Step 2 (m>0) ATIONAL -> ATE relational -> relate (m>0) TIONAL -> TION conditional -> condition rational -> rational (m>0) ENCI -> ENCE valenci -> valence (m>0) ANCI -> ANCE hesitanci -> hesitance (m>0) IZER -> IZE digitizer -> digitize (m>0) ABLI -> ABLE conformabli -> conformable (m>0) ALLI -> AL radicalli -> radical (m>0) ENTLI -> ENT differentli -> different (m>0) ELI -> E vileli - > vile (m>0) OUSLI -> OUS analogousli -> analogous (m>0) IZATION -> IZE vietnamization -> vietnamize (m>0) ATION -> ATE predication -> predicate (m>0) ATOR -> ATE operator -> operate (m>0) ALISM -> AL feudalism -> feudal (m>0) IVENESS -> IVE decisiveness -> decisive (m>0) FULNESS -> FUL hopefulness -> hopeful (m>0) OUSNESS -> OUS callousness -> callous (m>0) ALITI -> AL formaliti -> formal (m>0) IVITI -> IVE sensitiviti -> sensitive (m>0) BILITI -> BLE sensibiliti -> sensible """ if self.mode == self.NLTK_EXTENSIONS: # Instead of applying the ALLI -> AL rule after '(a)bli' per # the published algorithm, instead we apply it first, and, # if it succeeds, run the result through step2 again. if word.endswith("alli") and self._has_positive_measure( self._replace_suffix(word, "alli", "") ): return self._step2(self._replace_suffix(word, "alli", "al")) bli_rule = ("bli", "ble", self._has_positive_measure) abli_rule = ("abli", "able", self._has_positive_measure) rules = [ ("ational", "ate", self._has_positive_measure), ("tional", "tion", self._has_positive_measure), ("enci", "ence", self._has_positive_measure), ("anci", "ance", self._has_positive_measure), ("izer", "ize", self._has_positive_measure), abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule, ("alli", "al", self._has_positive_measure), ("entli", "ent", self._has_positive_measure), ("eli", "e", self._has_positive_measure), ("ousli", "ous", self._has_positive_measure), ("ization", "ize", self._has_positive_measure), ("ation", "ate", self._has_positive_measure), ("ator", "ate", self._has_positive_measure), ("alism", "al", self._has_positive_measure), ("iveness", "ive", self._has_positive_measure), ("fulness", "ful", self._has_positive_measure), ("ousness", "ous", self._has_positive_measure), ("aliti", "al", self._has_positive_measure), ("iviti", "ive", self._has_positive_measure), ("biliti", "ble", self._has_positive_measure), ] if self.mode == self.NLTK_EXTENSIONS: rules.append(("fulli", "ful", self._has_positive_measure)) # The 'l' of the 'logi' -> 'log' rule is put with the stem, # so that short stems like 'geo' 'theo' etc work like # 'archaeo' 'philo' etc. rules.append( ("logi", "log", lambda stem: self._has_positive_measure(word[:-3])) ) if self.mode == self.MARTIN_EXTENSIONS: rules.append(("logi", "log", self._has_positive_measure)) return self._apply_rule_list(word, rules) def _step3(self, word): """Implements Step 3 from "An algorithm for suffix stripping" From the paper: Step 3 (m>0) ICATE -> IC triplicate -> triplic (m>0) ATIVE -> formative -> form (m>0) ALIZE -> AL formalize -> formal (m>0) ICITI -> IC electriciti -> electric (m>0) ICAL -> IC electrical -> electric (m>0) FUL -> hopeful -> hope (m>0) NESS -> goodness -> good """ return self._apply_rule_list( word, [ ("icate", "ic", self._has_positive_measure), ("ative", "", self._has_positive_measure), ("alize", "al", self._has_positive_measure), ("iciti", "ic", self._has_positive_measure), ("ical", "ic", self._has_positive_measure), ("ful", "", self._has_positive_measure), ("ness", "", self._has_positive_measure), ], ) def _step4(self, word): """Implements Step 4 from "An algorithm for suffix stripping" Step 4 (m>1) AL -> revival -> reviv (m>1) ANCE -> allowance -> allow (m>1) ENCE -> inference -> infer (m>1) ER -> airliner -> airlin (m>1) IC -> gyroscopic -> gyroscop (m>1) ABLE -> adjustable -> adjust (m>1) IBLE -> defensible -> defens (m>1) ANT -> irritant -> irrit (m>1) EMENT -> replacement -> replac (m>1) MENT -> adjustment -> adjust (m>1) ENT -> dependent -> depend (m>1 and (*S or *T)) ION -> adoption -> adopt (m>1) OU -> homologou -> homolog (m>1) ISM -> communism -> commun (m>1) ATE -> activate -> activ (m>1) ITI -> angulariti -> angular (m>1) OUS -> homologous -> homolog (m>1) IVE -> effective -> effect (m>1) IZE -> bowdlerize -> bowdler The suffixes are now removed. All that remains is a little tidying up. """ measure_gt_1 = lambda stem: self._measure(stem) > 1 return self._apply_rule_list( word, [ ("al", "", measure_gt_1), ("ance", "", measure_gt_1), ("ence", "", measure_gt_1), ("er", "", measure_gt_1), ("ic", "", measure_gt_1), ("able", "", measure_gt_1), ("ible", "", measure_gt_1), ("ant", "", measure_gt_1), ("ement", "", measure_gt_1), ("ment", "", measure_gt_1), ("ent", "", measure_gt_1), # (m>1 and (*S or *T)) ION -> ( "ion", "", lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"), ), ("ou", "", measure_gt_1), ("ism", "", measure_gt_1), ("ate", "", measure_gt_1), ("iti", "", measure_gt_1), ("ous", "", measure_gt_1), ("ive", "", measure_gt_1), ("ize", "", measure_gt_1), ], ) def _step5a(self, word): """Implements Step 5a from "An algorithm for suffix stripping" From the paper: Step 5a (m>1) E -> probate -> probat rate -> rate (m=1 and not *o) E -> cease -> ceas """ # Note that Martin's test vocabulary and reference # implementations are inconsistent in how they handle the case # where two rules both refer to a suffix that matches the word # to be stemmed, but only the condition of the second one is # true. # Earlier in step2b we had the rules: # (m>0) EED -> EE # (*v*) ED -> # but the examples in the paper included "feed"->"feed", even # though (*v*) is true for "fe" and therefore the second rule # alone would map "feed"->"fe". # However, in THIS case, we need to handle the consecutive rules # differently and try both conditions (obviously; the second # rule here would be redundant otherwise). Martin's paper makes # no explicit mention of the inconsistency; you have to infer it # from the examples. # For this reason, we can't use _apply_rule_list here. if word.endswith("e"): stem = self._replace_suffix(word, "e", "") if self._measure(stem) > 1: return stem if self._measure(stem) == 1 and not self._ends_cvc(stem): return stem return word def _step5b(self, word): """Implements Step 5a from "An algorithm for suffix stripping" From the paper: Step 5b (m > 1 and *d and *L) -> single letter controll -> control roll -> roll """ return self._apply_rule_list( word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)] ) def stem(self, word, to_lowercase=True): """ :param to_lowercase: if `to_lowercase=True` the word always lowercase """ stem = word.lower() if to_lowercase else word if self.mode == self.NLTK_EXTENSIONS and word in self.pool: return self.pool[stem] if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2: # With this line, strings of length 1 or 2 don't go through # the stemming process, although no mention is made of this # in the published algorithm. return stem stem = self._step1a(stem) stem = self._step1b(stem) stem = self._step1c(stem) stem = self._step2(stem) stem = self._step3(stem) stem = self._step4(stem) stem = self._step5a(stem) stem = self._step5b(stem) return stem def __repr__(self): return "" def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk import stem from nltk.corpus import treebank stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.fileids()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = " ".join(stemmed) results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip() # Convert the original to a string, and word wrap it. original = " ".join(orig) original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip() # Print the results. print("-Original-".center(70).replace(" ", "*").replace("-", " ")) print(original) print("-Results-".center(70).replace(" ", "*").replace("-", " ")) print(results) print("*" * 70) nltk-3.7/nltk/stem/regexp.py000066400000000000000000000027621420073152400161240ustar00rootroot00000000000000# Natural Language Toolkit: Stemmers # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT import re from nltk.stem.api import StemmerI class RegexpStemmer(StemmerI): """ A stemmer that uses regular expressions to identify morphological affixes. Any substrings that match the regular expressions will be removed. >>> from nltk.stem import RegexpStemmer >>> st = RegexpStemmer('ing$|s$|e$|able$', min=4) >>> st.stem('cars') 'car' >>> st.stem('mass') 'mas' >>> st.stem('was') 'was' >>> st.stem('bee') 'bee' >>> st.stem('compute') 'comput' >>> st.stem('advisable') 'advis' :type regexp: str or regexp :param regexp: The regular expression that should be used to identify morphological affixes. :type min: int :param min: The minimum length of string to stem """ def __init__(self, regexp, min=0): if not hasattr(regexp, "pattern"): regexp = re.compile(regexp) self._regexp = regexp self._min = min def stem(self, word): if len(word) < self._min: return word else: return self._regexp.sub("", word) def __repr__(self): return f"" nltk-3.7/nltk/stem/rslp.py000066400000000000000000000123351420073152400156070ustar00rootroot00000000000000# Natural Language Toolkit: RSLP Stemmer # # Copyright (C) 2001-2022 NLTK Project # Author: Tiago Tresoldi # URL: # For license information, see LICENSE.TXT # This code is based on the algorithm presented in the paper "A Stemming # Algorithm for the Portuguese Language" by Viviane Moreira Orengo and # Christian Huyck, which unfortunately I had no access to. The code is a # Python version, with some minor modifications of mine, to the description # presented at https://www.webcitation.org/5NnvdIzOb and to the C source code # available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. # Please note that this stemmer is intended for demonstration and educational # purposes only. Feel free to write me for any comments, including the # development of a different and/or better stemmer for Portuguese. I also # suggest using NLTK's mailing list for Portuguese for any discussion. # Este código é baseado no algoritmo apresentado no artigo "A Stemming # Algorithm for the Portuguese Language" de Viviane Moreira Orengo e # Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O # código é uma conversão para Python, com algumas pequenas modificações # minhas, daquele apresentado em https://www.webcitation.org/5NnvdIzOb e do # código para linguagem C disponível em # http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor, # lembre-se de que este stemmer foi desenvolvido com finalidades unicamente # de demonstração e didáticas. Sinta-se livre para me escrever para qualquer # comentário, inclusive sobre o desenvolvimento de um stemmer diferente # e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão # do NLTK para o português para qualquer debate. from nltk.data import load from nltk.stem.api import StemmerI class RSLPStemmer(StemmerI): """ A stemmer for Portuguese. >>> from nltk.stem import RSLPStemmer >>> st = RSLPStemmer() >>> # opening lines of Erico Verissimo's "Música ao Longe" >>> text = ''' ... Clarissa risca com giz no quadro-negro a paisagem que os alunos ... devem copiar . Uma casinha de porta e janela , em cima duma ... coxilha .''' >>> for token in text.split(): ... print(st.stem(token)) clariss risc com giz no quadro-negr a pais que os alun dev copi . uma cas de port e janel , em cim dum coxilh . """ def __init__(self): self._model = [] self._model.append(self.read_rule("step0.pt")) self._model.append(self.read_rule("step1.pt")) self._model.append(self.read_rule("step2.pt")) self._model.append(self.read_rule("step3.pt")) self._model.append(self.read_rule("step4.pt")) self._model.append(self.read_rule("step5.pt")) self._model.append(self.read_rule("step6.pt")) def read_rule(self, filename): rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8") lines = rules.split("\n") lines = [line for line in lines if line != ""] # remove blank lines lines = [line for line in lines if line[0] != "#"] # remove comments # NOTE: a simple but ugly hack to make this parser happy with double '\t's lines = [line.replace("\t\t", "\t") for line in lines] # parse rules rules = [] for line in lines: rule = [] tokens = line.split("\t") # text to be searched for at the end of the string rule.append(tokens[0][1:-1]) # remove quotes # minimum stem size to perform the replacement rule.append(int(tokens[1])) # text to be replaced into rule.append(tokens[2][1:-1]) # remove quotes # exceptions to this rule rule.append([token[1:-1] for token in tokens[3].split(",")]) # append to the results rules.append(rule) return rules def stem(self, word): word = word.lower() # the word ends in 's'? apply rule for plural reduction if word[-1] == "s": word = self.apply_rule(word, 0) # the word ends in 'a'? apply rule for feminine reduction if word[-1] == "a": word = self.apply_rule(word, 1) # augmentative reduction word = self.apply_rule(word, 3) # adverb reduction word = self.apply_rule(word, 2) # noun reduction prev_word = word word = self.apply_rule(word, 4) if word == prev_word: # verb reduction prev_word = word word = self.apply_rule(word, 5) if word == prev_word: # vowel removal word = self.apply_rule(word, 6) return word def apply_rule(self, word, rule_index): rules = self._model[rule_index] for rule in rules: suffix_length = len(rule[0]) if word[-suffix_length:] == rule[0]: # if suffix matches if len(word) >= suffix_length + rule[1]: # if we have minimum size if word not in rule[3]: # if not an exception word = word[:-suffix_length] + rule[2] break return word nltk-3.7/nltk/stem/snowball.py000066400000000000000000005333111420073152400164520ustar00rootroot00000000000000# # Natural Language Toolkit: Snowball Stemmer # # Copyright (C) 2001-2022 NLTK Project # Author: Peter Michael Stahl # Peter Ljunglof (revisions) # Lakhdar Benzahia (co-writer) # Assem Chelli (reviewer arabicstemmer) # Abdelkrim Aries (reviewer arabicstemmer) # Algorithms: Dr Martin Porter # Assem Chelli arabic stemming algorithm # Benzahia Lakhdar # URL: # For license information, see LICENSE.TXT """ Snowball stemmers This module provides a port of the Snowball stemmers developed by Martin Porter. There is also a demo function: `snowball.demo()`. """ import re from nltk.corpus import stopwords from nltk.stem import porter from nltk.stem.api import StemmerI from nltk.stem.util import prefix_replace, suffix_replace class SnowballStemmer(StemmerI): """ Snowball Stemmer The following languages are supported: Arabic, Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish and Swedish. The algorithm for English is documented here: Porter, M. \"An algorithm for suffix stripping.\" Program 14.3 (1980): 130-137. The algorithms have been developed by Martin Porter. These stemmers are called Snowball, because Porter created a programming language with this name for creating new stemming algorithms. There is more information available at http://snowball.tartarus.org/ The stemmer is invoked as shown below: >>> from nltk.stem import SnowballStemmer >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish >>> stemmer = SnowballStemmer("german") # Choose a language >>> stemmer.stem("Autobahnen") # Stem a word 'autobahn' Invoking the stemmers that way is useful if you do not know the language to be stemmed at runtime. Alternatively, if you already know the language, then you can invoke the language specific stemmer directly: >>> from nltk.stem.snowball import GermanStemmer >>> stemmer = GermanStemmer() >>> stemmer.stem("Autobahnen") 'autobahn' :param language: The language whose subclass is instantiated. :type language: str or unicode :param ignore_stopwords: If set to True, stopwords are not stemmed and returned unchanged. Set to False by default. :type ignore_stopwords: bool :raise ValueError: If there is no stemmer for the specified language, a ValueError is raised. """ languages = ( "arabic", "danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "porter", "portuguese", "romanian", "russian", "spanish", "swedish", ) def __init__(self, language, ignore_stopwords=False): if language not in self.languages: raise ValueError(f"The language '{language}' is not supported.") stemmerclass = globals()[language.capitalize() + "Stemmer"] self.stemmer = stemmerclass(ignore_stopwords) self.stem = self.stemmer.stem self.stopwords = self.stemmer.stopwords def stem(self, token): return self.stemmer.stem(self, token) class _LanguageSpecificStemmer(StemmerI): """ This helper subclass offers the possibility to invoke a specific stemmer directly. This is useful if you already know the language to be stemmed at runtime. Create an instance of the Snowball stemmer. :param ignore_stopwords: If set to True, stopwords are not stemmed and returned unchanged. Set to False by default. :type ignore_stopwords: bool """ def __init__(self, ignore_stopwords=False): # The language is the name of the class, minus the final "Stemmer". language = type(self).__name__.lower() if language.endswith("stemmer"): language = language[:-7] self.stopwords = set() if ignore_stopwords: try: for word in stopwords.words(language): self.stopwords.add(word) except OSError as e: raise ValueError( "{!r} has no list of stopwords. Please set" " 'ignore_stopwords' to 'False'.".format(self) ) from e def __repr__(self): """ Print out the string representation of the respective class. """ return f"<{type(self).__name__}>" class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer): """ A word stemmer based on the original Porter stemming algorithm. Porter, M. \"An algorithm for suffix stripping.\" Program 14.3 (1980): 130-137. A few minor modifications have been made to Porter's basic algorithm. See the source code of the module nltk.stem.porter for more information. """ def __init__(self, ignore_stopwords=False): _LanguageSpecificStemmer.__init__(self, ignore_stopwords) porter.PorterStemmer.__init__(self) class _ScandinavianStemmer(_LanguageSpecificStemmer): """ This subclass encapsulates a method for defining the string region R1. It is used by the Danish, Norwegian, and Swedish stemmer. """ def _r1_scandinavian(self, word, vowels): """ Return the region R1 that is used by the Scandinavian stemmers. R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel. But then R1 is adjusted so that the region before it contains at least three letters. :param word: The word whose region R1 is determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the region R1. :type vowels: unicode :return: the region R1 for the respective word. :rtype: unicode :note: This helper method is invoked by the respective stem method of the subclasses DanishStemmer, NorwegianStemmer, and SwedishStemmer. It is not to be invoked directly! """ r1 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: if 3 > len(word[: i + 1]) > 0: r1 = word[3:] elif len(word[: i + 1]) >= 3: r1 = word[i + 1 :] else: return word break return r1 class _StandardStemmer(_LanguageSpecificStemmer): """ This subclass encapsulates two methods for defining the standard versions of the string regions R1, R2, and RV. """ def _r1r2_standard(self, word, vowels): """ Return the standard interpretations of the string regions R1 and R2. R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel. R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel. :param word: The word whose regions R1 and R2 are determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the regions R1 and R2. :type vowels: unicode :return: (r1,r2), the regions R1 and R2 for the respective word. :rtype: tuple :note: This helper method is invoked by the respective stem method of the subclasses DutchStemmer, FinnishStemmer, FrenchStemmer, GermanStemmer, ItalianStemmer, PortugueseStemmer, RomanianStemmer, and SpanishStemmer. It is not to be invoked directly! :note: A detailed description of how to define R1 and R2 can be found at http://snowball.tartarus.org/texts/r1r2.html """ r1 = "" r2 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: r1 = word[i + 1 :] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i - 1] in vowels: r2 = r1[i + 1 :] break return (r1, r2) def _rv_standard(self, word, vowels): """ Return the standard interpretation of the string region RV. If the second letter is a consonant, RV is the region after the next following vowel. If the first two letters are vowels, RV is the region after the next following consonant. Otherwise, RV is the region after the third letter. :param word: The word whose region RV is determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the region RV. :type vowels: unicode :return: the region RV for the respective word. :rtype: unicode :note: This helper method is invoked by the respective stem method of the subclasses ItalianStemmer, PortugueseStemmer, RomanianStemmer, and SpanishStemmer. It is not to be invoked directly! """ rv = "" if len(word) >= 2: if word[1] not in vowels: for i in range(2, len(word)): if word[i] in vowels: rv = word[i + 1 :] break elif word[0] in vowels and word[1] in vowels: for i in range(2, len(word)): if word[i] not in vowels: rv = word[i + 1 :] break else: rv = word[3:] return rv class ArabicStemmer(_StandardStemmer): """ https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm) The Snowball Arabic light Stemmer Algorithm: - Assem Chelli - Abdelkrim Aries - Lakhdar Benzahia NLTK Version Author: - Lakhdar Benzahia """ # Normalize_pre stes __vocalization = re.compile( r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]" ) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ __kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda __arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟ # Normalize_post __last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ # normalize other hamza's __initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ __waw_hamza = re.compile(r"[\u0624]") # ؤ __yeh_hamza = re.compile(r"[\u0626]") # ئ __alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ # Checks __checks1 = ( "\u0643\u0627\u0644", "\u0628\u0627\u0644", # بال، كال "\u0627\u0644", "\u0644\u0644", # لل، ال ) __checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات # Suffixes __suffix_noun_step1a = ( "\u064a", "\u0643", "\u0647", # ي، ك، ه "\u0646\u0627", "\u0643\u0645", "\u0647\u0627", "\u0647\u0646", "\u0647\u0645", # نا، كم، ها، هن، هم "\u0643\u0645\u0627", "\u0647\u0645\u0627", # كما، هما ) __suffix_noun_step1b = "\u0646" # ن __suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و __suffix_noun_step2b = "\u0627\u062a" # ات __suffix_noun_step2c1 = "\u062a" # ت __suffix_noun_step2c2 = "\u0629" # ة __suffix_noun_step3 = "\u064a" # ي __suffix_verb_step1 = ( "\u0647", "\u0643", # ه، ك "\u0646\u064a", "\u0646\u0627", "\u0647\u0627", "\u0647\u0645", # ني، نا، ها، هم "\u0647\u0646", "\u0643\u0645", "\u0643\u0646", # هن، كم، كن "\u0647\u0645\u0627", "\u0643\u0645\u0627", "\u0643\u0645\u0648", # هما، كما، كمو ) __suffix_verb_step2a = ( "\u062a", "\u0627", "\u0646", "\u064a", # ت، ا، ن، ي "\u0646\u0627", "\u062a\u0627", "\u062a\u0646", # نا، تا، تن Past "\u0627\u0646", "\u0648\u0646", "\u064a\u0646", # ان، هن، ين Present "\u062a\u0645\u0627", # تما ) __suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم __suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو __suffix_all_alef_maqsura = "\u0649" # ى # Prefixes __prefix_step1 = ( "\u0623", # أ "\u0623\u0623", "\u0623\u0622", "\u0623\u0624", "\u0623\u0627", "\u0623\u0625", # أأ، أآ، أؤ، أا، أإ ) __prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال __prefix_step2b = ("\u0641", "\u0648") # ف، و __prefix_step3a_noun = ( "\u0627\u0644", "\u0644\u0644", # لل، ال "\u0643\u0627\u0644", "\u0628\u0627\u0644", # بال، كال ) __prefix_step3b_noun = ( "\u0628", "\u0643", "\u0644", # ب، ك، ل "\u0628\u0628", "\u0643\u0643", # بب، كك ) __prefix_step3_verb = ( "\u0633\u064a", "\u0633\u062a", "\u0633\u0646", "\u0633\u0623", ) # سي، ست، سن، سأ __prefix_step4_verb = ( "\u064a\u0633\u062a", "\u0646\u0633\u062a", "\u062a\u0633\u062a", ) # يست، نست، تست # Suffixes added due to Conjugation Verbs __conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك __conjugation_suffix_verb_2 = ( "\u0646\u064a", "\u0646\u0627", "\u0647\u0627", # ني، نا، ها "\u0647\u0645", "\u0647\u0646", "\u0643\u0645", # هم، هن، كم "\u0643\u0646", # كن ) __conjugation_suffix_verb_3 = ( "\u0647\u0645\u0627", "\u0643\u0645\u0627", "\u0643\u0645\u0648", ) # هما، كما، كمو __conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي __conjugation_suffix_verb_past = ( "\u0646\u0627", "\u062a\u0627", "\u062a\u0646", ) # نا، تا، تن __conjugation_suffix_verb_present = ( "\u0627\u0646", "\u0648\u0646", "\u064a\u0646", ) # ان، ون، ين # Suffixes added due to derivation Names __conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه __conjugation_suffix_noun_2 = ( "\u0646\u0627", "\u0643\u0645", # نا، كم "\u0647\u0627", "\u0647\u0646", "\u0647\u0645", # ها، هن، هم ) __conjugation_suffix_noun_3 = ( "\u0643\u0645\u0627", "\u0647\u0645\u0627", ) # كما، هما # Prefixes added due to derivation Names __prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا __articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال __articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل # Prepositions letters __prepositions1 = ("\u0643", "\u0644") # ك، ل __prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك is_verb = True is_noun = True is_defined = False suffixes_verb_step1_success = False suffix_verb_step2a_success = False suffix_verb_step2b_success = False suffix_noun_step2c2_success = False suffix_noun_step1a_success = False suffix_noun_step2a_success = False suffix_noun_step2b_success = False suffixe_noun_step1b_success = False prefix_step2a_success = False prefix_step3a_noun_success = False prefix_step3b_noun_success = False def __normalize_pre(self, token): """ :param token: string :return: normalized token type string """ # strip diacritics token = self.__vocalization.sub("", token) # strip kasheeda token = self.__kasheeda.sub("", token) # strip punctuation marks token = self.__arabic_punctuation_marks.sub("", token) return token def __normalize_post(self, token): # normalize last hamza for hamza in self.__last_hamzat: if token.endswith(hamza): token = suffix_replace(token, hamza, "\u0621") break # normalize other hamzat token = self.__initial_hamzat.sub("\u0627", token) token = self.__waw_hamza.sub("\u0648", token) token = self.__yeh_hamza.sub("\u064a", token) token = self.__alefat.sub("\u0627", token) return token def __checks_1(self, token): for prefix in self.__checks1: if token.startswith(prefix): if prefix in self.__articles_3len and len(token) > 4: self.is_noun = True self.is_verb = False self.is_defined = True break if prefix in self.__articles_2len and len(token) > 3: self.is_noun = True self.is_verb = False self.is_defined = True break def __checks_2(self, token): for suffix in self.__checks2: if token.endswith(suffix): if suffix == "\u0629" and len(token) > 2: self.is_noun = True self.is_verb = False break if suffix == "\u0627\u062a" and len(token) > 3: self.is_noun = True self.is_verb = False break def __Suffix_Verb_Step1(self, token): for suffix in self.__suffix_verb_step1: if token.endswith(suffix): if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4: token = token[:-1] self.suffixes_verb_step1_success = True break if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5: token = token[:-2] self.suffixes_verb_step1_success = True break if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6: token = token[:-3] self.suffixes_verb_step1_success = True break return token def __Suffix_Verb_Step2a(self, token): for suffix in self.__suffix_verb_step2a: if token.endswith(suffix) and len(token) > 3: if suffix == "\u062a" and len(token) >= 4: token = token[:-1] self.suffix_verb_step2a_success = True break if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4: token = token[:-1] self.suffix_verb_step2a_success = True break if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5: token = token[:-2] # past self.suffix_verb_step2a_success = True break if suffix in self.__conjugation_suffix_verb_present and len(token) > 5: token = token[:-2] # present self.suffix_verb_step2a_success = True break if suffix == "\u062a\u0645\u0627" and len(token) >= 6: token = token[:-3] self.suffix_verb_step2a_success = True break return token def __Suffix_Verb_Step2c(self, token): for suffix in self.__suffix_verb_step2c: if token.endswith(suffix): if suffix == "\u062a\u0645\u0648" and len(token) >= 6: token = token[:-3] break if suffix == "\u0648" and len(token) >= 4: token = token[:-1] break return token def __Suffix_Verb_Step2b(self, token): for suffix in self.__suffix_verb_step2b: if token.endswith(suffix) and len(token) >= 5: token = token[:-2] self.suffix_verb_step2b_success = True break return token def __Suffix_Noun_Step2c2(self, token): for suffix in self.__suffix_noun_step2c2: if token.endswith(suffix) and len(token) >= 3: token = token[:-1] self.suffix_noun_step2c2_success = True break return token def __Suffix_Noun_Step1a(self, token): for suffix in self.__suffix_noun_step1a: if token.endswith(suffix): if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4: token = token[:-1] self.suffix_noun_step1a_success = True break if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5: token = token[:-2] self.suffix_noun_step1a_success = True break if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6: token = token[:-3] self.suffix_noun_step1a_success = True break return token def __Suffix_Noun_Step2a(self, token): for suffix in self.__suffix_noun_step2a: if token.endswith(suffix) and len(token) > 4: token = token[:-1] self.suffix_noun_step2a_success = True break return token def __Suffix_Noun_Step2b(self, token): for suffix in self.__suffix_noun_step2b: if token.endswith(suffix) and len(token) >= 5: token = token[:-2] self.suffix_noun_step2b_success = True break return token def __Suffix_Noun_Step2c1(self, token): for suffix in self.__suffix_noun_step2c1: if token.endswith(suffix) and len(token) >= 4: token = token[:-1] break return token def __Suffix_Noun_Step1b(self, token): for suffix in self.__suffix_noun_step1b: if token.endswith(suffix) and len(token) > 5: token = token[:-1] self.suffixe_noun_step1b_success = True break return token def __Suffix_Noun_Step3(self, token): for suffix in self.__suffix_noun_step3: if token.endswith(suffix) and len(token) >= 3: token = token[:-1] # ya' nisbiya break return token def __Suffix_All_alef_maqsura(self, token): for suffix in self.__suffix_all_alef_maqsura: if token.endswith(suffix): token = suffix_replace(token, suffix, "\u064a") return token def __Prefix_Step1(self, token): for prefix in self.__prefix_step1: if token.startswith(prefix) and len(token) > 3: if prefix == "\u0623\u0623": token = prefix_replace(token, prefix, "\u0623") break elif prefix == "\u0623\u0622": token = prefix_replace(token, prefix, "\u0622") break elif prefix == "\u0623\u0624": token = prefix_replace(token, prefix, "\u0624") break elif prefix == "\u0623\u0627": token = prefix_replace(token, prefix, "\u0627") break elif prefix == "\u0623\u0625": token = prefix_replace(token, prefix, "\u0625") break return token def __Prefix_Step2a(self, token): for prefix in self.__prefix_step2a: if token.startswith(prefix) and len(token) > 5: token = token[len(prefix) :] self.prefix_step2a_success = True break return token def __Prefix_Step2b(self, token): for prefix in self.__prefix_step2b: if token.startswith(prefix) and len(token) > 3: if token[:2] not in self.__prefixes1: token = token[len(prefix) :] break return token def __Prefix_Step3a_Noun(self, token): for prefix in self.__prefix_step3a_noun: if token.startswith(prefix): if prefix in self.__articles_2len and len(token) > 4: token = token[len(prefix) :] self.prefix_step3a_noun_success = True break if prefix in self.__articles_3len and len(token) > 5: token = token[len(prefix) :] break return token def __Prefix_Step3b_Noun(self, token): for prefix in self.__prefix_step3b_noun: if token.startswith(prefix): if len(token) > 3: if prefix == "\u0628": token = token[len(prefix) :] self.prefix_step3b_noun_success = True break if prefix in self.__prepositions2: token = prefix_replace(token, prefix, prefix[1]) self.prefix_step3b_noun_success = True break if prefix in self.__prepositions1 and len(token) > 4: token = token[len(prefix) :] # BUG: cause confusion self.prefix_step3b_noun_success = True break return token def __Prefix_Step3_Verb(self, token): for prefix in self.__prefix_step3_verb: if token.startswith(prefix) and len(token) > 4: token = prefix_replace(token, prefix, prefix[1]) break return token def __Prefix_Step4_Verb(self, token): for prefix in self.__prefix_step4_verb: if token.startswith(prefix) and len(token) > 4: token = prefix_replace(token, prefix, "\u0627\u0633\u062a") self.is_verb = True self.is_noun = False break return token def stem(self, word): """ Stem an Arabic word and return the stemmed form. :param word: string :return: string """ # set initial values self.is_verb = True self.is_noun = True self.is_defined = False self.suffix_verb_step2a_success = False self.suffix_verb_step2b_success = False self.suffix_noun_step2c2_success = False self.suffix_noun_step1a_success = False self.suffix_noun_step2a_success = False self.suffix_noun_step2b_success = False self.suffixe_noun_step1b_success = False self.prefix_step2a_success = False self.prefix_step3a_noun_success = False self.prefix_step3b_noun_success = False modified_word = word # guess type and properties # checks1 self.__checks_1(modified_word) # checks2 self.__checks_2(modified_word) # Pre_Normalization modified_word = self.__normalize_pre(modified_word) # Avoid stopwords if modified_word in self.stopwords or len(modified_word) <= 2: return modified_word # Start stemming if self.is_verb: modified_word = self.__Suffix_Verb_Step1(modified_word) if self.suffixes_verb_step1_success: modified_word = self.__Suffix_Verb_Step2a(modified_word) if not self.suffix_verb_step2a_success: modified_word = self.__Suffix_Verb_Step2c(modified_word) # or next TODO: How to deal with or next instruction else: modified_word = self.__Suffix_Verb_Step2b(modified_word) if not self.suffix_verb_step2b_success: modified_word = self.__Suffix_Verb_Step2a(modified_word) if self.is_noun: modified_word = self.__Suffix_Noun_Step2c2(modified_word) if not self.suffix_noun_step2c2_success: if not self.is_defined: modified_word = self.__Suffix_Noun_Step1a(modified_word) # if self.suffix_noun_step1a_success: modified_word = self.__Suffix_Noun_Step2a(modified_word) if not self.suffix_noun_step2a_success: modified_word = self.__Suffix_Noun_Step2b(modified_word) if ( not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success ): modified_word = self.__Suffix_Noun_Step2c1(modified_word) # or next ? todo : how to deal with or next else: modified_word = self.__Suffix_Noun_Step1b(modified_word) if self.suffixe_noun_step1b_success: modified_word = self.__Suffix_Noun_Step2a(modified_word) if not self.suffix_noun_step2a_success: modified_word = self.__Suffix_Noun_Step2b(modified_word) if ( not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success ): modified_word = self.__Suffix_Noun_Step2c1(modified_word) else: if not self.is_defined: modified_word = self.__Suffix_Noun_Step2a(modified_word) modified_word = self.__Suffix_Noun_Step2b(modified_word) modified_word = self.__Suffix_Noun_Step3(modified_word) if not self.is_noun and self.is_verb: modified_word = self.__Suffix_All_alef_maqsura(modified_word) # prefixes modified_word = self.__Prefix_Step1(modified_word) modified_word = self.__Prefix_Step2a(modified_word) if not self.prefix_step2a_success: modified_word = self.__Prefix_Step2b(modified_word) modified_word = self.__Prefix_Step3a_Noun(modified_word) if not self.prefix_step3a_noun_success and self.is_noun: modified_word = self.__Prefix_Step3b_Noun(modified_word) else: if not self.prefix_step3b_noun_success and self.is_verb: modified_word = self.__Prefix_Step3_Verb(modified_word) modified_word = self.__Prefix_Step4_Verb(modified_word) # post normalization stemming modified_word = self.__normalize_post(modified_word) stemmed_word = modified_word return stemmed_word class DanishStemmer(_ScandinavianStemmer): """ The Danish Snowball stemmer. :cvar __vowels: The Danish vowels. :type __vowels: unicode :cvar __consonants: The Danish consonants. :type __consonants: unicode :cvar __double_consonants: The Danish double consonants. :type __double_consonants: tuple :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Danish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/danish/stemmer.html """ # The language's vowels and other important characters are defined. __vowels = "aeiouy\xE6\xE5\xF8" __consonants = "bcdfghjklmnpqrstvwxz" __double_consonants = ( "bb", "cc", "dd", "ff", "gg", "hh", "jj", "kk", "ll", "mm", "nn", "pp", "qq", "rr", "ss", "tt", "vv", "ww", "xx", "zz", ) __s_ending = "abcdfghjklmnoprtvyz\xE5" # The different suffixes, divided into the algorithm's steps # and organized by length, are listed in tuples. __step1_suffixes = ( "erendes", "erende", "hedens", "ethed", "erede", "heden", "heder", "endes", "ernes", "erens", "erets", "ered", "ende", "erne", "eren", "erer", "heds", "enes", "eres", "eret", "hed", "ene", "ere", "ens", "ers", "ets", "en", "er", "es", "et", "e", "s", ) __step2_suffixes = ("gd", "dt", "gt", "kt") __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig") def stem(self, word): """ Stem a Danish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ # Every word is put into lower case for normalization. word = word.lower() if word in self.stopwords: return word # After this, the required regions are generated # by the respective helper method. r1 = self._r1_scandinavian(word, self.__vowels) # Then the actual stemming process starts. # Every new step is explicitly indicated # according to the descriptions on the Snowball website. # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 if r1.endswith("igst"): word = word[:-2] r1 = r1[:-2] for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix == "l\xF8st": word = word[:-1] r1 = r1[:-1] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] if r1.endswith(self.__step2_suffixes): word = word[:-1] r1 = r1[:-1] break # STEP 4: Undouble for double_cons in self.__double_consonants: if word.endswith(double_cons) and len(word) > 3: word = word[:-1] break return word class DutchStemmer(_StandardStemmer): """ The Dutch Snowball stemmer. :cvar __vowels: The Dutch vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm. :type __step3b_suffixes: tuple :note: A detailed description of the Dutch stemming algorithm can be found under http://snowball.tartarus.org/algorithms/dutch/stemmer.html """ __vowels = "aeiouy\xE8" __step1_suffixes = ("heden", "ene", "en", "se", "s") __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig") def stem(self, word): """ Stem a Dutch word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step2_success = False # Vowel accents are removed. word = ( word.replace("\xE4", "a") .replace("\xE1", "a") .replace("\xEB", "e") .replace("\xE9", "e") .replace("\xED", "i") .replace("\xEF", "i") .replace("\xF6", "o") .replace("\xF3", "o") .replace("\xFC", "u") .replace("\xFA", "u") ) # An initial 'y', a 'y' after a vowel, # and an 'i' between self.__vowels is put into upper case. # As from now these are treated as consonants. if word.startswith("y"): word = "".join(("Y", word[1:])) for i in range(1, len(word)): if word[i - 1] in self.__vowels and word[i] == "y": word = "".join((word[:i], "Y", word[i + 1 :])) for i in range(1, len(word) - 1): if ( word[i - 1] in self.__vowels and word[i] == "i" and word[i + 1] in self.__vowels ): word = "".join((word[:i], "I", word[i + 1 :])) r1, r2 = self._r1r2_standard(word, self.__vowels) # R1 is adjusted so that the region before it # contains at least 3 letters. for i in range(1, len(word)): if word[i] not in self.__vowels and word[i - 1] in self.__vowels: if 3 > len(word[: i + 1]) > 0: r1 = word[3:] elif len(word[: i + 1]) == 0: return word break # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "heden": word = suffix_replace(word, suffix, "heid") r1 = suffix_replace(r1, suffix, "heid") if r2.endswith("heden"): r2 = suffix_replace(r2, suffix, "heid") elif ( suffix in ("ene", "en") and not word.endswith("heden") and word[-len(suffix) - 1] not in self.__vowels and word[-len(suffix) - 3 : -len(suffix)] != "gem" ): word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif ( suffix in ("se", "s") and word[-len(suffix) - 1] not in self.__vowels and word[-len(suffix) - 1] != "j" ): word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] break # STEP 2 if r1.endswith("e") and word[-2] not in self.__vowels: step2_success = True word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] # STEP 3a if r2.endswith("heid") and word[-5] != "c": word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] if ( r1.endswith("en") and word[-3] not in self.__vowels and word[-5:-2] != "gem" ): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] # STEP 3b: Derivational suffixes for suffix in self.__step3b_suffixes: if r2.endswith(suffix): if suffix in ("end", "ing"): word = word[:-3] r2 = r2[:-3] if r2.endswith("ig") and word[-3] != "e": word = word[:-2] else: if word.endswith(("kk", "dd", "tt")): word = word[:-1] elif suffix == "ig" and word[-3] != "e": word = word[:-2] elif suffix == "lijk": word = word[:-4] r1 = r1[:-4] if r1.endswith("e") and word[-2] not in self.__vowels: word = word[:-1] if word.endswith(("kk", "dd", "tt")): word = word[:-1] elif suffix == "baar": word = word[:-4] elif suffix == "bar" and step2_success: word = word[:-3] break # STEP 4: Undouble vowel if len(word) >= 4: if word[-1] not in self.__vowels and word[-1] != "I": if word[-3:-1] in ("aa", "ee", "oo", "uu"): if word[-4] not in self.__vowels: word = "".join((word[:-3], word[-3], word[-1])) # All occurrences of 'I' and 'Y' are put back into lower case. word = word.replace("I", "i").replace("Y", "y") return word class EnglishStemmer(_StandardStemmer): """ The English Snowball stemmer. :cvar __vowels: The English vowels. :type __vowels: unicode :cvar __double_consonants: The English double consonants. :type __double_consonants: tuple :cvar __li_ending: Letters that may directly appear before a word final 'li'. :type __li_ending: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm. :type __step1a_suffixes: tuple :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm. :type __step1b_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. :type __step5_suffixes: tuple :cvar __special_words: A dictionary containing words which have to be stemmed specially. :type __special_words: dict :note: A detailed description of the English stemming algorithm can be found under http://snowball.tartarus.org/algorithms/english/stemmer.html """ __vowels = "aeiouy" __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") __li_ending = "cdeghkmnrt" __step0_suffixes = ("'s'", "'s", "'") __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s") __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed") __step2_suffixes = ( "ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti", "lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli", "enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li", ) __step3_suffixes = ( "ational", "tional", "alize", "icate", "iciti", "ative", "ical", "ness", "ful", ) __step4_suffixes = ( "ement", "ance", "ence", "able", "ible", "ment", "ant", "ent", "ism", "ate", "iti", "ous", "ive", "ize", "ion", "al", "er", "ic", ) __step5_suffixes = ("e", "l") __special_words = { "skis": "ski", "skies": "sky", "dying": "die", "lying": "lie", "tying": "tie", "idly": "idl", "gently": "gentl", "ugly": "ugli", "early": "earli", "only": "onli", "singly": "singl", "sky": "sky", "news": "news", "howe": "howe", "atlas": "atlas", "cosmos": "cosmos", "bias": "bias", "andes": "andes", "inning": "inning", "innings": "inning", "outing": "outing", "outings": "outing", "canning": "canning", "cannings": "canning", "herring": "herring", "herrings": "herring", "earring": "earring", "earrings": "earring", "proceed": "proceed", "proceeds": "proceed", "proceeded": "proceed", "proceeding": "proceed", "exceed": "exceed", "exceeds": "exceed", "exceeded": "exceed", "exceeding": "exceed", "succeed": "succeed", "succeeds": "succeed", "succeeded": "succeed", "succeeding": "succeed", } def stem(self, word): """ Stem an English word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords or len(word) <= 2: return word elif word in self.__special_words: return self.__special_words[word] # Map the different apostrophe characters to a single consistent one word = ( word.replace("\u2019", "\x27") .replace("\u2018", "\x27") .replace("\u201B", "\x27") ) if word.startswith("\x27"): word = word[1:] if word.startswith("y"): word = "".join(("Y", word[1:])) for i in range(1, len(word)): if word[i - 1] in self.__vowels and word[i] == "y": word = "".join((word[:i], "Y", word[i + 1 :])) step1a_vowel_found = False step1b_vowel_found = False r1 = "" r2 = "" if word.startswith(("gener", "commun", "arsen")): if word.startswith(("gener", "arsen")): r1 = word[5:] else: r1 = word[6:] for i in range(1, len(r1)): if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels: r2 = r1[i + 1 :] break else: r1, r2 = self._r1r2_standard(word, self.__vowels) # STEP 0 for suffix in self.__step0_suffixes: if word.endswith(suffix): word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] break # STEP 1a for suffix in self.__step1a_suffixes: if word.endswith(suffix): if suffix == "sses": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("ied", "ies"): if len(word[: -len(suffix)]) > 1: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif suffix == "s": for letter in word[:-2]: if letter in self.__vowels: step1a_vowel_found = True break if step1a_vowel_found: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] break # STEP 1b for suffix in self.__step1b_suffixes: if word.endswith(suffix): if suffix in ("eed", "eedly"): if r1.endswith(suffix): word = suffix_replace(word, suffix, "ee") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ee") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ee") else: r2 = "" else: for letter in word[: -len(suffix)]: if letter in self.__vowels: step1b_vowel_found = True break if step1b_vowel_found: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] if word.endswith(("at", "bl", "iz")): word = "".join((word, "e")) r1 = "".join((r1, "e")) if len(word) > 5 or len(r1) >= 3: r2 = "".join((r2, "e")) elif word.endswith(self.__double_consonants): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif ( r1 == "" and len(word) >= 3 and word[-1] not in self.__vowels and word[-1] not in "wxY" and word[-2] in self.__vowels and word[-3] not in self.__vowels ) or ( r1 == "" and len(word) == 2 and word[0] in self.__vowels and word[1] not in self.__vowels ): word = "".join((word, "e")) if len(r1) > 0: r1 = "".join((r1, "e")) if len(r2) > 0: r2 = "".join((r2, "e")) break # STEP 1c if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels: word = "".join((word[:-1], "i")) if len(r1) >= 1: r1 = "".join((r1[:-1], "i")) else: r1 = "" if len(r2) >= 1: r2 = "".join((r2[:-1], "i")) else: r2 = "" # STEP 2 for suffix in self.__step2_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == "tional": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("enci", "anci", "abli"): word = "".join((word[:-1], "e")) if len(r1) >= 1: r1 = "".join((r1[:-1], "e")) else: r1 = "" if len(r2) >= 1: r2 = "".join((r2[:-1], "e")) else: r2 = "" elif suffix == "entli": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("izer", "ization"): word = suffix_replace(word, suffix, "ize") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ize") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ize") else: r2 = "" elif suffix in ("ational", "ation", "ator"): word = suffix_replace(word, suffix, "ate") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ate") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ate") else: r2 = "e" elif suffix in ("alism", "aliti", "alli"): word = suffix_replace(word, suffix, "al") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "al") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "al") else: r2 = "" elif suffix == "fulness": word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] elif suffix in ("ousli", "ousness"): word = suffix_replace(word, suffix, "ous") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ous") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ous") else: r2 = "" elif suffix in ("iveness", "iviti"): word = suffix_replace(word, suffix, "ive") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ive") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ive") else: r2 = "e" elif suffix in ("biliti", "bli"): word = suffix_replace(word, suffix, "ble") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ble") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ble") else: r2 = "" elif suffix == "ogi" and word[-4] == "l": word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif suffix in ("fulli", "lessli"): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "li" and word[-3] in self.__li_ending: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] break # STEP 3 for suffix in self.__step3_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == "tional": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "ational": word = suffix_replace(word, suffix, "ate") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ate") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ate") else: r2 = "" elif suffix == "alize": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] elif suffix in ("icate", "iciti", "ical"): word = suffix_replace(word, suffix, "ic") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ic") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ic") else: r2 = "" elif suffix in ("ful", "ness"): word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] elif suffix == "ative" and r2.endswith(suffix): word = word[:-5] r1 = r1[:-5] r2 = r2[:-5] break # STEP 4 for suffix in self.__step4_suffixes: if word.endswith(suffix): if r2.endswith(suffix): if suffix == "ion": if word[-4] in "st": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] break # STEP 5 if r2.endswith("l") and word[-2] == "l": word = word[:-1] elif r2.endswith("e"): word = word[:-1] elif r1.endswith("e"): if len(word) >= 4 and ( word[-2] in self.__vowels or word[-2] in "wxY" or word[-3] not in self.__vowels or word[-4] in self.__vowels ): word = word[:-1] word = word.replace("Y", "y") return word class FinnishStemmer(_StandardStemmer): """ The Finnish Snowball stemmer. :cvar __vowels: The Finnish vowels. :type __vowels: unicode :cvar __restricted_vowels: A subset of the Finnish vowels. :type __restricted_vowels: unicode :cvar __long_vowels: The Finnish vowels in their long forms. :type __long_vowels: tuple :cvar __consonants: The Finnish consonants. :type __consonants: unicode :cvar __double_consonants: The Finnish double consonants. :type __double_consonants: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the Finnish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/finnish/stemmer.html """ __vowels = "aeiouy\xE4\xF6" __restricted_vowels = "aeiou\xE4\xF6" __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6") __consonants = "bcdfghjklmnpqrstvwxz" __double_consonants = ( "bb", "cc", "dd", "ff", "gg", "hh", "jj", "kk", "ll", "mm", "nn", "pp", "qq", "rr", "ss", "tt", "vv", "ww", "xx", "zz", ) __step1_suffixes = ( "kaan", "k\xE4\xE4n", "sti", "kin", "han", "h\xE4n", "ko", "k\xF6", "pa", "p\xE4", ) __step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en") __step3_suffixes = ( "siin", "tten", "seen", "han", "hen", "hin", "hon", "h\xE4n", "h\xF6n", "den", "tta", "tt\xE4", "ssa", "ss\xE4", "sta", "st\xE4", "lla", "ll\xE4", "lta", "lt\xE4", "lle", "ksi", "ine", "ta", "t\xE4", "na", "n\xE4", "a", "\xE4", "n", ) __step4_suffixes = ( "impi", "impa", "imp\xE4", "immi", "imma", "imm\xE4", "mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4", "eja", "ej\xE4", ) def stem(self, word): """ Stem a Finnish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step3_success = False r1, r2 = self._r1r2_standard(word, self.__vowels) # STEP 1: Particles etc. for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "sti": if suffix in r2: word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6": word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] break # STEP 2: Possessives for suffix in self.__step2_suffixes: if r1.endswith(suffix): if suffix == "si": if word[-3] != "k": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "ni": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] if word.endswith("kse"): word = suffix_replace(word, "kse", "ksi") if r1.endswith("kse"): r1 = suffix_replace(r1, "kse", "ksi") if r2.endswith("kse"): r2 = suffix_replace(r2, "kse", "ksi") elif suffix == "an": if word[-4:-2] in ("ta", "na") or word[-5:-2] in ( "ssa", "sta", "lla", "lta", ): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "\xE4n": if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in ( "ss\xE4", "st\xE4", "ll\xE4", "lt\xE4", ): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "en": if word[-5:-2] in ("lle", "ine"): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] break # STEP 3: Cases for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"): if ( (suffix == "han" and word[-4] == "a") or (suffix == "hen" and word[-4] == "e") or (suffix == "hin" and word[-4] == "i") or (suffix == "hon" and word[-4] == "o") or (suffix == "h\xE4n" and word[-4] == "\xE4") or (suffix == "h\xF6n" and word[-4] == "\xF6") ): word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] step3_success = True elif suffix in ("siin", "den", "tten"): if ( word[-len(suffix) - 1] == "i" and word[-len(suffix) - 2] in self.__restricted_vowels ): word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] step3_success = True else: continue elif suffix == "seen": if word[-6:-4] in self.__long_vowels: word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] step3_success = True else: continue elif suffix in ("a", "\xE4"): if word[-2] in self.__vowels and word[-3] in self.__consonants: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] step3_success = True elif suffix in ("tta", "tt\xE4"): if word[-4] == "e": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] step3_success = True elif suffix == "n": word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] step3_success = True if word[-2:] == "ie" or word[-2:] in self.__long_vowels: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] step3_success = True break # STEP 4: Other endings for suffix in self.__step4_suffixes: if r2.endswith(suffix): if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"): if word[-5:-3] != "po": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] break # STEP 5: Plurals if step3_success and len(r1) >= 1 and r1[-1] in "ij": word = word[:-1] r1 = r1[:-1] elif ( not step3_success and len(r1) >= 2 and r1[-1] == "t" and r1[-2] in self.__vowels ): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] if r2.endswith("imma"): word = word[:-4] r1 = r1[:-4] elif r2.endswith("mma") and r2[-5:-3] != "po": word = word[:-3] r1 = r1[:-3] # STEP 6: Tidying up if r1[-2:] in self.__long_vowels: word = word[:-1] r1 = r1[:-1] if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei": word = word[:-1] r1 = r1[:-1] if r1.endswith(("oj", "uj")): word = word[:-1] r1 = r1[:-1] if r1.endswith("jo"): word = word[:-1] r1 = r1[:-1] # If the word ends with a double consonant # followed by zero or more vowels, the last consonant is removed. for i in range(1, len(word)): if word[-i] in self.__vowels: continue else: if i == 1: if word[-i - 1 :] in self.__double_consonants: word = word[:-1] else: if word[-i - 1 : -i + 1] in self.__double_consonants: word = "".join((word[:-i], word[-i + 1 :])) break return word class FrenchStemmer(_StandardStemmer): """ The French Snowball stemmer. :cvar __vowels: The French vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. :type __step2a_suffixes: tuple :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. :type __step2b_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the French stemming algorithm can be found under http://snowball.tartarus.org/algorithms/french/stemmer.html """ __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9" __step1_suffixes = ( "issements", "issement", "atrices", "atrice", "ateurs", "ations", "logies", "usions", "utions", "ements", "amment", "emment", "ances", "iqUes", "ismes", "ables", "istes", "ateur", "ation", "logie", "usion", "ution", "ences", "ement", "euses", "ments", "ance", "iqUe", "isme", "able", "iste", "ence", "it\xE9s", "ives", "eaux", "euse", "ment", "eux", "it\xE9", "ive", "ifs", "aux", "if", ) __step2a_suffixes = ( "issaIent", "issantes", "iraIent", "issante", "issants", "issions", "irions", "issais", "issait", "issant", "issent", "issiez", "issons", "irais", "irait", "irent", "iriez", "irons", "iront", "isses", "issez", "\xEEmes", "\xEEtes", "irai", "iras", "irez", "isse", "ies", "ira", "\xEEt", "ie", "ir", "is", "it", "i", ) __step2b_suffixes = ( "eraIent", "assions", "erions", "assent", "assiez", "\xE8rent", "erais", "erait", "eriez", "erons", "eront", "aIent", "antes", "asses", "ions", "erai", "eras", "erez", "\xE2mes", "\xE2tes", "ante", "ants", "asse", "\xE9es", "era", "iez", "ais", "ait", "ant", "\xE9e", "\xE9s", "er", "ez", "\xE2t", "ai", "as", "\xE9", "a", ) __step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB") def stem(self, word): """ Stem a French word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False rv_ending_found = False step2a_success = False step2b_success = False # Every occurrence of 'u' after 'q' is put into upper case. for i in range(1, len(word)): if word[i - 1] == "q" and word[i] == "u": word = "".join((word[:i], "U", word[i + 1 :])) # Every occurrence of 'u' and 'i' # between vowels is put into upper case. # Every occurrence of 'y' preceded or # followed by a vowel is also put into upper case. for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1 :])) elif word[i] == "i": word = "".join((word[:i], "I", word[i + 1 :])) if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels: if word[i] == "y": word = "".join((word[:i], "Y", word[i + 1 :])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self.__rv_french(word, self.__vowels) # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "eaux": word = word[:-1] step1_success = True elif suffix in ("euse", "euses"): if suffix in r2: word = word[: -len(suffix)] step1_success = True elif suffix in r1: word = suffix_replace(word, suffix, "eux") step1_success = True elif suffix in ("ement", "ements") and suffix in rv: word = word[: -len(suffix)] step1_success = True if word[-2:] == "iv" and "iv" in r2: word = word[:-2] if word[-2:] == "at" and "at" in r2: word = word[:-2] elif word[-3:] == "eus": if "eus" in r2: word = word[:-3] elif "eus" in r1: word = "".join((word[:-1], "x")) elif word[-3:] in ("abl", "iqU"): if "abl" in r2 or "iqU" in r2: word = word[:-3] elif word[-3:] in ("i\xE8r", "I\xE8r"): if "i\xE8r" in rv or "I\xE8r" in rv: word = "".join((word[:-3], "i")) elif suffix == "amment" and suffix in rv: word = suffix_replace(word, "amment", "ant") rv = suffix_replace(rv, "amment", "ant") rv_ending_found = True elif suffix == "emment" and suffix in rv: word = suffix_replace(word, "emment", "ent") rv_ending_found = True elif ( suffix in ("ment", "ments") and suffix in rv and not rv.startswith(suffix) and rv[rv.rindex(suffix) - 1] in self.__vowels ): word = word[: -len(suffix)] rv = rv[: -len(suffix)] rv_ending_found = True elif suffix == "aux" and suffix in r1: word = "".join((word[:-2], "l")) step1_success = True elif ( suffix in ("issement", "issements") and suffix in r1 and word[-len(suffix) - 1] not in self.__vowels ): word = word[: -len(suffix)] step1_success = True elif ( suffix in ( "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes", ) and suffix in r2 ): word = word[: -len(suffix)] step1_success = True elif ( suffix in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations") and suffix in r2 ): word = word[: -len(suffix)] step1_success = True if word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) elif suffix in ("logie", "logies") and suffix in r2: word = suffix_replace(word, suffix, "log") step1_success = True elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2: word = suffix_replace(word, suffix, "u") step1_success = True elif suffix in ("ence", "ences") and suffix in r2: word = suffix_replace(word, suffix, "ent") step1_success = True elif suffix in ("it\xE9", "it\xE9s") and suffix in r2: word = word[: -len(suffix)] step1_success = True if word[-4:] == "abil": if "abil" in r2: word = word[:-4] else: word = "".join((word[:-2], "l")) elif word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) elif word[-2:] == "iv": if "iv" in r2: word = word[:-2] elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2: word = word[: -len(suffix)] step1_success = True if word[-2:] == "at" and "at" in r2: word = word[:-2] if word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) break # STEP 2a: Verb suffixes beginning 'i' if not step1_success or rv_ending_found: for suffix in self.__step2a_suffixes: if word.endswith(suffix): if ( suffix in rv and len(rv) > len(suffix) and rv[rv.rindex(suffix) - 1] not in self.__vowels ): word = word[: -len(suffix)] step2a_success = True break # STEP 2b: Other verb suffixes if not step2a_success: for suffix in self.__step2b_suffixes: if rv.endswith(suffix): if suffix == "ions" and "ions" in r2: word = word[:-4] step2b_success = True elif suffix in ( "eraIent", "erions", "\xE8rent", "erais", "erait", "eriez", "erons", "eront", "erai", "eras", "erez", "\xE9es", "era", "iez", "\xE9e", "\xE9s", "er", "ez", "\xE9", ): word = word[: -len(suffix)] step2b_success = True elif suffix in ( "assions", "assent", "assiez", "aIent", "antes", "asses", "\xE2mes", "\xE2tes", "ante", "ants", "asse", "ais", "ait", "ant", "\xE2t", "ai", "as", "a", ): word = word[: -len(suffix)] rv = rv[: -len(suffix)] step2b_success = True if rv.endswith("e"): word = word[:-1] break # STEP 3 if step1_success or step2a_success or step2b_success: if word[-1] == "Y": word = "".join((word[:-1], "i")) elif word[-1] == "\xE7": word = "".join((word[:-1], "c")) # STEP 4: Residual suffixes else: if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s": word = word[:-1] for suffix in self.__step4_suffixes: if word.endswith(suffix): if suffix in rv: if suffix == "ion" and suffix in r2 and rv[-4] in "st": word = word[:-3] elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"): word = suffix_replace(word, suffix, "i") elif suffix == "e": word = word[:-1] elif suffix == "\xEB" and word[-3:-1] == "gu": word = word[:-1] break # STEP 5: Undouble if word.endswith(("enn", "onn", "ett", "ell", "eill")): word = word[:-1] # STEP 6: Un-accent for i in range(1, len(word)): if word[-i] not in self.__vowels: i += 1 else: if i != 1 and word[-i] in ("\xE9", "\xE8"): word = "".join((word[:-i], "e", word[-i + 1 :])) break word = word.replace("I", "i").replace("U", "u").replace("Y", "y") return word def __rv_french(self, word, vowels): """ Return the region RV that is used by the French stemmer. If the word begins with two vowels, RV is the region after the third letter. Otherwise, it is the region after the first vowel not at the beginning of the word, or the end of the word if these positions cannot be found. (Exceptionally, u'par', u'col' or u'tap' at the beginning of a word is also taken to define RV as the region to their right.) :param word: The French word whose region RV is determined. :type word: str or unicode :param vowels: The French vowels that are used to determine the region RV. :type vowels: unicode :return: the region RV for the respective French word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass FrenchStemmer. It is not to be invoked directly! """ rv = "" if len(word) >= 2: if word.startswith(("par", "col", "tap")) or ( word[0] in vowels and word[1] in vowels ): rv = word[3:] else: for i in range(1, len(word)): if word[i] in vowels: rv = word[i + 1 :] break return rv class GermanStemmer(_StandardStemmer): """ The German Snowball stemmer. :cvar __vowels: The German vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __st_ending: Letter that may directly appear before a word final 'st'. :type __st_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the German stemming algorithm can be found under http://snowball.tartarus.org/algorithms/german/stemmer.html """ __vowels = "aeiouy\xE4\xF6\xFC" __s_ending = "bdfghklmnrt" __st_ending = "bdfghklmnt" __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") __step2_suffixes = ("est", "en", "er", "st") __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik") def stem(self, word): """ Stem a German word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word word = word.replace("\xDF", "ss") # Every occurrence of 'u' and 'y' # between vowels is put into upper case. for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1 :])) elif word[i] == "y": word = "".join((word[:i], "Y", word[i + 1 :])) r1, r2 = self._r1r2_standard(word, self.__vowels) # R1 is adjusted so that the region before it # contains at least 3 letters. for i in range(1, len(word)): if word[i] not in self.__vowels and word[i - 1] in self.__vowels: if 3 > len(word[: i + 1]) > 0: r1 = word[3:] elif len(word[: i + 1]) == 0: return word break # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if ( suffix in ("en", "es", "e") and word[-len(suffix) - 4 : -len(suffix)] == "niss" ): word = word[: -len(suffix) - 1] r1 = r1[: -len(suffix) - 1] r2 = r2[: -len(suffix) - 1] elif suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): if suffix == "st": if word[-3] in self.__st_ending and len(word[:-3]) >= 3: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] break # STEP 3: Derivational suffixes for suffix in self.__step3_suffixes: if r2.endswith(suffix): if suffix in ("end", "ung"): if ( "ig" in r2[-len(suffix) - 2 : -len(suffix)] and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2] ): word = word[: -len(suffix) - 2] else: word = word[: -len(suffix)] elif ( suffix in ("ig", "ik", "isch") and "e" not in r2[-len(suffix) - 1 : -len(suffix)] ): word = word[: -len(suffix)] elif suffix in ("lich", "heit"): if ( "er" in r1[-len(suffix) - 2 : -len(suffix)] or "en" in r1[-len(suffix) - 2 : -len(suffix)] ): word = word[: -len(suffix) - 2] else: word = word[: -len(suffix)] elif suffix == "keit": if "lich" in r2[-len(suffix) - 4 : -len(suffix)]: word = word[: -len(suffix) - 4] elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]: word = word[: -len(suffix) - 2] else: word = word[: -len(suffix)] break # Umlaut accents are removed and # 'u' and 'y' are put back into lower case. word = ( word.replace("\xE4", "a") .replace("\xF6", "o") .replace("\xFC", "u") .replace("U", "u") .replace("Y", "y") ) return word class HungarianStemmer(_LanguageSpecificStemmer): """ The Hungarian Snowball stemmer. :cvar __vowels: The Hungarian vowels. :type __vowels: unicode :cvar __digraphs: The Hungarian digraphs. :type __digraphs: tuple :cvar __double_consonants: The Hungarian double consonants. :type __double_consonants: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. :type __step5_suffixes: tuple :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. :type __step6_suffixes: tuple :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. :type __step7_suffixes: tuple :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. :type __step8_suffixes: tuple :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. :type __step9_suffixes: tuple :note: A detailed description of the Hungarian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/hungarian/stemmer.html """ __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB" __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") __double_consonants = ( "bb", "cc", "ccs", "dd", "ff", "gg", "ggy", "jj", "kk", "ll", "lly", "mm", "nn", "nny", "pp", "rr", "ss", "ssz", "tt", "tty", "vv", "zz", "zzs", ) __step1_suffixes = ("al", "el") __step2_suffixes = ( "k\xE9ppen", "onk\xE9nt", "enk\xE9nt", "ank\xE9nt", "k\xE9pp", "k\xE9nt", "ban", "ben", "nak", "nek", "val", "vel", "t\xF3l", "t\xF5l", "r\xF3l", "r\xF5l", "b\xF3l", "b\xF5l", "hoz", "hez", "h\xF6z", "n\xE1l", "n\xE9l", "\xE9rt", "kor", "ba", "be", "ra", "re", "ig", "at", "et", "ot", "\xF6t", "ul", "\xFCl", "v\xE1", "v\xE9", "en", "on", "an", "\xF6n", "n", "t", ) __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n") __step4_suffixes = ( "astul", "est\xFCl", "\xE1stul", "\xE9st\xFCl", "stul", "st\xFCl", ) __step5_suffixes = ("\xE1", "\xE9") __step6_suffixes = ( "ok\xE9", "\xF6k\xE9", "ak\xE9", "ek\xE9", "\xE1k\xE9", "\xE1\xE9i", "\xE9k\xE9", "\xE9\xE9i", "k\xE9", "\xE9i", "\xE9\xE9", "\xE9", ) __step7_suffixes = ( "\xE1juk", "\xE9j\xFCk", "\xFCnk", "unk", "juk", "j\xFCk", "\xE1nk", "\xE9nk", "nk", "uk", "\xFCk", "em", "om", "am", "od", "ed", "ad", "\xF6d", "ja", "je", "\xE1m", "\xE1d", "\xE9m", "\xE9d", "m", "d", "a", "e", "o", "\xE1", "\xE9", ) __step8_suffixes = ( "jaitok", "jeitek", "jaink", "jeink", "aitok", "eitek", "\xE1itok", "\xE9itek", "jaim", "jeim", "jaid", "jeid", "eink", "aink", "itek", "jeik", "jaik", "\xE1ink", "\xE9ink", "aim", "eim", "aid", "eid", "jai", "jei", "ink", "aik", "eik", "\xE1im", "\xE1id", "\xE1ik", "\xE9im", "\xE9id", "\xE9ik", "im", "id", "ai", "ei", "ik", "\xE1i", "\xE9i", "i", ) __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k") def stem(self, word): """ Stem an Hungarian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) # STEP 1: Remove instrumental case if r1.endswith(self.__step1_suffixes): for double_cons in self.__double_consonants: if word[-2 - len(double_cons) : -2] == double_cons: word = "".join((word[:-4], word[-3])) if r1[-2 - len(double_cons) : -2] == double_cons: r1 = "".join((r1[:-4], r1[-3])) break # STEP 2: Remove frequent cases for suffix in self.__step2_suffixes: if word.endswith(suffix): if r1.endswith(suffix): word = word[: -len(suffix)] r1 = r1[: -len(suffix)] if r1.endswith("\xE1"): word = "".join((word[:-1], "a")) r1 = suffix_replace(r1, "\xE1", "a") elif r1.endswith("\xE9"): word = "".join((word[:-1], "e")) r1 = suffix_replace(r1, "\xE9", "e") break # STEP 3: Remove special cases for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix == "\xE9n": word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") break # STEP 4: Remove other cases for suffix in self.__step4_suffixes: if r1.endswith(suffix): if suffix == "\xE1stul": word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") elif suffix == "\xE9st\xFCl": word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] break # STEP 5: Remove factive case for suffix in self.__step5_suffixes: if r1.endswith(suffix): for double_cons in self.__double_consonants: if word[-1 - len(double_cons) : -1] == double_cons: word = "".join((word[:-3], word[-2])) if r1[-1 - len(double_cons) : -1] == double_cons: r1 = "".join((r1[:-3], r1[-2])) break # STEP 6: Remove owned for suffix in self.__step6_suffixes: if r1.endswith(suffix): if suffix in ("\xE1k\xE9", "\xE1\xE9i"): word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"): word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] break # STEP 7: Remove singular owner suffixes for suffix in self.__step7_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"): word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"): word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] break # STEP 8: Remove plural owner suffixes for suffix in self.__step8_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix in ( "\xE1im", "\xE1id", "\xE1i", "\xE1ink", "\xE1itok", "\xE1ik", ): word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") elif suffix in ( "\xE9im", "\xE9id", "\xE9i", "\xE9ink", "\xE9itek", "\xE9ik", ): word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] break # STEP 9: Remove plural suffixes for suffix in self.__step9_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == "\xE1k": word = suffix_replace(word, suffix, "a") elif suffix == "\xE9k": word = suffix_replace(word, suffix, "e") else: word = word[: -len(suffix)] break return word def __r1_hungarian(self, word, vowels, digraphs): """ Return the region R1 that is used by the Hungarian stemmer. If the word begins with a vowel, R1 is defined as the region after the first consonant or digraph (= two letters stand for one phoneme) in the word. If the word begins with a consonant, it is defined as the region after the first vowel in the word. If the word does not contain both a vowel and consonant, R1 is the null region at the end of the word. :param word: The Hungarian word whose region R1 is determined. :type word: str or unicode :param vowels: The Hungarian vowels that are used to determine the region R1. :type vowels: unicode :param digraphs: The digraphs that are used to determine the region R1. :type digraphs: tuple :return: the region R1 for the respective word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass HungarianStemmer. It is not to be invoked directly! """ r1 = "" if word[0] in vowels: for digraph in digraphs: if digraph in word[1:]: r1 = word[word.index(digraph[-1]) + 1 :] return r1 for i in range(1, len(word)): if word[i] not in vowels: r1 = word[i + 1 :] break else: for i in range(1, len(word)): if word[i] in vowels: r1 = word[i + 1 :] break return r1 class ItalianStemmer(_StandardStemmer): """ The Italian Snowball stemmer. :cvar __vowels: The Italian vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :note: A detailed description of the Italian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/italian/stemmer.html """ __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9" __step0_suffixes = ( "gliela", "gliele", "glieli", "glielo", "gliene", "sene", "mela", "mele", "meli", "melo", "mene", "tela", "tele", "teli", "telo", "tene", "cela", "cele", "celi", "celo", "cene", "vela", "vele", "veli", "velo", "vene", "gli", "ci", "la", "le", "li", "lo", "mi", "ne", "si", "ti", "vi", ) __step1_suffixes = ( "atrice", "atrici", "azione", "azioni", "uzione", "uzioni", "usione", "usioni", "amento", "amenti", "imento", "imenti", "amente", "abile", "abili", "ibile", "ibili", "mente", "atore", "atori", "logia", "logie", "anza", "anze", "iche", "ichi", "ismo", "ismi", "ista", "iste", "isti", "ist\xE0", "ist\xE8", "ist\xEC", "ante", "anti", "enza", "enze", "ico", "ici", "ica", "ice", "oso", "osi", "osa", "ose", "it\xE0", "ivo", "ivi", "iva", "ive", ) __step2_suffixes = ( "erebbero", "irebbero", "assero", "assimo", "eranno", "erebbe", "eremmo", "ereste", "eresti", "essero", "iranno", "irebbe", "iremmo", "ireste", "iresti", "iscano", "iscono", "issero", "arono", "avamo", "avano", "avate", "eremo", "erete", "erono", "evamo", "evano", "evate", "iremo", "irete", "irono", "ivamo", "ivano", "ivate", "ammo", "ando", "asse", "assi", "emmo", "enda", "ende", "endi", "endo", "erai", "erei", "Yamo", "iamo", "immo", "irai", "irei", "isca", "isce", "isci", "isco", "ano", "are", "ata", "ate", "ati", "ato", "ava", "avi", "avo", "er\xE0", "ere", "er\xF2", "ete", "eva", "evi", "evo", "ir\xE0", "ire", "ir\xF2", "ita", "ite", "iti", "ito", "iva", "ivi", "ivo", "ono", "uta", "ute", "uti", "uto", "ar", "ir", ) def stem(self, word): """ Stem an Italian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False # All acute accents are replaced by grave accents. word = ( word.replace("\xE1", "\xE0") .replace("\xE9", "\xE8") .replace("\xED", "\xEC") .replace("\xF3", "\xF2") .replace("\xFA", "\xF9") ) # Every occurrence of 'u' after 'q' # is put into upper case. for i in range(1, len(word)): if word[i - 1] == "q" and word[i] == "u": word = "".join((word[:i], "U", word[i + 1 :])) # Every occurrence of 'u' and 'i' # between vowels is put into upper case. for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1 :])) elif word[i] == "i": word = "".join((word[:i], "I", word[i + 1 :])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Attached pronoun for suffix in self.__step0_suffixes: if rv.endswith(suffix): if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"): word = word[: -len(suffix)] r1 = r1[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"): word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") r2 = suffix_replace(r2, suffix, "e") rv = suffix_replace(rv, suffix, "e") break # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic")): word = word[:-2] rv = rv[:-2] elif r2.endswith("abil"): word = word[:-4] rv = rv[:-4] elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith( suffix ): step1_success = True word = word[:-6] rv = rv[:-6] elif r2.endswith(suffix): step1_success = True if suffix in ("azione", "azioni", "atore", "atori"): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] elif suffix in ("logia", "logie"): word = word[:-2] rv = word[:-2] elif suffix in ("uzione", "uzioni", "usione", "usioni"): word = word[:-5] rv = rv[:-5] elif suffix in ("enza", "enze"): word = suffix_replace(word, suffix, "te") rv = suffix_replace(rv, suffix, "te") elif suffix == "it\xE0": word = word[:-3] r2 = r2[:-3] rv = rv[:-3] if r2.endswith(("ic", "iv")): word = word[:-2] rv = rv[:-2] elif r2.endswith("abil"): word = word[:-4] rv = rv[:-4] elif suffix in ("ivo", "ivi", "iva", "ive"): word = word[:-3] r2 = r2[:-3] rv = rv[:-3] if r2.endswith("at"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] else: word = word[: -len(suffix)] rv = rv[: -len(suffix)] break # STEP 2: Verb suffixes if not step1_success: for suffix in self.__step2_suffixes: if rv.endswith(suffix): word = word[: -len(suffix)] rv = rv[: -len(suffix)] break # STEP 3a if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")): word = word[:-1] rv = rv[:-1] if rv.endswith("i"): word = word[:-1] rv = rv[:-1] # STEP 3b if rv.endswith(("ch", "gh")): word = word[:-1] word = word.replace("I", "i").replace("U", "u") return word class NorwegianStemmer(_ScandinavianStemmer): """ The Norwegian Snowball stemmer. :cvar __vowels: The Norwegian vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Norwegian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/norwegian/stemmer.html """ __vowels = "aeiouy\xE6\xE5\xF8" __s_ending = "bcdfghjlmnoprtvyz" __step1_suffixes = ( "hetenes", "hetene", "hetens", "heter", "heten", "endes", "ande", "ende", "edes", "enes", "erte", "ede", "ane", "ene", "ens", "ers", "ets", "het", "ast", "ert", "en", "ar", "er", "as", "es", "et", "a", "e", "s", ) __step2_suffixes = ("dt", "vt") __step3_suffixes = ( "hetslov", "eleg", "elig", "elov", "slov", "leg", "eig", "lig", "els", "lov", "ig", ) def stem(self, word): """ Stem a Norwegian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word r1 = self._r1_scandinavian(word, self.__vowels) # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix in ("erte", "ert"): word = suffix_replace(word, suffix, "er") r1 = suffix_replace(r1, suffix, "er") elif suffix == "s": if word[-2] in self.__s_ending or ( word[-2] == "k" and word[-3] not in self.__vowels ): word = word[:-1] r1 = r1[:-1] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 for suffix in self.__step3_suffixes: if r1.endswith(suffix): word = word[: -len(suffix)] break return word class PortugueseStemmer(_StandardStemmer): """ The Portuguese Snowball stemmer. :cvar __vowels: The Portuguese vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the Portuguese stemming algorithm can be found under http://snowball.tartarus.org/algorithms/portuguese/stemmer.html """ __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4" __step1_suffixes = ( "amentos", "imentos", "uço~es", "amento", "imento", "adoras", "adores", "a\xE7o~es", "logias", "\xEAncias", "amente", "idades", "an\xE7as", "ismos", "istas", "adora", "a\xE7a~o", "antes", "\xE2ncia", "logia", "uça~o", "\xEAncia", "mente", "idade", "an\xE7a", "ezas", "icos", "icas", "ismo", "\xE1vel", "\xEDvel", "ista", "osos", "osas", "ador", "ante", "ivas", "ivos", "iras", "eza", "ico", "ica", "oso", "osa", "iva", "ivo", "ira", ) __step2_suffixes = ( "ar\xEDamos", "er\xEDamos", "ir\xEDamos", "\xE1ssemos", "\xEAssemos", "\xEDssemos", "ar\xEDeis", "er\xEDeis", "ir\xEDeis", "\xE1sseis", "\xE9sseis", "\xEDsseis", "\xE1ramos", "\xE9ramos", "\xEDramos", "\xE1vamos", "aremos", "eremos", "iremos", "ariam", "eriam", "iriam", "assem", "essem", "issem", "ara~o", "era~o", "ira~o", "arias", "erias", "irias", "ardes", "erdes", "irdes", "asses", "esses", "isses", "astes", "estes", "istes", "\xE1reis", "areis", "\xE9reis", "ereis", "\xEDreis", "ireis", "\xE1veis", "\xEDamos", "armos", "ermos", "irmos", "aria", "eria", "iria", "asse", "esse", "isse", "aste", "este", "iste", "arei", "erei", "irei", "aram", "eram", "iram", "avam", "arem", "erem", "irem", "ando", "endo", "indo", "adas", "idas", "ar\xE1s", "aras", "er\xE1s", "eras", "ir\xE1s", "avas", "ares", "eres", "ires", "\xEDeis", "ados", "idos", "\xE1mos", "amos", "emos", "imos", "iras", "ada", "ida", "ar\xE1", "ara", "er\xE1", "era", "ir\xE1", "ava", "iam", "ado", "ido", "ias", "ais", "eis", "ira", "ia", "ei", "am", "em", "ar", "er", "ir", "as", "es", "is", "eu", "iu", "ou", ) __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3") def stem(self, word): """ Stem a Portuguese word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False step2_success = False word = ( word.replace("\xE3", "a~") .replace("\xF5", "o~") .replace("q\xFC", "qu") .replace("g\xFC", "gu") ) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic", "ad")): word = word[:-2] rv = rv[:-2] elif ( suffix in ("ira", "iras") and rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "e" ): step1_success = True word = suffix_replace(word, suffix, "ir") rv = suffix_replace(rv, suffix, "ir") elif r2.endswith(suffix): step1_success = True if suffix in ("logia", "logias"): word = suffix_replace(word, suffix, "log") rv = suffix_replace(rv, suffix, "log") elif suffix in ("uça~o", "uço~es"): word = suffix_replace(word, suffix, "u") rv = suffix_replace(rv, suffix, "u") elif suffix in ("\xEAncia", "\xEAncias"): word = suffix_replace(word, suffix, "ente") rv = suffix_replace(rv, suffix, "ente") elif suffix == "mente": word = word[:-5] r2 = r2[:-5] rv = rv[:-5] if r2.endswith(("ante", "avel", "ivel")): word = word[:-4] rv = rv[:-4] elif suffix in ("idade", "idades"): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] if r2.endswith(("ic", "iv")): word = word[:-2] rv = rv[:-2] elif r2.endswith("abil"): word = word[:-4] rv = rv[:-4] elif suffix in ("iva", "ivo", "ivas", "ivos"): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] else: word = word[: -len(suffix)] rv = rv[: -len(suffix)] break # STEP 2: Verb suffixes if not step1_success: for suffix in self.__step2_suffixes: if rv.endswith(suffix): step2_success = True word = word[: -len(suffix)] rv = rv[: -len(suffix)] break # STEP 3 if step1_success or step2_success: if rv.endswith("i") and word[-2] == "c": word = word[:-1] rv = rv[:-1] ### STEP 4: Residual suffix if not step1_success and not step2_success: for suffix in self.__step4_suffixes: if rv.endswith(suffix): word = word[: -len(suffix)] rv = rv[: -len(suffix)] break # STEP 5 if rv.endswith(("e", "\xE9", "\xEA")): word = word[:-1] rv = rv[:-1] if (word.endswith("gu") and rv.endswith("u")) or ( word.endswith("ci") and rv.endswith("i") ): word = word[:-1] elif word.endswith("\xE7"): word = suffix_replace(word, "\xE7", "c") word = word.replace("a~", "\xE3").replace("o~", "\xF5") return word class RomanianStemmer(_StandardStemmer): """ The Romanian Snowball stemmer. :cvar __vowels: The Romanian vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Romanian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/romanian/stemmer.html """ __vowels = "aeiou\u0103\xE2\xEE" __step0_suffixes = ( "iilor", "ului", "elor", "iile", "ilor", "atei", "a\u0163ie", "a\u0163ia", "aua", "ele", "iua", "iei", "ile", "ul", "ea", "ii", ) __step1_suffixes = ( "abilitate", "abilitati", "abilit\u0103\u0163i", "ibilitate", "abilit\u0103i", "ivitate", "ivitati", "ivit\u0103\u0163i", "icitate", "icitati", "icit\u0103\u0163i", "icatori", "ivit\u0103i", "icit\u0103i", "icator", "a\u0163iune", "atoare", "\u0103toare", "i\u0163iune", "itoare", "iciva", "icive", "icivi", "iciv\u0103", "icala", "icale", "icali", "ical\u0103", "ativa", "ative", "ativi", "ativ\u0103", "atori", "\u0103tori", "itiva", "itive", "itivi", "itiv\u0103", "itori", "iciv", "ical", "ativ", "ator", "\u0103tor", "itiv", "itor", ) __step2_suffixes = ( "abila", "abile", "abili", "abil\u0103", "ibila", "ibile", "ibili", "ibil\u0103", "atori", "itate", "itati", "it\u0103\u0163i", "abil", "ibil", "oasa", "oas\u0103", "oase", "anta", "ante", "anti", "ant\u0103", "ator", "it\u0103i", "iune", "iuni", "isme", "ista", "iste", "isti", "ist\u0103", "i\u015Fti", "ata", "at\u0103", "ati", "ate", "uta", "ut\u0103", "uti", "ute", "ita", "it\u0103", "iti", "ite", "ica", "ice", "ici", "ic\u0103", "osi", "o\u015Fi", "ant", "iva", "ive", "ivi", "iv\u0103", "ism", "ist", "at", "ut", "it", "ic", "os", "iv", ) __step3_suffixes = ( "seser\u0103\u0163i", "aser\u0103\u0163i", "iser\u0103\u0163i", "\xE2ser\u0103\u0163i", "user\u0103\u0163i", "seser\u0103m", "aser\u0103m", "iser\u0103m", "\xE2ser\u0103m", "user\u0103m", "ser\u0103\u0163i", "sese\u015Fi", "seser\u0103", "easc\u0103", "ar\u0103\u0163i", "ur\u0103\u0163i", "ir\u0103\u0163i", "\xE2r\u0103\u0163i", "ase\u015Fi", "aser\u0103", "ise\u015Fi", "iser\u0103", "\xe2se\u015Fi", "\xE2ser\u0103", "use\u015Fi", "user\u0103", "ser\u0103m", "sesem", "indu", "\xE2ndu", "eaz\u0103", "e\u015Fti", "e\u015Fte", "\u0103\u015Fti", "\u0103\u015Fte", "ea\u0163i", "ia\u0163i", "ar\u0103m", "ur\u0103m", "ir\u0103m", "\xE2r\u0103m", "asem", "isem", "\xE2sem", "usem", "se\u015Fi", "ser\u0103", "sese", "are", "ere", "ire", "\xE2re", "ind", "\xE2nd", "eze", "ezi", "esc", "\u0103sc", "eam", "eai", "eau", "iam", "iai", "iau", "a\u015Fi", "ar\u0103", "u\u015Fi", "ur\u0103", "i\u015Fi", "ir\u0103", "\xE2\u015Fi", "\xe2r\u0103", "ase", "ise", "\xE2se", "use", "a\u0163i", "e\u0163i", "i\u0163i", "\xe2\u0163i", "sei", "ez", "am", "ai", "au", "ea", "ia", "ui", "\xE2i", "\u0103m", "em", "im", "\xE2m", "se", ) def stem(self, word): """ Stem a Romanian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False step2_success = False for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1 :])) elif word[i] == "i": word = "".join((word[:i], "I", word[i + 1 :])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Removal of plurals and other simplifications for suffix in self.__step0_suffixes: if word.endswith(suffix): if suffix in r1: if suffix in ("ul", "ului"): word = word[: -len(suffix)] if suffix in rv: rv = rv[: -len(suffix)] else: rv = "" elif ( suffix == "aua" or suffix == "atei" or (suffix == "ile" and word[-5:-3] != "ab") ): word = word[:-2] elif suffix in ("ea", "ele", "elor"): word = suffix_replace(word, suffix, "e") if suffix in rv: rv = suffix_replace(rv, suffix, "e") else: rv = "" elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"): word = suffix_replace(word, suffix, "i") if suffix in rv: rv = suffix_replace(rv, suffix, "i") else: rv = "" elif suffix in ("a\u0163ie", "a\u0163ia"): word = word[:-1] break # STEP 1: Reduction of combining suffixes while True: replacement_done = False for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix in r1: step1_success = True replacement_done = True if suffix in ( "abilitate", "abilitati", "abilit\u0103i", "abilit\u0103\u0163i", ): word = suffix_replace(word, suffix, "abil") elif suffix == "ibilitate": word = word[:-5] elif suffix in ( "ivitate", "ivitati", "ivit\u0103i", "ivit\u0103\u0163i", ): word = suffix_replace(word, suffix, "iv") elif suffix in ( "icitate", "icitati", "icit\u0103i", "icit\u0103\u0163i", "icator", "icatori", "iciv", "iciva", "icive", "icivi", "iciv\u0103", "ical", "icala", "icale", "icali", "ical\u0103", ): word = suffix_replace(word, suffix, "ic") elif suffix in ( "ativ", "ativa", "ative", "ativi", "ativ\u0103", "a\u0163iune", "atoare", "ator", "atori", "\u0103toare", "\u0103tor", "\u0103tori", ): word = suffix_replace(word, suffix, "at") if suffix in r2: r2 = suffix_replace(r2, suffix, "at") elif suffix in ( "itiv", "itiva", "itive", "itivi", "itiv\u0103", "i\u0163iune", "itoare", "itor", "itori", ): word = suffix_replace(word, suffix, "it") if suffix in r2: r2 = suffix_replace(r2, suffix, "it") else: step1_success = False break if not replacement_done: break # STEP 2: Removal of standard suffixes for suffix in self.__step2_suffixes: if word.endswith(suffix): if suffix in r2: step2_success = True if suffix in ("iune", "iuni"): if word[-5] == "\u0163": word = "".join((word[:-5], "t")) elif suffix in ( "ism", "isme", "ist", "ista", "iste", "isti", "ist\u0103", "i\u015Fti", ): word = suffix_replace(word, suffix, "ist") else: word = word[: -len(suffix)] break # STEP 3: Removal of verb suffixes if not step1_success and not step2_success: for suffix in self.__step3_suffixes: if word.endswith(suffix): if suffix in rv: if suffix in ( "seser\u0103\u0163i", "seser\u0103m", "ser\u0103\u0163i", "sese\u015Fi", "seser\u0103", "ser\u0103m", "sesem", "se\u015Fi", "ser\u0103", "sese", "a\u0163i", "e\u0163i", "i\u0163i", "\xE2\u0163i", "sei", "\u0103m", "em", "im", "\xE2m", "se", ): word = word[: -len(suffix)] rv = rv[: -len(suffix)] else: if ( not rv.startswith(suffix) and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE" ): word = word[: -len(suffix)] break # STEP 4: Removal of final vowel for suffix in ("ie", "a", "e", "i", "\u0103"): if word.endswith(suffix): if suffix in rv: word = word[: -len(suffix)] break word = word.replace("I", "i").replace("U", "u") return word class RussianStemmer(_LanguageSpecificStemmer): """ The Russian Snowball stemmer. :cvar __perfective_gerund_suffixes: Suffixes to be deleted. :type __perfective_gerund_suffixes: tuple :cvar __adjectival_suffixes: Suffixes to be deleted. :type __adjectival_suffixes: tuple :cvar __reflexive_suffixes: Suffixes to be deleted. :type __reflexive_suffixes: tuple :cvar __verb_suffixes: Suffixes to be deleted. :type __verb_suffixes: tuple :cvar __noun_suffixes: Suffixes to be deleted. :type __noun_suffixes: tuple :cvar __superlative_suffixes: Suffixes to be deleted. :type __superlative_suffixes: tuple :cvar __derivational_suffixes: Suffixes to be deleted. :type __derivational_suffixes: tuple :note: A detailed description of the Russian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/russian/stemmer.html """ __perfective_gerund_suffixes = ( "ivshis'", "yvshis'", "vshis'", "ivshi", "yvshi", "vshi", "iv", "yv", "v", ) __adjectival_suffixes = ( "ui^ushchi^ui^u", "ui^ushchi^ai^a", "ui^ushchimi", "ui^ushchymi", "ui^ushchego", "ui^ushchogo", "ui^ushchemu", "ui^ushchomu", "ui^ushchikh", "ui^ushchykh", "ui^ushchui^u", "ui^ushchaia", "ui^ushchoi^u", "ui^ushchei^u", "i^ushchi^ui^u", "i^ushchi^ai^a", "ui^ushchee", "ui^ushchie", "ui^ushchye", "ui^ushchoe", "ui^ushchei`", "ui^ushchii`", "ui^ushchyi`", "ui^ushchoi`", "ui^ushchem", "ui^ushchim", "ui^ushchym", "ui^ushchom", "i^ushchimi", "i^ushchymi", "i^ushchego", "i^ushchogo", "i^ushchemu", "i^ushchomu", "i^ushchikh", "i^ushchykh", "i^ushchui^u", "i^ushchai^a", "i^ushchoi^u", "i^ushchei^u", "i^ushchee", "i^ushchie", "i^ushchye", "i^ushchoe", "i^ushchei`", "i^ushchii`", "i^ushchyi`", "i^ushchoi`", "i^ushchem", "i^ushchim", "i^ushchym", "i^ushchom", "shchi^ui^u", "shchi^ai^a", "ivshi^ui^u", "ivshi^ai^a", "yvshi^ui^u", "yvshi^ai^a", "shchimi", "shchymi", "shchego", "shchogo", "shchemu", "shchomu", "shchikh", "shchykh", "shchui^u", "shchai^a", "shchoi^u", "shchei^u", "ivshimi", "ivshymi", "ivshego", "ivshogo", "ivshemu", "ivshomu", "ivshikh", "ivshykh", "ivshui^u", "ivshai^a", "ivshoi^u", "ivshei^u", "yvshimi", "yvshymi", "yvshego", "yvshogo", "yvshemu", "yvshomu", "yvshikh", "yvshykh", "yvshui^u", "yvshai^a", "yvshoi^u", "yvshei^u", "vshi^ui^u", "vshi^ai^a", "shchee", "shchie", "shchye", "shchoe", "shchei`", "shchii`", "shchyi`", "shchoi`", "shchem", "shchim", "shchym", "shchom", "ivshee", "ivshie", "ivshye", "ivshoe", "ivshei`", "ivshii`", "ivshyi`", "ivshoi`", "ivshem", "ivshim", "ivshym", "ivshom", "yvshee", "yvshie", "yvshye", "yvshoe", "yvshei`", "yvshii`", "yvshyi`", "yvshoi`", "yvshem", "yvshim", "yvshym", "yvshom", "vshimi", "vshymi", "vshego", "vshogo", "vshemu", "vshomu", "vshikh", "vshykh", "vshui^u", "vshai^a", "vshoi^u", "vshei^u", "emi^ui^u", "emi^ai^a", "nni^ui^u", "nni^ai^a", "vshee", "vshie", "vshye", "vshoe", "vshei`", "vshii`", "vshyi`", "vshoi`", "vshem", "vshim", "vshym", "vshom", "emimi", "emymi", "emego", "emogo", "ememu", "emomu", "emikh", "emykh", "emui^u", "emai^a", "emoi^u", "emei^u", "nnimi", "nnymi", "nnego", "nnogo", "nnemu", "nnomu", "nnikh", "nnykh", "nnui^u", "nnai^a", "nnoi^u", "nnei^u", "emee", "emie", "emye", "emoe", "emei`", "emii`", "emyi`", "emoi`", "emem", "emim", "emym", "emom", "nnee", "nnie", "nnye", "nnoe", "nnei`", "nnii`", "nnyi`", "nnoi`", "nnem", "nnim", "nnym", "nnom", "i^ui^u", "i^ai^a", "imi", "ymi", "ego", "ogo", "emu", "omu", "ikh", "ykh", "ui^u", "ai^a", "oi^u", "ei^u", "ee", "ie", "ye", "oe", "ei`", "ii`", "yi`", "oi`", "em", "im", "ym", "om", ) __reflexive_suffixes = ("si^a", "s'") __verb_suffixes = ( "esh'", "ei`te", "ui`te", "ui^ut", "ish'", "ete", "i`te", "i^ut", "nno", "ila", "yla", "ena", "ite", "ili", "yli", "ilo", "ylo", "eno", "i^at", "uet", "eny", "it'", "yt'", "ui^u", "la", "na", "li", "em", "lo", "no", "et", "ny", "t'", "ei`", "ui`", "il", "yl", "im", "ym", "en", "it", "yt", "i^u", "i`", "l", "n", ) __noun_suffixes = ( "ii^ami", "ii^akh", "i^ami", "ii^am", "i^akh", "ami", "iei`", "i^am", "iem", "akh", "ii^u", "'i^u", "ii^a", "'i^a", "ev", "ov", "ie", "'e", "ei", "ii", "ei`", "oi`", "ii`", "em", "am", "om", "i^u", "i^a", "a", "e", "i", "i`", "o", "u", "y", "'", ) __superlative_suffixes = ("ei`she", "ei`sh") __derivational_suffixes = ("ost'", "ost") def stem(self, word): """ Stem a Russian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ if word in self.stopwords: return word chr_exceeded = False for i in range(len(word)): if ord(word[i]) > 255: chr_exceeded = True break if not chr_exceeded: return word word = self.__cyrillic_to_roman(word) step1_success = False adjectival_removed = False verb_removed = False undouble_success = False superlative_removed = False rv, r2 = self.__regions_russian(word) # Step 1 for suffix in self.__perfective_gerund_suffixes: if rv.endswith(suffix): if suffix in ("v", "vshi", "vshis'"): if ( rv[-len(suffix) - 3 : -len(suffix)] == "i^a" or rv[-len(suffix) - 1 : -len(suffix)] == "a" ): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] step1_success = True break else: word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] step1_success = True break if not step1_success: for suffix in self.__reflexive_suffixes: if rv.endswith(suffix): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] break for suffix in self.__adjectival_suffixes: if rv.endswith(suffix): if suffix in ( "i^ushchi^ui^u", "i^ushchi^ai^a", "i^ushchui^u", "i^ushchai^a", "i^ushchoi^u", "i^ushchei^u", "i^ushchimi", "i^ushchymi", "i^ushchego", "i^ushchogo", "i^ushchemu", "i^ushchomu", "i^ushchikh", "i^ushchykh", "shchi^ui^u", "shchi^ai^a", "i^ushchee", "i^ushchie", "i^ushchye", "i^ushchoe", "i^ushchei`", "i^ushchii`", "i^ushchyi`", "i^ushchoi`", "i^ushchem", "i^ushchim", "i^ushchym", "i^ushchom", "vshi^ui^u", "vshi^ai^a", "shchui^u", "shchai^a", "shchoi^u", "shchei^u", "emi^ui^u", "emi^ai^a", "nni^ui^u", "nni^ai^a", "shchimi", "shchymi", "shchego", "shchogo", "shchemu", "shchomu", "shchikh", "shchykh", "vshui^u", "vshai^a", "vshoi^u", "vshei^u", "shchee", "shchie", "shchye", "shchoe", "shchei`", "shchii`", "shchyi`", "shchoi`", "shchem", "shchim", "shchym", "shchom", "vshimi", "vshymi", "vshego", "vshogo", "vshemu", "vshomu", "vshikh", "vshykh", "emui^u", "emai^a", "emoi^u", "emei^u", "nnui^u", "nnai^a", "nnoi^u", "nnei^u", "vshee", "vshie", "vshye", "vshoe", "vshei`", "vshii`", "vshyi`", "vshoi`", "vshem", "vshim", "vshym", "vshom", "emimi", "emymi", "emego", "emogo", "ememu", "emomu", "emikh", "emykh", "nnimi", "nnymi", "nnego", "nnogo", "nnemu", "nnomu", "nnikh", "nnykh", "emee", "emie", "emye", "emoe", "emei`", "emii`", "emyi`", "emoi`", "emem", "emim", "emym", "emom", "nnee", "nnie", "nnye", "nnoe", "nnei`", "nnii`", "nnyi`", "nnoi`", "nnem", "nnim", "nnym", "nnom", ): if ( rv[-len(suffix) - 3 : -len(suffix)] == "i^a" or rv[-len(suffix) - 1 : -len(suffix)] == "a" ): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] adjectival_removed = True break else: word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] adjectival_removed = True break if not adjectival_removed: for suffix in self.__verb_suffixes: if rv.endswith(suffix): if suffix in ( "la", "na", "ete", "i`te", "li", "i`", "l", "em", "n", "lo", "no", "et", "i^ut", "ny", "t'", "esh'", "nno", ): if ( rv[-len(suffix) - 3 : -len(suffix)] == "i^a" or rv[-len(suffix) - 1 : -len(suffix)] == "a" ): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] verb_removed = True break else: word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] verb_removed = True break if not adjectival_removed and not verb_removed: for suffix in self.__noun_suffixes: if rv.endswith(suffix): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] break # Step 2 if rv.endswith("i"): word = word[:-1] r2 = r2[:-1] # Step 3 for suffix in self.__derivational_suffixes: if r2.endswith(suffix): word = word[: -len(suffix)] break # Step 4 if word.endswith("nn"): word = word[:-1] undouble_success = True if not undouble_success: for suffix in self.__superlative_suffixes: if word.endswith(suffix): word = word[: -len(suffix)] superlative_removed = True break if word.endswith("nn"): word = word[:-1] if not undouble_success and not superlative_removed: if word.endswith("'"): word = word[:-1] word = self.__roman_to_cyrillic(word) return word def __regions_russian(self, word): """ Return the regions RV and R2 which are used by the Russian stemmer. In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. :param word: The Russian word whose regions RV and R2 are determined. :type word: str or unicode :return: the regions RV and R2 for the respective Russian word. :rtype: tuple :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ r1 = "" r2 = "" rv = "" vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E") for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: r1 = word[i + 1 :] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i - 1] in vowels: r2 = r1[i + 1 :] break for i in range(len(word)): if word[i] in vowels: rv = word[i + 1 :] break r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`") rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`") return (rv, r2) def __cyrillic_to_roman(self, word): """ Transliterate a Russian word into the Roman alphabet. A Russian word whose letters consist of the Cyrillic alphabet are transliterated into the Roman alphabet in order to ease the forthcoming stemming process. :param word: The word that is transliterated. :type word: unicode :return: the transliterated word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ word = ( word.replace("\u0410", "a") .replace("\u0430", "a") .replace("\u0411", "b") .replace("\u0431", "b") .replace("\u0412", "v") .replace("\u0432", "v") .replace("\u0413", "g") .replace("\u0433", "g") .replace("\u0414", "d") .replace("\u0434", "d") .replace("\u0415", "e") .replace("\u0435", "e") .replace("\u0401", "e") .replace("\u0451", "e") .replace("\u0416", "zh") .replace("\u0436", "zh") .replace("\u0417", "z") .replace("\u0437", "z") .replace("\u0418", "i") .replace("\u0438", "i") .replace("\u0419", "i`") .replace("\u0439", "i`") .replace("\u041A", "k") .replace("\u043A", "k") .replace("\u041B", "l") .replace("\u043B", "l") .replace("\u041C", "m") .replace("\u043C", "m") .replace("\u041D", "n") .replace("\u043D", "n") .replace("\u041E", "o") .replace("\u043E", "o") .replace("\u041F", "p") .replace("\u043F", "p") .replace("\u0420", "r") .replace("\u0440", "r") .replace("\u0421", "s") .replace("\u0441", "s") .replace("\u0422", "t") .replace("\u0442", "t") .replace("\u0423", "u") .replace("\u0443", "u") .replace("\u0424", "f") .replace("\u0444", "f") .replace("\u0425", "kh") .replace("\u0445", "kh") .replace("\u0426", "t^s") .replace("\u0446", "t^s") .replace("\u0427", "ch") .replace("\u0447", "ch") .replace("\u0428", "sh") .replace("\u0448", "sh") .replace("\u0429", "shch") .replace("\u0449", "shch") .replace("\u042A", "''") .replace("\u044A", "''") .replace("\u042B", "y") .replace("\u044B", "y") .replace("\u042C", "'") .replace("\u044C", "'") .replace("\u042D", "e`") .replace("\u044D", "e`") .replace("\u042E", "i^u") .replace("\u044E", "i^u") .replace("\u042F", "i^a") .replace("\u044F", "i^a") ) return word def __roman_to_cyrillic(self, word): """ Transliterate a Russian word back into the Cyrillic alphabet. A Russian word formerly transliterated into the Roman alphabet in order to ease the stemming process, is transliterated back into the Cyrillic alphabet, its original form. :param word: The word that is transliterated. :type word: str or unicode :return: word, the transliterated word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ word = ( word.replace("i^u", "\u044E") .replace("i^a", "\u044F") .replace("shch", "\u0449") .replace("kh", "\u0445") .replace("t^s", "\u0446") .replace("ch", "\u0447") .replace("e`", "\u044D") .replace("i`", "\u0439") .replace("sh", "\u0448") .replace("k", "\u043A") .replace("e", "\u0435") .replace("zh", "\u0436") .replace("a", "\u0430") .replace("b", "\u0431") .replace("v", "\u0432") .replace("g", "\u0433") .replace("d", "\u0434") .replace("e", "\u0435") .replace("z", "\u0437") .replace("i", "\u0438") .replace("l", "\u043B") .replace("m", "\u043C") .replace("n", "\u043D") .replace("o", "\u043E") .replace("p", "\u043F") .replace("r", "\u0440") .replace("s", "\u0441") .replace("t", "\u0442") .replace("u", "\u0443") .replace("f", "\u0444") .replace("''", "\u044A") .replace("y", "\u044B") .replace("'", "\u044C") ) return word class SpanishStemmer(_StandardStemmer): """ The Spanish Snowball stemmer. :cvar __vowels: The Spanish vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. :type __step2a_suffixes: tuple :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. :type __step2b_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Spanish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/spanish/stemmer.html """ __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC" __step0_suffixes = ( "selas", "selos", "sela", "selo", "las", "les", "los", "nos", "me", "se", "la", "le", "lo", ) __step1_suffixes = ( "amientos", "imientos", "amiento", "imiento", "aciones", "uciones", "adoras", "adores", "ancias", "log\xEDas", "encias", "amente", "idades", "anzas", "ismos", "ables", "ibles", "istas", "adora", "aci\xF3n", "antes", "ancia", "log\xEDa", "uci\xf3n", "encia", "mente", "anza", "icos", "icas", "ismo", "able", "ible", "ista", "osos", "osas", "ador", "ante", "idad", "ivas", "ivos", "ico", "ica", "oso", "osa", "iva", "ivo", ) __step2a_suffixes = ( "yeron", "yendo", "yamos", "yais", "yan", "yen", "yas", "yes", "ya", "ye", "yo", "y\xF3", ) __step2b_suffixes = ( "ar\xEDamos", "er\xEDamos", "ir\xEDamos", "i\xE9ramos", "i\xE9semos", "ar\xEDais", "aremos", "er\xEDais", "eremos", "ir\xEDais", "iremos", "ierais", "ieseis", "asteis", "isteis", "\xE1bamos", "\xE1ramos", "\xE1semos", "ar\xEDan", "ar\xEDas", "ar\xE9is", "er\xEDan", "er\xEDas", "er\xE9is", "ir\xEDan", "ir\xEDas", "ir\xE9is", "ieran", "iesen", "ieron", "iendo", "ieras", "ieses", "abais", "arais", "aseis", "\xE9amos", "ar\xE1n", "ar\xE1s", "ar\xEDa", "er\xE1n", "er\xE1s", "er\xEDa", "ir\xE1n", "ir\xE1s", "ir\xEDa", "iera", "iese", "aste", "iste", "aban", "aran", "asen", "aron", "ando", "abas", "adas", "idas", "aras", "ases", "\xEDais", "ados", "idos", "amos", "imos", "emos", "ar\xE1", "ar\xE9", "er\xE1", "er\xE9", "ir\xE1", "ir\xE9", "aba", "ada", "ida", "ara", "ase", "\xEDan", "ado", "ido", "\xEDas", "\xE1is", "\xE9is", "\xEDa", "ad", "ed", "id", "an", "i\xF3", "ar", "er", "ir", "as", "\xEDs", "en", "es", ) __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3") def stem(self, word): """ Stem a Spanish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Attached pronoun for suffix in self.__step0_suffixes: if not (word.endswith(suffix) and rv.endswith(suffix)): continue if ( rv[: -len(suffix)].endswith( ( "ando", "\xE1ndo", "ar", "\xE1r", "er", "\xE9r", "iendo", "i\xE9ndo", "ir", "\xEDr", ) ) ) or ( rv[: -len(suffix)].endswith("yendo") and word[: -len(suffix)].endswith("uyendo") ): word = self.__replace_accented(word[: -len(suffix)]) r1 = self.__replace_accented(r1[: -len(suffix)]) r2 = self.__replace_accented(r2[: -len(suffix)]) rv = self.__replace_accented(rv[: -len(suffix)]) break # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if not word.endswith(suffix): continue if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic", "ad")): word = word[:-2] rv = rv[:-2] elif r2.endswith(suffix): step1_success = True if suffix in ( "adora", "ador", "aci\xF3n", "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias", ): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] elif suffix in ("log\xEDa", "log\xEDas"): word = suffix_replace(word, suffix, "log") rv = suffix_replace(rv, suffix, "log") elif suffix in ("uci\xF3n", "uciones"): word = suffix_replace(word, suffix, "u") rv = suffix_replace(rv, suffix, "u") elif suffix in ("encia", "encias"): word = suffix_replace(word, suffix, "ente") rv = suffix_replace(rv, suffix, "ente") elif suffix == "mente": word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] if r2.endswith(("ante", "able", "ible")): word = word[:-4] rv = rv[:-4] elif suffix in ("idad", "idades"): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] for pre_suff in ("abil", "ic", "iv"): if r2.endswith(pre_suff): word = word[: -len(pre_suff)] rv = rv[: -len(pre_suff)] elif suffix in ("ivo", "iva", "ivos", "ivas"): word = word[: -len(suffix)] r2 = r2[: -len(suffix)] rv = rv[: -len(suffix)] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] else: word = word[: -len(suffix)] rv = rv[: -len(suffix)] break # STEP 2a: Verb suffixes beginning 'y' if not step1_success: for suffix in self.__step2a_suffixes: if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u": word = word[: -len(suffix)] rv = rv[: -len(suffix)] break # STEP 2b: Other verb suffixes for suffix in self.__step2b_suffixes: if rv.endswith(suffix): word = word[: -len(suffix)] rv = rv[: -len(suffix)] if suffix in ("en", "es", "\xE9is", "emos"): if word.endswith("gu"): word = word[:-1] if rv.endswith("gu"): rv = rv[:-1] break # STEP 3: Residual suffix for suffix in self.__step3_suffixes: if rv.endswith(suffix): word = word[: -len(suffix)] if suffix in ("e", "\xE9"): rv = rv[: -len(suffix)] if word[-2:] == "gu" and rv.endswith("u"): word = word[:-1] break word = self.__replace_accented(word) return word def __replace_accented(self, word): """ Replaces all accented letters on a word with their non-accented counterparts. :param word: A spanish word, with or without accents :type word: str or unicode :return: a word with the accented letters (á, é, í, ó, ú) replaced with their non-accented counterparts (a, e, i, o, u) :rtype: str or unicode """ return ( word.replace("\xE1", "a") .replace("\xE9", "e") .replace("\xED", "i") .replace("\xF3", "o") .replace("\xFA", "u") ) class SwedishStemmer(_ScandinavianStemmer): """ The Swedish Snowball stemmer. :cvar __vowels: The Swedish vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Swedish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/swedish/stemmer.html """ __vowels = "aeiouy\xE4\xE5\xF6" __s_ending = "bcdfghjklmnoprtvy" __step1_suffixes = ( "heterna", "hetens", "heter", "heten", "anden", "arnas", "ernas", "ornas", "andes", "andet", "arens", "arna", "erna", "orna", "ande", "arne", "aste", "aren", "ades", "erns", "ade", "are", "ern", "ens", "het", "ast", "ad", "en", "ar", "er", "or", "as", "es", "at", "a", "e", "s", ) __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig") def stem(self, word): """ Stem a Swedish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word r1 = self._r1_scandinavian(word, self.__vowels) # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] else: word = word[: -len(suffix)] r1 = r1[: -len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix in ("els", "lig", "ig"): word = word[: -len(suffix)] elif suffix in ("fullt", "l\xF6st"): word = word[:-1] break return word def demo(): """ This function provides a demonstration of the Snowball stemmers. After invoking this function and specifying a language, it stems an excerpt of the Universal Declaration of Human Rights (which is a part of the NLTK corpus collection) and then prints out the original and the stemmed text. """ from nltk.corpus import udhr udhr_corpus = { "arabic": "Arabic_Alarabia-Arabic", "danish": "Danish_Dansk-Latin1", "dutch": "Dutch_Nederlands-Latin1", "english": "English-Latin1", "finnish": "Finnish_Suomi-Latin1", "french": "French_Francais-Latin1", "german": "German_Deutsch-Latin1", "hungarian": "Hungarian_Magyar-UTF8", "italian": "Italian_Italiano-Latin1", "norwegian": "Norwegian-Latin1", "porter": "English-Latin1", "portuguese": "Portuguese_Portugues-Latin1", "romanian": "Romanian_Romana-Latin2", "russian": "Russian-UTF8", "spanish": "Spanish-Latin1", "swedish": "Swedish_Svenska-Latin1", } print("\n") print("******************************") print("Demo for the Snowball stemmers") print("******************************") while True: language = input( "Please enter the name of the language " + "to be demonstrated\n" + "/".join(SnowballStemmer.languages) + "\n" + "(enter 'exit' in order to leave): " ) if language == "exit": break if language not in SnowballStemmer.languages: print( "\nOops, there is no stemmer for this language. " + "Please try again.\n" ) continue stemmer = SnowballStemmer(language) excerpt = udhr.words(udhr_corpus[language])[:300] stemmed = " ".join(stemmer.stem(word) for word in excerpt) stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip() excerpt = " ".join(excerpt) excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip() print("\n") print("-" * 70) print("ORIGINAL".center(70)) print(excerpt) print("\n\n") print("STEMMED RESULTS".center(70)) print(stemmed) print("-" * 70) print("\n") nltk-3.7/nltk/stem/util.py000066400000000000000000000011531420073152400156000ustar00rootroot00000000000000# Natural Language Toolkit: Stemmer Utilities # # Copyright (C) 2001-2022 NLTK Project # Author: Helder # URL: # For license information, see LICENSE.TXT def suffix_replace(original, old, new): """ Replaces the old suffix of the original string by a new suffix """ return original[: -len(old)] + new def prefix_replace(original, old, new): """ Replaces the old prefix of the original string by a new suffix :param original: string :param old: string :param new: string :return: string """ return new + original[len(old) :] nltk-3.7/nltk/stem/wordnet.py000066400000000000000000000031061420073152400163050ustar00rootroot00000000000000# Natural Language Toolkit: WordNet stemmer interface # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT from nltk.corpus import wordnet as wn class WordNetLemmatizer: """ WordNet Lemmatizer Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet. >>> from nltk.stem import WordNetLemmatizer >>> wnl = WordNetLemmatizer() >>> print(wnl.lemmatize('dogs')) dog >>> print(wnl.lemmatize('churches')) church >>> print(wnl.lemmatize('aardwolves')) aardwolf >>> print(wnl.lemmatize('abaci')) abacus >>> print(wnl.lemmatize('hardrock')) hardrock """ def lemmatize(self, word: str, pos: str = "n") -> str: """Lemmatize `word` using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet. :param word: The input word to lemmatize. :type word: str :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns, `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"` for satellite adjectives. :param pos: str :return: The lemma of `word`, for the given `pos`. """ lemmas = wn._morphy(word, pos) return min(lemmas, key=len) if lemmas else word def __repr__(self): return "" nltk-3.7/nltk/tag/000077500000000000000000000000001420073152400140545ustar00rootroot00000000000000nltk-3.7/nltk/tag/__init__.py000066400000000000000000000157011420073152400161710ustar00rootroot00000000000000# Natural Language Toolkit: Taggers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT """ NLTK Taggers This package contains classes and interfaces for part-of-speech tagging, or simply "tagging". A "tag" is a case-sensitive string that specifies some property of a token, such as its part of speech. Tagged tokens are encoded as tuples ``(tag, token)``. For example, the following tagged token combines the word ``'fly'`` with a noun part of speech tag (``'NN'``): >>> tagged_tok = ('fly', 'NN') An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset: >>> from nltk import pos_tag, word_tokenize >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] A Russian tagger is also available if you specify lang="rus". It uses the Russian National Corpus tagset: >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus') # doctest: +SKIP [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'), ('бумажку', 'S'), ('.', 'NONLEX')] This package defines several taggers, which take a list of tokens, assign a tag to each one, and return the resulting list of tagged tokens. Most of the taggers are built automatically based on a training corpus. For example, the unigram tagger tags each word *w* by checking what the most frequent tag for *w* was in a training corpus: >>> from nltk.corpus import brown >>> from nltk.tag import UnigramTagger >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) >>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'] >>> for word, tag in tagger.tag(sent): ... print(word, '->', tag) Mitchell -> NP decried -> None the -> AT high -> JJ rate -> NN of -> IN unemployment -> None Note that words that the tagger has not seen during training receive a tag of ``None``. We evaluate a tagger on data that was not seen during training: >>> tagger.accuracy(brown.tagged_sents(categories='news')[500:600]) 0.7... For more information, please consult chapter 5 of the NLTK Book. isort:skip_file """ from nltk.tag.api import TaggerI from nltk.tag.util import str2tuple, tuple2str, untag from nltk.tag.sequential import ( SequentialBackoffTagger, ContextTagger, DefaultTagger, NgramTagger, UnigramTagger, BigramTagger, TrigramTagger, AffixTagger, RegexpTagger, ClassifierBasedTagger, ClassifierBasedPOSTagger, ) from nltk.tag.brill import BrillTagger from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.tag.tnt import TnT from nltk.tag.hunpos import HunposTagger from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger from nltk.tag.mapping import tagset_mapping, map_tag from nltk.tag.crf import CRFTagger from nltk.tag.perceptron import PerceptronTagger from nltk.data import load, find RUS_PICKLE = ( "taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle" ) def _get_tagger(lang=None): if lang == "rus": tagger = PerceptronTagger(False) ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) else: tagger = PerceptronTagger() return tagger def _pos_tag(tokens, tagset=None, tagger=None, lang=None): # Currently only supports English and Russian. if lang not in ["eng", "rus"]: raise NotImplementedError( "Currently, NLTK pos_tag only supports English and Russian " "(i.e. lang='eng' or lang='rus')" ) # Throws Error if tokens is of string type elif isinstance(tokens, str): raise TypeError("tokens: expected a list of strings, got a string") else: tagged_tokens = tagger.tag(tokens) if tagset: # Maps to the specified tagset. if lang == "eng": tagged_tokens = [ (token, map_tag("en-ptb", tagset, tag)) for (token, tag) in tagged_tokens ] elif lang == "rus": # Note that the new Russian pos tags from the model contains suffixes, # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018 tagged_tokens = [ (token, map_tag("ru-rnc-new", tagset, tag.partition("=")[0])) for (token, tag) in tagged_tokens ] return tagged_tokens def pos_tag(tokens, tagset=None, lang="eng"): """ Use NLTK's currently recommended part of speech tagger to tag the given list of tokens. >>> from nltk.tag import pos_tag >>> from nltk.tokenize import word_tokenize >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') # doctest: +NORMALIZE_WHITESPACE [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence. :param tokens: Sequence of tokens to be tagged :type tokens: list(str) :param tagset: the tagset to be used, e.g. universal, wsj, brown :type tagset: str :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian :type lang: str :return: The tagged tokens :rtype: list(tuple(str, str)) """ tagger = _get_tagger(lang) return _pos_tag(tokens, tagset, tagger, lang) def pos_tag_sents(sentences, tagset=None, lang="eng"): """ Use NLTK's currently recommended part of speech tagger to tag the given list of sentences, each consisting of a list of tokens. :param sentences: List of sentences to be tagged :type sentences: list(list(str)) :param tagset: the tagset to be used, e.g. universal, wsj, brown :type tagset: str :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian :type lang: str :return: The list of tagged sentences :rtype: list(list(tuple(str, str))) """ tagger = _get_tagger(lang) return [_pos_tag(sent, tagset, tagger, lang) for sent in sentences] nltk-3.7/nltk/tag/api.py000066400000000000000000000342621420073152400152060ustar00rootroot00000000000000# Natural Language Toolkit: Tagger Interface # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # Tom Aarsen <> # URL: # For license information, see LICENSE.TXT """ Interface for tagging each token in a sentence with supplementary information, such as its part of speech. """ from abc import ABCMeta, abstractmethod from functools import lru_cache from itertools import chain from typing import Dict from nltk.internals import deprecated, overridden from nltk.metrics import ConfusionMatrix, accuracy from nltk.tag.util import untag class TaggerI(metaclass=ABCMeta): """ A processing interface for assigning a tag to each token in a list. Tags are case sensitive strings that identify some property of each token, such as its part of speech or its sense. Some taggers require specific types for their tokens. This is generally indicated by the use of a sub-interface to ``TaggerI``. For example, featureset taggers, which are subclassed from ``FeaturesetTagger``, require that each token be a ``featureset``. Subclasses must define: - either ``tag()`` or ``tag_sents()`` (or both) """ @abstractmethod def tag(self, tokens): """ Determine the most appropriate tag sequence for the given token sequence, and return a corresponding list of tagged tokens. A tagged token is encoded as a tuple ``(token, tag)``. :rtype: list(tuple(str, str)) """ if overridden(self.tag_sents): return self.tag_sents([tokens])[0] def tag_sents(self, sentences): """ Apply ``self.tag()`` to each element of *sentences*. I.e.:: return [self.tag(sent) for sent in sentences] """ return [self.tag(sent) for sent in sentences] @deprecated("Use accuracy(gold) instead.") def evaluate(self, gold): return self.accuracy(gold) def accuracy(self, gold): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. :param gold: The list of tagged sentences to score the tagger on. :type gold: list(list(tuple(str, str))) :rtype: float """ tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = list(chain.from_iterable(gold)) test_tokens = list(chain.from_iterable(tagged_sents)) return accuracy(gold_tokens, test_tokens) @lru_cache(maxsize=1) def _confusion_cached(self, gold): """ Inner function used after ``gold`` is converted to a ``tuple(tuple(tuple(str, str)))``. That way, we can use caching on creating a ConfusionMatrix. :param gold: The list of tagged sentences to run the tagger with, also used as the reference values in the generated confusion matrix. :type gold: tuple(tuple(tuple(str, str))) :rtype: ConfusionMatrix """ tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = [token for _word, token in chain.from_iterable(gold)] test_tokens = [token for _word, token in chain.from_iterable(tagged_sents)] return ConfusionMatrix(gold_tokens, test_tokens) def confusion(self, gold): """ Return a ConfusionMatrix with the tags from ``gold`` as the reference values, with the predictions from ``tag_sents`` as the predicted values. >>> from nltk.tag import PerceptronTagger >>> from nltk.corpus import treebank >>> tagger = PerceptronTagger() >>> gold_data = treebank.tagged_sents()[:10] >>> print(tagger.confusion(gold_data)) | - | | N | | O P | | N J J N N P P R R V V V V V W | | ' E C C D E I J J J M N N N O R P R B R T V B B B B B D ` | | ' , - . C D T X N J R S D N P S S P $ B R P O B D G N P Z T ` | -------+----------------------------------------------------------------------------------------------+ '' | <1> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | , | .<15> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | -NONE- | . . <.> . . 2 . . . 2 . . . 5 1 . . . . 2 . . . . . . . . . . . | . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . . . . | CC | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . . . . | CD | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . . . . | DT | . . . . . .<20> . . . . . . . . . . . . . . . . . . . . . . . . | EX | . . . . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | IN | . . . . . . . .<22> . . . . . . . . . . 3 . . . . . . . . . . . | JJ | . . . . . . . . .<16> . . . . 1 . . . . 1 . . . . . . . . . . . | JJR | . . . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . | JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . . | MD | . . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . | NN | . . . . . . . . . . . . .<28> 1 1 . . . . . . . . . . . . . . . | NNP | . . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . . . | NNS | . . . . . . . . . . . . . . .<19> . . . . . . . . . . . . . . . | POS | . . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . . . | PRP | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . . . | PRP$ | . . . . . . . . . . . . . . . . . . <2> . . . . . . . . . . . . | RB | . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | RBR | . . . . . . . . . . 1 . . . . . . . . . <1> . . . . . . . . . . | RP | . . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . . . | TO | . . . . . . . . . . . . . . . . . . . . . . <5> . . . . . . . . | VB | . . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . . . | VBD | . . . . . . . . . . . . . 1 . . . . . . . . . . <6> . . . . . . | VBG | . . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . . . | VBN | . . . . . . . . . . . . . . . . . . . . . . . . 1 . <4> . . . . | VBP | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . . . | VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . . . . <7> . . | WDT | . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . <.> . | `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <1>| -------+----------------------------------------------------------------------------------------------+ (row = reference; col = test) :param gold: The list of tagged sentences to run the tagger with, also used as the reference values in the generated confusion matrix. :type gold: list(list(tuple(str, str))) :rtype: ConfusionMatrix """ return self._confusion_cached(tuple(tuple(sent) for sent in gold)) def recall(self, gold) -> Dict[str, float]: """ Compute the recall for each tag from ``gold`` or from running ``tag`` on the tokenized sentences from ``gold``. Then, return the dictionary with mappings from tag to recall. The recall is defined as: - *r* = true positive / (true positive + false positive) :param gold: The list of tagged sentences to score the tagger on. :type gold: list(list(tuple(str, str))) :return: A mapping from tags to recall :rtype: Dict[str, float] """ cm = self.confusion(gold) return {tag: cm.recall(tag) for tag in cm._values} def precision(self, gold): """ Compute the precision for each tag from ``gold`` or from running ``tag`` on the tokenized sentences from ``gold``. Then, return the dictionary with mappings from tag to precision. The precision is defined as: - *p* = true positive / (true positive + false negative) :param gold: The list of tagged sentences to score the tagger on. :type gold: list(list(tuple(str, str))) :return: A mapping from tags to precision :rtype: Dict[str, float] """ cm = self.confusion(gold) return {tag: cm.precision(tag) for tag in cm._values} def f_measure(self, gold, alpha=0.5): """ Compute the f-measure for each tag from ``gold`` or from running ``tag`` on the tokenized sentences from ``gold``. Then, return the dictionary with mappings from tag to f-measure. The f-measure is the harmonic mean of the ``precision`` and ``recall``, weighted by ``alpha``. In particular, given the precision *p* and recall *r* defined by: - *p* = true positive / (true positive + false negative) - *r* = true positive / (true positive + false positive) The f-measure is: - *1/(alpha/p + (1-alpha)/r)* With ``alpha = 0.5``, this reduces to: - *2pr / (p + r)* :param gold: The list of tagged sentences to score the tagger on. :type gold: list(list(tuple(str, str))) :param alpha: Ratio of the cost of false negative compared to false positives. Defaults to 0.5, where the costs are equal. :type alpha: float :return: A mapping from tags to precision :rtype: Dict[str, float] """ cm = self.confusion(gold) return {tag: cm.f_measure(tag, alpha) for tag in cm._values} def evaluate_per_tag(self, gold, alpha=0.5, truncate=None, sort_by_count=False): """Tabulate the **recall**, **precision** and **f-measure** for each tag from ``gold`` or from running ``tag`` on the tokenized sentences from ``gold``. >>> from nltk.tag import PerceptronTagger >>> from nltk.corpus import treebank >>> tagger = PerceptronTagger() >>> gold_data = treebank.tagged_sents()[:10] >>> print(tagger.evaluate_per_tag(gold_data)) Tag | Prec. | Recall | F-measure -------+--------+--------+----------- '' | 1.0000 | 1.0000 | 1.0000 , | 1.0000 | 1.0000 | 1.0000 -NONE- | 0.0000 | 0.0000 | 0.0000 . | 1.0000 | 1.0000 | 1.0000 CC | 1.0000 | 1.0000 | 1.0000 CD | 0.7143 | 1.0000 | 0.8333 DT | 1.0000 | 1.0000 | 1.0000 EX | 1.0000 | 1.0000 | 1.0000 IN | 0.9167 | 0.8800 | 0.8980 JJ | 0.8889 | 0.8889 | 0.8889 JJR | 0.0000 | 0.0000 | 0.0000 JJS | 1.0000 | 1.0000 | 1.0000 MD | 1.0000 | 1.0000 | 1.0000 NN | 0.8000 | 0.9333 | 0.8615 NNP | 0.8929 | 1.0000 | 0.9434 NNS | 0.9500 | 1.0000 | 0.9744 POS | 1.0000 | 1.0000 | 1.0000 PRP | 1.0000 | 1.0000 | 1.0000 PRP$ | 1.0000 | 1.0000 | 1.0000 RB | 0.4000 | 1.0000 | 0.5714 RBR | 1.0000 | 0.5000 | 0.6667 RP | 1.0000 | 1.0000 | 1.0000 TO | 1.0000 | 1.0000 | 1.0000 VB | 1.0000 | 1.0000 | 1.0000 VBD | 0.8571 | 0.8571 | 0.8571 VBG | 1.0000 | 0.8000 | 0.8889 VBN | 1.0000 | 0.8000 | 0.8889 VBP | 1.0000 | 1.0000 | 1.0000 VBZ | 1.0000 | 1.0000 | 1.0000 WDT | 0.0000 | 0.0000 | 0.0000 `` | 1.0000 | 1.0000 | 1.0000 :param gold: The list of tagged sentences to score the tagger on. :type gold: list(list(tuple(str, str))) :param alpha: Ratio of the cost of false negative compared to false positives, as used in the f-measure computation. Defaults to 0.5, where the costs are equal. :type alpha: float :param truncate: If specified, then only show the specified number of values. Any sorting (e.g., sort_by_count) will be performed before truncation. Defaults to None :type truncate: int, optional :param sort_by_count: Whether to sort the outputs on number of occurrences of that tag in the ``gold`` data, defaults to False :type sort_by_count: bool, optional :return: A tabulated recall, precision and f-measure string :rtype: str """ cm = self.confusion(gold) return cm.evaluate(alpha=alpha, truncate=truncate, sort_by_count=sort_by_count) def _check_params(self, train, model): if (train and model) or (not train and not model): raise ValueError("Must specify either training data or trained model.") class FeaturesetTaggerI(TaggerI): """ A tagger that requires tokens to be ``featuresets``. A featureset is a dictionary that maps from feature names to feature values. See ``nltk.classify`` for more information about features and featuresets. """ nltk-3.7/nltk/tag/brill.py000066400000000000000000000377741420073152400155540ustar00rootroot00000000000000# Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2022 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from collections import Counter, defaultdict from nltk import jsontags from nltk.tag import TaggerI from nltk.tbl import Feature, Template ###################################################################### # Brill Templates ###################################################################### @jsontags.register_tag class Word(Feature): """ Feature which examines the text (word) of nearby tokens. """ json_tag = "nltk.tag.brill.Word" @staticmethod def extract_property(tokens, index): """@return: The given token's text.""" return tokens[index][0] @jsontags.register_tag class Pos(Feature): """ Feature which examines the tags of nearby tokens. """ json_tag = "nltk.tag.brill.Pos" @staticmethod def extract_property(tokens, index): """@return: The given token's tag.""" return tokens[index][1] def nltkdemo18(): """ Return 18 templates, from the original nltk demo, in multi-feature syntax """ return [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])), Template(Word([1, 2, 3])), Template(Word([-1]), Word([1])), ] def nltkdemo18plus(): """ Return 18 templates, from the original nltk demo, and additionally a few multi-feature ones (the motivation is easy comparison with nltkdemo18) """ return nltkdemo18() + [ Template(Word([-1]), Pos([1])), Template(Pos([-1]), Word([1])), Template(Word([-1]), Word([0]), Pos([1])), Template(Pos([-1]), Word([0]), Word([1])), Template(Pos([-1]), Word([0]), Pos([1])), ] def fntbl37(): """ Return 37 templates taken from the postagging task of the fntbl distribution https://www.cs.jhu.edu/~rflorian/fntbl/ (37 is after excluding a handful which do not condition on Pos[0]; fntbl can do that but the current nltk implementation cannot.) """ return [ Template(Word([0]), Word([1]), Word([2])), Template(Word([-1]), Word([0]), Word([1])), Template(Word([0]), Word([-1])), Template(Word([0]), Word([1])), Template(Word([0]), Word([2])), Template(Word([0]), Word([-2])), Template(Word([1, 2])), Template(Word([-2, -1])), Template(Word([1, 2, 3])), Template(Word([-3, -2, -1])), Template(Word([0]), Pos([2])), Template(Word([0]), Pos([-2])), Template(Word([0]), Pos([1])), Template(Word([0]), Pos([-1])), Template(Word([0])), Template(Word([-2])), Template(Word([2])), Template(Word([1])), Template(Word([-1])), Template(Pos([-1]), Pos([1])), Template(Pos([1]), Pos([2])), Template(Pos([-1]), Pos([-2])), Template(Pos([1])), Template(Pos([-1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([1, 2, 3])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([-2, -1])), Template(Pos([1]), Word([0]), Word([1])), Template(Pos([1]), Word([0]), Word([-1])), Template(Pos([-1]), Word([-1]), Word([0])), Template(Pos([-1]), Word([0]), Word([1])), Template(Pos([-2]), Pos([-1])), Template(Pos([1]), Pos([2])), Template(Pos([1]), Pos([2]), Word([1])), ] def brill24(): """ Return 24 templates of the seminal TBL paper, Brill (1995) """ return [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Pos([-2]), Pos([-1])), Template(Pos([1]), Pos([2])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-1, 0])), Template(Word([0, 1])), Template(Word([0])), Template(Word([-1]), Pos([-1])), Template(Word([1]), Pos([1])), Template(Word([0]), Word([-1]), Pos([-1])), Template(Word([0]), Word([1]), Pos([1])), ] def describe_template_sets(): """ Print the available template sets in this demo, with a short description" """ import inspect import sys # a bit of magic to get all functions in this module templatesets = inspect.getmembers(sys.modules[__name__], inspect.isfunction) for (name, obj) in templatesets: if name == "describe_template_sets": continue print(name, obj.__doc__, "\n") ###################################################################### # The Brill Tagger ###################################################################### @jsontags.register_tag class BrillTagger(TaggerI): """ Brill's transformational rule-based tagger. Brill taggers use an initial tagger (such as ``tag.DefaultTagger``) to assign an initial tag sequence to a text; and then apply an ordered list of transformational rules to correct the tags of individual tokens. These transformation rules are specified by the ``TagRule`` interface. Brill taggers can be created directly, from an initial tagger and a list of transformational rules; but more often, Brill taggers are created by learning rules from a training corpus, using one of the TaggerTrainers available. """ json_tag = "nltk.tag.BrillTagger" def __init__(self, initial_tagger, rules, training_stats=None): """ :param initial_tagger: The initial tagger :type initial_tagger: TaggerI :param rules: An ordered list of transformation rules that should be used to correct the initial tagging. :type rules: list(TagRule) :param training_stats: A dictionary of statistics collected during training, for possible later use :type training_stats: dict """ self._initial_tagger = initial_tagger self._rules = tuple(rules) self._training_stats = training_stats def encode_json_obj(self): return self._initial_tagger, self._rules, self._training_stats @classmethod def decode_json_obj(cls, obj): _initial_tagger, _rules, _training_stats = obj return cls(_initial_tagger, _rules, _training_stats) def rules(self): """ Return the ordered list of transformation rules that this tagger has learnt :return: the ordered list of transformation rules that correct the initial tagging :rtype: list of Rules """ return self._rules def train_stats(self, statistic=None): """ Return a named statistic collected during training, or a dictionary of all available statistics if no name given :param statistic: name of statistic :type statistic: str :return: some statistic collected during training of this tagger :rtype: any (but usually a number) """ if statistic is None: return self._training_stats else: return self._training_stats.get(statistic) def tag(self, tokens): # Inherit documentation from TaggerI # Run the initial tagger. tagged_tokens = self._initial_tagger.tag(tokens) # Create a dictionary that maps each tag to a list of the # indices of tokens that have that tag. tag_to_positions = defaultdict(set) for i, (token, tag) in enumerate(tagged_tokens): tag_to_positions[tag].add(i) # Apply each rule, in order. Only try to apply rules at # positions that have the desired original tag. for rule in self._rules: # Find the positions where it might apply positions = tag_to_positions.get(rule.original_tag, []) # Apply the rule at those positions. changed = rule.apply(tagged_tokens, positions) # Update tag_to_positions with the positions of tags that # were modified. for i in changed: tag_to_positions[rule.original_tag].remove(i) tag_to_positions[rule.replacement_tag].add(i) return tagged_tokens def print_template_statistics(self, test_stats=None, printunused=True): """ Print a list of all templates, ranked according to efficiency. If test_stats is available, the templates are ranked according to their relative contribution (summed for all rules created from a given template, weighted by score) to the performance on the test set. If no test_stats, then statistics collected during training are used instead. There is also an unweighted measure (just counting the rules). This is less informative, though, as many low-score rules will appear towards end of training. :param test_stats: dictionary of statistics collected during testing :type test_stats: dict of str -> any (but usually numbers) :param printunused: if True, print a list of all unused templates :type printunused: bool :return: None :rtype: None """ tids = [r.templateid for r in self._rules] train_stats = self.train_stats() trainscores = train_stats["rulescores"] assert len(trainscores) == len( tids ), "corrupt statistics: " "{} train scores for {} rules".format( trainscores, tids ) template_counts = Counter(tids) weighted_traincounts = Counter() for (tid, score) in zip(tids, trainscores): weighted_traincounts[tid] += score tottrainscores = sum(trainscores) # det_tplsort() is for deterministic sorting; # the otherwise convenient Counter.most_common() unfortunately # does not break ties deterministically # between python versions and will break cross-version tests def det_tplsort(tpl_value): return (tpl_value[1], repr(tpl_value[0])) def print_train_stats(): print( "TEMPLATE STATISTICS (TRAIN) {} templates, {} rules)".format( len(template_counts), len(tids) ) ) print( "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats) ) head = "#ID | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") train_tplscores = sorted( weighted_traincounts.items(), key=det_tplsort, reverse=True ) for (tid, trainscore) in train_tplscores: s = "{} | {:5d} {:5.3f} |{:4d} {:.3f} | {}".format( tid, trainscore, trainscore / tottrainscores, template_counts[tid], template_counts[tid] / len(tids), Template.ALLTEMPLATES[int(tid)], ) print(s) def print_testtrain_stats(): testscores = test_stats["rulescores"] print( "TEMPLATE STATISTICS (TEST AND TRAIN) ({} templates, {} rules)".format( len(template_counts), len(tids) ) ) print( "TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats) ) print( "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats) ) weighted_testcounts = Counter() for (tid, score) in zip(tids, testscores): weighted_testcounts[tid] += score tottestscores = sum(testscores) head = "#ID | Score (test) | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") test_tplscores = sorted( weighted_testcounts.items(), key=det_tplsort, reverse=True ) for (tid, testscore) in test_tplscores: s = "{:s} |{:5d} {:6.3f} | {:4d} {:.3f} |{:4d} {:.3f} | {:s}".format( tid, testscore, testscore / tottestscores, weighted_traincounts[tid], weighted_traincounts[tid] / tottrainscores, template_counts[tid], template_counts[tid] / len(tids), Template.ALLTEMPLATES[int(tid)], ) print(s) def print_unused_templates(): usedtpls = {int(tid) for tid in tids} unused = [ (tid, tpl) for (tid, tpl) in enumerate(Template.ALLTEMPLATES) if tid not in usedtpls ] print(f"UNUSED TEMPLATES ({len(unused)})") for (tid, tpl) in unused: print(f"{tid:03d} {str(tpl):s}") if test_stats is None: print_train_stats() else: print_testtrain_stats() print() if printunused: print_unused_templates() print() def batch_tag_incremental(self, sequences, gold): """ Tags by applying each rule to the entire corpus (rather than all rules to a single sequence). The point is to collect statistics on the test set for individual rules. NOTE: This is inefficient (does not build any index, so will traverse the entire corpus N times for N rules) -- usually you would not care about statistics for individual rules and thus use batch_tag() instead :param sequences: lists of token sequences (sentences, in some applications) to be tagged :type sequences: list of list of strings :param gold: the gold standard :type gold: list of list of strings :returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule)) """ def counterrors(xs): return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair)) testing_stats = {} testing_stats["tokencount"] = sum(len(t) for t in sequences) testing_stats["sequencecount"] = len(sequences) tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences] testing_stats["initialerrors"] = counterrors(tagged_tokenses) testing_stats["initialacc"] = ( 1 - testing_stats["initialerrors"] / testing_stats["tokencount"] ) # Apply each rule to the entire corpus, in order errors = [testing_stats["initialerrors"]] for rule in self._rules: for tagged_tokens in tagged_tokenses: rule.apply(tagged_tokens) errors.append(counterrors(tagged_tokenses)) testing_stats["rulescores"] = [ err0 - err1 for (err0, err1) in zip(errors, errors[1:]) ] testing_stats["finalerrors"] = errors[-1] testing_stats["finalacc"] = ( 1 - testing_stats["finalerrors"] / testing_stats["tokencount"] ) return (tagged_tokenses, testing_stats) nltk-3.7/nltk/tag/brill_trainer.py000066400000000000000000000652511420073152400172670ustar00rootroot00000000000000# Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2013 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT import bisect import textwrap from collections import defaultdict from nltk.tag import BrillTagger, untag ###################################################################### # Brill Tagger Trainer ###################################################################### class BrillTaggerTrainer: """ A trainer for tbl taggers. """ def __init__( self, initial_tagger, templates, trace=0, deterministic=None, ruleformat="str" ): """ Construct a Brill tagger from a baseline tagger and a set of templates :param initial_tagger: the baseline tagger :type initial_tagger: Tagger :param templates: templates to be used in training :type templates: list of Templates :param trace: verbosity level :type trace: int :param deterministic: if True, adjudicate ties deterministically :type deterministic: bool :param ruleformat: format of reported Rules :type ruleformat: str :return: An untrained BrillTagger :rtype: BrillTagger """ if deterministic is None: deterministic = trace > 0 self._initial_tagger = initial_tagger self._templates = templates self._trace = trace self._deterministic = deterministic self._ruleformat = ruleformat self._tag_positions = None """Mapping from tags to lists of positions that use that tag.""" self._rules_by_position = None """Mapping from positions to the set of rules that are known to occur at that position. Position is (sentnum, wordnum). Initially, this will only contain positions where each rule applies in a helpful way; but when we examine a rule, we'll extend this list to also include positions where each rule applies in a harmful or neutral way.""" self._positions_by_rule = None """Mapping from rule to position to effect, specifying the effect that each rule has on the overall score, at each position. Position is (sentnum, wordnum); and effect is -1, 0, or 1. As with _rules_by_position, this mapping starts out only containing rules with positive effects; but when we examine a rule, we'll extend this mapping to include the positions where the rule is harmful or neutral.""" self._rules_by_score = None """Mapping from scores to the set of rules whose effect on the overall score is upper bounded by that score. Invariant: rulesByScore[s] will contain r iff the sum of _positions_by_rule[r] is s.""" self._rule_scores = None """Mapping from rules to upper bounds on their effects on the overall score. This is the inverse mapping to _rules_by_score. Invariant: ruleScores[r] = sum(_positions_by_rule[r])""" self._first_unknown_position = None """Mapping from rules to the first position where we're unsure if the rule applies. This records the next position we need to check to see if the rule messed anything up.""" # Training def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): r""" Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least *min_score*, and each of which has accuracy not lower than *min_acc*. >>> # Relevant imports >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer >>> # Load some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] >>> gold_data = treebank.tagged_sents()[200:300] >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ ... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> baseline = backoff #see NOTE1 >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS 0.2450142... >>> # Set up templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] >>> # Construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... Found 847 useful rules. B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 47 63 16 162 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger1.rules()[1:3] (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) TRAIN ( 2417 tokens) initial 1776 0.2652 final: 1270 0.4746 #ID | Score (train) | #Rules | Template -------------------------------------------- 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) 000 | 201 0.397 | 3 0.300 | Template(Pos([-1])) >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS 0.43996... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] >>> # A high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... Found 847 useful rules. B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] 19 19 0 6 | NN->VB if Pos:TO@[-1] 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.accuracy(gold_data) # doctest: +ELLIPSIS 0.44159544... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, # with a RegexpTagger only as backoff. For instance, # >>> baseline = UnigramTagger(baseline_data, backoff=backoff) # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results # between python versions. The simplistic backoff above is a workaround to make doctests # get consistent input. :param train_sents: training data :type train_sents: list(list(tuple)) :param max_rules: output at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger """ # FIXME: several tests are a bit too dependent on tracing format # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates # Basic idea: Keep track of the rules that apply at each position. # And keep track of the positions to which each rule applies. # Create a new copy of the training corpus, and run the # initial tagger on it. We will progressively update this # test corpus to look more like the training corpus. test_sents = [ list(self._initial_tagger.tag(untag(sent))) for sent in train_sents ] # Collect some statistics on the training process trainstats = {} trainstats["min_acc"] = min_acc trainstats["min_score"] = min_score trainstats["tokencount"] = sum(len(t) for t in test_sents) trainstats["sequencecount"] = len(test_sents) trainstats["templatecount"] = len(self._templates) trainstats["rulescores"] = [] trainstats["initialerrors"] = sum( tag[1] != truth[1] for paired in zip(test_sents, train_sents) for (tag, truth) in zip(*paired) ) trainstats["initialacc"] = ( 1 - trainstats["initialerrors"] / trainstats["tokencount"] ) if self._trace > 0: print( "TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; " "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format( **trainstats ) ) # Initialize our mappings. This will find any errors made # by the initial tagger, and use those to generate repair # rules, which are added to the rule mappings. if self._trace: print("Finding initial useful rules...") self._init_mappings(test_sents, train_sents) if self._trace: print(f" Found {len(self._rule_scores)} useful rules.") # Let the user know what we're up to. if self._trace > 2: self._trace_header() elif self._trace == 1: print("Selecting rules...") # Repeatedly select the best rule, and add it to `rules`. rules = [] try: while len(rules) < max_rules: # Find the best rule, and add it to our rule list. rule = self._best_rule(train_sents, test_sents, min_score, min_acc) if rule: rules.append(rule) score = self._rule_scores[rule] trainstats["rulescores"].append(score) else: break # No more good rules left! # Report the rule that we found. if self._trace > 1: self._trace_rule(rule) # Apply the new rule at the relevant sites self._apply_rule(rule, test_sents) # Update _tag_positions[rule.original_tag] and # _tag_positions[rule.replacement_tag] for the affected # positions (i.e., self._positions_by_rule[rule]). self._update_tag_positions(rule) # Update rules that were affected by the change. self._update_rules(rule, train_sents, test_sents) # The user can cancel training manually: except KeyboardInterrupt: print(f"Training stopped manually -- {len(rules)} rules found") # Discard our tag position mapping & rule mappings. self._clean() trainstats["finalerrors"] = trainstats["initialerrors"] - sum( trainstats["rulescores"] ) trainstats["finalacc"] = ( 1 - trainstats["finalerrors"] / trainstats["tokencount"] ) # Create and return a tagger from the rules we found. return BrillTagger(self._initial_tagger, rules, trainstats) def _init_mappings(self, test_sents, train_sents): """ Initialize the tag position mapping & the rule related mappings. For each error in test_sents, find new rules that would correct them, and add them to the rule mappings. """ self._tag_positions = defaultdict(list) self._rules_by_position = defaultdict(set) self._positions_by_rule = defaultdict(dict) self._rules_by_score = defaultdict(set) self._rule_scores = defaultdict(int) self._first_unknown_position = defaultdict(int) # Scan through the corpus, initializing the tag_positions # mapping and all the rule-related mappings. for sentnum, sent in enumerate(test_sents): for wordnum, (word, tag) in enumerate(sent): # Initialize tag_positions self._tag_positions[tag].append((sentnum, wordnum)) # If it's an error token, update the rule-related mappings. correct_tag = train_sents[sentnum][wordnum][1] if tag != correct_tag: for rule in self._find_rules(sent, wordnum, correct_tag): self._update_rule_applies(rule, sentnum, wordnum, train_sents) def _clean(self): self._tag_positions = None self._rules_by_position = None self._positions_by_rule = None self._rules_by_score = None self._rule_scores = None self._first_unknown_position = None def _find_rules(self, sent, wordnum, new_tag): """ Use the templates to find rules that apply at index *wordnum* in the sentence *sent* and generate the tag *new_tag*. """ for template in self._templates: yield from template.applicable_rules(sent, wordnum, new_tag) def _update_rule_applies(self, rule, sentnum, wordnum, train_sents): """ Update the rule data tables to reflect the fact that *rule* applies at the position *(sentnum, wordnum)*. """ pos = sentnum, wordnum # If the rule is already known to apply here, ignore. # (This only happens if the position's tag hasn't changed.) if pos in self._positions_by_rule[rule]: return # Update self._positions_by_rule. correct_tag = train_sents[sentnum][wordnum][1] if rule.replacement_tag == correct_tag: self._positions_by_rule[rule][pos] = 1 elif rule.original_tag == correct_tag: self._positions_by_rule[rule][pos] = -1 else: # was wrong, remains wrong self._positions_by_rule[rule][pos] = 0 # Update _rules_by_position self._rules_by_position[pos].add(rule) # Update _rule_scores. old_score = self._rule_scores[rule] self._rule_scores[rule] += self._positions_by_rule[rule][pos] # Update _rules_by_score. self._rules_by_score[old_score].discard(rule) self._rules_by_score[self._rule_scores[rule]].add(rule) def _update_rule_not_applies(self, rule, sentnum, wordnum): """ Update the rule data tables to reflect the fact that *rule* does not apply at the position *(sentnum, wordnum)*. """ pos = sentnum, wordnum # Update _rule_scores. old_score = self._rule_scores[rule] self._rule_scores[rule] -= self._positions_by_rule[rule][pos] # Update _rules_by_score. self._rules_by_score[old_score].discard(rule) self._rules_by_score[self._rule_scores[rule]].add(rule) # Update _positions_by_rule del self._positions_by_rule[rule][pos] self._rules_by_position[pos].remove(rule) # Optional addition: if the rule now applies nowhere, delete # all its dictionary entries. def _best_rule(self, train_sents, test_sents, min_score, min_acc): """ Find the next best rule. This is done by repeatedly taking a rule with the highest score and stepping through the corpus to see where it applies. When it makes an error (decreasing its score) it's bumped down, and we try a new rule with the highest score. When we find a rule which has the highest score *and* which has been tested against the entire corpus, we can conclude that it's the next best rule. """ for max_score in sorted(self._rules_by_score.keys(), reverse=True): if len(self._rules_by_score) == 0: return None if max_score < min_score or max_score <= 0: return None best_rules = list(self._rules_by_score[max_score]) if self._deterministic: best_rules.sort(key=repr) for rule in best_rules: positions = self._tag_positions[rule.original_tag] unk = self._first_unknown_position.get(rule, (0, -1)) start = bisect.bisect_left(positions, unk) for i in range(start, len(positions)): sentnum, wordnum = positions[i] if rule.applies(test_sents[sentnum], wordnum): self._update_rule_applies(rule, sentnum, wordnum, train_sents) if self._rule_scores[rule] < max_score: self._first_unknown_position[rule] = (sentnum, wordnum + 1) break # The update demoted the rule. if self._rule_scores[rule] == max_score: self._first_unknown_position[rule] = (len(train_sents) + 1, 0) # optimization: if no min_acc threshold given, don't bother computing accuracy if min_acc is None: return rule else: changes = self._positions_by_rule[rule].values() num_fixed = len([c for c in changes if c == 1]) num_broken = len([c for c in changes if c == -1]) # acc here is fixed/(fixed+broken); could also be # fixed/(fixed+broken+other) == num_fixed/len(changes) acc = num_fixed / (num_fixed + num_broken) if acc >= min_acc: return rule # else: rule too inaccurate, discard and try next # We demoted (or skipped due to < min_acc, if that was given) # all the rules with score==max_score. assert min_acc is not None or not self._rules_by_score[max_score] if not self._rules_by_score[max_score]: del self._rules_by_score[max_score] def _apply_rule(self, rule, test_sents): """ Update *test_sents* by applying *rule* everywhere where its conditions are met. """ update_positions = set(self._positions_by_rule[rule]) new_tag = rule.replacement_tag if self._trace > 3: self._trace_apply(len(update_positions)) # Update test_sents. for (sentnum, wordnum) in update_positions: text = test_sents[sentnum][wordnum][0] test_sents[sentnum][wordnum] = (text, new_tag) def _update_tag_positions(self, rule): """ Update _tag_positions to reflect the changes to tags that are made by *rule*. """ # Update the tag index. for pos in self._positions_by_rule[rule]: # Delete the old tag. old_tag_positions = self._tag_positions[rule.original_tag] old_index = bisect.bisect_left(old_tag_positions, pos) del old_tag_positions[old_index] # Insert the new tag. new_tag_positions = self._tag_positions[rule.replacement_tag] bisect.insort_left(new_tag_positions, pos) def _update_rules(self, rule, train_sents, test_sents): """ Check if we should add or remove any rules from consideration, given the changes made by *rule*. """ # Collect a list of all positions that might be affected. neighbors = set() for sentnum, wordnum in self._positions_by_rule[rule]: for template in self._templates: n = template.get_neighborhood(test_sents[sentnum], wordnum) neighbors.update([(sentnum, i) for i in n]) # Update the rules at each position. num_obsolete = num_new = num_unseen = 0 for sentnum, wordnum in neighbors: test_sent = test_sents[sentnum] correct_tag = train_sents[sentnum][wordnum][1] # Check if the change causes any rule at this position to # stop matching; if so, then update our rule mappings # accordingly. old_rules = set(self._rules_by_position[sentnum, wordnum]) for old_rule in old_rules: if not old_rule.applies(test_sent, wordnum): num_obsolete += 1 self._update_rule_not_applies(old_rule, sentnum, wordnum) # Check if the change causes our templates to propose any # new rules for this position. for template in self._templates: for new_rule in template.applicable_rules( test_sent, wordnum, correct_tag ): if new_rule not in old_rules: num_new += 1 if new_rule not in self._rule_scores: num_unseen += 1 old_rules.add(new_rule) self._update_rule_applies( new_rule, sentnum, wordnum, train_sents ) # We may have caused other rules to match here, that are # not proposed by our templates -- in particular, rules # that are harmful or neutral. We therefore need to # update any rule whose first_unknown_position is past # this rule. for new_rule, pos in self._first_unknown_position.items(): if pos > (sentnum, wordnum): if new_rule not in old_rules: num_new += 1 if new_rule.applies(test_sent, wordnum): self._update_rule_applies( new_rule, sentnum, wordnum, train_sents ) if self._trace > 3: self._trace_update_rules(num_obsolete, num_new, num_unseen) # Tracing def _trace_header(self): print( """ B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- """.rstrip() ) def _trace_rule(self, rule): assert self._rule_scores[rule] == sum(self._positions_by_rule[rule].values()) changes = self._positions_by_rule[rule].values() num_fixed = len([c for c in changes if c == 1]) num_broken = len([c for c in changes if c == -1]) num_other = len([c for c in changes if c == 0]) score = self._rule_scores[rule] rulestr = rule.format(self._ruleformat) if self._trace > 2: print( "{:4d}{:4d}{:4d}{:4d} |".format( score, num_fixed, num_broken, num_other ), end=" ", ) print( textwrap.fill( rulestr, initial_indent=" " * 20, width=79, subsequent_indent=" " * 18 + "| ", ).strip() ) else: print(rulestr) def _trace_apply(self, num_updates): prefix = " " * 18 + "|" print(prefix) print(prefix, f"Applying rule to {num_updates} positions.") def _trace_update_rules(self, num_obsolete, num_new, num_unseen): prefix = " " * 18 + "|" print(prefix, "Updated rule tables:") print(prefix, (f" - {num_obsolete} rule applications removed")) print( prefix, (f" - {num_new} rule applications added ({num_unseen} novel)"), ) print(prefix) nltk-3.7/nltk/tag/crf.py000066400000000000000000000167131420073152400152100ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the CRFSuite Tagger # # Copyright (C) 2001-2022 NLTK Project # Author: Long Duong # URL: # For license information, see LICENSE.TXT """ A module for POS tagging using CRFSuite """ import re import unicodedata from nltk.tag.api import TaggerI try: import pycrfsuite except ImportError: pass class CRFTagger(TaggerI): """ A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite >>> from nltk.tag import CRFTagger >>> ct = CRFTagger() >>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')], ... [('dog','Noun'),('eat','Verb'),('meat','Noun')]] >>> ct.train(train_data,'model.crf.tagger') >>> ct.tag_sents([['dog','is','good'], ['Cat','eat','meat']]) [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]] >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]] >>> ct.accuracy(gold_sentences) 1.0 Setting learned model file >>> ct = CRFTagger() >>> ct.set_model_file('model.crf.tagger') >>> ct.accuracy(gold_sentences) 1.0 """ def __init__(self, feature_func=None, verbose=False, training_opt={}): """ Initialize the CRFSuite tagger :param feature_func: The function that extracts features for each token of a sentence. This function should take 2 parameters: tokens and index which extract features at index position from tokens list. See the build in _get_features function for more detail. :param verbose: output the debugging messages during training. :type verbose: boolean :param training_opt: python-crfsuite training options :type training_opt: dictionary Set of possible training options (using LBFGS training algorithm). :'feature.minfreq': The minimum frequency of features. :'feature.possible_states': Force to generate possible state features. :'feature.possible_transitions': Force to generate possible transition features. :'c1': Coefficient for L1 regularization. :'c2': Coefficient for L2 regularization. :'max_iterations': The maximum number of iterations for L-BFGS optimization. :'num_memories': The number of limited memories for approximating the inverse hessian matrix. :'epsilon': Epsilon for testing the convergence of the objective. :'period': The duration of iterations to test the stopping criterion. :'delta': The threshold for the stopping criterion; an L-BFGS iteration stops when the improvement of the log likelihood over the last ${period} iterations is no greater than this threshold. :'linesearch': The line search algorithm used in L-BFGS updates: - 'MoreThuente': More and Thuente's method, - 'Backtracking': Backtracking method with regular Wolfe condition, - 'StrongBacktracking': Backtracking method with strong Wolfe condition :'max_linesearch': The maximum number of trials for the line search algorithm. """ self._model_file = "" self._tagger = pycrfsuite.Tagger() if feature_func is None: self._feature_func = self._get_features else: self._feature_func = feature_func self._verbose = verbose self._training_options = training_opt self._pattern = re.compile(r"\d") def set_model_file(self, model_file): self._model_file = model_file self._tagger.open(self._model_file) def _get_features(self, tokens, idx): """ Extract basic features about this word including - Current word - is it capitalized? - Does it have punctuation? - Does it have a number? - Suffixes up to length 3 Note that : we might include feature over previous word, next word etc. :return: a list which contains the features :rtype: list(str) """ token = tokens[idx] feature_list = [] if not token: return feature_list # Capitalization if token[0].isupper(): feature_list.append("CAPITALIZATION") # Number if re.search(self._pattern, token) is not None: feature_list.append("HAS_NUM") # Punctuation punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"} if all(unicodedata.category(x) in punc_cat for x in token): feature_list.append("PUNCTUATION") # Suffix up to length 3 if len(token) > 1: feature_list.append("SUF_" + token[-1:]) if len(token) > 2: feature_list.append("SUF_" + token[-2:]) if len(token) > 3: feature_list.append("SUF_" + token[-3:]) feature_list.append("WORD_" + token) return feature_list def tag_sents(self, sents): """ Tag a list of sentences. NB before using this function, user should specify the mode_file either by - Train a new model using ``train`` function - Use the pre-trained model which is set via ``set_model_file`` function :params sentences: list of sentences needed to tag. :type sentences: list(list(str)) :return: list of tagged sentences. :rtype: list(list(tuple(str,str))) """ if self._model_file == "": raise Exception( " No model file is found !! Please use train or set_model_file function" ) # We need the list of sentences instead of the list generator for matching the input and output result = [] for tokens in sents: features = [self._feature_func(tokens, i) for i in range(len(tokens))] labels = self._tagger.tag(features) if len(labels) != len(tokens): raise Exception(" Predicted Length Not Matched, Expect Errors !") tagged_sent = list(zip(tokens, labels)) result.append(tagged_sent) return result def train(self, train_data, model_file): """ Train the CRF tagger using CRFSuite :params train_data : is the list of annotated sentences. :type train_data : list (list(tuple(str,str))) :params model_file : the model will be saved to this file. """ trainer = pycrfsuite.Trainer(verbose=self._verbose) trainer.set_params(self._training_options) for sent in train_data: tokens, labels = zip(*sent) features = [self._feature_func(tokens, i) for i in range(len(tokens))] trainer.append(features, labels) # Now train the model, the output should be model_file trainer.train(model_file) # Save the model file self.set_model_file(model_file) def tag(self, tokens): """ Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by - Train a new model using ``train`` function - Use the pre-trained model which is set via ``set_model_file`` function :params tokens: list of tokens needed to tag. :type tokens: list(str) :return: list of tagged tokens. :rtype: list(tuple(str,str)) """ return self.tag_sents([tokens])[0] nltk-3.7/nltk/tag/hmm.py000066400000000000000000001375741420073152400152300ustar00rootroot00000000000000# Natural Language Toolkit: Hidden Markov Model # # Copyright (C) 2001-2022 NLTK Project # Author: Trevor Cohn # Philip Blunsom # Tiago Tresoldi (fixes) # Steven Bird (fixes) # Joseph Frazee (fixes) # Steven Xu (fixes) # URL: # For license information, see LICENSE.TXT """ Hidden Markov Models (HMMs) largely used to assign the correct label sequence to sequential data or assess the probability of a given label and data sequence. These models are finite state machines characterised by a number of states, transitions between these states, and output symbols emitted while in each state. The HMM is an extension to the Markov chain, where each state corresponds deterministically to a given event. In the HMM the observation is a probabilistic function of the state. HMMs share the Markov chain's assumption, being that the probability of transition from one state to another only depends on the current state - i.e. the series of states that led to the current state are not used. They are also time invariant. The HMM is a directed graph, with probability weighted edges (representing the probability of a transition between the source and sink states) where each vertex emits an output symbol when entered. The symbol (or observation) is non-deterministically generated. For this reason, knowing that a sequence of output observations was generated by a given HMM does not mean that the corresponding sequence of states (and what the current state is) is known. This is the 'hidden' in the hidden markov model. Formally, a HMM can be characterised by: - the output observation alphabet. This is the set of symbols which may be observed as output of the system. - the set of states. - the transition probabilities *a_{ij} = P(s_t = j | s_{t-1} = i)*. These represent the probability of transition to each state from a given state. - the output probability matrix *b_i(k) = P(X_t = o_k | s_t = i)*. These represent the probability of observing each symbol in a given state. - the initial state distribution. This gives the probability of starting in each state. To ground this discussion, take a common NLP application, part-of-speech (POS) tagging. An HMM is desirable for this task as the highest probability tag sequence can be calculated for a given sequence of word forms. This differs from other tagging techniques which often tag each word individually, seeking to optimise each individual tagging greedily without regard to the optimal combination of tags for a larger unit, such as a sentence. The HMM does this with the Viterbi algorithm, which efficiently computes the optimal path through the graph given the sequence of words forms. In POS tagging the states usually have a 1:1 correspondence with the tag alphabet - i.e. each state represents a single tag. The output observation alphabet is the set of word forms (the lexicon), and the remaining three parameters are derived by a training regime. With this information the probability of a given sentence can be easily derived, by simply summing the probability of each distinct path through the model. Similarly, the highest probability tagging sequence can be derived with the Viterbi algorithm, yielding a state sequence which can be mapped into a tag sequence. This discussion assumes that the HMM has been trained. This is probably the most difficult task with the model, and requires either MLE estimates of the parameters or unsupervised learning using the Baum-Welch algorithm, a variant of EM. For more information, please consult the source code for this module, which includes extensive demonstration code. """ import itertools import re try: import numpy as np except ImportError: pass from nltk.metrics import accuracy from nltk.probability import ( ConditionalFreqDist, ConditionalProbDist, DictionaryConditionalProbDist, DictionaryProbDist, FreqDist, LidstoneProbDist, MLEProbDist, MutableProbDist, RandomProbDist, ) from nltk.tag.api import TaggerI from nltk.util import LazyMap, unique_list _TEXT = 0 # index of text in a tuple _TAG = 1 # index of tag in a tuple def _identity(labeled_symbols): return labeled_symbols class HiddenMarkovModelTagger(TaggerI): """ Hidden Markov model class, a generative model for labelling sequence data. These models define the joint probability of a sequence of symbols and their labels (state transitions) as the product of the starting state probability, the probability of each state transition, and the probability of each observation being generated from each state. This is described in more detail in the module documentation. This implementation is based on the HMM description in Chapter 8, Huang, Acero and Hon, Spoken Language Processing and includes an extension for training shallow HMM parsers or specialized HMMs as in Molina et. al, 2002. A specialized HMM modifies training data by applying a specialization function to create a new training set that is more appropriate for sequential tagging with an HMM. A typical use case is chunking. :param symbols: the set of output symbols (alphabet) :type symbols: seq of any :param states: a set of states representing state space :type states: seq of any :param transitions: transition probabilities; Pr(s_i | s_j) is the probability of transition from state i given the model is in state_j :type transitions: ConditionalProbDistI :param outputs: output probabilities; Pr(o_k | s_i) is the probability of emitting symbol k when entering state i :type outputs: ConditionalProbDistI :param priors: initial state distribution; Pr(s_i) is the probability of starting in state i :type priors: ProbDistI :param transform: an optional function for transforming training instances, defaults to the identity function. :type transform: callable """ def __init__( self, symbols, states, transitions, outputs, priors, transform=_identity ): self._symbols = unique_list(symbols) self._states = unique_list(states) self._transitions = transitions self._outputs = outputs self._priors = priors self._cache = None self._transform = transform @classmethod def _train( cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, transform=_identity, estimator=None, **kwargs, ): if estimator is None: def estimator(fd, bins): return LidstoneProbDist(fd, 0.1, bins) labeled_sequence = LazyMap(transform, labeled_sequence) symbols = unique_list(word for sent in labeled_sequence for word, tag in sent) tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) hmm = cls( hmm._symbols, hmm._states, hmm._transitions, hmm._outputs, hmm._priors, transform=transform, ) if test_sequence: hmm.test(test_sequence, verbose=kwargs.get("verbose", False)) if unlabeled_sequence: max_iterations = kwargs.get("max_iterations", 5) hmm = trainer.train_unsupervised( unlabeled_sequence, model=hmm, max_iterations=max_iterations ) if test_sequence: hmm.test(test_sequence, verbose=kwargs.get("verbose", False)) return hmm @classmethod def train( cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, **kwargs ): """ Train a new HiddenMarkovModelTagger using the given labeled and unlabeled training instances. Testing will be performed if test instances are provided. :return: a hidden markov model tagger :rtype: HiddenMarkovModelTagger :param labeled_sequence: a sequence of labeled training instances, i.e. a list of sentences represented as tuples :type labeled_sequence: list(list) :param test_sequence: a sequence of labeled test instances :type test_sequence: list(list) :param unlabeled_sequence: a sequence of unlabeled training instances, i.e. a list of sentences represented as words :type unlabeled_sequence: list(list) :param transform: an optional function for transforming training instances, defaults to the identity function, see ``transform()`` :type transform: function :param estimator: an optional function or class that maps a condition's frequency distribution to its probability distribution, defaults to a Lidstone distribution with gamma = 0.1 :type estimator: class or function :param verbose: boolean flag indicating whether training should be verbose or include printed output :type verbose: bool :param max_iterations: number of Baum-Welch iterations to perform :type max_iterations: int """ return cls._train(labeled_sequence, test_sequence, unlabeled_sequence, **kwargs) def probability(self, sequence): """ Returns the probability of the given symbol sequence. If the sequence is labelled, then returns the joint probability of the symbol, state sequence. Otherwise, uses the forward algorithm to find the probability over all label sequences. :return: the probability of the sequence :rtype: float :param sequence: the sequence of symbols which must contain the TEXT property, and optionally the TAG property :type sequence: Token """ return 2 ** (self.log_probability(self._transform(sequence))) def log_probability(self, sequence): """ Returns the log-probability of the given symbol sequence. If the sequence is labelled, then returns the joint log-probability of the symbol, state sequence. Otherwise, uses the forward algorithm to find the log-probability over all label sequences. :return: the log-probability of the sequence :rtype: float :param sequence: the sequence of symbols which must contain the TEXT property, and optionally the TAG property :type sequence: Token """ sequence = self._transform(sequence) T = len(sequence) if T > 0 and sequence[0][_TAG]: last_state = sequence[0][_TAG] p = self._priors.logprob(last_state) + self._output_logprob( last_state, sequence[0][_TEXT] ) for t in range(1, T): state = sequence[t][_TAG] p += self._transitions[last_state].logprob( state ) + self._output_logprob(state, sequence[t][_TEXT]) last_state = state return p else: alpha = self._forward_probability(sequence) p = logsumexp2(alpha[T - 1]) return p def tag(self, unlabeled_sequence): """ Tags the sequence with the highest probability state sequence. This uses the best_path method to find the Viterbi path. :return: a labelled sequence of symbols :rtype: list :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list """ unlabeled_sequence = self._transform(unlabeled_sequence) return self._tag(unlabeled_sequence) def _tag(self, unlabeled_sequence): path = self._best_path(unlabeled_sequence) return list(zip(unlabeled_sequence, path)) def _output_logprob(self, state, symbol): """ :return: the log probability of the symbol being observed in the given state :rtype: float """ return self._outputs[state].logprob(symbol) def _create_cache(self): """ The cache is a tuple (P, O, X, S) where: - S maps symbols to integers. I.e., it is the inverse mapping from self._symbols; for each symbol s in self._symbols, the following is true:: self._symbols[S[s]] == s - O is the log output probabilities:: O[i,k] = log( P(token[t]=sym[k]|tag[t]=state[i]) ) - X is the log transition probabilities:: X[i,j] = log( P(tag[t]=state[j]|tag[t-1]=state[i]) ) - P is the log prior probabilities:: P[i] = log( P(tag[0]=state[i]) ) """ if not self._cache: N = len(self._states) M = len(self._symbols) P = np.zeros(N, np.float32) X = np.zeros((N, N), np.float32) O = np.zeros((N, M), np.float32) for i in range(N): si = self._states[i] P[i] = self._priors.logprob(si) for j in range(N): X[i, j] = self._transitions[si].logprob(self._states[j]) for k in range(M): O[i, k] = self._output_logprob(si, self._symbols[k]) S = {} for k in range(M): S[self._symbols[k]] = k self._cache = (P, O, X, S) def _update_cache(self, symbols): # add new symbols to the symbol table and repopulate the output # probabilities and symbol table mapping if symbols: self._create_cache() P, O, X, S = self._cache for symbol in symbols: if symbol not in self._symbols: self._cache = None self._symbols.append(symbol) # don't bother with the work if there aren't any new symbols if not self._cache: N = len(self._states) M = len(self._symbols) Q = O.shape[1] # add new columns to the output probability table without # destroying the old probabilities O = np.hstack([O, np.zeros((N, M - Q), np.float32)]) for i in range(N): si = self._states[i] # only calculate probabilities for new symbols for k in range(Q, M): O[i, k] = self._output_logprob(si, self._symbols[k]) # only create symbol mappings for new symbols for k in range(Q, M): S[self._symbols[k]] = k self._cache = (P, O, X, S) def reset_cache(self): self._cache = None def best_path(self, unlabeled_sequence): """ Returns the state sequence of the optimal (most probable) path through the HMM. Uses the Viterbi algorithm to calculate this part by dynamic programming. :return: the state sequence :rtype: sequence of any :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list """ unlabeled_sequence = self._transform(unlabeled_sequence) return self._best_path(unlabeled_sequence) def _best_path(self, unlabeled_sequence): T = len(unlabeled_sequence) N = len(self._states) self._create_cache() self._update_cache(unlabeled_sequence) P, O, X, S = self._cache V = np.zeros((T, N), np.float32) B = -np.ones((T, N), int) V[0] = P + O[:, S[unlabeled_sequence[0]]] for t in range(1, T): for j in range(N): vs = V[t - 1, :] + X[:, j] best = np.argmax(vs) V[t, j] = vs[best] + O[j, S[unlabeled_sequence[t]]] B[t, j] = best current = np.argmax(V[T - 1, :]) sequence = [current] for t in range(T - 1, 0, -1): last = B[t, current] sequence.append(last) current = last sequence.reverse() return list(map(self._states.__getitem__, sequence)) def best_path_simple(self, unlabeled_sequence): """ Returns the state sequence of the optimal (most probable) path through the HMM. Uses the Viterbi algorithm to calculate this part by dynamic programming. This uses a simple, direct method, and is included for teaching purposes. :return: the state sequence :rtype: sequence of any :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list """ unlabeled_sequence = self._transform(unlabeled_sequence) return self._best_path_simple(unlabeled_sequence) def _best_path_simple(self, unlabeled_sequence): T = len(unlabeled_sequence) N = len(self._states) V = np.zeros((T, N), np.float64) B = {} # find the starting log probabilities for each state symbol = unlabeled_sequence[0] for i, state in enumerate(self._states): V[0, i] = self._priors.logprob(state) + self._output_logprob(state, symbol) B[0, state] = None # find the maximum log probabilities for reaching each state at time t for t in range(1, T): symbol = unlabeled_sequence[t] for j in range(N): sj = self._states[j] best = None for i in range(N): si = self._states[i] va = V[t - 1, i] + self._transitions[si].logprob(sj) if not best or va > best[0]: best = (va, si) V[t, j] = best[0] + self._output_logprob(sj, symbol) B[t, sj] = best[1] # find the highest probability final state best = None for i in range(N): val = V[T - 1, i] if not best or val > best[0]: best = (val, self._states[i]) # traverse the back-pointers B to find the state sequence current = best[1] sequence = [current] for t in range(T - 1, 0, -1): last = B[t, current] sequence.append(last) current = last sequence.reverse() return sequence def random_sample(self, rng, length): """ Randomly sample the HMM to generate a sentence of a given length. This samples the prior distribution then the observation distribution and transition distribution for each subsequent observation and state. This will mostly generate unintelligible garbage, but can provide some amusement. :return: the randomly created state/observation sequence, generated according to the HMM's probability distributions. The SUBTOKENS have TEXT and TAG properties containing the observation and state respectively. :rtype: list :param rng: random number generator :type rng: Random (or any object with a random() method) :param length: desired output length :type length: int """ # sample the starting state and symbol prob dists tokens = [] state = self._sample_probdist(self._priors, rng.random(), self._states) symbol = self._sample_probdist( self._outputs[state], rng.random(), self._symbols ) tokens.append((symbol, state)) for i in range(1, length): # sample the state transition and symbol prob dists state = self._sample_probdist( self._transitions[state], rng.random(), self._states ) symbol = self._sample_probdist( self._outputs[state], rng.random(), self._symbols ) tokens.append((symbol, state)) return tokens def _sample_probdist(self, probdist, p, samples): cum_p = 0 for sample in samples: add_p = probdist.prob(sample) if cum_p <= p <= cum_p + add_p: return sample cum_p += add_p raise Exception("Invalid probability distribution - " "does not sum to one") def entropy(self, unlabeled_sequence): """ Returns the entropy over labellings of the given sequence. This is given by:: H(O) = - sum_S Pr(S | O) log Pr(S | O) where the summation ranges over all state sequences, S. Let *Z = Pr(O) = sum_S Pr(S, O)}* where the summation ranges over all state sequences and O is the observation sequence. As such the entropy can be re-expressed as:: H = - sum_S Pr(S | O) log [ Pr(S, O) / Z ] = log Z - sum_S Pr(S | O) log Pr(S, 0) = log Z - sum_S Pr(S | O) [ log Pr(S_0) + sum_t Pr(S_t | S_{t-1}) + sum_t Pr(O_t | S_t) ] The order of summation for the log terms can be flipped, allowing dynamic programming to be used to calculate the entropy. Specifically, we use the forward and backward probabilities (alpha, beta) giving:: H = log Z - sum_s0 alpha_0(s0) beta_0(s0) / Z * log Pr(s0) + sum_t,si,sj alpha_t(si) Pr(sj | si) Pr(O_t+1 | sj) beta_t(sj) / Z * log Pr(sj | si) + sum_t,st alpha_t(st) beta_t(st) / Z * log Pr(O_t | st) This simply uses alpha and beta to find the probabilities of partial sequences, constrained to include the given state(s) at some point in time. """ unlabeled_sequence = self._transform(unlabeled_sequence) T = len(unlabeled_sequence) N = len(self._states) alpha = self._forward_probability(unlabeled_sequence) beta = self._backward_probability(unlabeled_sequence) normalisation = logsumexp2(alpha[T - 1]) entropy = normalisation # starting state, t = 0 for i, state in enumerate(self._states): p = 2 ** (alpha[0, i] + beta[0, i] - normalisation) entropy -= p * self._priors.logprob(state) # print('p(s_0 = %s) =' % state, p) # state transitions for t0 in range(T - 1): t1 = t0 + 1 for i0, s0 in enumerate(self._states): for i1, s1 in enumerate(self._states): p = 2 ** ( alpha[t0, i0] + self._transitions[s0].logprob(s1) + self._outputs[s1].logprob(unlabeled_sequence[t1][_TEXT]) + beta[t1, i1] - normalisation ) entropy -= p * self._transitions[s0].logprob(s1) # print('p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p) # symbol emissions for t in range(T): for i, state in enumerate(self._states): p = 2 ** (alpha[t, i] + beta[t, i] - normalisation) entropy -= p * self._outputs[state].logprob( unlabeled_sequence[t][_TEXT] ) # print('p(s_%d = %s) =' % (t, state), p) return entropy def point_entropy(self, unlabeled_sequence): """ Returns the pointwise entropy over the possible states at each position in the chain, given the observation sequence. """ unlabeled_sequence = self._transform(unlabeled_sequence) T = len(unlabeled_sequence) N = len(self._states) alpha = self._forward_probability(unlabeled_sequence) beta = self._backward_probability(unlabeled_sequence) normalisation = logsumexp2(alpha[T - 1]) entropies = np.zeros(T, np.float64) probs = np.zeros(N, np.float64) for t in range(T): for s in range(N): probs[s] = alpha[t, s] + beta[t, s] - normalisation for s in range(N): entropies[t] -= 2 ** (probs[s]) * probs[s] return entropies def _exhaustive_entropy(self, unlabeled_sequence): unlabeled_sequence = self._transform(unlabeled_sequence) T = len(unlabeled_sequence) N = len(self._states) labellings = [[state] for state in self._states] for t in range(T - 1): current = labellings labellings = [] for labelling in current: for state in self._states: labellings.append(labelling + [state]) log_probs = [] for labelling in labellings: labeled_sequence = unlabeled_sequence[:] for t, label in enumerate(labelling): labeled_sequence[t] = (labeled_sequence[t][_TEXT], label) lp = self.log_probability(labeled_sequence) log_probs.append(lp) normalisation = _log_add(*log_probs) entropy = 0 for lp in log_probs: lp -= normalisation entropy -= 2 ** (lp) * lp return entropy def _exhaustive_point_entropy(self, unlabeled_sequence): unlabeled_sequence = self._transform(unlabeled_sequence) T = len(unlabeled_sequence) N = len(self._states) labellings = [[state] for state in self._states] for t in range(T - 1): current = labellings labellings = [] for labelling in current: for state in self._states: labellings.append(labelling + [state]) log_probs = [] for labelling in labellings: labelled_sequence = unlabeled_sequence[:] for t, label in enumerate(labelling): labelled_sequence[t] = (labelled_sequence[t][_TEXT], label) lp = self.log_probability(labelled_sequence) log_probs.append(lp) normalisation = _log_add(*log_probs) probabilities = _ninf_array((T, N)) for labelling, lp in zip(labellings, log_probs): lp -= normalisation for t, label in enumerate(labelling): index = self._states.index(label) probabilities[t, index] = _log_add(probabilities[t, index], lp) entropies = np.zeros(T, np.float64) for t in range(T): for s in range(N): entropies[t] -= 2 ** (probabilities[t, s]) * probabilities[t, s] return entropies def _transitions_matrix(self): """Return a matrix of transition log probabilities.""" trans_iter = ( self._transitions[sj].logprob(si) for sj in self._states for si in self._states ) transitions_logprob = np.fromiter(trans_iter, dtype=np.float64) N = len(self._states) return transitions_logprob.reshape((N, N)).T def _outputs_vector(self, symbol): """ Return a vector with log probabilities of emitting a symbol when entering states. """ out_iter = (self._output_logprob(sj, symbol) for sj in self._states) return np.fromiter(out_iter, dtype=np.float64) def _forward_probability(self, unlabeled_sequence): """ Return the forward probability matrix, a T by N array of log-probabilities, where T is the length of the sequence and N is the number of states. Each entry (t, s) gives the probability of being in state s at time t after observing the partial symbol sequence up to and including t. :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list :return: the forward log probability matrix :rtype: array """ T = len(unlabeled_sequence) N = len(self._states) alpha = _ninf_array((T, N)) transitions_logprob = self._transitions_matrix() # Initialization symbol = unlabeled_sequence[0][_TEXT] for i, state in enumerate(self._states): alpha[0, i] = self._priors.logprob(state) + self._output_logprob( state, symbol ) # Induction for t in range(1, T): symbol = unlabeled_sequence[t][_TEXT] output_logprob = self._outputs_vector(symbol) for i in range(N): summand = alpha[t - 1] + transitions_logprob[i] alpha[t, i] = logsumexp2(summand) + output_logprob[i] return alpha def _backward_probability(self, unlabeled_sequence): """ Return the backward probability matrix, a T by N array of log-probabilities, where T is the length of the sequence and N is the number of states. Each entry (t, s) gives the probability of being in state s at time t after observing the partial symbol sequence from t .. T. :return: the backward log probability matrix :rtype: array :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list """ T = len(unlabeled_sequence) N = len(self._states) beta = _ninf_array((T, N)) transitions_logprob = self._transitions_matrix().T # initialise the backward values; # "1" is an arbitrarily chosen value from Rabiner tutorial beta[T - 1, :] = np.log2(1) # inductively calculate remaining backward values for t in range(T - 2, -1, -1): symbol = unlabeled_sequence[t + 1][_TEXT] outputs = self._outputs_vector(symbol) for i in range(N): summand = transitions_logprob[i] + beta[t + 1] + outputs beta[t, i] = logsumexp2(summand) return beta def test(self, test_sequence, verbose=False, **kwargs): """ Tests the HiddenMarkovModelTagger instance. :param test_sequence: a sequence of labeled test instances :type test_sequence: list(list) :param verbose: boolean flag indicating whether training should be verbose or include printed output :type verbose: bool """ def words(sent): return [word for (word, tag) in sent] def tags(sent): return [tag for (word, tag) in sent] def flatten(seq): return list(itertools.chain(*seq)) test_sequence = self._transform(test_sequence) predicted_sequence = list(map(self._tag, map(words, test_sequence))) if verbose: for test_sent, predicted_sent in zip(test_sequence, predicted_sequence): print( "Test:", " ".join(f"{token}/{tag}" for (token, tag) in test_sent), ) print() print("Untagged:", " ".join("%s" % token for (token, tag) in test_sent)) print() print( "HMM-tagged:", " ".join(f"{token}/{tag}" for (token, tag) in predicted_sent), ) print() print( "Entropy:", self.entropy([(token, None) for (token, tag) in predicted_sent]), ) print() print("-" * 60) test_tags = flatten(map(tags, test_sequence)) predicted_tags = flatten(map(tags, predicted_sequence)) acc = accuracy(test_tags, predicted_tags) count = sum(len(sent) for sent in test_sequence) print("accuracy over %d tokens: %.2f" % (count, acc * 100)) def __repr__(self): return "" % ( len(self._states), len(self._symbols), ) class HiddenMarkovModelTrainer: """ Algorithms for learning HMM parameters from training data. These include both supervised learning (MLE) and unsupervised learning (Baum-Welch). Creates an HMM trainer to induce an HMM with the given states and output symbol alphabet. A supervised and unsupervised training method may be used. If either of the states or symbols are not given, these may be derived from supervised training. :param states: the set of state labels :type states: sequence of any :param symbols: the set of observation symbols :type symbols: sequence of any """ def __init__(self, states=None, symbols=None): self._states = states if states else [] self._symbols = symbols if symbols else [] def train(self, labeled_sequences=None, unlabeled_sequences=None, **kwargs): """ Trains the HMM using both (or either of) supervised and unsupervised techniques. :return: the trained model :rtype: HiddenMarkovModelTagger :param labelled_sequences: the supervised training data, a set of labelled sequences of observations ex: [ (word_1, tag_1),...,(word_n,tag_n) ] :type labelled_sequences: list :param unlabeled_sequences: the unsupervised training data, a set of sequences of observations ex: [ word_1, ..., word_n ] :type unlabeled_sequences: list :param kwargs: additional arguments to pass to the training methods """ assert labeled_sequences or unlabeled_sequences model = None if labeled_sequences: model = self.train_supervised(labeled_sequences, **kwargs) if unlabeled_sequences: if model: kwargs["model"] = model model = self.train_unsupervised(unlabeled_sequences, **kwargs) return model def _baum_welch_step(self, sequence, model, symbol_to_number): N = len(model._states) M = len(model._symbols) T = len(sequence) # compute forward and backward probabilities alpha = model._forward_probability(sequence) beta = model._backward_probability(sequence) # find the log probability of the sequence lpk = logsumexp2(alpha[T - 1]) A_numer = _ninf_array((N, N)) B_numer = _ninf_array((N, M)) A_denom = _ninf_array(N) B_denom = _ninf_array(N) transitions_logprob = model._transitions_matrix().T for t in range(T): symbol = sequence[t][_TEXT] # not found? FIXME next_symbol = None if t < T - 1: next_symbol = sequence[t + 1][_TEXT] # not found? FIXME xi = symbol_to_number[symbol] next_outputs_logprob = model._outputs_vector(next_symbol) alpha_plus_beta = alpha[t] + beta[t] if t < T - 1: numer_add = ( transitions_logprob + next_outputs_logprob + beta[t + 1] + alpha[t].reshape(N, 1) ) A_numer = np.logaddexp2(A_numer, numer_add) A_denom = np.logaddexp2(A_denom, alpha_plus_beta) else: B_denom = np.logaddexp2(A_denom, alpha_plus_beta) B_numer[:, xi] = np.logaddexp2(B_numer[:, xi], alpha_plus_beta) return lpk, A_numer, A_denom, B_numer, B_denom def train_unsupervised(self, unlabeled_sequences, update_outputs=True, **kwargs): """ Trains the HMM using the Baum-Welch algorithm to maximise the probability of the data sequence. This is a variant of the EM algorithm, and is unsupervised in that it doesn't need the state sequences for the symbols. The code is based on 'A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition', Lawrence Rabiner, IEEE, 1989. :return: the trained model :rtype: HiddenMarkovModelTagger :param unlabeled_sequences: the training data, a set of sequences of observations :type unlabeled_sequences: list kwargs may include following parameters: :param model: a HiddenMarkovModelTagger instance used to begin the Baum-Welch algorithm :param max_iterations: the maximum number of EM iterations :param convergence_logprob: the maximum change in log probability to allow convergence """ # create a uniform HMM, which will be iteratively refined, unless # given an existing model model = kwargs.get("model") if not model: priors = RandomProbDist(self._states) transitions = DictionaryConditionalProbDist( {state: RandomProbDist(self._states) for state in self._states} ) outputs = DictionaryConditionalProbDist( {state: RandomProbDist(self._symbols) for state in self._states} ) model = HiddenMarkovModelTagger( self._symbols, self._states, transitions, outputs, priors ) self._states = model._states self._symbols = model._symbols N = len(self._states) M = len(self._symbols) symbol_numbers = {sym: i for i, sym in enumerate(self._symbols)} # update model prob dists so that they can be modified # model._priors = MutableProbDist(model._priors, self._states) model._transitions = DictionaryConditionalProbDist( { s: MutableProbDist(model._transitions[s], self._states) for s in self._states } ) if update_outputs: model._outputs = DictionaryConditionalProbDist( { s: MutableProbDist(model._outputs[s], self._symbols) for s in self._states } ) model.reset_cache() # iterate until convergence converged = False last_logprob = None iteration = 0 max_iterations = kwargs.get("max_iterations", 1000) epsilon = kwargs.get("convergence_logprob", 1e-6) while not converged and iteration < max_iterations: A_numer = _ninf_array((N, N)) B_numer = _ninf_array((N, M)) A_denom = _ninf_array(N) B_denom = _ninf_array(N) logprob = 0 for sequence in unlabeled_sequences: sequence = list(sequence) if not sequence: continue ( lpk, seq_A_numer, seq_A_denom, seq_B_numer, seq_B_denom, ) = self._baum_welch_step(sequence, model, symbol_numbers) # add these sums to the global A and B values for i in range(N): A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i] - lpk) B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i] - lpk) A_denom = np.logaddexp2(A_denom, seq_A_denom - lpk) B_denom = np.logaddexp2(B_denom, seq_B_denom - lpk) logprob += lpk # use the calculated values to update the transition and output # probability values for i in range(N): logprob_Ai = A_numer[i] - A_denom[i] logprob_Bi = B_numer[i] - B_denom[i] # We should normalize all probabilities (see p.391 Huang et al) # Let sum(P) be K. # We can divide each Pi by K to make sum(P) == 1. # Pi' = Pi/K # log2(Pi') = log2(Pi) - log2(K) logprob_Ai -= logsumexp2(logprob_Ai) logprob_Bi -= logsumexp2(logprob_Bi) # update output and transition probabilities si = self._states[i] for j in range(N): sj = self._states[j] model._transitions[si].update(sj, logprob_Ai[j]) if update_outputs: for k in range(M): ok = self._symbols[k] model._outputs[si].update(ok, logprob_Bi[k]) # Rabiner says the priors don't need to be updated. I don't # believe him. FIXME # test for convergence if iteration > 0 and abs(logprob - last_logprob) < epsilon: converged = True print("iteration", iteration, "logprob", logprob) iteration += 1 last_logprob = logprob return model def train_supervised(self, labelled_sequences, estimator=None): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. :return: the trained model :rtype: HiddenMarkovModelTagger :param labelled_sequences: the training data, a set of labelled sequences of observations :type labelled_sequences: list :param estimator: a function taking a FreqDist and a number of bins and returning a CProbDistI; otherwise a MLE estimate is used """ # default to the MLE estimate if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = set(self._symbols) known_states = set(self._states) starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 outputs[state][symbol] += 1 lasts = state # update the state and symbol lists if state not in known_states: self._states.append(state) known_states.add(state) if symbol not in known_symbols: self._symbols.append(symbol) known_symbols.add(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) B = ConditionalProbDist(outputs, estimator, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi) def _ninf_array(shape): res = np.empty(shape, np.float64) res.fill(-np.inf) return res def logsumexp2(arr): max_ = arr.max() return np.log2(np.sum(2 ** (arr - max_))) + max_ def _log_add(*values): """ Adds the logged values, returning the logarithm of the addition. """ x = max(values) if x > -np.inf: sum_diffs = 0 for value in values: sum_diffs += 2 ** (value - x) return x + np.log2(sum_diffs) else: return x def _create_hmm_tagger(states, symbols, A, B, pi): def pd(values, samples): d = dict(zip(samples, values)) return DictionaryProbDist(d) def cpd(array, conditions, samples): d = {} for values, condition in zip(array, conditions): d[condition] = pd(values, samples) return DictionaryConditionalProbDist(d) A = cpd(A, states, states) B = cpd(B, states, symbols) pi = pd(pi, states) return HiddenMarkovModelTagger( symbols=symbols, states=states, transitions=A, outputs=B, priors=pi ) def _market_hmm_example(): """ Return an example HMM (described at page 381, Huang et al) """ states = ["bull", "bear", "static"] symbols = ["up", "down", "unchanged"] A = np.array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], np.float64) B = np.array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], np.float64) pi = np.array([0.5, 0.2, 0.3], np.float64) model = _create_hmm_tagger(states, symbols, A, B, pi) return model, states, symbols def demo(): # demonstrates HMM probability calculation print() print("HMM probability calculation demo") print() model, states, symbols = _market_hmm_example() print("Testing", model) for test in [ ["up", "up"], ["up", "down", "up"], ["down"] * 5, ["unchanged"] * 5 + ["up"], ]: sequence = [(t, None) for t in test] print("Testing with state sequence", test) print("probability =", model.probability(sequence)) print("tagging = ", model.tag([word for (word, tag) in sequence])) print("p(tagged) = ", model.probability(sequence)) print("H = ", model.entropy(sequence)) print("H_exh = ", model._exhaustive_entropy(sequence)) print("H(point) = ", model.point_entropy(sequence)) print("H_exh(point)=", model._exhaustive_point_entropy(sequence)) print() def load_pos(num_sents): from nltk.corpus import brown sentences = brown.tagged_sents(categories="news")[:num_sents] tag_re = re.compile(r"[*]|--|[^+*-]+") tag_set = set() symbols = set() cleaned_sentences = [] for sentence in sentences: for i in range(len(sentence)): word, tag = sentence[i] word = word.lower() # normalize symbols.add(word) # log this word # Clean up the tag. tag = tag_re.match(tag).group() tag_set.add(tag) sentence[i] = (word, tag) # store cleaned-up tagged token cleaned_sentences += [sentence] return cleaned_sentences, list(tag_set), list(symbols) def demo_pos(): # demonstrates POS tagging using supervised training print() print("HMM POS tagging demo") print() print("Training HMM...") labelled_sequences, tag_set, symbols = load_pos(20000) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised( labelled_sequences[10:], estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins), ) print("Testing...") hmm.test(labelled_sequences[:10], verbose=True) def _untag(sentences): unlabeled = [] for sentence in sentences: unlabeled.append([(token[_TEXT], None) for token in sentence]) return unlabeled def demo_pos_bw( test=10, supervised=20, unsupervised=10, verbose=True, max_iterations=5 ): # demonstrates the Baum-Welch algorithm in POS tagging print() print("Baum-Welch demo for POS tagging") print() print("Training HMM (supervised, %d sentences)..." % supervised) sentences, tag_set, symbols = load_pos(test + supervised + unsupervised) symbols = set() for sentence in sentences: for token in sentence: symbols.add(token[_TEXT]) trainer = HiddenMarkovModelTrainer(tag_set, list(symbols)) hmm = trainer.train_supervised( sentences[test : test + supervised], estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins), ) hmm.test(sentences[:test], verbose=verbose) print("Training (unsupervised, %d sentences)..." % unsupervised) # it's rather slow - so only use 10 samples by default unlabeled = _untag(sentences[test + supervised :]) hmm = trainer.train_unsupervised( unlabeled, model=hmm, max_iterations=max_iterations ) hmm.test(sentences[:test], verbose=verbose) def demo_bw(): # demo Baum Welch by generating some sequences and then performing # unsupervised training on them print() print("Baum-Welch demo for market example") print() model, states, symbols = _market_hmm_example() # generate some random sequences training = [] import random rng = random.Random() rng.seed(0) for i in range(10): item = model.random_sample(rng, 5) training.append([(i[0], None) for i in item]) # train on those examples, starting with the model that generated them trainer = HiddenMarkovModelTrainer(states, symbols) hmm = trainer.train_unsupervised(training, model=model, max_iterations=1000) nltk-3.7/nltk/tag/hunpos.py000066400000000000000000000114321420073152400157430ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the HunPos POS-tagger # # Copyright (C) 2001-2022 NLTK Project # Author: Peter Ljunglöf # Dávid Márk Nemeskey (modifications) # Attila Zséder (modifications) # URL: # For license information, see LICENSE.TXT """ A module for interfacing with the HunPos open-source POS-tagger. """ import os from subprocess import PIPE, Popen from nltk.internals import find_binary, find_file from nltk.tag.api import TaggerI _hunpos_url = "https://code.google.com/p/hunpos/" _hunpos_charset = "ISO-8859-1" """The default encoding used by hunpos: ISO-8859-1.""" class HunposTagger(TaggerI): """ A class for pos tagging with HunPos. The input is the paths to: - a model trained on training data - (optionally) the path to the hunpos-tag binary - (optionally) the encoding of the training data (default: ISO-8859-1) Example: >>> from nltk.tag import HunposTagger >>> ht = HunposTagger('en_wsj.model') >>> ht.tag('What is the airspeed of an unladen swallow ?'.split()) [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] >>> ht.close() This class communicates with the hunpos-tag binary via pipes. When the tagger object is no longer needed, the close() method should be called to free system resources. The class supports the context manager interface; if used in a with statement, the close() method is invoked automatically: >>> with HunposTagger('en_wsj.model') as ht: ... ht.tag('What is the airspeed of an unladen swallow ?'.split()) ... [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] """ def __init__( self, path_to_model, path_to_bin=None, encoding=_hunpos_charset, verbose=False ): """ Starts the hunpos-tag executable and establishes a connection with it. :param path_to_model: The model file. :param path_to_bin: The hunpos-tag binary. :param encoding: The encoding used by the model. Unicode tokens passed to the tag() and tag_sents() methods are converted to this charset when they are sent to hunpos-tag. The default is ISO-8859-1 (Latin-1). This parameter is ignored for str tokens, which are sent as-is. The caller must ensure that tokens are encoded in the right charset. """ self._closed = True hunpos_paths = [ ".", "/usr/bin", "/usr/local/bin", "/opt/local/bin", "/Applications/bin", "~/bin", "~/Applications/bin", ] hunpos_paths = list(map(os.path.expanduser, hunpos_paths)) self._hunpos_bin = find_binary( "hunpos-tag", path_to_bin, env_vars=("HUNPOS_TAGGER",), searchpath=hunpos_paths, url=_hunpos_url, verbose=verbose, ) self._hunpos_model = find_file( path_to_model, env_vars=("HUNPOS_TAGGER",), verbose=verbose ) self._encoding = encoding self._hunpos = Popen( [self._hunpos_bin, self._hunpos_model], shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE, ) self._closed = False def __del__(self): self.close() def close(self): """Closes the pipe to the hunpos executable.""" if not self._closed: self._hunpos.communicate() self._closed = True def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def tag(self, tokens): """Tags a single sentence: a list of words. The tokens should not contain any newline characters. """ for token in tokens: assert "\n" not in token, "Tokens should not contain newlines" if isinstance(token, str): token = token.encode(self._encoding) self._hunpos.stdin.write(token + b"\n") # We write a final empty line to tell hunpos that the sentence is finished: self._hunpos.stdin.write(b"\n") self._hunpos.stdin.flush() tagged_tokens = [] for token in tokens: tagged = self._hunpos.stdout.readline().strip().split(b"\t") tag = tagged[1] if len(tagged) > 1 else None tagged_tokens.append((token, tag)) # We have to read (and dismiss) the final empty line: self._hunpos.stdout.readline() return tagged_tokens nltk-3.7/nltk/tag/mapping.py000066400000000000000000000074601420073152400160700ustar00rootroot00000000000000# Natural Language Toolkit: Tagset Mapping # # Copyright (C) 2001-2022 NLTK Project # Author: Nathan Schneider # Steven Bird # URL: # For license information, see LICENSE.TXT """ Interface for converting POS tags from various treebanks to the universal tagset of Petrov, Das, & McDonald. The tagset consists of the following 12 coarse tags: VERB - verbs (all tenses and modes) NOUN - nouns (common and proper) PRON - pronouns ADJ - adjectives ADV - adverbs ADP - adpositions (prepositions and postpositions) CONJ - conjunctions DET - determiners NUM - cardinal numbers PRT - particles or other function words X - other: foreign words, typos, abbreviations . - punctuation @see: https://arxiv.org/abs/1104.2086 and https://code.google.com/p/universal-pos-tags/ """ from collections import defaultdict from os.path import join from nltk.data import load _UNIVERSAL_DATA = "taggers/universal_tagset" _UNIVERSAL_TAGS = ( "VERB", "NOUN", "PRON", "ADJ", "ADV", "ADP", "CONJ", "DET", "NUM", "PRT", "X", ".", ) # _MAPPINGS = defaultdict(lambda: defaultdict(dict)) # the mapping between tagset T1 and T2 returns UNK if applied to an unrecognized tag _MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: "UNK"))) def _load_universal_map(fileid): contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text") # When mapping to the Universal Tagset, # map unknown inputs to 'X' not 'UNK' _MAPPINGS[fileid]["universal"].default_factory = lambda: "X" for line in contents.splitlines(): line = line.strip() if line == "": continue fine, coarse = line.split("\t") assert coarse in _UNIVERSAL_TAGS, f"Unexpected coarse tag: {coarse}" assert ( fine not in _MAPPINGS[fileid]["universal"] ), f"Multiple entries for original tag: {fine}" _MAPPINGS[fileid]["universal"][fine] = coarse def tagset_mapping(source, target): """ Retrieve the mapping dictionary between tagsets. >>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\ 'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\ 'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'} True """ if source not in _MAPPINGS or target not in _MAPPINGS[source]: if target == "universal": _load_universal_map(source) # Added the new Russian National Corpus mappings because the # Russian model for nltk.pos_tag() uses it. _MAPPINGS["ru-rnc-new"]["universal"] = { "A": "ADJ", "A-PRO": "PRON", "ADV": "ADV", "ADV-PRO": "PRON", "ANUM": "ADJ", "CONJ": "CONJ", "INTJ": "X", "NONLEX": ".", "NUM": "NUM", "PARENTH": "PRT", "PART": "PRT", "PR": "ADP", "PRAEDIC": "PRT", "PRAEDIC-PRO": "PRON", "S": "NOUN", "S-PRO": "PRON", "V": "VERB", } return _MAPPINGS[source][target] def map_tag(source, target, source_tag): """ Maps the tag from the source tagset to the target tagset. >>> map_tag('en-ptb', 'universal', 'VBZ') 'VERB' >>> map_tag('en-ptb', 'universal', 'VBP') 'VERB' >>> map_tag('en-ptb', 'universal', '``') '.' """ # we need a systematic approach to naming if target == "universal": if source == "wsj": source = "en-ptb" if source == "brown": source = "en-brown" return tagset_mapping(source, target)[source_tag] nltk-3.7/nltk/tag/perceptron.py000066400000000000000000000313761420073152400166210ustar00rootroot00000000000000# This module is a port of the Textblob Averaged Perceptron Tagger # Author: Matthew Honnibal , # Long Duong (NLTK port) # URL: # # Copyright 2013 Matthew Honnibal # NLTK modifications Copyright 2015 The NLTK Project # # This module is provided under the terms of the MIT License. import logging import pickle import random from collections import defaultdict from nltk import jsontags from nltk.data import find, load from nltk.tag.api import TaggerI try: import numpy as np except ImportError: pass PICKLE = "averaged_perceptron_tagger.pickle" @jsontags.register_tag class AveragedPerceptron: """An averaged perceptron, as implemented by Matthew Honnibal. See more implementation details here: https://explosion.ai/blog/part-of-speech-pos-tagger-in-python """ json_tag = "nltk.tag.perceptron.AveragedPerceptron" def __init__(self, weights=None): # Each feature gets its own weight vector, so weights is a dict-of-dicts self.weights = weights if weights else {} self.classes = set() # The accumulated values, for the averaging. These will be keyed by # feature/clas tuples self._totals = defaultdict(int) # The last time the feature was changed, for the averaging. Also # keyed by feature/clas tuples # (tstamps is short for timestamps) self._tstamps = defaultdict(int) # Number of instances seen self.i = 0 def _softmax(self, scores): s = np.fromiter(scores.values(), dtype=float) exps = np.exp(s) return exps / np.sum(exps) def predict(self, features, return_conf=False): """Dot-product the features and current weights and return the best label.""" scores = defaultdict(float) for feat, value in features.items(): if feat not in self.weights or value == 0: continue weights = self.weights[feat] for label, weight in weights.items(): scores[label] += value * weight # Do a secondary alphabetic sort, for stability best_label = max(self.classes, key=lambda label: (scores[label], label)) # compute the confidence conf = max(self._softmax(scores)) if return_conf == True else None return best_label, conf def update(self, truth, guess, features): """Update the feature weights.""" def upd_feat(c, f, w, v): param = (f, c) self._totals[param] += (self.i - self._tstamps[param]) * w self._tstamps[param] = self.i self.weights[f][c] = w + v self.i += 1 if truth == guess: return None for f in features: weights = self.weights.setdefault(f, {}) upd_feat(truth, f, weights.get(truth, 0.0), 1.0) upd_feat(guess, f, weights.get(guess, 0.0), -1.0) def average_weights(self): """Average weights from all iterations.""" for feat, weights in self.weights.items(): new_feat_weights = {} for clas, weight in weights.items(): param = (feat, clas) total = self._totals[param] total += (self.i - self._tstamps[param]) * weight averaged = round(total / self.i, 3) if averaged: new_feat_weights[clas] = averaged self.weights[feat] = new_feat_weights def save(self, path): """Save the pickled model weights.""" with open(path, "wb") as fout: return pickle.dump(dict(self.weights), fout) def load(self, path): """Load the pickled model weights.""" self.weights = load(path) def encode_json_obj(self): return self.weights @classmethod def decode_json_obj(cls, obj): return cls(obj) @jsontags.register_tag class PerceptronTagger(TaggerI): """ Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. See more implementation details here: https://explosion.ai/blog/part-of-speech-pos-tagger-in-python >>> from nltk.tag.perceptron import PerceptronTagger Train the model >>> tagger = PerceptronTagger(load=False) >>> tagger.train([[('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')], ... [('yes','NNS'),('it','PRP'),('beautiful','JJ')]]) >>> tagger.tag(['today','is','a','beautiful','day']) [('today', 'NN'), ('is', 'PRP'), ('a', 'PRP'), ('beautiful', 'JJ'), ('day', 'NN')] Use the pretrain model (the default constructor) >>> pretrain = PerceptronTagger() >>> pretrain.tag('The quick brown fox jumps over the lazy dog'.split()) [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')] >>> pretrain.tag("The red cat".split()) [('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')] """ json_tag = "nltk.tag.sequential.PerceptronTagger" START = ["-START-", "-START2-"] END = ["-END-", "-END2-"] def __init__(self, load=True): """ :param load: Load the pickled model upon instantiation. """ self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() if load: AP_MODEL_LOC = "file:" + str( find("taggers/averaged_perceptron_tagger/" + PICKLE) ) self.load(AP_MODEL_LOC) def tag(self, tokens, return_conf=False, use_tagdict=True): """ Tag tokenized sentences. :params tokens: list of word :type tokens: list(str) """ prev, prev2 = self.START output = [] context = self.START + [self.normalize(w) for w in tokens] + self.END for i, word in enumerate(tokens): tag, conf = ( (self.tagdict.get(word), 1.0) if use_tagdict == True else (None, None) ) if not tag: features = self._get_features(i, word, context, prev, prev2) tag, conf = self.model.predict(features, return_conf) output.append((word, tag, conf) if return_conf == True else (word, tag)) prev2 = prev prev = tag return output def train(self, sentences, save_loc=None, nr_iter=5): """Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list or iterator of sentences, where each sentence is a list of (words, tags) tuples. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. """ # We'd like to allow ``sentences`` to be either a list or an iterator, # the latter being especially important for a large training dataset. # Because ``self._make_tagdict(sentences)`` runs regardless, we make # it populate ``self._sentences`` (a list) with all the sentences. # This saves the overheard of just iterating through ``sentences`` to # get the list by ``sentences = list(sentences)``. self._sentences = list() # to be populated by self._make_tagdict... self._make_tagdict(sentences) self.model.classes = self.classes for iter_ in range(nr_iter): c = 0 n = 0 for sentence in self._sentences: words, tags = zip(*sentence) prev, prev2 = self.START context = self.START + [self.normalize(w) for w in words] + self.END for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: feats = self._get_features(i, word, context, prev, prev2) guess, _ = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev2 = prev prev = guess c += guess == tags[i] n += 1 random.shuffle(self._sentences) logging.info(f"Iter {iter_}: {c}/{n}={_pc(c, n)}") # We don't need the training sentences anymore, and we don't want to # waste space on them when we pickle the trained tagger. self._sentences = None self.model.average_weights() # Pickle as a binary file if save_loc is not None: with open(save_loc, "wb") as fout: # changed protocol from -1 to 2 to make pickling Python 2 compatible pickle.dump((self.model.weights, self.tagdict, self.classes), fout, 2) def load(self, loc): """ :param loc: Load a pickled model at location. :type loc: str """ self.model.weights, self.tagdict, self.classes = load(loc) self.model.classes = self.classes def encode_json_obj(self): return self.model.weights, self.tagdict, list(self.classes) @classmethod def decode_json_obj(cls, obj): tagger = cls(load=False) tagger.model.weights, tagger.tagdict, tagger.classes = obj tagger.classes = set(tagger.classes) tagger.model.classes = tagger.classes return tagger def normalize(self, word): """ Normalization used in pre-processing. - All words are lower cased - Groups of digits of length 4 are represented as !YEAR; - Other digits are represented as !DIGITS :rtype: str """ if "-" in word and word[0] != "-": return "!HYPHEN" if word.isdigit() and len(word) == 4: return "!YEAR" if word and word[0].isdigit(): return "!DIGITS" return word.lower() def _get_features(self, i, word, context, prev, prev2): """Map tokens into a feature representation, implemented as a {hashable: int} dict. If the features change, a new model must be trained. """ def add(name, *args): features[" ".join((name,) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add("bias") add("i suffix", word[-3:]) add("i pref1", word[0] if word else "") add("i-1 tag", prev) add("i-2 tag", prev2) add("i tag+i-2 tag", prev, prev2) add("i word", context[i]) add("i-1 tag+i word", prev, context[i]) add("i-1 word", context[i - 1]) add("i-1 suffix", context[i - 1][-3:]) add("i-2 word", context[i - 2]) add("i+1 word", context[i + 1]) add("i+1 suffix", context[i + 1][-3:]) add("i+2 word", context[i + 2]) return features def _make_tagdict(self, sentences): """ Make a tag dictionary for single-tag words. :param sentences: A list of list of (word, tag) tuples. """ counts = defaultdict(lambda: defaultdict(int)) for sentence in sentences: self._sentences.append(sentence) for word, tag in sentence: counts[word][tag] += 1 self.classes.add(tag) freq_thresh = 20 ambiguity_thresh = 0.97 for word, tag_freqs in counts.items(): tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) n = sum(tag_freqs.values()) # Don't add rare words to the tag dictionary # Only add quite unambiguous words if n >= freq_thresh and (mode / n) >= ambiguity_thresh: self.tagdict[word] = tag def _pc(n, d): return (n / d) * 100 def _load_data_conll_format(filename): print("Read from file: ", filename) with open(filename, "rb") as fin: sentences = [] sentence = [] for line in fin.readlines(): line = line.strip() # print line if len(line) == 0: sentences.append(sentence) sentence = [] continue tokens = line.split("\t") word = tokens[1] tag = tokens[4] sentence.append((word, tag)) return sentences def _get_pretrain_model(): # Train and test on English part of ConLL data (WSJ part of Penn Treebank) # Train: section 2-11 # Test : section 23 tagger = PerceptronTagger() training = _load_data_conll_format("english_ptb_train.conll") testing = _load_data_conll_format("english_ptb_test.conll") print("Size of training and testing (sentence)", len(training), len(testing)) # Train and save the model tagger.train(training, PICKLE) print("Accuracy : ", tagger.accuracy(testing)) if __name__ == "__main__": # _get_pretrain_model() pass nltk-3.7/nltk/tag/senna.py000066400000000000000000000130761420073152400155410ustar00rootroot00000000000000# Natural Language Toolkit: Senna POS Tagger # # Copyright (C) 2001-2022 NLTK Project # Author: Rami Al-Rfou' # URL: # For license information, see LICENSE.TXT """ Senna POS tagger, NER Tagger, Chunk Tagger The input is: - path to the directory that contains SENNA executables. If the path is incorrect, SennaTagger will automatically search for executable file specified in SENNA environment variable - (optionally) the encoding of the input data (default:utf-8) Note: Unit tests for this module can be found in test/unit/test_senna.py >>> from nltk.tag import SennaTagger >>> tagger = SennaTagger('/usr/share/senna-v3.0') >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] >>> from nltk.tag import SennaChunkTagger >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')] >>> from nltk.tag import SennaNERTagger >>> nertagger = SennaNERTagger('/usr/share/senna-v3.0') >>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP [('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'), ('London', 'B-LOC'), ('.', 'O')] >>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP [('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')] """ from nltk.classify import Senna class SennaTagger(Senna): def __init__(self, path, encoding="utf-8"): super().__init__(path, ["pos"], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super().tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations["word"], annotations["pos"]) return tagged_sents class SennaChunkTagger(Senna): def __init__(self, path, encoding="utf-8"): super().__init__(path, ["chk"], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super().tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations["word"], annotations["chk"]) return tagged_sents def bio_to_chunks(self, tagged_sent, chunk_type): """ Extracts the chunks in a BIO chunk-tagged sentence. >>> from nltk.tag import SennaChunkTagger >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') >>> sent = 'What is the airspeed of an unladen swallow ?'.split() >>> tagged_sent = chktagger.tag(sent) # doctest: +SKIP >>> tagged_sent # doctest: +SKIP [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')] >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) # doctest: +SKIP [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')] :param tagged_sent: A list of tuples of word and BIO chunk tag. :type tagged_sent: list(tuple) :param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP' :type tagged_sent: str :return: An iterable of tuples of chunks that users want to extract and their corresponding indices. :rtype: iter(tuple(str)) """ current_chunk = [] current_chunk_position = [] for idx, word_pos in enumerate(tagged_sent): word, pos = word_pos if "-" + chunk_type in pos: # Append the word to the current_chunk. current_chunk.append(word) current_chunk_position.append(idx) else: if current_chunk: # Flush the full chunk when out of an NP. _chunk_str = " ".join(current_chunk) _chunk_pos_str = "-".join(map(str, current_chunk_position)) yield _chunk_str, _chunk_pos_str current_chunk = [] current_chunk_position = [] if current_chunk: # Flush the last chunk. yield " ".join(current_chunk), "-".join(map(str, current_chunk_position)) class SennaNERTagger(Senna): def __init__(self, path, encoding="utf-8"): super().__init__(path, ["ner"], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super().tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations["word"], annotations["ner"]) return tagged_sents nltk-3.7/nltk/tag/sequential.py000066400000000000000000000663321420073152400166120ustar00rootroot00000000000000# Natural Language Toolkit: Sequential Backoff Taggers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # Tiago Tresoldi (original affix tagger) # URL: # For license information, see LICENSE.TXT """ Classes for tagging sentences sequentially, left to right. The abstract base class SequentialBackoffTagger serves as the base class for all the taggers in this module. Tagging of individual words is performed by the method ``choose_tag()``, which is defined by subclasses of SequentialBackoffTagger. If a tagger is unable to determine a tag for the specified token, then its backoff tagger is consulted instead. Any SequentialBackoffTagger may serve as a backoff tagger for any other SequentialBackoffTagger. """ import ast import re from abc import abstractmethod from typing import List, Optional, Tuple from nltk import jsontags from nltk.classify import NaiveBayesClassifier from nltk.probability import ConditionalFreqDist from nltk.tag.api import FeaturesetTaggerI, TaggerI ###################################################################### # Abstract Base Classes ###################################################################### class SequentialBackoffTagger(TaggerI): """ An abstract base class for taggers that tags words sequentially, left to right. Tagging of individual words is performed by the ``choose_tag()`` method, which should be defined by subclasses. If a tagger is unable to determine a tag for the specified token, then its backoff tagger is consulted. :ivar _taggers: A list of all the taggers that should be tried to tag a token (i.e., self and its backoff taggers). """ def __init__(self, backoff=None): if backoff is None: self._taggers = [self] else: self._taggers = [self] + backoff._taggers @property def backoff(self): """The backoff tagger for this tagger.""" return self._taggers[1] if len(self._taggers) > 1 else None def tag(self, tokens): # docs inherited from TaggerI tags = [] for i in range(len(tokens)): tags.append(self.tag_one(tokens, i, tags)) return list(zip(tokens, tags)) def tag_one(self, tokens, index, history): """ Determine an appropriate tag for the specified token, and return that tag. If this tagger is unable to determine a tag for the specified token, then its backoff tagger is consulted. :rtype: str :type tokens: list :param tokens: The list of words that are being tagged. :type index: int :param index: The index of the word whose tag should be returned. :type history: list(str) :param history: A list of the tags for all words before *index*. """ tag = None for tagger in self._taggers: tag = tagger.choose_tag(tokens, index, history) if tag is not None: break return tag @abstractmethod def choose_tag(self, tokens, index, history): """ Decide which tag should be used for the specified token, and return that tag. If this tagger is unable to determine a tag for the specified token, return None -- do not consult the backoff tagger. This method should be overridden by subclasses of SequentialBackoffTagger. :rtype: str :type tokens: list :param tokens: The list of words that are being tagged. :type index: int :param index: The index of the word whose tag should be returned. :type history: list(str) :param history: A list of the tags for all words before *index*. """ class ContextTagger(SequentialBackoffTagger): """ An abstract base class for sequential backoff taggers that choose a tag for a token based on the value of its "context". Different subclasses are used to define different contexts. A ContextTagger chooses the tag for a token by calculating the token's context, and looking up the corresponding tag in a table. This table can be constructed manually; or it can be automatically constructed based on a training corpus, using the ``_train()`` factory method. :ivar _context_to_tag: Dictionary mapping contexts to tags. """ def __init__(self, context_to_tag, backoff=None): """ :param context_to_tag: A dictionary mapping contexts to tags. :param backoff: The backoff tagger that should be used for this tagger. """ super().__init__(backoff) self._context_to_tag = context_to_tag if context_to_tag else {} @abstractmethod def context(self, tokens, index, history): """ :return: the context that should be used to look up the tag for the specified token; or None if the specified token should not be handled by this tagger. :rtype: (hashable) """ def choose_tag(self, tokens, index, history): context = self.context(tokens, index, history) return self._context_to_tag.get(context) def size(self): """ :return: The number of entries in the table used by this tagger to map from contexts to tags. """ return len(self._context_to_tag) def __repr__(self): return f"<{self.__class__.__name__}: size={self.size()}>" def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if self.backoff is None or tag != self.backoff.tag_one( tokens, index, tags[:index] ): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0) / token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print("[Trained Unigram tagger:", end=" ") print( "size={}, backoff={:.2f}%, pruning={:.2f}%]".format( size, backoff, pruning ) ) ###################################################################### # Tagger Classes ###################################################################### @jsontags.register_tag class DefaultTagger(SequentialBackoffTagger): """ A tagger that assigns the same tag to every token. >>> from nltk.tag import DefaultTagger >>> default_tagger = DefaultTagger('NN') >>> list(default_tagger.tag('This is a test'.split())) [('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')] This tagger is recommended as a backoff tagger, in cases where a more powerful tagger is unable to assign a tag to the word (e.g. because the word was not seen during training). :param tag: The tag to assign to each token :type tag: str """ json_tag = "nltk.tag.sequential.DefaultTagger" def __init__(self, tag): self._tag = tag super().__init__(None) def encode_json_obj(self): return self._tag @classmethod def decode_json_obj(cls, obj): tag = obj return cls(tag) def choose_tag(self, tokens, index, history): return self._tag # ignore token and history def __repr__(self): return f"" @jsontags.register_tag class NgramTagger(ContextTagger): """ A tagger that chooses a token's tag based on its word string and on the preceding n word's tags. In particular, a tuple (tags[i-n:i-1], words[i]) is looked up in a table, and the corresponding tag is returned. N-gram taggers are typically trained on a tagged corpus. Train a new NgramTagger using the given training data or the supplied model. In particular, construct a new tagger whose table maps from each context (tag[i-n:i-1], word[i]) to the most frequent tag for that context. But exclude any contexts that are already tagged perfectly by the backoff tagger. :param train: A tagged corpus consisting of a list of tagged sentences, where each sentence is a list of (word, tag) tuples. :param backoff: A backoff tagger, to be used by the new tagger if it encounters an unknown context. :param cutoff: If the most likely tag for a context occurs fewer than *cutoff* times, then exclude it from the context-to-tag table for the new tagger. """ json_tag = "nltk.tag.sequential.NgramTagger" def __init__( self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False ): self._n = n self._check_params(train, model) super().__init__(model, backoff) if train: self._train(train, cutoff, verbose) def encode_json_obj(self): _context_to_tag = {repr(k): v for k, v in self._context_to_tag.items()} if "NgramTagger" in self.__class__.__name__: return self._n, _context_to_tag, self.backoff else: return _context_to_tag, self.backoff @classmethod def decode_json_obj(cls, obj): try: _n, _context_to_tag, backoff = obj except ValueError: _context_to_tag, backoff = obj if not _context_to_tag: return backoff _context_to_tag = {ast.literal_eval(k): v for k, v in _context_to_tag.items()} if "NgramTagger" in cls.__name__: return cls(_n, model=_context_to_tag, backoff=backoff) else: return cls(model=_context_to_tag, backoff=backoff) def context(self, tokens, index, history): tag_context = tuple(history[max(0, index - self._n + 1) : index]) return tag_context, tokens[index] @jsontags.register_tag class UnigramTagger(NgramTagger): """ Unigram Tagger The UnigramTagger finds the most likely tag for each word in a training corpus, and then uses that information to assign tags to new tokens. >>> from nltk.corpus import brown >>> from nltk.tag import UnigramTagger >>> test_sent = brown.sents(categories='news')[0] >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) >>> for tok, tag in unigram_tagger.tag(test_sent): ... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL), (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT), (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ), (primary, NN), (election, NN), (produced, VBD), (``, ``), (no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI), (irregularities, NNS), (took, VBD), (place, NN), (., .), :param train: The corpus of training data, a list of tagged sentences :type train: list(list(tuple(str, str))) :param model: The tagger model :type model: dict :param backoff: Another tagger which this tagger will consult when it is unable to tag a word :type backoff: TaggerI :param cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger :type cutoff: int """ json_tag = "nltk.tag.sequential.UnigramTagger" def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): super().__init__(1, train, model, backoff, cutoff, verbose) def context(self, tokens, index, history): return tokens[index] @jsontags.register_tag class BigramTagger(NgramTagger): """ A tagger that chooses a token's tag based its word string and on the preceding words' tag. In particular, a tuple consisting of the previous tag and the word is looked up in a table, and the corresponding tag is returned. :param train: The corpus of training data, a list of tagged sentences :type train: list(list(tuple(str, str))) :param model: The tagger model :type model: dict :param backoff: Another tagger which this tagger will consult when it is unable to tag a word :type backoff: TaggerI :param cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger :type cutoff: int """ json_tag = "nltk.tag.sequential.BigramTagger" def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): super().__init__(2, train, model, backoff, cutoff, verbose) @jsontags.register_tag class TrigramTagger(NgramTagger): """ A tagger that chooses a token's tag based its word string and on the preceding two words' tags. In particular, a tuple consisting of the previous two tags and the word is looked up in a table, and the corresponding tag is returned. :param train: The corpus of training data, a list of tagged sentences :type train: list(list(tuple(str, str))) :param model: The tagger model :type model: dict :param backoff: Another tagger which this tagger will consult when it is unable to tag a word :type backoff: TaggerI :param cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger :type cutoff: int """ json_tag = "nltk.tag.sequential.TrigramTagger" def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): super().__init__(3, train, model, backoff, cutoff, verbose) @jsontags.register_tag class AffixTagger(ContextTagger): """ A tagger that chooses a token's tag based on a leading or trailing substring of its word string. (It is important to note that these substrings are not necessarily "true" morphological affixes). In particular, a fixed-length substring of the word is looked up in a table, and the corresponding tag is returned. Affix taggers are typically constructed by training them on a tagged corpus. Construct a new affix tagger. :param affix_length: The length of the affixes that should be considered during training and tagging. Use negative numbers for suffixes. :param min_stem_length: Any words whose length is less than min_stem_length+abs(affix_length) will be assigned a tag of None by this tagger. """ json_tag = "nltk.tag.sequential.AffixTagger" def __init__( self, train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False, ): self._check_params(train, model) super().__init__(model, backoff) self._affix_length = affix_length self._min_word_length = min_stem_length + abs(affix_length) if train: self._train(train, cutoff, verbose) def encode_json_obj(self): return ( self._affix_length, self._min_word_length, self._context_to_tag, self.backoff, ) @classmethod def decode_json_obj(cls, obj): _affix_length, _min_word_length, _context_to_tag, backoff = obj return cls( affix_length=_affix_length, min_stem_length=_min_word_length - abs(_affix_length), model=_context_to_tag, backoff=backoff, ) def context(self, tokens, index, history): token = tokens[index] if len(token) < self._min_word_length: return None elif self._affix_length > 0: return token[: self._affix_length] else: return token[self._affix_length :] @jsontags.register_tag class RegexpTagger(SequentialBackoffTagger): r""" Regular Expression Tagger The RegexpTagger assigns tags to tokens by comparing their word strings to a series of regular expressions. The following tagger uses word suffixes to make guesses about the correct Brown Corpus part of speech tag: >>> from nltk.corpus import brown >>> from nltk.tag import RegexpTagger >>> test_sent = brown.sents(categories='news')[0] >>> regexp_tagger = RegexpTagger( ... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> regexp_tagger >>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'), ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'), ('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'), ('place', 'NN'), ('.', 'NN')] :type regexps: list(tuple(str, str)) :param regexps: A list of ``(regexp, tag)`` pairs, each of which indicates that a word matching ``regexp`` should be tagged with ``tag``. The pairs will be evaluated in order. If none of the regexps match a word, then the optional backoff tagger is invoked, else it is assigned the tag None. """ json_tag = "nltk.tag.sequential.RegexpTagger" def __init__( self, regexps: List[Tuple[str, str]], backoff: Optional[TaggerI] = None ): super().__init__(backoff) self._regexps = [] for regexp, tag in regexps: try: self._regexps.append((re.compile(regexp), tag)) except Exception as e: raise Exception( f"Invalid RegexpTagger regexp: {e}\n- regexp: {regexp!r}\n- tag: {tag!r}" ) from e def encode_json_obj(self): return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff @classmethod def decode_json_obj(cls, obj): regexps, backoff = obj return cls(regexps, backoff) def choose_tag(self, tokens, index, history): for regexp, tag in self._regexps: if re.match(regexp, tokens[index]): return tag return None def __repr__(self): return f"" class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI): """ A sequential tagger that uses a classifier to choose the tag for each token in a sentence. The featureset input for the classifier is generated by a feature detector function:: feature_detector(tokens, index, history) -> featureset Where tokens is the list of unlabeled tokens in the sentence; index is the index of the token for which feature detection should be performed; and history is list of the tags for all tokens before index. Construct a new classifier-based sequential tagger. :param feature_detector: A function used to generate the featureset input for the classifier:: feature_detector(tokens, index, history) -> featureset :param train: A tagged corpus consisting of a list of tagged sentences, where each sentence is a list of (word, tag) tuples. :param backoff: A backoff tagger, to be used by the new tagger if it encounters an unknown context. :param classifier_builder: A function used to train a new classifier based on the data in *train*. It should take one argument, a list of labeled featuresets (i.e., (featureset, label) tuples). :param classifier: The classifier that should be used by the tagger. This is only useful if you want to manually construct the classifier; normally, you would use *train* instead. :param backoff: A backoff tagger, used if this tagger is unable to determine a tag for a given token. :param cutoff_prob: If specified, then this tagger will fall back on its backoff tagger if the probability of the most likely tag is less than *cutoff_prob*. """ def __init__( self, feature_detector=None, train=None, classifier_builder=NaiveBayesClassifier.train, classifier=None, backoff=None, cutoff_prob=None, verbose=False, ): self._check_params(train, classifier) super().__init__(backoff) if (train and classifier) or (not train and not classifier): raise ValueError( "Must specify either training data or " "trained classifier." ) if feature_detector is not None: self._feature_detector = feature_detector # The feature detector function, used to generate a featureset # or each token: feature_detector(tokens, index, history) -> featureset self._cutoff_prob = cutoff_prob """Cutoff probability for tagging -- if the probability of the most likely tag is less than this, then use backoff.""" self._classifier = classifier """The classifier used to choose a tag for each token.""" if train: self._train(train, classifier_builder, verbose) def choose_tag(self, tokens, index, history): # Use our feature detector to get the featureset. featureset = self.feature_detector(tokens, index, history) # Use the classifier to pick a tag. If a cutoff probability # was specified, then check that the tag's probability is # higher than that cutoff first; otherwise, return None. if self._cutoff_prob is None: return self._classifier.classify(featureset) pdist = self._classifier.prob_classify(featureset) tag = pdist.max() return tag if pdist.prob(tag) >= self._cutoff_prob else None def _train(self, tagged_corpus, classifier_builder, verbose): """ Build a new classifier, based on the given training data *tagged_corpus*. """ classifier_corpus = [] if verbose: print("Constructing training corpus for classifier.") for sentence in tagged_corpus: history = [] untagged_sentence, tags = zip(*sentence) for index in range(len(sentence)): featureset = self.feature_detector(untagged_sentence, index, history) classifier_corpus.append((featureset, tags[index])) history.append(tags[index]) if verbose: print(f"Training classifier ({len(classifier_corpus)} instances)") self._classifier = classifier_builder(classifier_corpus) def __repr__(self): return f"" def feature_detector(self, tokens, index, history): """ Return the feature detector that this tagger uses to generate featuresets for its classifier. The feature detector is a function with the signature:: feature_detector(tokens, index, history) -> featureset See ``classifier()`` """ return self._feature_detector(tokens, index, history) def classifier(self): """ Return the classifier that this tagger uses to choose a tag for each word in a sentence. The input for this classifier is generated using this tagger's feature detector. See ``feature_detector()`` """ return self._classifier class ClassifierBasedPOSTagger(ClassifierBasedTagger): """ A classifier based part of speech tagger. """ def feature_detector(self, tokens, index, history): word = tokens[index] if index == 0: prevword = prevprevword = None prevtag = prevprevtag = None elif index == 1: prevword = tokens[index - 1].lower() prevprevword = None prevtag = history[index - 1] prevprevtag = None else: prevword = tokens[index - 1].lower() prevprevword = tokens[index - 2].lower() prevtag = history[index - 1] prevprevtag = history[index - 2] if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word): shape = "number" elif re.match(r"\W+$", word): shape = "punct" elif re.match("[A-Z][a-z]+$", word): shape = "upcase" elif re.match("[a-z]+$", word): shape = "downcase" elif re.match(r"\w+$", word): shape = "mixedcase" else: shape = "other" features = { "prevtag": prevtag, "prevprevtag": prevprevtag, "word": word, "word.lower": word.lower(), "suffix3": word.lower()[-3:], "suffix2": word.lower()[-2:], "suffix1": word.lower()[-1:], "prevprevword": prevprevword, "prevword": prevword, "prevtag+word": f"{prevtag}+{word.lower()}", "prevprevtag+word": f"{prevprevtag}+{word.lower()}", "prevword+word": f"{prevword}+{word.lower()}", "shape": shape, } return features nltk-3.7/nltk/tag/stanford.py000066400000000000000000000177351420073152400162630ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers # # Copyright (C) 2001-2022 NLTK Project # Author: Nitin Madnani # Rami Al-Rfou' # URL: # For license information, see LICENSE.TXT """ A module for interfacing with the Stanford taggers. Tagger models need to be downloaded from https://nlp.stanford.edu/software and the STANFORD_MODELS environment variable set (a colon-separated list of paths). For more details see the documentation for StanfordPOSTagger and StanfordNERTagger. """ import os import tempfile import warnings from abc import abstractmethod from subprocess import PIPE from nltk.internals import _java_options, config_java, find_file, find_jar, java from nltk.tag.api import TaggerI _stanford_url = "https://nlp.stanford.edu/software" class StanfordTagger(TaggerI): """ An interface to Stanford taggers. Subclasses must define: - ``_cmd`` property: A property that returns the command that will be executed. - ``_SEPARATOR``: Class constant that represents that character that is used to separate the tokens from their tags. - ``_JAR`` file: Class constant that represents the jar file name. """ _SEPARATOR = "" _JAR = "" def __init__( self, model_filename, path_to_jar=None, encoding="utf8", verbose=False, java_options="-mx1000m", ): # Raise deprecation warning. warnings.warn( str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.6.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead." ), DeprecationWarning, stacklevel=2, ) if not self._JAR: warnings.warn( "The StanfordTagger class is not meant to be " "instantiated directly. Did you mean " "StanfordPOSTagger or StanfordNERTagger?" ) self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose ) self._stanford_model = find_file( model_filename, env_vars=("STANFORD_MODELS",), verbose=verbose ) self._encoding = encoding self.java_options = java_options @property @abstractmethod def _cmd(self): """ A property that returns the command that will be executed. """ def tag(self, tokens): # This function should return list of tuple rather than list of list return sum(self.tag_sents([tokens]), []) def tag_sents(self, sentences): encoding = self._encoding default_options = " ".join(_java_options) config_java(options=self.java_options, verbose=False) # Create a temporary input file _input_fh, self._input_file_path = tempfile.mkstemp(text=True) cmd = list(self._cmd) cmd.extend(["-encoding", encoding]) # Write the actual sentences to the temporary input file _input_fh = os.fdopen(_input_fh, "wb") _input = "\n".join(" ".join(x) for x in sentences) if isinstance(_input, str) and encoding: _input = _input.encode(encoding) _input_fh.write(_input) _input_fh.close() # Run the tagger and get the output stanpos_output, _stderr = java( cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE ) stanpos_output = stanpos_output.decode(encoding) # Delete the temporary file os.unlink(self._input_file_path) # Return java configurations to their default values config_java(options=default_options, verbose=False) return self.parse_output(stanpos_output, sentences) def parse_output(self, text, sentences=None): # Output the tagged sentences tagged_sentences = [] for tagged_sentence in text.strip().split("\n"): sentence = [] for tagged_word in tagged_sentence.strip().split(): word_tags = tagged_word.strip().split(self._SEPARATOR) sentence.append( ("".join(word_tags[:-1]), word_tags[-1].replace("0", "").upper()) ) tagged_sentences.append(sentence) return tagged_sentences class StanfordPOSTagger(StanfordTagger): """ A class for pos tagging with Stanford Tagger. The input is the paths to: - a model trained on training data - (optionally) the path to the stanford tagger jar file. If not specified here, then this jar file must be specified in the CLASSPATH environment variable. - (optionally) the encoding of the training data (default: UTF-8) Example: >>> from nltk.tag import StanfordPOSTagger >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] """ _SEPARATOR = "_" _JAR = "stanford-postagger.jar" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @property def _cmd(self): return [ "edu.stanford.nlp.tagger.maxent.MaxentTagger", "-model", self._stanford_model, "-textFile", self._input_file_path, "-tokenize", "false", "-outputFormatOptions", "keepEmptySentences", ] class StanfordNERTagger(StanfordTagger): """ A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to: - a model trained on training data - (optionally) the path to the stanford tagger jar file. If not specified here, then this jar file must be specified in the CLASSPATH environment variable. - (optionally) the encoding of the training data (default: UTF-8) Example: >>> from nltk.tag import StanfordNERTagger >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')] """ _SEPARATOR = "/" _JAR = "stanford-ner.jar" _FORMAT = "slashTags" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @property def _cmd(self): # Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer return [ "edu.stanford.nlp.ie.crf.CRFClassifier", "-loadClassifier", self._stanford_model, "-textFile", self._input_file_path, "-outputFormat", self._FORMAT, "-tokenizerFactory", "edu.stanford.nlp.process.WhitespaceTokenizer", "-tokenizerOptions", '"tokenizeNLs=false"', ] def parse_output(self, text, sentences): if self._FORMAT == "slashTags": # Joint together to a big list tagged_sentences = [] for tagged_sentence in text.strip().split("\n"): for tagged_word in tagged_sentence.strip().split(): word_tags = tagged_word.strip().split(self._SEPARATOR) tagged_sentences.append(("".join(word_tags[:-1]), word_tags[-1])) # Separate it according to the input result = [] start = 0 for sent in sentences: result.append(tagged_sentences[start : start + len(sent)]) start += len(sent) return result raise NotImplementedError nltk-3.7/nltk/tag/tnt.py000077500000000000000000000426751420073152400152540ustar00rootroot00000000000000# Natural Language Toolkit: TnT Tagger # # Copyright (C) 2001-2022 NLTK Project # Author: Sam Huston # # URL: # For license information, see LICENSE.TXT """ Implementation of 'TnT - A Statisical Part of Speech Tagger' by Thorsten Brants https://aclanthology.org/A00-1031.pdf """ from math import log from operator import itemgetter from nltk.probability import ConditionalFreqDist, FreqDist from nltk.tag.api import TaggerI class TnT(TaggerI): """ TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. To speed up and get more precision, we can use log addition to instead multiplication, specifically: argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + log(P(t_T+1|t_T)) The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. """ def __init__(self, unk=None, Trained=False, N=1000, C=False): """ Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk: TaggerI :param Trained: Indication that the POS tagger is trained or not :type Trained: bool :param N: Beam search degree (see above) :type N: int :param C: Capitalization flag :type C: bool Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger """ self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): """ Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) """ # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = [("BOS", False), ("BOS", False)] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C = True self._wd[w][t] += 1 self._uni[(t, C)] += 1 self._bi[history[1]][(t, C)] += 1 self._tri[tuple(history)][(t, C)] += 1 history.append((t, C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t]["EOS"] += 1 # compute lambda values from the trained frequency distributions self._compute_lambda() def _compute_lambda(self): """ creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) """ # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].keys(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div( (self._tri[history][tag] - 1), (self._tri[history].N() - 1) ) c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += self._tri[history][tag] / 2.0 tl3 += self._tri[history][tag] / 2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += self._tri[history][tag] / 2.0 tl2 += self._tri[history][tag] / 2.0 # otherwise there might be a problem # eg: all values = 0 else: pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1 + tl2 + tl3) self._l2 = tl2 / (tl1 + tl2 + tl3) self._l3 = tl3 / (tl1 + tl2 + tl3) def _safe_div(self, v1, v2): """ Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 """ if v2 == 0: return -1 else: return v1 / v2 def tagdata(self, data): """ Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples """ res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): """ Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples """ current_state = [(["BOS", "BOS"], 0.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t, C) = tags[i + 2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): """ :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the log probability associated with each tag combination :type current_states : [([tag, ], logprob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag """ # if this word marks the end of the sentence, # return the most probable tag if sent == []: (h, logp) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initialise the flag for this word C = False if self._C and word[0].isupper(): C = True # if word is known # compute the set of possible tags # and their associated log probabilities if word in self._wd: self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].keys(): tC = (t, C) p_uni = self._uni.freq(tC) p_bi = self._bi[history[-1]].freq(tC) p_tri = self._tri[tuple(history[-2:])].freq(tC) p_wd = self._wd[word][t] / self._uni[tC] p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri p2 = log(p, 2) + log(p_wd, 2) # compute the result of appending each tag to this history new_states.append((history + [tC], curr_sent_logprob + p2)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ("Unk", C) # otherwise apply the unknown word tagger else: [(_w, t)] = list(self._unk.tag([word])) tag = (t, C) for (history, logprob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by log prob # set is now ordered greatest to least log probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[: self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states) ######################################## # helper function -- basic sentence tokenizer ######################################## def basic_sent_chop(data, raw=True): """ Basic method for tokenizing input into sentences for this tagger: :param data: list of tokens (words or (word, tag) tuples) :type data: str or tuple(str, str) :param raw: boolean flag marking the input data as a list of words or a list of tagged words :type raw: bool :return: list of sentences sentences are a list of tokens tokens are the same as the input Function takes a list of tokens and separates the tokens into lists where each list represents a sentence fragment This function can separate both tagged and raw sequences into basic sentences. Sentence markers are the set of [,.!?] This is a simple method which enhances the performance of the TnT tagger. Better sentence tokenization will further enhance the results. """ new_data = [] curr_sent = [] sent_mark = [",", ".", "?", "!"] if raw: for word in data: if word in sent_mark: curr_sent.append(word) new_data.append(curr_sent) curr_sent = [] else: curr_sent.append(word) else: for (word, tag) in data: if word in sent_mark: curr_sent.append((word, tag)) new_data.append(curr_sent) curr_sent = [] else: curr_sent.append((word, tag)) return new_data def demo(): from nltk.corpus import brown sents = list(brown.tagged_sents()) test = list(brown.sents()) tagger = TnT() tagger.train(sents[200:1000]) tagged_data = tagger.tagdata(test[100:120]) for j in range(len(tagged_data)): s = tagged_data[j] t = sents[j + 100] for i in range(len(s)): print(s[i], "--", t[i]) print() def demo2(): from nltk.corpus import treebank d = list(treebank.tagged_sents()) t = TnT(N=1000, C=False) s = TnT(N=1000, C=True) t.train(d[(11) * 100 :]) s.train(d[(11) * 100 :]) for i in range(10): tacc = t.accuracy(d[i * 100 : ((i + 1) * 100)]) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) t.unknown = 0 t.known = 0 print("Capitalization off:") print("Accuracy:", tacc) print("Percentage known:", tp_kn) print("Percentage unknown:", tp_un) print("Accuracy over known words:", (tacc / tp_kn)) sacc = s.accuracy(d[i * 100 : ((i + 1) * 100)]) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) s.unknown = 0 s.known = 0 print("Capitalization on:") print("Accuracy:", sacc) print("Percentage known:", sp_kn) print("Percentage unknown:", sp_un) print("Accuracy over known words:", (sacc / sp_kn)) def demo3(): from nltk.corpus import brown, treebank d = list(treebank.tagged_sents()) e = list(brown.tagged_sents()) d = d[:1000] e = e[:1000] d10 = int(len(d) * 0.1) e10 = int(len(e) * 0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = TnT(N=1000, C=False) s = TnT(N=1000, C=False) dtest = d[(i * d10) : ((i + 1) * d10)] etest = e[(i * e10) : ((i + 1) * e10)] dtrain = d[: (i * d10)] + d[((i + 1) * d10) :] etrain = e[: (i * e10)] + e[((i + 1) * e10) :] t.train(dtrain) s.train(etrain) tacc = t.accuracy(dtest) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = s.accuracy(etest) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += tacc / tp_kn sknacc += sacc / tp_kn tallacc += tacc sallacc += sacc # print(i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc) print("brown: acc over words known:", 10 * tknacc) print(" : overall accuracy:", 10 * tallacc) print(" : words known:", 10 * tknown) print("treebank: acc over words known:", 10 * sknacc) print(" : overall accuracy:", 10 * sallacc) print(" : words known:", 10 * sknown) nltk-3.7/nltk/tag/util.py000066400000000000000000000043511420073152400154060ustar00rootroot00000000000000# Natural Language Toolkit: Tagger Utilities # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT def str2tuple(s, sep="/"): """ Given the string representation of a tagged token, return the corresponding tuple representation. The rightmost occurrence of *sep* in *s* will be used to divide *s* into a word string and a tag string. If *sep* does not occur in *s*, return (s, None). >>> from nltk.tag.util import str2tuple >>> str2tuple('fly/NN') ('fly', 'NN') :type s: str :param s: The string representation of a tagged token. :type sep: str :param sep: The separator string used to separate word strings from tags. """ loc = s.rfind(sep) if loc >= 0: return (s[:loc], s[loc + len(sep) :].upper()) else: return (s, None) def tuple2str(tagged_token, sep="/"): """ Given the tuple representation of a tagged token, return the corresponding string representation. This representation is formed by concatenating the token's word string, followed by the separator, followed by the token's tag. (If the tag is None, then just return the bare word string.) >>> from nltk.tag.util import tuple2str >>> tagged_token = ('fly', 'NN') >>> tuple2str(tagged_token) 'fly/NN' :type tagged_token: tuple(str, str) :param tagged_token: The tuple representation of a tagged token. :type sep: str :param sep: The separator string used to separate word strings from tags. """ word, tag = tagged_token if tag is None: return word else: assert sep not in tag, "tag may not contain sep!" return f"{word}{sep}{tag}" def untag(tagged_sentence): """ Given a tagged sentence, return an untagged version of that sentence. I.e., return a list containing the first element of each tuple in *tagged_sentence*. >>> from nltk.tag.util import untag >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')]) ['John', 'saw', 'Mary'] """ return [w for (w, t) in tagged_sentence] nltk-3.7/nltk/tbl/000077500000000000000000000000001420073152400140625ustar00rootroot00000000000000nltk-3.7/nltk/tbl/__init__.py000066400000000000000000000013671420073152400162020ustar00rootroot00000000000000# Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2022 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT """ Transformation Based Learning A general purpose package for Transformation Based Learning, currently used by nltk.tag.BrillTagger. isort:skip_file """ from nltk.tbl.template import Template # API: Template(...), Template.expand(...) from nltk.tbl.feature import Feature # API: Feature(...), Feature.expand(...) from nltk.tbl.rule import Rule # API: Rule.format(...), Rule.templatetid from nltk.tbl.erroranalysis import error_list nltk-3.7/nltk/tbl/api.py000066400000000000000000000000001420073152400151730ustar00rootroot00000000000000nltk-3.7/nltk/tbl/demo.py000066400000000000000000000351101420073152400153600ustar00rootroot00000000000000# Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2022 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT import os import pickle import random import time from nltk.corpus import treebank from nltk.tag import BrillTaggerTrainer, RegexpTagger, UnigramTagger from nltk.tag.brill import Pos, Word from nltk.tbl import Template, error_list def demo(): """ Run a demo with defaults. See source comments for details, or docstrings of any of the more specific demo_* functions. """ postag() def demo_repr_rule_format(): """ Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose")) """ postag(ruleformat="repr") def demo_str_rule_format(): """ Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose")) """ postag(ruleformat="str") def demo_verbose_rule_format(): """ Exemplify Rule.format("verbose") """ postag(ruleformat="verbose") def demo_multiposition_feature(): """ The feature/s of a template takes a list of positions relative to the current word where the feature should be looked for, conceptually joined by logical OR. For instance, Pos([-1, 1]), given a value V, will hold whenever V is found one step to the left and/or one step to the right. For contiguous ranges, a 2-arg form giving inclusive end points can also be used: Pos(-3, -1) is the same as the arg below. """ postag(templates=[Template(Pos([-3, -2, -1]))]) def demo_multifeature_template(): """ Templates can have more than a single feature. """ postag(templates=[Template(Word([0]), Pos([-2, -1]))]) def demo_template_statistics(): """ Show aggregate statistics per template. Little used templates are candidates for deletion, much used templates may possibly be refined. Deleting unused templates is mostly about saving time and/or space: training is basically O(T) in the number of templates T (also in terms of memory usage, which often will be the limiting factor). """ postag(incremental_stats=True, template_stats=True) def demo_generated_templates(): """ Template.expand and Feature.expand are class methods facilitating generating large amounts of templates. See their documentation for details. Note: training with 500 templates can easily fill all available even on relatively small corpora """ wordtpls = Word.expand([-1, 0, 1], [1, 2], excludezero=False) tagtpls = Pos.expand([-2, -1, 0, 1], [1, 2], excludezero=True) templates = list(Template.expand([wordtpls, tagtpls], combinations=(1, 3))) print( "Generated {} templates for transformation-based learning".format( len(templates) ) ) postag(templates=templates, incremental_stats=True, template_stats=True) def demo_learning_curve(): """ Plot a learning curve -- the contribution on tagging accuracy of the individual rules. Note: requires matplotlib """ postag( incremental_stats=True, separate_baseline_data=True, learning_curve_output="learningcurve.png", ) def demo_error_analysis(): """ Writes a file with context for each erroneous word after tagging testing data """ postag(error_output="errors.txt") def demo_serialize_tagger(): """ Serializes the learned tagger to a file in pickle format; reloads it and validates the process. """ postag(serialize_output="tagger.pcl") def demo_high_accuracy_rules(): """ Discard rules with low accuracy. This may hurt performance a bit, but will often produce rules which are more interesting read to a human. """ postag(num_sents=3000, min_acc=0.96, min_score=10) def postag( templates=None, tagged_data=None, num_sents=1000, max_rules=300, min_score=3, min_acc=None, train=0.8, trace=3, randomize=False, ruleformat="str", incremental_stats=False, template_stats=False, error_output=None, serialize_output=None, learning_curve_output=None, learning_curve_take=300, baseline_backoff_tagger=None, separate_baseline_data=False, cache_baseline_tagger=None, ): """ Brill Tagger Demonstration :param templates: how many sentences of training and testing data to use :type templates: list of Template :param tagged_data: maximum number of rule instances to create :type tagged_data: C{int} :param num_sents: how many sentences of training and testing data to use :type num_sents: C{int} :param max_rules: maximum number of rule instances to create :type max_rules: C{int} :param min_score: the minimum score for a rule in order for it to be considered :type min_score: C{int} :param min_acc: the minimum score for a rule in order for it to be considered :type min_acc: C{float} :param train: the fraction of the the corpus to be used for training (1=all) :type train: C{float} :param trace: the level of diagnostic tracing output to produce (0-4) :type trace: C{int} :param randomize: whether the training data should be a random subset of the corpus :type randomize: C{bool} :param ruleformat: rule output format, one of "str", "repr", "verbose" :type ruleformat: C{str} :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) :type incremental_stats: C{bool} :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing :type template_stats: C{bool} :param error_output: the file where errors will be saved :type error_output: C{string} :param serialize_output: the file where the learned tbl tagger will be saved :type serialize_output: C{string} :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) :type learning_curve_output: C{string} :param learning_curve_take: how many rules plotted :type learning_curve_take: C{int} :param baseline_backoff_tagger: the file where rules will be saved :type baseline_backoff_tagger: tagger :param separate_baseline_data: use a fraction of the training data exclusively for training baseline :type separate_baseline_data: C{bool} :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get deterministic output from the baseline unigram tagger between python versions) :type cache_baseline_tagger: C{string} Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This is fast and fine for a demo, but is likely to generalize worse on unseen data. Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). """ # defaults baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER if templates is None: from nltk.tag.brill import brill24, describe_template_sets # some pre-built template sets taken from typical systems or publications are # available. Print a list with describe_template_sets() # for instance: templates = brill24() (training_data, baseline_data, gold_data, testing_data) = _demo_prepare_data( tagged_data, train, num_sents, randomize, separate_baseline_data ) # creating (or reloading from cache) a baseline tagger (unigram tagger) # this is just a mechanism for getting deterministic output from the baseline between # python versions if cache_baseline_tagger: if not os.path.exists(cache_baseline_tagger): baseline_tagger = UnigramTagger( baseline_data, backoff=baseline_backoff_tagger ) with open(cache_baseline_tagger, "w") as print_rules: pickle.dump(baseline_tagger, print_rules) print( "Trained baseline tagger, pickled it to {}".format( cache_baseline_tagger ) ) with open(cache_baseline_tagger) as print_rules: baseline_tagger = pickle.load(print_rules) print(f"Reloaded pickled tagger from {cache_baseline_tagger}") else: baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) print("Trained baseline tagger") if gold_data: print( " Accuracy on test set: {:0.4f}".format( baseline_tagger.accuracy(gold_data) ) ) # creating a Brill tagger tbrill = time.time() trainer = BrillTaggerTrainer( baseline_tagger, templates, trace, ruleformat=ruleformat ) print("Training tbl tagger...") brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print(f"Trained tbl tagger in {time.time() - tbrill:0.2f} seconds") if gold_data: print(" Accuracy on test set: %.4f" % brill_tagger.accuracy(gold_data)) # printing the learned rules, if learned silently if trace == 1: print("\nLearned rules: ") for (ruleno, rule) in enumerate(brill_tagger.rules(), 1): print(f"{ruleno:4d} {rule.format(ruleformat):s}") # printing template statistics (optionally including comparison with the training data) # note: if not separate_baseline_data, then baseline accuracy will be artificially high if incremental_stats: print( "Incrementally tagging the test data, collecting individual rule statistics" ) (taggedtest, teststats) = brill_tagger.batch_tag_incremental( testing_data, gold_data ) print(" Rule statistics collected") if not separate_baseline_data: print( "WARNING: train_stats asked for separate_baseline_data=True; the baseline " "will be artificially high" ) trainstats = brill_tagger.train_stats() if template_stats: brill_tagger.print_template_statistics(teststats) if learning_curve_output: _demo_plot( learning_curve_output, teststats, trainstats, take=learning_curve_take ) print(f"Wrote plot of learning curve to {learning_curve_output}") else: print("Tagging the test data") taggedtest = brill_tagger.tag_sents(testing_data) if template_stats: brill_tagger.print_template_statistics() # writing error analysis to file if error_output is not None: with open(error_output, "w") as f: f.write("Errors for Brill Tagger %r\n\n" % serialize_output) f.write("\n".join(error_list(gold_data, taggedtest)).encode("utf-8") + "\n") print(f"Wrote tagger errors including context to {error_output}") # serializing the tagger to a pickle file and reloading (just to see it works) if serialize_output is not None: taggedtest = brill_tagger.tag_sents(testing_data) with open(serialize_output, "w") as print_rules: pickle.dump(brill_tagger, print_rules) print(f"Wrote pickled tagger to {serialize_output}") with open(serialize_output) as print_rules: brill_tagger_reloaded = pickle.load(print_rules) print(f"Reloaded pickled tagger from {serialize_output}") taggedtest_reloaded = brill_tagger.tag_sents(testing_data) if taggedtest == taggedtest_reloaded: print("Reloaded tagger tried on test set, results identical") else: print("PROBLEM: Reloaded tagger gave different results on test set") def _demo_prepare_data( tagged_data, train, num_sents, randomize, separate_baseline_data ): # train is the proportion of data used in training; the rest is reserved # for testing. if tagged_data is None: print("Loading tagged data from treebank... ") tagged_data = treebank.tagged_sents() if num_sents is None or len(tagged_data) <= num_sents: num_sents = len(tagged_data) if randomize: random.seed(len(tagged_data)) random.shuffle(tagged_data) cutoff = int(num_sents * train) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:num_sents] testing_data = [[t[0] for t in sent] for sent in gold_data] if not separate_baseline_data: baseline_data = training_data else: bl_cutoff = len(training_data) // 3 (baseline_data, training_data) = ( training_data[:bl_cutoff], training_data[bl_cutoff:], ) (trainseqs, traintokens) = corpus_size(training_data) (testseqs, testtokens) = corpus_size(testing_data) (bltrainseqs, bltraintokens) = corpus_size(baseline_data) print(f"Read testing data ({testseqs:d} sents/{testtokens:d} wds)") print(f"Read training data ({trainseqs:d} sents/{traintokens:d} wds)") print( "Read baseline data ({:d} sents/{:d} wds) {:s}".format( bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]", ) ) return (training_data, baseline_data, gold_data, testing_data) def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None): testcurve = [teststats["initialerrors"]] for rulescore in teststats["rulescores"]: testcurve.append(testcurve[-1] - rulescore) testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]] traincurve = [trainstats["initialerrors"]] for rulescore in trainstats["rulescores"]: traincurve.append(traincurve[-1] - rulescore) traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]] import matplotlib.pyplot as plt r = list(range(len(testcurve))) plt.plot(r, testcurve, r, traincurve) plt.axis([None, None, None, 1.0]) plt.savefig(learning_curve_output) NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")]) REGEXP_TAGGER = RegexpTagger( [ (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ] ) def corpus_size(seqs): return (len(seqs), sum(len(x) for x in seqs)) if __name__ == "__main__": demo_learning_curve() nltk-3.7/nltk/tbl/erroranalysis.py000066400000000000000000000026101420073152400173300ustar00rootroot00000000000000# Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2022 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT # returns a list of errors in string format def error_list(train_sents, test_sents): """ Returns a list of human-readable strings indicating the errors in the given tagging of the corpus. :param train_sents: The correct tagging of the corpus :type train_sents: list(tuple) :param test_sents: The tagged corpus :type test_sents: list(tuple) """ hdr = ("%25s | %s | %s\n" + "-" * 26 + "+" + "-" * 24 + "+" + "-" * 26) % ( "left context", "word/test->gold".center(22), "right context", ) errors = [hdr] for (train_sent, test_sent) in zip(train_sents, test_sents): for wordnum, (word, train_pos) in enumerate(train_sent): test_pos = test_sent[wordnum][1] if train_pos != test_pos: left = " ".join("%s/%s" % w for w in train_sent[:wordnum]) right = " ".join("%s/%s" % w for w in train_sent[wordnum + 1 :]) mid = f"{word}/{test_pos}->{train_pos}" errors.append(f"{left[-25:]:>25} | {mid.center(22)} | {right[:25]}") return errors nltk-3.7/nltk/tbl/feature.py000066400000000000000000000223171420073152400160740ustar00rootroot00000000000000# Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2022 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from abc import ABCMeta, abstractmethod class Feature(metaclass=ABCMeta): """ An abstract base class for Features. A Feature is a combination of a specific property-computing method and a list of relative positions to apply that method to. The property-computing method, M{extract_property(tokens, index)}, must be implemented by every subclass. It extracts or computes a specific property for the token at the current index. Typical extract_property() methods return features such as the token text or tag; but more involved methods may consider the entire sequence M{tokens} and for instance compute the length of the sentence the token belongs to. In addition, the subclass may have a PROPERTY_NAME, which is how it will be printed (in Rules and Templates, etc). If not given, defaults to the classname. """ json_tag = "nltk.tbl.Feature" PROPERTY_NAME = None def __init__(self, positions, end=None): """ Construct a Feature which may apply at C{positions}. >>> # For instance, importing some concrete subclasses (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> # Feature Word, applying at one of [-2, -1] >>> Word([-2,-1]) Word([-2, -1]) >>> # Positions need not be contiguous >>> Word([-2,-1, 1]) Word([-2, -1, 1]) >>> # Contiguous ranges can alternatively be specified giving the >>> # two endpoints (inclusive) >>> Pos(-3, -1) Pos([-3, -2, -1]) >>> # In two-arg form, start <= end is enforced >>> Pos(2, 1) Traceback (most recent call last): File "", line 1, in File "nltk/tbl/template.py", line 306, in __init__ raise TypeError ValueError: illegal interval specification: (start=2, end=1) :type positions: list of int :param positions: the positions at which this features should apply :raises ValueError: illegal position specifications An alternative calling convention, for contiguous positions only, is Feature(start, end): :type start: int :param start: start of range where this feature should apply :type end: int :param end: end of range (NOTE: inclusive!) where this feature should apply """ self.positions = None # to avoid warnings if end is None: self.positions = tuple(sorted({int(i) for i in positions})) else: # positions was actually not a list, but only the start index try: if positions > end: raise TypeError self.positions = tuple(range(positions, end + 1)) except TypeError as e: # let any kind of erroneous spec raise ValueError raise ValueError( "illegal interval specification: (start={}, end={})".format( positions, end ) ) from e # set property name given in subclass, or otherwise name of subclass self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__ def encode_json_obj(self): return self.positions @classmethod def decode_json_obj(cls, obj): positions = obj return cls(positions) def __repr__(self): return f"{self.__class__.__name__}({list(self.positions)!r})" @classmethod def expand(cls, starts, winlens, excludezero=False): """ Return a list of features, one for each start point in starts and for each window length in winlen. If excludezero is True, no Features containing 0 in its positions will be generated (many tbl trainers have a special representation for the target feature at [0]) For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word First argument gives the possible start positions, second the possible window lengths >>> Word.expand([-3,-2,-1], [1]) [Word([-3]), Word([-2]), Word([-1])] >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] >>> Word.expand([-3,-2,-1], [1,2]) [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])] >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] A third optional argument excludes all Features whose positions contain zero >>> Word.expand([-2,-1,0], [1,2], excludezero=False) [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])] >>> Word.expand([-2,-1,0], [1,2], excludezero=True) [Word([-2]), Word([-1]), Word([-2, -1])] All window lengths must be positive >>> Word.expand([-2,-1], [0]) Traceback (most recent call last): File "", line 1, in File "nltk/tag/tbl/template.py", line 371, in expand :param starts: where to start looking for Feature ValueError: non-positive window length in [0] :param starts: where to start looking for Feature :type starts: list of ints :param winlens: window lengths where to look for Feature :type starts: list of ints :param excludezero: do not output any Feature with 0 in any of its positions. :type excludezero: bool :returns: list of Features :raises ValueError: for non-positive window lengths """ if not all(x > 0 for x in winlens): raise ValueError(f"non-positive window length in {winlens}") xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1)) return [cls(x) for x in xs if not (excludezero and 0 in x)] def issuperset(self, other): """ Return True if this Feature always returns True when other does More precisely, return True if this feature refers to the same property as other; and this Feature looks at all positions that other does (and possibly other positions in addition). #For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> Word([-3,-2,-1]).issuperset(Word([-3,-2])) True >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0])) False #Feature subclasses must agree >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2])) False :param other: feature with which to compare :type other: (subclass of) Feature :return: True if this feature is superset, otherwise False :rtype: bool """ return self.__class__ is other.__class__ and set(self.positions) >= set( other.positions ) def intersects(self, other): """ Return True if the positions of this Feature intersects with those of other More precisely, return True if this feature refers to the same property as other; and there is some overlap in the positions they look at. #For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> Word([-3,-2,-1]).intersects(Word([-3,-2])) True >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0])) True >>> Word([-3,-2,-1]).intersects(Word([0])) False #Feature subclasses must agree >>> Word([-3,-2,-1]).intersects(Pos([-3,-2])) False :param other: feature with which to compare :type other: (subclass of) Feature :return: True if feature classes agree and there is some overlap in the positions they look at :rtype: bool """ return bool( self.__class__ is other.__class__ and set(self.positions) & set(other.positions) ) # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+), # it will be enough to define __lt__ and __eq__ def __eq__(self, other): return self.__class__ is other.__class__ and self.positions == other.positions def __lt__(self, other): return ( self.__class__.__name__ < other.__class__.__name__ or # self.positions is a sorted tuple of ints self.positions < other.positions ) def __ne__(self, other): return not (self == other) def __gt__(self, other): return other < self def __ge__(self, other): return not self < other def __le__(self, other): return self < other or self == other @staticmethod @abstractmethod def extract_property(tokens, index): """ Any subclass of Feature must define static method extract_property(tokens, index) :param tokens: the sequence of tokens :type tokens: list of tokens :param index: the current index :type index: int :return: feature value :rtype: any (but usually scalar) """ nltk-3.7/nltk/tbl/rule.py000066400000000000000000000256711420073152400154160ustar00rootroot00000000000000# Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2022 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from abc import ABCMeta, abstractmethod from nltk import jsontags ###################################################################### # Tag Rules ###################################################################### class TagRule(metaclass=ABCMeta): """ An interface for tag transformations on a tagged corpus, as performed by tbl taggers. Each transformation finds all tokens in the corpus that are tagged with a specific original tag and satisfy a specific condition, and replaces their tags with a replacement tag. For any given transformation, the original tag, replacement tag, and condition are fixed. Conditions may depend on the token under consideration, as well as any other tokens in the corpus. Tag rules must be comparable and hashable. """ def __init__(self, original_tag, replacement_tag): self.original_tag = original_tag """The tag which this TagRule may cause to be replaced.""" self.replacement_tag = replacement_tag """The tag with which this TagRule may replace another tag.""" def apply(self, tokens, positions=None): """ Apply this rule at every position in positions where it applies to the given sentence. I.e., for each position p in *positions*, if *tokens[p]* is tagged with this rule's original tag, and satisfies this rule's condition, then set its tag to be this rule's replacement tag. :param tokens: The tagged sentence :type tokens: list(tuple(str, str)) :type positions: list(int) :param positions: The positions where the transformation is to be tried. If not specified, try it at all positions. :return: The indices of tokens whose tags were changed by this rule. :rtype: int """ if positions is None: positions = list(range(len(tokens))) # Determine the indices at which this rule applies. change = [i for i in positions if self.applies(tokens, i)] # Make the changes. Note: this must be done in a separate # step from finding applicable locations, since we don't want # the rule to interact with itself. for i in change: tokens[i] = (tokens[i][0], self.replacement_tag) return change @abstractmethod def applies(self, tokens, index): """ :return: True if the rule would change the tag of ``tokens[index]``, False otherwise :rtype: bool :param tokens: A tagged sentence :type tokens: list(str) :param index: The index to check :type index: int """ # Rules must be comparable and hashable for the algorithm to work def __eq__(self, other): raise TypeError("Rules must implement __eq__()") def __ne__(self, other): raise TypeError("Rules must implement __ne__()") def __hash__(self): raise TypeError("Rules must implement __hash__()") @jsontags.register_tag class Rule(TagRule): """ A Rule checks the current corpus position for a certain set of conditions; if they are all fulfilled, the Rule is triggered, meaning that it will change tag A to tag B. For other tags than A, nothing happens. The conditions are parameters to the Rule instance. Each condition is a feature-value pair, with a set of positions to check for the value of the corresponding feature. Conceptually, the positions are joined by logical OR, and the feature set by logical AND. More formally, the Rule is then applicable to the M{n}th token iff: - The M{n}th token is tagged with the Rule's original tag; and - For each (Feature(positions), M{value}) tuple: - The value of Feature of at least one token in {n+p for p in positions} is M{value}. """ json_tag = "nltk.tbl.Rule" def __init__(self, templateid, original_tag, replacement_tag, conditions): """ Construct a new Rule that changes a token's tag from C{original_tag} to C{replacement_tag} if all of the properties specified in C{conditions} hold. :param templateid: the template id (a zero-padded string, '001' etc, so it will sort nicely) :type templateid: string :param conditions: A list of Feature(positions), each of which specifies that the property (computed by Feature.extract_property()) of at least one token in M{n} + p in positions is C{value}. :type conditions: C{iterable} of C{Feature} """ TagRule.__init__(self, original_tag, replacement_tag) self._conditions = conditions self.templateid = templateid def encode_json_obj(self): return { "templateid": self.templateid, "original": self.original_tag, "replacement": self.replacement_tag, "conditions": self._conditions, } @classmethod def decode_json_obj(cls, obj): return cls( obj["templateid"], obj["original"], obj["replacement"], tuple(tuple(feat) for feat in obj["conditions"]), ) def applies(self, tokens, index): # Inherit docs from TagRule # Does the given token have this Rule's "original tag"? if tokens[index][1] != self.original_tag: return False # Check to make sure that every condition holds. for (feature, val) in self._conditions: # Look for *any* token that satisfies the condition. for pos in feature.positions: if not (0 <= index + pos < len(tokens)): continue if feature.extract_property(tokens, index + pos) == val: break else: # No token satisfied the condition; return false. return False # Every condition checked out, so the Rule is applicable. return True def __eq__(self, other): return self is other or ( other is not None and other.__class__ == self.__class__ and self.original_tag == other.original_tag and self.replacement_tag == other.replacement_tag and self._conditions == other._conditions ) def __ne__(self, other): return not (self == other) def __hash__(self): # Cache our hash value (justified by profiling.) try: return self.__hash except AttributeError: self.__hash = hash(repr(self)) return self.__hash def __repr__(self): # Cache the repr (justified by profiling -- this is used as # a sort key when deterministic=True.) try: return self.__repr except AttributeError: self.__repr = "{}('{}', {}, {}, [{}])".format( self.__class__.__name__, self.templateid, repr(self.original_tag), repr(self.replacement_tag), # list(self._conditions) would be simpler but will not generate # the same Rule.__repr__ in python 2 and 3 and thus break some tests ", ".join(f"({f},{repr(v)})" for (f, v) in self._conditions), ) return self.__repr def __str__(self): def _condition_to_logic(feature, value): """ Return a compact, predicate-logic styled string representation of the given condition. """ return "{}:{}@[{}]".format( feature.PROPERTY_NAME, value, ",".join(str(w) for w in feature.positions), ) conditions = " & ".join( [_condition_to_logic(f, v) for (f, v) in self._conditions] ) s = f"{self.original_tag}->{self.replacement_tag} if {conditions}" return s def format(self, fmt): """ Return a string representation of this rule. >>> from nltk.tbl.rule import Rule >>> from nltk.tag.brill import Pos >>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')]) r.format("str") == str(r) True >>> r.format("str") 'VB->NN if Pos:DT@[-2,-1]' r.format("repr") == repr(r) True >>> r.format("repr") "Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])" >>> r.format("verbose") 'VB -> NN if the Pos of words i-2...i-1 is "DT"' >>> r.format("not_found") Traceback (most recent call last): File "", line 1, in File "nltk/tbl/rule.py", line 256, in format raise ValueError("unknown rule format spec: {0}".format(fmt)) ValueError: unknown rule format spec: not_found >>> :param fmt: format specification :type fmt: str :return: string representation :rtype: str """ if fmt == "str": return self.__str__() elif fmt == "repr": return self.__repr__() elif fmt == "verbose": return self._verbose_format() else: raise ValueError(f"unknown rule format spec: {fmt}") def _verbose_format(self): """ Return a wordy, human-readable string representation of the given rule. Not sure how useful this is. """ def condition_to_str(feature, value): return 'the {} of {} is "{}"'.format( feature.PROPERTY_NAME, range_to_str(feature.positions), value, ) def range_to_str(positions): if len(positions) == 1: p = positions[0] if p == 0: return "this word" if p == -1: return "the preceding word" elif p == 1: return "the following word" elif p < 0: return "word i-%d" % -p elif p > 0: return "word i+%d" % p else: # for complete compatibility with the wordy format of nltk2 mx = max(positions) mn = min(positions) if mx - mn == len(positions) - 1: return "words i%+d...i%+d" % (mn, mx) else: return "words {{{}}}".format( ",".join("i%+d" % d for d in positions) ) replacement = f"{self.original_tag} -> {self.replacement_tag}" conditions = (" if " if self._conditions else "") + ", and ".join( condition_to_str(f, v) for (f, v) in self._conditions ) return replacement + conditions nltk-3.7/nltk/tbl/template.py000066400000000000000000000304271420073152400162550ustar00rootroot00000000000000# Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2022 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT import itertools as it from abc import ABCMeta, abstractmethod from nltk.tbl.feature import Feature from nltk.tbl.rule import Rule class BrillTemplateI(metaclass=ABCMeta): """ An interface for generating lists of transformational rules that apply at given sentence positions. ``BrillTemplateI`` is used by ``Brill`` training algorithms to generate candidate rules. """ @abstractmethod def applicable_rules(self, tokens, i, correctTag): """ Return a list of the transformational rules that would correct the ``i``-th subtoken's tag in the given token. In particular, return a list of zero or more rules that would change ``tokens[i][1]`` to ``correctTag``, if applied to ``token[i]``. If the ``i``-th token already has the correct tag (i.e., if ``tagged_tokens[i][1] == correctTag``), then ``applicable_rules()`` should return the empty list. :param tokens: The tagged tokens being tagged. :type tokens: list(tuple) :param i: The index of the token whose tag should be corrected. :type i: int :param correctTag: The correct tag for the ``i``-th token. :type correctTag: any :rtype: list(BrillRule) """ @abstractmethod def get_neighborhood(self, token, index): """ Returns the set of indices *i* such that ``applicable_rules(token, i, ...)`` depends on the value of the *index*th token of *token*. This method is used by the "fast" Brill tagger trainer. :param token: The tokens being tagged. :type token: list(tuple) :param index: The index whose neighborhood should be returned. :type index: int :rtype: set """ class Template(BrillTemplateI): """ A tbl Template that generates a list of L{Rule}s that apply at a given sentence position. In particular, each C{Template} is parameterized by a list of independent features (a combination of a specific property to extract and a list C{L} of relative positions at which to extract it) and generates all Rules that: - use the given features, each at its own independent position; and - are applicable to the given token. """ ALLTEMPLATES = [] # record a unique id of form "001", for each template created # _ids = it.count(0) def __init__(self, *features): """ Construct a Template for generating Rules. Takes a list of Features. A C{Feature} is a combination of a specific property and its relative positions and should be a subclass of L{nltk.tbl.feature.Feature}. An alternative calling convention (kept for backwards compatibility, but less expressive as it only permits one feature type) is Template(Feature, (start1, end1), (start2, end2), ...) In new code, that would be better written Template(Feature(start1, end1), Feature(start2, end2), ...) For instance, importing some features >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Word, Pos Create some features >>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1])) Create a single-feature template >>> Template(wfeat1) Template(Word([-1])) Or a two-feature one >>> Template(wfeat1, wfeat2) Template(Word([-1]),Word([1, 2])) Or a three-feature one with two different feature types >>> Template(wfeat1, wfeat2, pfeat) Template(Word([-1]),Word([1, 2]),Pos([-2, -1])) deprecated api: Feature subclass, followed by list of (start,end) pairs (permits only a single Feature) >>> Template(Word, (-2,-1), (0,0)) Template(Word([-2, -1]),Word([0])) Incorrect specification raises TypeError >>> Template(Word, (-2,-1), Pos, (0,0)) Traceback (most recent call last): File "", line 1, in File "nltk/tag/tbl/template.py", line 143, in __init__ raise TypeError( TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ... :type features: list of Features :param features: the features to build this Template on """ # determine the calling form: either # Template(Feature, args1, [args2, ...)] # Template(Feature1(args), Feature2(args), ...) if all(isinstance(f, Feature) for f in features): self._features = features elif issubclass(features[0], Feature) and all( isinstance(a, tuple) for a in features[1:] ): self._features = [features[0](*tp) for tp in features[1:]] else: raise TypeError( "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..." ) self.id = f"{len(self.ALLTEMPLATES):03d}" self.ALLTEMPLATES.append(self) def __repr__(self): return "{}({})".format( self.__class__.__name__, ",".join([str(f) for f in self._features]), ) def applicable_rules(self, tokens, index, correct_tag): if tokens[index][1] == correct_tag: return [] # For each of this Template's features, find the conditions # that are applicable for the given token. # Then, generate one Rule for each combination of features # (the crossproduct of the conditions). applicable_conditions = self._applicable_conditions(tokens, index) xs = list(it.product(*applicable_conditions)) return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs] def _applicable_conditions(self, tokens, index): """ :returns: A set of all conditions for rules that are applicable to C{tokens[index]}. """ conditions = [] for feature in self._features: conditions.append([]) for pos in feature.positions: if not (0 <= index + pos < len(tokens)): continue value = feature.extract_property(tokens, index + pos) conditions[-1].append((feature, value)) return conditions def get_neighborhood(self, tokens, index): # inherit docs from BrillTemplateI # applicable_rules(tokens, index, ...) depends on index. neighborhood = {index} # set literal for python 2.7+ # applicable_rules(tokens, i, ...) depends on index if # i+start < index <= i+end. allpositions = [0] + [p for feat in self._features for p in feat.positions] start, end = min(allpositions), max(allpositions) s = max(0, index + (-end)) e = min(index + (-start) + 1, len(tokens)) for i in range(s, e): neighborhood.add(i) return neighborhood @classmethod def expand(cls, featurelists, combinations=None, skipintersecting=True): """ Factory method to mass generate Templates from a list L of lists of Features. #With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2 #of the sublists in L; it will output all Templates formed by the Cartesian product #of this selection, with duplicates and other semantically equivalent #forms removed. Default for combinations is (1, len(L)). The feature lists may have been specified manually, or generated from Feature.expand(). For instance, >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Word, Pos #creating some features >>> (wd_0, wd_01) = (Word([0]), Word([0,1])) >>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3])) >>> list(Template.expand([[wd_0], [pos_m2]])) [Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))] >>> list(Template.expand([[wd_0, wd_01], [pos_m2]])) [Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))] #note: with Feature.expand(), it is very easy to generate more templates #than your system can handle -- for instance, >>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False) >>> len(wordtpls) 7 >>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True) >>> len(postpls) 9 #and now the Cartesian product of all non-empty combinations of two wordtpls and #two postpls, with semantic equivalents removed >>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls])) >>> len(templates) 713 will return a list of eight templates Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-1])), Template(Pos([-2]),Word([0])), Template(Pos([-1]),Word([0])), Template(Pos([-2]),Word([0, 1])), Template(Pos([-1]),Word([0, 1]))] #Templates where one feature is a subset of another, such as #Template(Word([0,1]), Word([1]), will not appear in the output. #By default, this non-subset constraint is tightened to disjointness: #Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out. #With skipintersecting=False, then such Templates are allowed WARNING: this method makes it very easy to fill all your memory when training generated templates on any real-world corpus :param featurelists: lists of Features, whose Cartesian product will return a set of Templates :type featurelists: list of (list of Features) :param combinations: given n featurelists: if combinations=k, all generated Templates will have k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n :type combinations: None, int, or (int, int) :param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature) :type skipintersecting: bool :returns: generator of Templates """ def nonempty_powerset(xs): # xs is a list # itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3) # find the correct tuple given combinations, one of {None, k, (k1,k2)} k = combinations # for brevity combrange = ( (1, len(xs) + 1) if k is None else (k, k + 1) # n over 1 .. n over n (all non-empty combinations) if isinstance(k, int) else (k[0], k[1] + 1) # n over k (only ) # n over k1, n over k1+1... n over k2 return it.chain.from_iterable( it.combinations(xs, r) for r in range(*combrange) ) seentemplates = set() for picks in nonempty_powerset(featurelists): for pick in it.product(*picks): if any( i != j and x.issuperset(y) for (i, x) in enumerate(pick) for (j, y) in enumerate(pick) ): continue if skipintersecting and any( i != j and x.intersects(y) for (i, x) in enumerate(pick) for (j, y) in enumerate(pick) ): continue thistemplate = cls(*sorted(pick)) strpick = str(thistemplate) #!!FIXME --this is hackish if strpick in seentemplates: # already added cls._poptemplate() continue seentemplates.add(strpick) yield thistemplate @classmethod def _cleartemplates(cls): cls.ALLTEMPLATES = [] @classmethod def _poptemplate(cls): return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None nltk-3.7/nltk/test/000077500000000000000000000000001420073152400142605ustar00rootroot00000000000000nltk-3.7/nltk/test/FX8.xml000066400000000000000000000234751420073152400154220ustar00rootroot00000000000000 General practitioner's surgery: medical consultation. Sample containing about 125 words speech recorded in public context Data capture and transcription Longman ELT BNC XML Edition, December 2006 125 tokens; 130 w-units; 15 s-units Distributed under licence by Oxford University Computing Services on behalf of the BNC Consortium. This material is protected by international copyright laws and may not be copied or redistributed in any way. Consult the BNC Web Site at http://www.natcorp.ox.ac.uk for full licencing and distribution conditions.FX8 093802 0000-00-00 Origination/creation date not known Doctor doctor other participants are doctors patientsUnknown speakerGroup of unknown speakersStrathclyde: Lanarkshire G.P.'s surgery Medical consultation S consult medicine medical consultation Tag usage updated for BNC-XMLLast check for BNC World first releaseCheck all tagcountsResequenced s-units and added headersRevised participant detailsAdded date infoUpdated all catrefsUpdated REC elements to include tape numberUpdated titlescorrected tagUsagePOS codes revised for BNC-2; header updatedInitial accession to corpus Ah there we are,. Right abdominal wound, she's a wee bit confused. She didn't bother to tell me that she'd only got to call you, right? Erm she wasn't in her nightdress but she only dressed herself, she said And you She said she went to buy something herself, she phoned the clinic and the clinic . She's here and says she should be fortnightly . So I don't know whether you want to go and see her rather than, I could get a doctor to go and see her and phone,. it's just that I'm never gonna get to up to. ? Yeah. Okay. Yeah. erm, first twelve weeks pregnant so should I mark at the bottom when she types . Erm this one. nltk-3.7/nltk/test/Makefile000066400000000000000000000006161420073152400157230ustar00rootroot00000000000000.SUFFIXES: .doctest .errs .html TESTS = $(wildcard *.doctest) ERRS := $(TESTS:.doctest=.errs) HTML = $(TESTS:.doctest=.html) # $(IPYNB:.ipynb=.html) IPYNB = $(wildcard *.ipynb) .doctest.errs: pytest $< > $@ .doctest.html: rst2html.py $< > $@ %.html: %.ipynb ipython nbconvert $< all: $(ERRS) html: $(HTML) install_html: cp $(HTML) ../../../nltk.github.com/howto clean: rm -f *.errs nltk-3.7/nltk/test/__init__.py000066400000000000000000000007251420073152400163750ustar00rootroot00000000000000# Natural Language Toolkit: Unit Tests # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Unit tests for the NLTK modules. These tests are intended to ensure that source code changes don't accidentally introduce bugs. For instructions, please see: ../../web/dev/local_testing.rst https://github.com/nltk/nltk/blob/develop/web/dev/local_testing.rst """ nltk-3.7/nltk/test/all.py000066400000000000000000000014321420073152400154020ustar00rootroot00000000000000"""Test suite that runs all NLTK tests. This module, `nltk.test.all`, is named as the NLTK ``test_suite`` in the project's ``setup-eggs.py`` file. Here, we create a test suite that runs all of our doctests, and return it for processing by the setuptools test harness. """ import doctest import os.path import unittest from glob import glob def additional_tests(): # print("here-000000000000000") # print("-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest'))) dir = os.path.dirname(__file__) paths = glob(os.path.join(dir, "*.doctest")) files = [os.path.basename(path) for path in paths] return unittest.TestSuite([doctest.DocFileSuite(file) for file in files]) # if os.path.split(path)[-1] != 'index.rst' # skips time-dependent doctest in index.rst nltk-3.7/nltk/test/bleu.doctest000066400000000000000000000015011420073152400165730ustar00rootroot00000000000000========== BLEU tests ========== >>> from nltk.translate import bleu If the candidate has no alignment to any of the references, the BLEU score is 0. >>> bleu( ... ['The candidate has no alignment to any of the references'.split()], ... 'John loves Mary'.split(), ... (1,), ... ) 0 This is an implementation of the smoothing techniques for segment-level BLEU scores that was presented in Boxing Chen and Collin Cherry (2014) A Systematic Comparison of Smoothing Techniques for Sentence-Level BLEU. In WMT14. http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf >>> from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction >>> sentence_bleu( ... ['It is a place of quiet contemplation .'.split()], ... 'It is .'.split(), ... smoothing_function=SmoothingFunction().method4, ... )*100 4.4267... nltk-3.7/nltk/test/bnc.doctest000066400000000000000000000037071420073152400164200ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT >>> import os.path >>> from nltk.corpus.reader import BNCCorpusReader >>> import nltk.test >>> root = os.path.dirname(nltk.test.__file__) >>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml') Checking the word access. ------------------------- >>> len(bnc.words()) 151 >>> bnc.words()[:6] ['Ah', 'there', 'we', 'are', ',', '.'] >>> bnc.words(stem=True)[:6] ['ah', 'there', 'we', 'be', ',', '.'] >>> bnc.tagged_words()[:6] [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')] >>> bnc.tagged_words(c5=True)[:6] [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')] Testing access to the sentences. -------------------------------- >>> len(bnc.sents()) 15 >>> bnc.sents()[0] ['Ah', 'there', 'we', 'are', ',', '.'] >>> bnc.sents(stem=True)[0] ['ah', 'there', 'we', 'be', ',', '.'] >>> bnc.tagged_sents()[0] [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')] >>> bnc.tagged_sents(c5=True)[0] [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')] A not lazy loader. ------------------ >>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False) >>> len(eager.words()) 151 >>> eager.words(stem=True)[6:17] ['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.'] >>> eager.tagged_words()[6:11] [('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')] >>> eager.tagged_words(c5=True)[6:17] [('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')] >>> len(eager.sents()) 15 nltk-3.7/nltk/test/ccg.doctest000066400000000000000000000457221420073152400164150ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ============================== Combinatory Categorial Grammar ============================== Relative Clauses ---------------- >>> from nltk.ccg import chart, lexicon Construct a lexicon: >>> lex = lexicon.fromstring(''' ... :- S, NP, N, VP ... ... Det :: NP/N ... Pro :: NP ... Modal :: S\\NP/VP ... ... TV :: VP/NP ... DTV :: TV/NP ... ... the => Det ... ... that => Det ... that => NP ... ... I => Pro ... you => Pro ... we => Pro ... ... chef => N ... cake => N ... children => N ... dough => N ... ... will => Modal ... should => Modal ... might => Modal ... must => Modal ... ... and => var\\.,var/.,var ... ... to => VP[to]/VP ... ... without => (VP\\VP)/VP[ing] ... ... be => TV ... cook => TV ... eat => TV ... ... cooking => VP[ing]/NP ... ... give => DTV ... ... is => (S\\NP)/NP ... prefer => (S\\NP)/NP ... ... which => (N\\N)/(S/NP) ... ... persuade => (VP/VP[to])/NP ... ''') >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> for parse in parser.parse("you prefer that cake".split()): ... chart.printCCGDerivation(parse) ... break ... you prefer that cake NP ((S\NP)/NP) (NP/N) N --------------> NP ---------------------------> (S\NP) --------------------------------< S >>> for parse in parser.parse("that is the cake which you prefer".split()): ... chart.printCCGDerivation(parse) ... break ... that is the cake which you prefer NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/NP) ----->T (S/(S\NP)) ------------------>B (S/NP) ----------------------------------> (N\N) ----------------------------------------< N ------------------------------------------------> NP -------------------------------------------------------------> (S\NP) -------------------------------------------------------------------< S Some other sentences to try: "that is the cake which we will persuade the chef to cook" "that is the cake which we will persuade the chef to give the children" >>> sent = "that is the dough which you will eat without cooking".split() >>> nosub_parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet + ... chart.CompositionRuleSet + chart.TypeRaiseRuleSet) Without Substitution (no output) >>> for parse in nosub_parser.parse(sent): ... chart.printCCGDerivation(parse) With Substitution: >>> for parse in parser.parse(sent): ... chart.printCCGDerivation(parse) ... break ... that is the dough which you will eat without cooking NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/VP) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) ----->T (S/(S\NP)) ------------------------------------->B ((VP\VP)/NP) ----------------------------------------------B ((S\NP)/NP) ---------------------------------------------------------------->B (S/NP) --------------------------------------------------------------------------------> (N\N) ---------------------------------------------------------------------------------------< N -----------------------------------------------------------------------------------------------> NP ------------------------------------------------------------------------------------------------------------> (S\NP) ------------------------------------------------------------------------------------------------------------------< S Conjunction ----------- >>> from nltk.ccg.chart import CCGChartParser, ApplicationRuleSet, CompositionRuleSet >>> from nltk.ccg.chart import SubstitutionRuleSet, TypeRaiseRuleSet, printCCGDerivation >>> from nltk.ccg import lexicon Lexicons for the tests: >>> test1_lex = ''' ... :- S,N,NP,VP ... I => NP ... you => NP ... will => S\\NP/VP ... cook => VP/NP ... which => (N\\N)/(S/NP) ... and => var\\.,var/.,var ... might => S\\NP/VP ... eat => VP/NP ... the => NP/N ... mushrooms => N ... parsnips => N''' >>> test2_lex = ''' ... :- N, S, NP, VP ... articles => N ... the => NP/N ... and => var\\.,var/.,var ... which => (N\\N)/(S/NP) ... I => NP ... anyone => NP ... will => (S/VP)\\NP ... file => VP/NP ... without => (VP\\VP)/VP[ing] ... forget => VP/NP ... reading => VP[ing]/NP ... ''' Tests handling of conjunctions. Note that while the two derivations are different, they are semantically equivalent. >>> lex = lexicon.fromstring(test1_lex) >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) >>> for parse in parser.parse("I will cook and might eat the mushrooms and parsnips".split()): ... printCCGDerivation(parse) I will cook and might eat the mushrooms and parsnips NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N ---------------------->B ((S\NP)/NP) ---------------------->B ((S\NP)/NP) -------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) -----------------------------------------------------------------------< ((S\NP)/NP) -------------------------------------> (N\.,N) ------------------------------------------------< N --------------------------------------------------------> NP -------------------------------------------------------------------------------------------------------------------------------> (S\NP) -----------------------------------------------------------------------------------------------------------------------------------< S I will cook and might eat the mushrooms and parsnips NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N ---------------------->B ((S\NP)/NP) ---------------------->B ((S\NP)/NP) -------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) -----------------------------------------------------------------------< ((S\NP)/NP) ------------------------------------------------------------------------------->B ((S\NP)/N) -------------------------------------> (N\.,N) ------------------------------------------------< N -------------------------------------------------------------------------------------------------------------------------------> (S\NP) -----------------------------------------------------------------------------------------------------------------------------------< S Tests handling subject extraction. Interesting to point that the two parses are clearly semantically different. >>> lex = lexicon.fromstring(test2_lex) >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) >>> for parse in parser.parse("articles which I will file and forget without reading".split()): ... printCCGDerivation(parse) articles which I will file and forget without reading N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) -----------------< (S/VP) ------------------------------------->B ((VP\VP)/NP) ---------------------------------------------- ((VP/NP)\.,(VP/NP)) ----------------------------------------------------------------------------------< (VP/NP) --------------------------------------------------------------------------------------------------->B (S/NP) -------------------------------------------------------------------------------------------------------------------> (N\N) -----------------------------------------------------------------------------------------------------------------------------< N articles which I will file and forget without reading N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) -----------------< (S/VP) ------------------------------------> ((VP/NP)\.,(VP/NP)) ---------------------------------------------< (VP/NP) ------------------------------------->B ((VP\VP)/NP) ----------------------------------------------------------------------------------B (S/NP) -------------------------------------------------------------------------------------------------------------------> (N\N) -----------------------------------------------------------------------------------------------------------------------------< N Unicode support --------------- Unicode words are supported. >>> from nltk.ccg import chart, lexicon Lexicons for the tests: >>> lex = lexicon.fromstring(''' ... :- S, N, NP, PP ... ... AdjI :: N\\N ... AdjD :: N/N ... AdvD :: S/S ... AdvI :: S\\S ... Det :: NP/N ... PrepNPCompl :: PP/NP ... PrepNAdjN :: S\\S/N ... PrepNAdjNP :: S\\S/NP ... VPNP :: S\\NP/NP ... VPPP :: S\\NP/PP ... VPser :: S\\NP/AdjI ... ... auto => N ... bebidas => N ... cine => N ... ley => N ... libro => N ... ministro => N ... panadería => N ... presidente => N ... super => N ... ... el => Det ... la => Det ... las => Det ... un => Det ... ... Ana => NP ... Pablo => NP ... ... y => var\\.,var/.,var ... ... pero => (S/NP)\\(S/NP)/(S/NP) ... ... anunció => VPNP ... compró => VPNP ... cree => S\\NP/S[dep] ... desmintió => VPNP ... lee => VPNP ... fueron => VPPP ... ... es => VPser ... ... interesante => AdjD ... interesante => AdjI ... nueva => AdjD ... nueva => AdjI ... ... a => PrepNPCompl ... en => PrepNAdjN ... en => PrepNAdjNP ... ... ayer => AdvI ... ... que => (NP\\NP)/(S/NP) ... que => S[dep]/S ... ''') >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> for parse in parser.parse(u"el ministro anunció pero el presidente desmintió la nueva ley".split()): ... printCCGDerivation(parse) # doctest: +SKIP ... # it fails on python2.7 because of the unicode problem explained in https://github.com/nltk/nltk/pull/1354 ... break el ministro anunció pero el presidente desmintió la nueva ley (NP/N) N ((S\NP)/NP) (((S/NP)\(S/NP))/(S/NP)) (NP/N) N ((S\NP)/NP) (NP/N) (N/N) N ------------------> NP ------------------>T (S/(S\NP)) --------------------> NP -------------------->T (S/(S\NP)) --------------------------------->B (S/NP) -----------------------------------------------------------> ((S/NP)\(S/NP)) ------------> N --------------------> NP -------------------- S nltk-3.7/nltk/test/ccg_semantics.doctest000066400000000000000000000734621420073152400204650ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ============================================== Combinatory Categorial Grammar with semantics ============================================== ----- Chart ----- >>> from nltk.ccg import chart, lexicon >>> from nltk.ccg.chart import printCCGDerivation No semantics ------------------- >>> lex = lexicon.fromstring(''' ... :- S, NP, N ... She => NP ... has => (S\\NP)/NP ... books => NP ... ''', ... False) >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> parses = list(parser.parse("She has books".split())) >>> print(str(len(parses)) + " parses") 3 parses >>> printCCGDerivation(parses[0]) She has books NP ((S\NP)/NP) NP --------------------> (S\NP) -------------------------< S >>> printCCGDerivation(parses[1]) She has books NP ((S\NP)/NP) NP ----->T (S/(S\NP)) --------------------> (S\NP) -------------------------> S >>> printCCGDerivation(parses[2]) She has books NP ((S\NP)/NP) NP ----->T (S/(S\NP)) ------------------>B (S/NP) -------------------------> S Simple semantics ------------------- >>> lex = lexicon.fromstring(''' ... :- S, NP, N ... She => NP {she} ... has => (S\\NP)/NP {\\x y.have(y, x)} ... a => NP/N {\\P.exists z.P(z)} ... book => N {book} ... ''', ... True) >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> parses = list(parser.parse("She has a book".split())) >>> print(str(len(parses)) + " parses") 7 parses >>> printCCGDerivation(parses[0]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} -------------------------------------> NP {exists z.book(z)} -------------------------------------------------------------------> (S\NP) {\y.have(y,exists z.book(z))} -----------------------------------------------------------------------------< S {have(she,exists z.book(z))} >>> printCCGDerivation(parses[1]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} --------------------------------------------------------->B ((S\NP)/N) {\P y.have(y,exists z.P(z))} -------------------------------------------------------------------> (S\NP) {\y.have(y,exists z.book(z))} -----------------------------------------------------------------------------< S {have(she,exists z.book(z))} >>> printCCGDerivation(parses[2]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} ---------->T (S/(S\NP)) {\F.F(she)} -------------------------------------> NP {exists z.book(z)} -------------------------------------------------------------------> (S\NP) {\y.have(y,exists z.book(z))} -----------------------------------------------------------------------------> S {have(she,exists z.book(z))} >>> printCCGDerivation(parses[3]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} ---------->T (S/(S\NP)) {\F.F(she)} --------------------------------------------------------->B ((S\NP)/N) {\P y.have(y,exists z.P(z))} -------------------------------------------------------------------> (S\NP) {\y.have(y,exists z.book(z))} -----------------------------------------------------------------------------> S {have(she,exists z.book(z))} >>> printCCGDerivation(parses[4]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} ---------->T (S/(S\NP)) {\F.F(she)} ---------------------------------------->B (S/NP) {\x.have(she,x)} -------------------------------------> NP {exists z.book(z)} -----------------------------------------------------------------------------> S {have(she,exists z.book(z))} >>> printCCGDerivation(parses[5]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} ---------->T (S/(S\NP)) {\F.F(she)} --------------------------------------------------------->B ((S\NP)/N) {\P y.have(y,exists z.P(z))} ------------------------------------------------------------------->B (S/N) {\P.have(she,exists z.P(z))} -----------------------------------------------------------------------------> S {have(she,exists z.book(z))} >>> printCCGDerivation(parses[6]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} ---------->T (S/(S\NP)) {\F.F(she)} ---------------------------------------->B (S/NP) {\x.have(she,x)} ------------------------------------------------------------------->B (S/N) {\P.have(she,exists z.P(z))} -----------------------------------------------------------------------------> S {have(she,exists z.book(z))} Complex semantics ------------------- >>> lex = lexicon.fromstring(''' ... :- S, NP, N ... She => NP {she} ... has => (S\\NP)/NP {\\x y.have(y, x)} ... a => ((S\\NP)\\((S\\NP)/NP))/N {\\P R x.(exists z.P(z) & R(z,x))} ... book => N {book} ... ''', ... True) >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> parses = list(parser.parse("She has a book".split())) >>> print(str(len(parses)) + " parses") 2 parses >>> printCCGDerivation(parses[0]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book} ----------------------------------------------------------------------> ((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))} ----------------------------------------------------------------------------------------------------< (S\NP) {\x.(exists z.book(z) & have(x,z))} --------------------------------------------------------------------------------------------------------------< S {(exists z.book(z) & have(she,z))} >>> printCCGDerivation(parses[1]) She has a book NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book} ---------->T (S/(S\NP)) {\F.F(she)} ----------------------------------------------------------------------> ((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))} ----------------------------------------------------------------------------------------------------< (S\NP) {\x.(exists z.book(z) & have(x,z))} --------------------------------------------------------------------------------------------------------------> S {(exists z.book(z) & have(she,z))} Using conjunctions --------------------- # TODO: The semantics of "and" should have been more flexible >>> lex = lexicon.fromstring(''' ... :- S, NP, N ... I => NP {I} ... cook => (S\\NP)/NP {\\x y.cook(x,y)} ... and => var\\.,var/.,var {\\P Q x y.(P(x,y) & Q(x,y))} ... eat => (S\\NP)/NP {\\x y.eat(x,y)} ... the => NP/N {\\x.the(x)} ... bacon => N {bacon} ... ''', ... True) >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> parses = list(parser.parse("I cook and eat the bacon".split())) >>> print(str(len(parses)) + " parses") 7 parses >>> printCCGDerivation(parses[0]) I cook and eat the bacon NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} -------------------------------------------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} -------------------------------------------------------------------------------------------------------------------< ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} -------------------------------> NP {the(bacon)} --------------------------------------------------------------------------------------------------------------------------------------------------> (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} ----------------------------------------------------------------------------------------------------------------------------------------------------------< S {(eat(the(bacon),I) & cook(the(bacon),I))} >>> printCCGDerivation(parses[1]) I cook and eat the bacon NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} -------------------------------------------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} -------------------------------------------------------------------------------------------------------------------< ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} --------------------------------------------------------------------------------------------------------------------------------------->B ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} --------------------------------------------------------------------------------------------------------------------------------------------------> (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} ----------------------------------------------------------------------------------------------------------------------------------------------------------< S {(eat(the(bacon),I) & cook(the(bacon),I))} >>> printCCGDerivation(parses[2]) I cook and eat the bacon NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} -------->T (S/(S\NP)) {\F.F(I)} -------------------------------------------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} -------------------------------------------------------------------------------------------------------------------< ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} -------------------------------> NP {the(bacon)} --------------------------------------------------------------------------------------------------------------------------------------------------> (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} ----------------------------------------------------------------------------------------------------------------------------------------------------------> S {(eat(the(bacon),I) & cook(the(bacon),I))} >>> printCCGDerivation(parses[3]) I cook and eat the bacon NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} -------->T (S/(S\NP)) {\F.F(I)} -------------------------------------------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} -------------------------------------------------------------------------------------------------------------------< ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} --------------------------------------------------------------------------------------------------------------------------------------->B ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} --------------------------------------------------------------------------------------------------------------------------------------------------> (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} ----------------------------------------------------------------------------------------------------------------------------------------------------------> S {(eat(the(bacon),I) & cook(the(bacon),I))} >>> printCCGDerivation(parses[4]) I cook and eat the bacon NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} -------->T (S/(S\NP)) {\F.F(I)} -------------------------------------------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} -------------------------------------------------------------------------------------------------------------------< ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} --------------------------------------------------------------------------------------------------------------------------->B (S/NP) {\x.(eat(x,I) & cook(x,I))} -------------------------------> NP {the(bacon)} ----------------------------------------------------------------------------------------------------------------------------------------------------------> S {(eat(the(bacon),I) & cook(the(bacon),I))} >>> printCCGDerivation(parses[5]) I cook and eat the bacon NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} -------->T (S/(S\NP)) {\F.F(I)} -------------------------------------------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} -------------------------------------------------------------------------------------------------------------------< ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} --------------------------------------------------------------------------------------------------------------------------------------->B ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} ----------------------------------------------------------------------------------------------------------------------------------------------->B (S/N) {\x.(eat(the(x),I) & cook(the(x),I))} ----------------------------------------------------------------------------------------------------------------------------------------------------------> S {(eat(the(bacon),I) & cook(the(bacon),I))} >>> printCCGDerivation(parses[6]) I cook and eat the bacon NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} -------->T (S/(S\NP)) {\F.F(I)} -------------------------------------------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} -------------------------------------------------------------------------------------------------------------------< ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} --------------------------------------------------------------------------------------------------------------------------->B (S/NP) {\x.(eat(x,I) & cook(x,I))} ----------------------------------------------------------------------------------------------------------------------------------------------->B (S/N) {\x.(eat(the(x),I) & cook(the(x),I))} ----------------------------------------------------------------------------------------------------------------------------------------------------------> S {(eat(the(bacon),I) & cook(the(bacon),I))} Tests from published papers ------------------------------ An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf >>> lex = lexicon.fromstring(''' ... :- S, NP ... I => NP {I} ... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)} ... them => NP {them} ... money => NP {money} ... ''', ... True) >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> parses = list(parser.parse("I give them money".split())) >>> print(str(len(parses)) + " parses") 3 parses >>> printCCGDerivation(parses[0]) I give them money NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} --------------------------------------------------> ((S\NP)/NP) {\y z.give(y,them,z)} --------------------------------------------------------------> (S\NP) {\z.give(money,them,z)} ----------------------------------------------------------------------< S {give(money,them,I)} >>> printCCGDerivation(parses[1]) I give them money NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} -------->T (S/(S\NP)) {\F.F(I)} --------------------------------------------------> ((S\NP)/NP) {\y z.give(y,them,z)} --------------------------------------------------------------> (S\NP) {\z.give(money,them,z)} ----------------------------------------------------------------------> S {give(money,them,I)} >>> printCCGDerivation(parses[2]) I give them money NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} -------->T (S/(S\NP)) {\F.F(I)} --------------------------------------------------> ((S\NP)/NP) {\y z.give(y,them,z)} ---------------------------------------------------------->B (S/NP) {\y.give(y,them,I)} ----------------------------------------------------------------------> S {give(money,them,I)} An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf >>> lex = lexicon.fromstring(''' ... :- N, NP, S ... money => N {money} ... that => (N\\N)/(S/NP) {\\P Q x.(P(x) & Q(x))} ... I => NP {I} ... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)} ... them => NP {them} ... ''', ... True) >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> parses = list(parser.parse("money that I give them".split())) >>> print(str(len(parses)) + " parses") 3 parses >>> printCCGDerivation(parses[0]) money that I give them N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} -------->T (S/(S\NP)) {\F.F(I)} --------------------------------------------------> ((S\NP)/NP) {\y z.give(y,them,z)} ---------------------------------------------------------->B (S/NP) {\y.give(y,them,I)} -------------------------------------------------------------------------------------------------> (N\N) {\Q x.(give(x,them,I) & Q(x))} ------------------------------------------------------------------------------------------------------------< N {\x.(give(x,them,I) & money(x))} >>> printCCGDerivation(parses[1]) money that I give them N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} ----------->T (N/(N\N)) {\F.F(money)} -------->T (S/(S\NP)) {\F.F(I)} --------------------------------------------------> ((S\NP)/NP) {\y z.give(y,them,z)} ---------------------------------------------------------->B (S/NP) {\y.give(y,them,I)} -------------------------------------------------------------------------------------------------> (N\N) {\Q x.(give(x,them,I) & Q(x))} ------------------------------------------------------------------------------------------------------------> N {\x.(give(x,them,I) & money(x))} >>> printCCGDerivation(parses[2]) money that I give them N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} ----------->T (N/(N\N)) {\F.F(money)} -------------------------------------------------->B (N/(S/NP)) {\P x.(P(x) & money(x))} -------->T (S/(S\NP)) {\F.F(I)} --------------------------------------------------> ((S\NP)/NP) {\y z.give(y,them,z)} ---------------------------------------------------------->B (S/NP) {\y.give(y,them,I)} ------------------------------------------------------------------------------------------------------------> N {\x.(give(x,them,I) & money(x))} ------- Lexicon ------- >>> from nltk.ccg import lexicon Parse lexicon with semantics >>> print(str(lexicon.fromstring( ... ''' ... :- S,NP ... ... IntransVsg :: S\\NP[sg] ... ... sleeps => IntransVsg {\\x.sleep(x)} ... eats => S\\NP[sg]/NP {\\x y.eat(x,y)} ... ... and => var\\var/var {\\x y.x & y} ... ''', ... True ... ))) and => ((_var0\_var0)/_var0) {(\x y.x & y)} eats => ((S\NP['sg'])/NP) {\x y.eat(x,y)} sleeps => (S\NP['sg']) {\x.sleep(x)} Parse lexicon without semantics >>> print(str(lexicon.fromstring( ... ''' ... :- S,NP ... ... IntransVsg :: S\\NP[sg] ... ... sleeps => IntransVsg ... eats => S\\NP[sg]/NP {sem=\\x y.eat(x,y)} ... ... and => var\\var/var ... ''', ... False ... ))) and => ((_var0\_var0)/_var0) eats => ((S\NP['sg'])/NP) sleeps => (S\NP['sg']) Semantics are missing >>> print(str(lexicon.fromstring( ... ''' ... :- S,NP ... ... eats => S\\NP[sg]/NP ... ''', ... True ... ))) Traceback (most recent call last): ... AssertionError: eats => S\NP[sg]/NP must contain semantics because include_semantics is set to True ------------------------------------ CCG combinator semantics computation ------------------------------------ >>> from nltk.sem.logic import * >>> from nltk.ccg.logic import * >>> read_expr = Expression.fromstring Compute semantics from function application >>> print(str(compute_function_semantics(read_expr(r'\x.P(x)'), read_expr(r'book')))) P(book) >>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'read')))) read(book) >>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'\x.read(x)')))) read(book) Compute semantics from composition >>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'\x.Q(x)')))) \x.P(Q(x)) >>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'read')))) Traceback (most recent call last): ... AssertionError: `read` must be a lambda expression Compute semantics from substitution >>> print(str(compute_substitution_semantics(read_expr(r'\x y.P(x,y)'), read_expr(r'\x.Q(x)')))) \x.P(x,Q(x)) >>> print(str(compute_substitution_semantics(read_expr(r'\x.P(x)'), read_expr(r'read')))) Traceback (most recent call last): ... AssertionError: `\x.P(x)` must be a lambda expression with 2 arguments Compute type-raise semantics >>> print(str(compute_type_raised_semantics(read_expr(r'\x.P(x)')))) \F x.F(P(x)) >>> print(str(compute_type_raised_semantics(read_expr(r'\x.F(x)')))) \F1 x.F1(F(x)) >>> print(str(compute_type_raised_semantics(read_expr(r'\x y z.P(x,y,z)')))) \F x y z.F(P(x,y,z)) nltk-3.7/nltk/test/chat80.doctest000066400000000000000000000204671420073152400167470ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ======= Chat-80 ======= Chat-80 was a natural language system which allowed the user to interrogate a Prolog knowledge base in the domain of world geography. It was developed in the early '80s by Warren and Pereira; see ``_ for a description and ``_ for the source files. The ``chat80`` module contains functions to extract data from the Chat-80 relation files ('the world database'), and convert then into a format that can be incorporated in the FOL models of ``nltk.sem.evaluate``. The code assumes that the Prolog input files are available in the NLTK corpora directory. The Chat-80 World Database consists of the following files:: world0.pl rivers.pl cities.pl countries.pl contain.pl borders.pl This module uses a slightly modified version of ``world0.pl``, in which a set of Prolog rules have been omitted. The modified file is named ``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since it uses a list rather than a string in the second field. Reading Chat-80 Files ===================== Chat-80 relations are like tables in a relational database. The relation acts as the name of the table; the first argument acts as the 'primary key'; and subsequent arguments are further fields in the table. In general, the name of the table provides a label for a unary predicate whose extension is all the primary keys. For example, relations in ``cities.pl`` are of the following form:: 'city(athens,greece,1368).' Here, ``'athens'`` is the key, and will be mapped to a member of the unary predicate *city*. By analogy with NLTK corpora, ``chat80`` defines a number of 'items' which correspond to the relations. >>> from nltk.sem import chat80 >>> print(chat80.items) ('borders', 'circle_of_lat', 'circle_of_long', 'city', ...) The fields in the table are mapped to binary predicates. The first argument of the predicate is the primary key, while the second argument is the data in the relevant field. Thus, in the above example, the third field is mapped to the binary predicate *population_of*, whose extension is a set of pairs such as ``'(athens, 1368)'``. An exception to this general framework is required by the relations in the files ``borders.pl`` and ``contains.pl``. These contain facts of the following form:: 'borders(albania,greece).' 'contains0(africa,central_africa).' We do not want to form a unary concept out the element in the first field of these records, and we want the label of the binary relation just to be ``'border'``/``'contain'`` respectively. In order to drive the extraction process, we use 'relation metadata bundles' which are Python dictionaries such as the following:: city = {'label': 'city', 'closures': [], 'schema': ['city', 'country', 'population'], 'filename': 'cities.pl'} According to this, the file ``city['filename']`` contains a list of relational tuples (or more accurately, the corresponding strings in Prolog form) whose predicate symbol is ``city['label']`` and whose relational schema is ``city['schema']``. The notion of a ``closure`` is discussed in the next section. Concepts ======== In order to encapsulate the results of the extraction, a class of ``Concept``\ s is introduced. A ``Concept`` object has a number of attributes, in particular a ``prefLabel``, an arity and ``extension``. >>> c1 = chat80.Concept('dog', arity=1, extension=set(['d1', 'd2'])) >>> print(c1) Label = 'dog' Arity = 1 Extension = ['d1', 'd2'] The ``extension`` attribute makes it easier to inspect the output of the extraction. >>> schema = ['city', 'country', 'population'] >>> concepts = chat80.clause2concepts('cities.pl', 'city', schema) >>> concepts [Concept('city'), Concept('country_of'), Concept('population_of')] >>> for c in concepts: ... print("%s:\n\t%s" % (c.prefLabel, c.extension[:4])) city: ['athens', 'bangkok', 'barcelona', 'berlin'] country_of: [('athens', 'greece'), ('bangkok', 'thailand'), ('barcelona', 'spain'), ('berlin', 'east_germany')] population_of: [('athens', '1368'), ('bangkok', '1178'), ('barcelona', '1280'), ('berlin', '3481')] In addition, the ``extension`` can be further processed: in the case of the ``'border'`` relation, we check that the relation is **symmetric**, and in the case of the ``'contain'`` relation, we carry out the **transitive closure**. The closure properties associated with a concept is indicated in the relation metadata, as indicated earlier. >>> borders = set([('a1', 'a2'), ('a2', 'a3')]) >>> c2 = chat80.Concept('borders', arity=2, extension=borders) >>> print(c2) Label = 'borders' Arity = 2 Extension = [('a1', 'a2'), ('a2', 'a3')] >>> c3 = chat80.Concept('borders', arity=2, closures=['symmetric'], extension=borders) >>> c3.close() >>> print(c3) Label = 'borders' Arity = 2 Extension = [('a1', 'a2'), ('a2', 'a1'), ('a2', 'a3'), ('a3', 'a2')] The ``extension`` of a ``Concept`` object is then incorporated into a ``Valuation`` object. Persistence =========== The functions ``val_dump`` and ``val_load`` are provided to allow a valuation to be stored in a persistent database and re-loaded, rather than having to be re-computed each time. Individuals and Lexical Items ============================= As well as deriving relations from the Chat-80 data, we also create a set of individual constants, one for each entity in the domain. The individual constants are string-identical to the entities. For example, given a data item such as ``'zloty'``, we add to the valuation a pair ``('zloty', 'zloty')``. In order to parse English sentences that refer to these entities, we also create a lexical item such as the following for each individual constant:: PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty' The set of rules is written to the file ``chat_pnames.fcfg`` in the current directory. SQL Query ========= The ``city`` relation is also available in RDB form and can be queried using SQL statements. >>> import nltk >>> q = "SELECT City, Population FROM city_table WHERE Country = 'china' and Population > 1000" >>> for answer in chat80.sql_query('corpora/city_database/city.db', q): ... print("%-10s %4s" % answer) canton 1496 chungking 1100 mukden 1551 peking 2031 shanghai 5407 tientsin 1795 The (deliberately naive) grammar ``sql.fcfg`` translates from English to SQL: >>> nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg') % start S S[SEM=(?np + WHERE + ?vp)] -> NP[SEM=?np] VP[SEM=?vp] VP[SEM=(?v + ?pp)] -> IV[SEM=?v] PP[SEM=?pp] VP[SEM=(?v + ?ap)] -> IV[SEM=?v] AP[SEM=?ap] NP[SEM=(?det + ?n)] -> Det[SEM=?det] N[SEM=?n] PP[SEM=(?p + ?np)] -> P[SEM=?p] NP[SEM=?np] AP[SEM=?pp] -> A[SEM=?a] PP[SEM=?pp] NP[SEM='Country="greece"'] -> 'Greece' NP[SEM='Country="china"'] -> 'China' Det[SEM='SELECT'] -> 'Which' | 'What' N[SEM='City FROM city_table'] -> 'cities' IV[SEM=''] -> 'are' A[SEM=''] -> 'located' P[SEM=''] -> 'in' Given this grammar, we can express, and then execute, queries in English. >>> cp = nltk.parse.load_parser('grammars/book_grammars/sql0.fcfg') >>> query = 'What cities are in China' >>> for tree in cp.parse(query.split()): ... answer = tree.label()['SEM'] ... q = " ".join(answer) ... print(q) ... SELECT City FROM city_table WHERE Country="china" >>> rows = chat80.sql_query('corpora/city_database/city.db', q) >>> for r in rows: print("%s" % r, end=' ') canton chungking dairen harbin kowloon mukden peking shanghai sian tientsin Using Valuations ----------------- In order to convert such an extension into a valuation, we use the ``make_valuation()`` method; setting ``read=True`` creates and returns a new ``Valuation`` object which contains the results. >>> val = chat80.make_valuation(concepts, read=True) >>> 'calcutta' in val['city'] True >>> [town for (town, country) in val['country_of'] if country == 'india'] ['bombay', 'calcutta', 'delhi', 'hyderabad', 'madras'] >>> dom = val.domain >>> g = nltk.sem.Assignment(dom) >>> m = nltk.sem.Model(dom, val) >>> m.evaluate(r'population_of(jakarta, 533)', g) True nltk-3.7/nltk/test/childes.doctest000066400000000000000000000217251420073152400172710ustar00rootroot00000000000000======================= CHILDES Corpus Readers ======================= Read the XML version of the CHILDES corpus. Setup ===== >>> from nltk.test.childes_fixt import setup_module >>> setup_module() How to use CHILDESCorpusReader ============================== Read the CHILDESCorpusReader class and read the CHILDES corpus saved in the nltk_data directory. >>> import nltk >>> from nltk.corpus.reader import CHILDESCorpusReader >>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/') Reading files in the Valian corpus (Valian, 1991). >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml') >>> valian.fileids() ['Valian/01a.xml', 'Valian/01b.xml', 'Valian/02a.xml', 'Valian/02b.xml',... Count the number of files >>> len(valian.fileids()) 43 Printing properties of the corpus files. >>> corpus_data = valian.corpus(valian.fileids()) >>> print(corpus_data[0]['Lang']) eng >>> for key in sorted(corpus_data[0].keys()): ... print(key, ": ", corpus_data[0][key]) Corpus : valian Date : 1986-03-04 Id : 01a Lang : eng Version : 2.0.1 {http://www.w3.org/2001/XMLSchema-instance}schemaLocation : http://www.talkbank.org/ns/talkbank http://talkbank.org/software/talkbank.xsd Printing information of participants of the corpus. The most common codes for the participants are 'CHI' (target child), 'MOT' (mother), and 'INV' (investigator). >>> corpus_participants = valian.participants(valian.fileids()) >>> for this_corpus_participants in corpus_participants[:2]: ... for key in sorted(this_corpus_participants.keys()): ... dct = this_corpus_participants[key] ... print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())]) CHI : [('age', 'P2Y1M3D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')] INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')] MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')] CHI : [('age', 'P2Y1M12D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')] INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')] MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')] printing words. >>> valian.words('Valian/01a.xml') ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ... printing sentences. >>> valian.sents('Valian/01a.xml') [['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', 'March', 'fourth', 'I', 'believe', 'and', 'when', 'was', "Parent's", 'birthday'], ["Child's"], ['oh', "I'm", 'sorry'], ["that's", 'okay'], ... You can specify the participants with the argument *speaker*. >>> valian.words('Valian/01a.xml',speaker=['INV']) ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ... >>> valian.words('Valian/01a.xml',speaker=['MOT']) ["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ... >>> valian.words('Valian/01a.xml',speaker=['CHI']) ['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',... tagged_words() and tagged_sents() return the usual (word,pos) tuple lists. POS tags in the CHILDES are automatically assigned by MOR and POST programs (MacWhinney, 2000). >>> valian.tagged_words('Valian/01a.xml')[:30] [('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'), ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'), ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'), ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n'), ("Child's", 'n:prop'), ('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj'), ("that's", 'pro:dem'), ('okay', 'adj'), ('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')] >>> valian.tagged_sents('Valian/01a.xml')[:10] [[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'), ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'), ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'), ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n')], [("Child's", 'n:prop')], [('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj')], [("that's", 'pro:dem'), ('okay', 'adj')], [('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')], [('great', 'adj')], [('and', 'coord'), ("she's", 'pro:sub'), ('two', 'det:num'), ('years', 'n'), ('old', 'adj')], [('correct', 'adj')], [('okay', 'co')], [('she', 'pro:sub'), ('just', 'adv:int'), ('turned', 'part'), ('two', 'det:num'), ('a', 'det'), ('month', 'n'), ('ago', 'adv')]] When the argument *stem* is true, the word stems (e.g., 'is' -> 'be-3PS') are used instead of the original words. >>> valian.words('Valian/01a.xml')[:30] ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', ... >>> valian.words('Valian/01a.xml',stem=True)[:30] ['at', 'Parent', 'Lastname', 's', 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'be-3S', ... When the argument *replace* is true, the replaced words are used instead of the original words. >>> valian.words('Valian/01a.xml',speaker='CHI')[247] 'tikteat' >>> valian.words('Valian/01a.xml',speaker='CHI',replace=True)[247] 'trick' When the argument *relation* is true, the relational relationships in the sentence are returned. See Sagae et al. (2010) for details of the relational structure adopted in the CHILDES. >>> valian.words('Valian/01a.xml',relation=True)[:10] [[('at', 'prep', '1|0|ROOT'), ('Parent', 'n', '2|5|VOC'), ('Lastname', 'n', '3|5|MOD'), ('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|1|JCT'), ('Child', 'n', '7|8|NAME'), ('Lastname', 'n', '8|6|POBJ'), ('and', 'coord', '9|8|COORD'), ('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v', '11|9|COMP'), ('March', 'n', '12|11|PRED'), ('fourth', 'adj', '13|12|MOD'), ('I', 'pro', '15|16|SUBJ'), ('believe', 'v', '16|14|ROOT'), ('and', 'coord', '18|17|ROOT'), ('when', 'adv', '19|20|PRED'), ('be-PAST', 'v', '20|18|COMP'), ('Parent', 'n', '21|23|MOD'), ('s', 'poss', '22|23|MOD'), ('birth', 'n', '23|20|SUBJ')], [('Child', 'n', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|4|COM'), ('I', 'pro', '3|4|SUBJ'), ('be', 'v', '4|0|ROOT'), ('sorry', 'adj', '5|4|PRED')], [('that', 'pro', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')], [('February', 'n', '1|6|VOC'), ('first', 'adj', '2|6|ENUM'), ('nineteen', 'det', '4|6|ENUM'), ('eighty', 'det', '5|6|ENUM'), ('four', 'det', '6|0|ROOT')], [('great', 'adj', '1|0|ROOT')], [('and', 'coord', '1|0|ROOT'), ('she', 'pro', '2|1|ROOT'), ('be', 'aux', '3|5|AUX'), ('two', 'det', '4|5|QUANT'), ('year-PL', 'n', '5|2|ROOT'), ('old', 'adj', '6|5|MOD')], [('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|0|ROOT'), ('just', 'adv', '2|3|JCT'), ('turn-PERF', 'part', '3|1|XCOMP'), ('two', 'det', '4|6|QUANT'), ('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]] Printing age. When the argument *month* is true, the age information in the CHILDES format is converted into the number of months. >>> valian.age() ['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ... >>> valian.age('Valian/01a.xml') ['P2Y1M3D'] >>> valian.age('Valian/01a.xml',month=True) [25] Printing MLU. The criteria for the MLU computation is broadly based on Brown (1973). >>> valian.MLU() [2.3574660633484..., 2.292682926829..., 3.492857142857..., 2.961783439490..., 2.0842696629213..., 3.169811320754..., 3.137404580152..., 3.0578034682080..., 4.090163934426..., 3.488372093023..., 2.8773584905660..., 3.4792899408284..., 4.0111940298507..., 3.456790123456..., 4.487603305785..., 4.007936507936..., 5.25, 5.154696132596..., ...] >>> valian.MLU('Valian/01a.xml') [2.35746606334...] Basic stuff ============================== Count the number of words and sentences of each file. >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml') >>> for this_file in valian.fileids()[:6]: ... print(valian.corpus(this_file)[0]['Corpus'], valian.corpus(this_file)[0]['Id']) ... print("num of words: %i" % len(valian.words(this_file))) ... print("num of sents: %i" % len(valian.sents(this_file))) valian 01a num of words: 3606 num of sents: 1027 valian 01b num of words: 4376 num of sents: 1274 valian 02a num of words: 2673 num of sents: 801 valian 02b num of words: 5020 num of sents: 1583 valian 03a num of words: 2743 num of sents: 988 valian 03b num of words: 4409 num of sents: 1397 nltk-3.7/nltk/test/childes_fixt.py000066400000000000000000000005471420073152400173050ustar00rootroot00000000000000def setup_module(): import pytest import nltk.data try: nltk.data.find("corpora/childes/data-xml/Eng-USA-MOR/") except LookupError as e: pytest.skip( "The CHILDES corpus is not found. " "It should be manually downloaded and saved/unpacked " "to [NLTK_Data_Dir]/corpora/childes/" ) nltk-3.7/nltk/test/chunk.doctest000066400000000000000000000256031420073152400167650ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========== Chunking ========== >>> from nltk.chunk import * >>> from nltk.chunk.util import * >>> from nltk.chunk.regexp import * >>> from nltk import Tree >>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./." >>> gold_chunked_text = tagstr2tree(tagged_text) >>> unchunked_text = gold_chunked_text.flatten() Chunking uses a special regexp syntax for rules that delimit the chunks. These rules must be converted to 'regular' regular expressions before a sentence can be chunked. >>> tag_pattern = "
    ?*" >>> regexp_pattern = tag_pattern2re_pattern(tag_pattern) >>> regexp_pattern '(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)' Construct some new chunking rules. >>> chunk_rule = ChunkRule(r"<.*>+", "Chunk everything") >>> strip_rule = StripRule(r"", "Strip on verbs/prepositions") >>> split_rule = SplitRule("
    ", "
    ", ... "Split successive determiner/noun pairs") Create and score a series of chunk parsers, successively more complex. >>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print(chunked_text) (S (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.)) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> print(chunkscore.precision()) 0.0 >>> print(chunkscore.recall()) 0.0 >>> print(chunkscore.f_measure()) 0 >>> for chunk in sorted(chunkscore.missed()): print(chunk) (NP The/DT cat/NN) (NP the/DT dog/NN) (NP the/DT mat/NN) >>> for chunk in chunkscore.incorrect(): print(chunk) (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.) >>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule], ... chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print(chunked_text) (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN the/DT dog/NN) chewed/VBD ./.) >>> assert chunked_text == chunk_parser.parse(list(unchunked_text)) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 0.5 >>> print(chunkscore.recall()) 0.33333333... >>> print(chunkscore.f_measure()) 0.4 >>> for chunk in sorted(chunkscore.missed()): print(chunk) (NP the/DT dog/NN) (NP the/DT mat/NN) >>> for chunk in chunkscore.incorrect(): print(chunk) (NP the/DT mat/NN the/DT dog/NN) >>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule, split_rule], ... chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True) # Input:
    <.> # Chunk everything: {
    <.>} # Strip on verbs/prepositions: {
    } {
    } <.> # Split successive determiner/noun pairs: {
    } {
    }{
    } <.> >>> print(chunked_text) (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN) (NP the/DT dog/NN) chewed/VBD ./.) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 1.0 >>> chunkscore.recall() 1.0 >>> chunkscore.f_measure() 1.0 >>> chunkscore.missed() [] >>> chunkscore.incorrect() [] >>> chunk_parser.rules() [+'>, '>, ', '
    '>] Printing parsers: >>> print(repr(chunk_parser)) >>> print(chunk_parser) RegexpChunkParser with 3 rules: Chunk everything +'> Strip on verbs/prepositions '> Split successive determiner/noun pairs ', '
    '> Regression Tests ~~~~~~~~~~~~~~~~ ChunkParserI ------------ `ChunkParserI` is an abstract interface -- it is not meant to be instantiated directly. >>> ChunkParserI().parse([]) Traceback (most recent call last): . . . NotImplementedError ChunkString ----------- ChunkString can be built from a tree of tagged tuples, a tree of trees, or a mixed list of both: >>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)]) >>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])]) >>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])]) >>> ChunkString(t1) '> >>> ChunkString(t2) '> >>> ChunkString(t3) '> Other values generate an error: >>> ChunkString(Tree('S', ['x'])) Traceback (most recent call last): . . . ValueError: chunk structures must contain tagged tokens or trees The `str()` for a chunk string adds spaces to it, which makes it line up with `str()` output for other chunk strings over the same underlying input. >>> cs = ChunkString(t1) >>> print(cs) >>> cs.xform('', '{}') >>> print(cs) {} The `_verify()` method makes sure that our transforms don't corrupt the chunk string. By setting debug_level=2, `_verify()` will be called at the end of every call to `xform`. >>> cs = ChunkString(t1, debug_level=3) >>> # tag not marked with <...>: >>> cs.xform('', 't3') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: t3 >>> # brackets not balanced: >>> cs.xform('', '{') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: { >>> # nested brackets: >>> cs.xform('', '{{}}') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: {{}} >>> # modified tags: >>> cs.xform('', '') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed >>> # added tags: >>> cs.xform('', '') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed Chunking Rules -------------- Test the different rule constructors & __repr__ methods: >>> r1 = RegexpChunkRule(''+ChunkString.IN_STRIP_PATTERN, ... '{}', 'chunk and ') >>> r2 = RegexpChunkRule(re.compile(''+ChunkString.IN_STRIP_PATTERN), ... '{}', 'chunk and ') >>> r3 = ChunkRule('', 'chunk and ') >>> r4 = StripRule('', 'strip and ') >>> r5 = UnChunkRule('', 'unchunk and ') >>> r6 = MergeRule('', '', 'merge w/ ') >>> r7 = SplitRule('', '', 'split from ') >>> r8 = ExpandLeftRule('', '', 'expand left ') >>> r9 = ExpandRightRule('', '', 'expand right ') >>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9: ... print(rule) (?=[^\\}]*(\\{|$))'->'{}'> (?=[^\\}]*(\\{|$))'->'{}'> '> '> '> ', ''> ', ''> ', ''> ', ''> `tag_pattern2re_pattern()` complains if the tag pattern looks problematic: >>> tag_pattern2re_pattern('{}') Traceback (most recent call last): . . . ValueError: Bad tag pattern: '{}' RegexpChunkParser ----------------- A warning is printed when parsing an empty sentence: >>> parser = RegexpChunkParser([ChunkRule('', '')]) >>> parser.parse(Tree('S', [])) Warning: parsing empty text Tree('S', []) RegexpParser ------------ >>> parser = RegexpParser(''' ... NP: {
    ? * *} # NP ... P: {} # Preposition ... V: {} # Verb ... PP: {

    } # PP -> P NP ... VP: { *} # VP -> V (NP|PP)* ... ''') >>> print(repr(parser)) >>> print(parser) chunk.RegexpParser with 5 stages: RegexpChunkParser with 1 rules: NP ? * *'> RegexpChunkParser with 1 rules: Preposition '> RegexpChunkParser with 1 rules: Verb '> RegexpChunkParser with 1 rules: PP -> P NP '> RegexpChunkParser with 1 rules: VP -> V (NP|PP)* *'> >>> print(parser.parse(unchunked_text, trace=True)) # Input:

    <.> # NP: {
    } {
    }{
    } <.> # Input: <.> # Preposition: {} <.> # Input:

    <.> # Verb: {}

    {} <.> # Input:

    <.> # PP -> P NP: {

    } <.> # Input: <.> # VP -> V (NP|PP)*: { }{} <.> (S (NP The/DT cat/NN) (VP (V sat/VBD) (PP (P on/IN) (NP the/DT mat/NN)) (NP the/DT dog/NN)) (VP (V chewed/VBD)) ./.) Test parsing of other rule types: >>> print(RegexpParser(''' ... X: ... }{ # strip rule ... }{ # split rule ... {} # merge rule ... {} # chunk rule w/ context ... ''')) chunk.RegexpParser with 1 stages: RegexpChunkParser with 4 rules: strip rule '> split rule ', ''> merge rule ', ''> chunk rule w/ context ', '', ''> Illegal patterns give an error message: >>> print(RegexpParser('X: {} {}')) Traceback (most recent call last): . . . ValueError: Illegal chunk pattern: {} {} nltk-3.7/nltk/test/classify.doctest000066400000000000000000000165111420073152400174700ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ============= Classifiers ============= >>> from nltk.test.classify_fixt import setup_module >>> setup_module() Classifiers label tokens with category labels (or *class labels*). Typically, labels are represented with strings (such as ``"health"`` or ``"sports"``. In NLTK, classifiers are defined using classes that implement the `ClassifierI` interface, which supports the following operations: - self.classify(featureset) - self.classify_many(featuresets) - self.labels() - self.prob_classify(featureset) - self.prob_classify_many(featuresets) NLTK defines several classifier classes: - `ConditionalExponentialClassifier` - `DecisionTreeClassifier` - `MaxentClassifier` - `NaiveBayesClassifier` - `WekaClassifier` Classifiers are typically created by training them on a training corpus. Regression Tests ~~~~~~~~~~~~~~~~ We define a very simple training corpus with 3 binary features: ['a', 'b', 'c'], and are two labels: ['x', 'y']. We use a simple feature set so that the correct answers can be calculated analytically (although we haven't done this yet for all tests). >>> import nltk >>> train = [ ... (dict(a=1,b=1,c=1), 'y'), ... (dict(a=1,b=1,c=1), 'x'), ... (dict(a=1,b=1,c=0), 'y'), ... (dict(a=0,b=1,c=1), 'x'), ... (dict(a=0,b=1,c=1), 'y'), ... (dict(a=0,b=0,c=1), 'y'), ... (dict(a=0,b=1,c=0), 'x'), ... (dict(a=0,b=0,c=0), 'x'), ... (dict(a=0,b=1,c=1), 'y'), ... (dict(a=None,b=1,c=0), 'x'), ... ] >>> test = [ ... (dict(a=1,b=0,c=1)), # unseen ... (dict(a=1,b=0,c=0)), # unseen ... (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x ... (dict(a=0,b=1,c=0)), # seen 1 time, label=x ... ] Test the Naive Bayes classifier: >>> classifier = nltk.classify.NaiveBayesClassifier.train(train) >>> sorted(classifier.labels()) ['x', 'y'] >>> classifier.classify_many(test) ['y', 'x', 'y', 'x'] >>> for pdist in classifier.prob_classify_many(test): ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y'))) 0.2500 0.7500 0.5833 0.4167 0.3571 0.6429 0.7000 0.3000 >>> classifier.show_most_informative_features() Most Informative Features c = 0 x : y = 2.3 : 1.0 c = 1 y : x = 1.8 : 1.0 a = 1 y : x = 1.7 : 1.0 a = 0 x : y = 1.0 : 1.0 b = 0 x : y = 1.0 : 1.0 b = 1 x : y = 1.0 : 1.0 Test the Decision Tree classifier (without None): >>> classifier = nltk.classify.DecisionTreeClassifier.train( ... train[:-1], entropy_cutoff=0, ... support_cutoff=0) >>> sorted(classifier.labels()) ['x', 'y'] >>> print(classifier) c=0? .................................................. x a=0? ................................................ x a=1? ................................................ y c=1? .................................................. y >>> classifier.classify_many(test) ['y', 'y', 'y', 'x'] >>> for pdist in classifier.prob_classify_many(test): ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y'))) Traceback (most recent call last): . . . NotImplementedError Test the Decision Tree classifier (with None): >>> classifier = nltk.classify.DecisionTreeClassifier.train( ... train, entropy_cutoff=0, ... support_cutoff=0) >>> sorted(classifier.labels()) ['x', 'y'] >>> print(classifier) c=0? .................................................. x a=0? ................................................ x a=1? ................................................ y a=None? ............................................. x c=1? .................................................. y Test SklearnClassifier, which requires the scikit-learn package. >>> from nltk.classify import SklearnClassifier >>> from sklearn.naive_bayes import BernoulliNB >>> from sklearn.svm import SVC >>> train_data = [({"a": 4, "b": 1, "c": 0}, "ham"), ... ({"a": 5, "b": 2, "c": 1}, "ham"), ... ({"a": 0, "b": 3, "c": 4}, "spam"), ... ({"a": 5, "b": 1, "c": 1}, "ham"), ... ({"a": 1, "b": 4, "c": 3}, "spam")] >>> classif = SklearnClassifier(BernoulliNB()).train(train_data) >>> test_data = [{"a": 3, "b": 2, "c": 1}, ... {"a": 0, "b": 3, "c": 7}] >>> classif.classify_many(test_data) ['ham', 'spam'] >>> classif = SklearnClassifier(SVC(), sparse=False).train(train_data) >>> classif.classify_many(test_data) ['ham', 'spam'] Test the Maximum Entropy classifier training algorithms; they should all generate the same results. >>> def print_maxent_test_header(): ... print(' '*11+''.join([' test[%s] ' % i ... for i in range(len(test))])) ... print(' '*11+' p(x) p(y)'*len(test)) ... print('-'*(11+15*len(test))) >>> def test_maxent(algorithm): ... print('%11s' % algorithm, end=' ') ... try: ... classifier = nltk.classify.MaxentClassifier.train( ... train, algorithm, trace=0, max_iter=1000) ... except Exception as e: ... print('Error: %r' % e) ... return ... ... for featureset in test: ... pdist = classifier.prob_classify(featureset) ... print('%8.2f%6.2f' % (pdist.prob('x'), pdist.prob('y')), end=' ') ... print() >>> print_maxent_test_header(); test_maxent('GIS'); test_maxent('IIS') test[0] test[1] test[2] test[3] p(x) p(y) p(x) p(y) p(x) p(y) p(x) p(y) ----------------------------------------------------------------------- GIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 IIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 >>> test_maxent('MEGAM'); test_maxent('TADM') # doctest: +SKIP MEGAM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 TADM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 Regression tests for TypedMaxentFeatureEncoding ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> from nltk.classify import maxent >>> train = [ ... ({'a': 1, 'b': 1, 'c': 1}, 'y'), ... ({'a': 5, 'b': 5, 'c': 5}, 'x'), ... ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'), ... ({'a': 5.5, 'b': 5.4, 'c': 5.3}, 'x'), ... ({'a': 0.8, 'b': 1.2, 'c': 1}, 'y'), ... ({'a': 5.1, 'b': 4.9, 'c': 5.2}, 'x') ... ] >>> test = [ ... {'a': 1, 'b': 0.8, 'c': 1.2}, ... {'a': 5.2, 'b': 5.1, 'c': 5} ... ] >>> encoding = maxent.TypedMaxentFeatureEncoding.train( ... train, count_cutoff=3, alwayson_features=True) >>> classifier = maxent.MaxentClassifier.train( ... train, bernoulli=False, encoding=encoding, trace=0) >>> classifier.classify_many(test) ['y', 'x'] nltk-3.7/nltk/test/classify_fixt.py000066400000000000000000000001621420073152400175000ustar00rootroot00000000000000# most of classify.doctest requires numpy def setup_module(): import pytest pytest.importorskip("numpy") nltk-3.7/nltk/test/collections.doctest000066400000000000000000000011171420073152400201650ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =========== Collections =========== >>> import nltk >>> from nltk.collections import * Trie ---- Trie can be pickled: >>> import pickle >>> trie = nltk.collections.Trie(['a']) >>> s = pickle.dumps(trie) >>> pickle.loads(s) {'a': {True: None}} LazyIteratorList ---------------- Fetching the length of a LazyIteratorList object does not throw a StopIteration exception: >>> lil = LazyIteratorList(i for i in range(1, 11)) >>> lil[-1] 10 >>> len(lil) 10 nltk-3.7/nltk/test/collocations.doctest000066400000000000000000000260251420073152400203450ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ============== Collocations ============== Overview ~~~~~~~~ Collocations are expressions of multiple words which commonly co-occur. For example, the top ten bigram collocations in Genesis are listed below, as measured using Pointwise Mutual Information. >>> import nltk >>> from nltk.collocations import * >>> bigram_measures = nltk.collocations.BigramAssocMeasures() >>> trigram_measures = nltk.collocations.TrigramAssocMeasures() >>> fourgram_measures = nltk.collocations.QuadgramAssocMeasures() >>> finder = BigramCollocationFinder.from_words( ... nltk.corpus.genesis.words('english-web.txt')) >>> finder.nbest(bigram_measures.pmi, 10) [('Allon', 'Bacuth'), ('Ashteroth', 'Karnaim'), ('Ben', 'Ammi'), ('En', 'Mishpat'), ('Jegar', 'Sahadutha'), ('Salt', 'Sea'), ('Whoever', 'sheds'), ('appoint', 'overseers'), ('aromatic', 'resin'), ('cutting', 'instrument')] While these words are highly collocated, the expressions are also very infrequent. Therefore it is useful to apply filters, such as ignoring all bigrams which occur less than three times in the corpus: >>> finder.apply_freq_filter(3) >>> finder.nbest(bigram_measures.pmi, 10) [('Beer', 'Lahai'), ('Lahai', 'Roi'), ('gray', 'hairs'), ('ewe', 'lambs'), ('Most', 'High'), ('many', 'colors'), ('burnt', 'offering'), ('Paddan', 'Aram'), ('east', 'wind'), ('living', 'creature')] We may similarly find collocations among tagged words: >>> finder = BigramCollocationFinder.from_words( ... nltk.corpus.brown.tagged_words('ca01', tagset='universal')) >>> finder.nbest(bigram_measures.pmi, 5) [(('1,119', 'NUM'), ('votes', 'NOUN')), (('1962', 'NUM'), ("governor's", 'NOUN')), (('637', 'NUM'), ('E.', 'NOUN')), (('Alpharetta', 'NOUN'), ('prison', 'NOUN')), (('Bar', 'NOUN'), ('Association', 'NOUN'))] Or tags alone: >>> finder = BigramCollocationFinder.from_words(t for w, t in ... nltk.corpus.brown.tagged_words('ca01', tagset='universal')) >>> finder.nbest(bigram_measures.pmi, 10) [('PRT', 'VERB'), ('PRON', 'VERB'), ('ADP', 'DET'), ('.', 'PRON'), ('DET', 'ADJ'), ('CONJ', 'PRON'), ('ADP', 'NUM'), ('NUM', '.'), ('ADV', 'ADV'), ('VERB', 'ADV')] Or spanning intervening words: >>> finder = BigramCollocationFinder.from_words( ... nltk.corpus.genesis.words('english-web.txt'), ... window_size = 20) >>> finder.apply_freq_filter(2) >>> ignored_words = nltk.corpus.stopwords.words('english') >>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) >>> finder.nbest(bigram_measures.likelihood_ratio, 10) [('chief', 'chief'), ('became', 'father'), ('years', 'became'), ('hundred', 'years'), ('lived', 'became'), ('king', 'king'), ('lived', 'years'), ('became', 'became'), ('chief', 'chiefs'), ('hundred', 'became')] Finders ~~~~~~~ The collocations package provides collocation finders which by default consider all ngrams in a text as candidate collocations: >>> text = "I do not like green eggs and ham, I do not like them Sam I am!" >>> tokens = nltk.wordpunct_tokenize(text) >>> finder = BigramCollocationFinder.from_words(tokens) >>> scored = finder.score_ngrams(bigram_measures.raw_freq) >>> sorted(bigram for bigram, score in scored) [(',', 'I'), ('I', 'am'), ('I', 'do'), ('Sam', 'I'), ('am', '!'), ('and', 'ham'), ('do', 'not'), ('eggs', 'and'), ('green', 'eggs'), ('ham', ','), ('like', 'green'), ('like', 'them'), ('not', 'like'), ('them', 'Sam')] We could otherwise construct the collocation finder from manually-derived FreqDists: >>> word_fd = nltk.FreqDist(tokens) >>> bigram_fd = nltk.FreqDist(nltk.bigrams(tokens)) >>> finder = BigramCollocationFinder(word_fd, bigram_fd) >>> scored == finder.score_ngrams(bigram_measures.raw_freq) True A similar interface is provided for trigrams: >>> finder = TrigramCollocationFinder.from_words(tokens) >>> scored = finder.score_ngrams(trigram_measures.raw_freq) >>> set(trigram for trigram, score in scored) == set(nltk.trigrams(tokens)) True We may want to select only the top n results: >>> sorted(finder.nbest(trigram_measures.raw_freq, 2)) [('I', 'do', 'not'), ('do', 'not', 'like')] Alternatively, we can select those above a minimum score value: >>> sorted(finder.above_score(trigram_measures.raw_freq, ... 1.0 / len(tuple(nltk.trigrams(tokens))))) [('I', 'do', 'not'), ('do', 'not', 'like')] Now spanning intervening words: >>> finder = TrigramCollocationFinder.from_words(tokens) >>> finder = TrigramCollocationFinder.from_words(tokens, window_size=4) >>> sorted(finder.nbest(trigram_measures.raw_freq, 4)) [('I', 'do', 'like'), ('I', 'do', 'not'), ('I', 'not', 'like'), ('do', 'not', 'like')] A closer look at the finder's ngram frequencies: >>> sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10] [(('I', 'do', 'like'), 2), (('I', 'do', 'not'), 2), (('I', 'not', 'like'), 2), (('do', 'not', 'like'), 2), ((',', 'I', 'do'), 1), ((',', 'I', 'not'), 1), ((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1), (('Sam', 'I', 'am'), 1)] A similar interface is provided for fourgrams: >>> finder_4grams = QuadgramCollocationFinder.from_words(tokens) >>> scored_4grams = finder_4grams.score_ngrams(fourgram_measures.raw_freq) >>> set(fourgram for fourgram, score in scored_4grams) == set(nltk.ngrams(tokens, n=4)) True Filtering candidates ~~~~~~~~~~~~~~~~~~~~ All the ngrams in a text are often too many to be useful when finding collocations. It is generally useful to remove some words or punctuation, and to require a minimum frequency for candidate collocations. Given our sample text above, if we remove all trigrams containing personal pronouns from candidature, score_ngrams should return 6 less results, and 'do not like' will be the only candidate which occurs more than once: >>> finder = TrigramCollocationFinder.from_words(tokens) >>> len(finder.score_ngrams(trigram_measures.raw_freq)) 14 >>> finder.apply_word_filter(lambda w: w in ('I', 'me')) >>> len(finder.score_ngrams(trigram_measures.raw_freq)) 8 >>> sorted(finder.above_score(trigram_measures.raw_freq, ... 1.0 / len(tuple(nltk.trigrams(tokens))))) [('do', 'not', 'like')] Sometimes a filter is a function on the whole ngram, rather than each word, such as if we may permit 'and' to appear in the middle of a trigram, but not on either edge: >>> finder.apply_ngram_filter(lambda w1, w2, w3: 'and' in (w1, w3)) >>> len(finder.score_ngrams(trigram_measures.raw_freq)) 6 Finally, it is often important to remove low frequency candidates, as we lack sufficient evidence about their significance as collocations: >>> finder.apply_freq_filter(2) >>> len(finder.score_ngrams(trigram_measures.raw_freq)) 1 Association measures ~~~~~~~~~~~~~~~~~~~~ A number of measures are available to score collocations or other associations. The arguments to measure functions are marginals of a contingency table, in the bigram case (n_ii, (n_ix, n_xi), n_xx):: w1 ~w1 ------ ------ w2 | n_ii | n_oi | = n_xi ------ ------ ~w2 | n_io | n_oo | ------ ------ = n_ix TOTAL = n_xx We test their calculation using some known values presented in Manning and Schutze's text and other papers. Student's t: examples from Manning and Schutze 5.3.2 >>> print('%0.4f' % bigram_measures.student_t(8, (15828, 4675), 14307668)) 0.9999 >>> print('%0.4f' % bigram_measures.student_t(20, (42, 20), 14307668)) 4.4721 Chi-square: examples from Manning and Schutze 5.3.3 >>> print('%0.2f' % bigram_measures.chi_sq(8, (15828, 4675), 14307668)) 1.55 >>> print('%0.0f' % bigram_measures.chi_sq(59, (67, 65), 571007)) 456400 Likelihood ratios: examples from Dunning, CL, 1993 >>> print('%0.2f' % bigram_measures.likelihood_ratio(110, (2552, 221), 31777)) 270.72 >>> print('%0.2f' % bigram_measures.likelihood_ratio(8, (13, 32), 31777)) 95.29 Pointwise Mutual Information: examples from Manning and Schutze 5.4 >>> print('%0.2f' % bigram_measures.pmi(20, (42, 20), 14307668)) 18.38 >>> print('%0.2f' % bigram_measures.pmi(20, (15019, 15629), 14307668)) 0.29 TODO: Find authoritative results for trigrams. Using contingency table values ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ While frequency counts make marginals readily available for collocation finding, it is common to find published contingency table values. The collocations package therefore provides a wrapper, ContingencyMeasures, which wraps an association measures class, providing association measures which take contingency values as arguments, (n_ii, n_io, n_oi, n_oo) in the bigram case. >>> from nltk.metrics import ContingencyMeasures >>> cont_bigram_measures = ContingencyMeasures(bigram_measures) >>> print('%0.2f' % cont_bigram_measures.likelihood_ratio(8, 5, 24, 31740)) 95.29 >>> print('%0.2f' % cont_bigram_measures.chi_sq(8, 15820, 4667, 14287173)) 1.55 Ranking and correlation ~~~~~~~~~~~~~~~~~~~~~~~ It is useful to consider the results of finding collocations as a ranking, and the rankings output using different association measures can be compared using the Spearman correlation coefficient. Ranks can be assigned to a sorted list of results trivially by assigning strictly increasing ranks to each result: >>> from nltk.metrics.spearman import * >>> results_list = ['item1', 'item2', 'item3', 'item4', 'item5'] >>> print(list(ranks_from_sequence(results_list))) [('item1', 0), ('item2', 1), ('item3', 2), ('item4', 3), ('item5', 4)] If scores are available for each result, we may allow sufficiently similar results (differing by no more than rank_gap) to be assigned the same rank: >>> results_scored = [('item1', 50.0), ('item2', 40.0), ('item3', 38.0), ... ('item4', 35.0), ('item5', 14.0)] >>> print(list(ranks_from_scores(results_scored, rank_gap=5))) [('item1', 0), ('item2', 1), ('item3', 1), ('item4', 1), ('item5', 4)] The Spearman correlation coefficient gives a number from -1.0 to 1.0 comparing two rankings. A coefficient of 1.0 indicates identical rankings; -1.0 indicates exact opposite rankings. >>> print('%0.1f' % spearman_correlation( ... ranks_from_sequence(results_list), ... ranks_from_sequence(results_list))) 1.0 >>> print('%0.1f' % spearman_correlation( ... ranks_from_sequence(reversed(results_list)), ... ranks_from_sequence(results_list))) -1.0 >>> results_list2 = ['item2', 'item3', 'item1', 'item5', 'item4'] >>> print('%0.1f' % spearman_correlation( ... ranks_from_sequence(results_list), ... ranks_from_sequence(results_list2))) 0.6 >>> print('%0.1f' % spearman_correlation( ... ranks_from_sequence(reversed(results_list)), ... ranks_from_sequence(results_list2))) -0.6 nltk-3.7/nltk/test/concordance.doctest000066400000000000000000000066151420073152400201350ustar00rootroot00000000000000.. Copyright (C) 2001-2016 NLTK Project .. For license information, see LICENSE.TXT ================================== Concordance Example ================================== A concordance view shows us every occurrence of a given word, together with some context. Here we look up the word monstrous in Moby Dick by entering text1 followed by a period, then the term concordance, and then placing "monstrous" in parentheses: >>> from nltk.corpus import gutenberg >>> from nltk.text import Text >>> corpus = gutenberg.words('melville-moby_dick.txt') >>> text = Text(corpus) >>> text.concordance("monstrous") Displaying 11 of 11 matches: ong the former , one was of a most monstrous size . ... This came towards us , ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r ll over with a heathenish array of monstrous clubs and spears . Some were thick d as you gazed , and wondered what monstrous cannibal and savage could ever hav that has survived the flood ; most monstrous and most mountainous ! That Himmal they might scout at Moby Dick as a monstrous fable , or still worse and more de th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l ing Scenes . In connexion with the monstrous pictures of whales , I am strongly ere to enter upon those still more monstrous stories of them which are to be fo ght have been rummaged out of this monstrous cabinet there is no telling . But of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u >>> text.concordance("monstrous") Displaying 11 of 11 matches: ong the former , one was of a most monstrous size . ... This came towards us , ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r ll over with a heathenish array of monstrous clubs and spears . Some were thick ... We can also search for a multi-word phrase by passing a list of strings: >>> text.concordance(["monstrous", "size"]) Displaying 2 of 2 matches: the former , one was of a most monstrous size . ... This came towards us , op Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead upo ================================= Concordance List ================================= Often we need to store the results of concordance for further usage. To do so, call the concordance function with the stdout argument set to false: >>> from nltk.corpus import gutenberg >>> from nltk.text import Text >>> corpus = gutenberg.words('melville-moby_dick.txt') >>> text = Text(corpus) >>> con_list = text.concordance_list("monstrous") >>> con_list[2].line 'll over with a heathenish array of monstrous clubs and spears . Some were thick' >>> len(con_list) 11 ================================= Patching Issue #2088 ================================= Patching https://github.com/nltk/nltk/issues/2088 The left slice of the left context should be clip to 0 if the `i-context` < 0. >>> from nltk import Text, word_tokenize >>> jane_eyre = 'Chapter 1\nTHERE was no possibility of taking a walk that day. We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further outdoor exercise was now out of the question.' >>> text = Text(word_tokenize(jane_eyre)) >>> text.concordance_list('taking')[0].left ['Chapter', '1', 'THERE', 'was', 'no', 'possibility', 'of'] nltk-3.7/nltk/test/conftest.py000066400000000000000000000014031420073152400164550ustar00rootroot00000000000000import pytest from nltk.corpus.reader import CorpusReader @pytest.fixture(autouse=True) def mock_plot(mocker): """Disable matplotlib plotting in test code""" try: import matplotlib.pyplot as plt mocker.patch.object(plt, "gca") mocker.patch.object(plt, "show") except ImportError: pass @pytest.fixture(scope="module", autouse=True) def teardown_loaded_corpora(): """ After each test session ends (either doctest or unit test), unload any loaded corpora """ yield # first, wait for the test to end import nltk.corpus for name in dir(nltk.corpus): obj = getattr(nltk.corpus, name, None) if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"): obj._unload() nltk-3.7/nltk/test/corpus.doctest000066400000000000000000002752731420073152400172020ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ================ Corpus Readers ================ The `nltk.corpus` package defines a collection of *corpus reader* classes, which can be used to access the contents of a diverse set of corpora. The list of available corpora is given at: https://www.nltk.org/nltk_data/ Each corpus reader class is specialized to handle a specific corpus format. In addition, the `nltk.corpus` package automatically creates a set of corpus reader instances that can be used to access the corpora in the NLTK data package. Section `Corpus Reader Objects`_ ("Corpus Reader Objects") describes the corpus reader instances that can be used to read the corpora in the NLTK data package. Section `Corpus Reader Classes`_ ("Corpus Reader Classes") describes the corpus reader classes themselves, and discusses the issues involved in creating new corpus reader objects and new corpus reader classes. Section `Regression Tests`_ ("Regression Tests") contains regression tests for the corpus readers and associated functions and classes. .. contents:: **Table of Contents** :depth: 4 :backlinks: none --------------------- Corpus Reader Objects --------------------- Overview ======== NLTK includes a diverse set of corpora which can be read using the ``nltk.corpus`` package. Each corpus is accessed by means of a "corpus reader" object from ``nltk.corpus``: >>> import nltk.corpus >>> # The Brown corpus: >>> print(str(nltk.corpus.brown).replace('\\\\','/')) >>> # The Penn Treebank Corpus: >>> print(str(nltk.corpus.treebank).replace('\\\\','/')) >>> # The Name Genders Corpus: >>> print(str(nltk.corpus.names).replace('\\\\','/')) >>> # The Inaugural Address Corpus: >>> print(str(nltk.corpus.inaugural).replace('\\\\','/')) Most corpora consist of a set of files, each containing a document (or other pieces of text). A list of identifiers for these files is accessed via the ``fileids()`` method of the corpus reader: >>> nltk.corpus.treebank.fileids() ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...] >>> nltk.corpus.inaugural.fileids() ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', ...] Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus. For example, plaintext corpora support methods to read the corpus as raw text, a list of words, a list of sentences, or a list of paragraphs. >>> from nltk.corpus import inaugural >>> inaugural.raw('1789-Washington.txt') 'Fellow-Citizens of the Senate ...' >>> inaugural.words('1789-Washington.txt') ['Fellow', '-', 'Citizens', 'of', 'the', ...] >>> inaugural.sents('1789-Washington.txt') [['Fellow', '-', 'Citizens'...], ['Among', 'the', 'vicissitudes'...]...] >>> inaugural.paras('1789-Washington.txt') [[['Fellow', '-', 'Citizens'...]], [['Among', 'the', 'vicissitudes'...], ['On', 'the', 'one', 'hand', ',', 'I'...]...]...] Each of these reader methods may be given a single document's item name or a list of document item names. When given a list of document item names, the reader methods will concatenate together the contents of the individual documents. >>> l1 = len(inaugural.words('1789-Washington.txt')) >>> l2 = len(inaugural.words('1793-Washington.txt')) >>> l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt'])) >>> print('%s+%s == %s' % (l1, l2, l3)) 1538+147 == 1685 If the reader methods are called without any arguments, they will typically load all documents in the corpus. >>> len(inaugural.words()) 152901 If a corpus contains a README file, it can be accessed with a ``readme()`` method: >>> inaugural.readme()[:32] 'C-Span Inaugural Address Corpus\n' Plaintext Corpora ================= Here are the first few words from each of NLTK's plaintext corpora: >>> nltk.corpus.abc.words() ['PM', 'denies', 'knowledge', 'of', 'AWB', ...] >>> nltk.corpus.genesis.words() ['In', 'the', 'beginning', 'God', 'created', ...] >>> nltk.corpus.gutenberg.words(fileids='austen-emma.txt') ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ...] >>> nltk.corpus.inaugural.words() ['Fellow', '-', 'Citizens', 'of', 'the', ...] >>> nltk.corpus.state_union.words() ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ...] >>> nltk.corpus.webtext.words() ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...] Tagged Corpora ============== In addition to the plaintext corpora, NLTK's data package also contains a wide variety of annotated corpora. For example, the Brown Corpus is annotated with part-of-speech tags, and defines additional methods ``tagged_*()`` which words as `(word,tag)` tuples, rather than just bare word strings. >>> from nltk.corpus import brown >>> print(brown.words()) ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] >>> print(brown.tagged_words()) [('The', 'AT'), ('Fulton', 'NP-TL'), ...] >>> print(brown.sents()) [['The', 'Fulton', 'County'...], ['The', 'jury', 'further'...], ...] >>> print(brown.tagged_sents()) [[('The', 'AT'), ('Fulton', 'NP-TL')...], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR')...]...] >>> print(brown.paras(categories='reviews')) [[['It', 'is', 'not', 'news', 'that', 'Nathan', 'Milstein'...], ['Certainly', 'not', 'in', 'Orchestra', 'Hall', 'where'...]], [['There', 'was', 'about', 'that', 'song', 'something', ...], ['Not', 'the', 'noblest', 'performance', 'we', 'have', ...], ...], ...] >>> print(brown.tagged_paras(categories='reviews')) [[[('It', 'PPS'), ('is', 'BEZ'), ('not', '*'), ...], [('Certainly', 'RB'), ('not', '*'), ('in', 'IN'), ...]], [[('There', 'EX'), ('was', 'BEDZ'), ('about', 'IN'), ...], [('Not', '*'), ('the', 'AT'), ('noblest', 'JJT'), ...], ...], ...] Similarly, the Indian Language POS-Tagged Corpus includes samples of Indian text annotated with part-of-speech tags: >>> from nltk.corpus import indian >>> print(indian.words()) # doctest: +SKIP ['\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf\...', '\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', ...] >>> print(indian.tagged_words()) # doctest: +SKIP [('\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf...', 'NN'), ('\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', 'NN'), ...] Several tagged corpora support access to a simplified, universal tagset, e.g. where all nouns tags are collapsed to a single category ``NOUN``: >>> print(brown.tagged_sents(tagset='universal')) [[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ...], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ...]...] >>> from nltk.corpus import conll2000, switchboard >>> print(conll2000.tagged_words(tagset='universal')) [('Confidence', 'NOUN'), ('in', 'ADP'), ...] Use ``nltk.app.pos_concordance()`` to access a GUI for searching tagged corpora. Chunked Corpora =============== The CoNLL corpora also provide chunk structures, which are encoded as flat trees. The CoNLL 2000 Corpus includes phrasal chunks; and the CoNLL 2002 Corpus includes named entity chunks. >>> from nltk.corpus import conll2000, conll2002 >>> print(conll2000.sents()) [['Confidence', 'in', 'the', 'pound', 'is', 'widely', ...], ['Chancellor', 'of', 'the', 'Exchequer', ...], ...] >>> for tree in conll2000.chunked_sents()[:2]: ... print(tree) (S (NP Confidence/NN) (PP in/IN) (NP the/DT pound/NN) (VP is/VBZ widely/RB expected/VBN to/TO take/VB) (NP another/DT sharp/JJ dive/NN) if/IN ...) (S Chancellor/NNP (PP of/IN) (NP the/DT Exchequer/NNP) ...) >>> print(conll2002.sents()) [['Sao', 'Paulo', '(', 'Brasil', ')', ',', ...], ['-'], ...] >>> for tree in conll2002.chunked_sents()[:2]: ... print(tree) (S (LOC Sao/NC Paulo/VMI) (/Fpa (LOC Brasil/NC) )/Fpt ...) (S -/Fg) .. note:: Since the CONLL corpora do not contain paragraph break information, these readers do not support the ``para()`` method.) .. warning:: if you call the conll corpora reader methods without any arguments, they will return the contents of the entire corpus, *including* the 'test' portions of the corpus.) SemCor is a subset of the Brown corpus tagged with WordNet senses and named entities. Both kinds of lexical items include multiword units, which are encoded as chunks (senses and part-of-speech tags pertain to the entire chunk). >>> from nltk.corpus import semcor >>> semcor.words() ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] >>> semcor.chunks() [['The'], ['Fulton', 'County', 'Grand', 'Jury'], ...] >>> semcor.sents() [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], ['The', 'jury', 'further', 'said', ...], ...] >>> semcor.chunk_sents() [[['The'], ['Fulton', 'County', 'Grand', 'Jury'], ['said'], ... ['.']], [['The'], ['jury'], ['further'], ['said'], ... ['.']], ...] >>> list(map(str, semcor.tagged_chunks(tag='both')[:3])) ['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", "(Lemma('state.v.01.say') (VB said))"] >>> [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] [['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", ... '(None .)'], ['(DT The)', ... '(None .)']] The IEER corpus is another chunked corpus. This corpus is unusual in that each corpus item contains multiple documents. (This reflects the fact that each corpus file contains multiple documents.) The IEER corpus defines the `parsed_docs` method, which returns the documents in a given item as `IEERDocument` objects: >>> from nltk.corpus import ieer >>> ieer.fileids() ['APW_19980314', 'APW_19980424', 'APW_19980429', 'NYT_19980315', 'NYT_19980403', 'NYT_19980407'] >>> docs = ieer.parsed_docs('APW_19980314') >>> print(docs[0]) >>> print(docs[0].docno) APW19980314.0391 >>> print(docs[0].doctype) NEWS STORY >>> print(docs[0].date_time) 03/14/1998 10:36:00 >>> print(docs[0].headline) (DOCUMENT Kenyans protest tax hikes) >>> print(docs[0].text) (DOCUMENT (LOCATION NAIROBI) , (LOCATION Kenya) ( (ORGANIZATION AP) ) _ (CARDINAL Thousands) of laborers, ... on (DATE Saturday) ...) Parsed Corpora ============== The Treebank corpora provide a syntactic parse for each sentence. The NLTK data package includes a 10% sample of the Penn Treebank (in ``treebank``), as well as the Sinica Treebank (in ``sinica_treebank``). Reading the Penn Treebank (Wall Street Journal sample): >>> from nltk.corpus import treebank >>> print(treebank.fileids()) ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...] >>> print(treebank.words('wsj_0003.mrg')) ['A', 'form', 'of', 'asbestos', 'once', 'used', ...] >>> print(treebank.tagged_words('wsj_0003.mrg')) [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...] >>> print(treebank.parsed_sents('wsj_0003.mrg')[0]) (S (S-TPC-1 (NP-SBJ (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos)))) (RRC ...)...)...) ... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)) If you have access to a full installation of the Penn Treebank, NLTK can be configured to load it as well. Download the ``ptb`` package, and in the directory ``nltk_data/corpora/ptb`` place the ``BROWN`` and ``WSJ`` directories of the Treebank installation (symlinks work as well). Then use the ``ptb`` module instead of ``treebank``: >>> from nltk.corpus import ptb >>> print(ptb.fileids()) # doctest: +SKIP ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ...] >>> print(ptb.words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP ['A', 'form', 'of', 'asbestos', 'once', 'used', '*', ...] >>> print(ptb.tagged_words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...] ...and so forth, like ``treebank`` but with extended fileids. Categories specified in ``allcats.txt`` can be used to filter by genre; they consist of ``news`` (for WSJ articles) and names of the Brown subcategories (``fiction``, ``humor``, ``romance``, etc.): >>> ptb.categories() # doctest: +SKIP ['adventure', 'belles_lettres', 'fiction', 'humor', 'lore', 'mystery', 'news', 'romance', 'science_fiction'] >>> print(ptb.fileids('news')) # doctest: +SKIP ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG', ...] >>> print(ptb.words(categories=['humor','fiction'])) # doctest: +SKIP ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...] As PropBank and NomBank depend on the (WSJ portion of the) Penn Treebank, the modules ``propbank_ptb`` and ``nombank_ptb`` are provided for access to a full PTB installation. Reading the Sinica Treebank: >>> from nltk.corpus import sinica_treebank >>> print(sinica_treebank.sents()) # doctest: +SKIP [['\xe4\xb8\x80'], ['\xe5\x8f\x8b\xe6\x83\x85'], ...] >>> sinica_treebank.parsed_sents()[25] # doctest: +SKIP Tree('S', [Tree('NP', [Tree('Nba', ['\xe5\x98\x89\xe7\x8f\x8d'])]), Tree('V\xe2\x80\xa7\xe5\x9c\xb0', [Tree('VA11', ['\xe4\xb8\x8d\xe5\x81\x9c']), Tree('DE', ['\xe7\x9a\x84'])]), Tree('VA4', ['\xe5\x93\xad\xe6\xb3\xa3'])]) Reading the CoNLL 2007 Dependency Treebanks: >>> from nltk.corpus import conll2007 >>> conll2007.sents('esp.train')[0] # doctest: +SKIP ['El', 'aumento', 'del', 'índice', 'de', 'desempleo', ...] >>> conll2007.parsed_sents('esp.train')[0] # doctest: +SKIP >>> print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP (fortaleció (aumento El (del (índice (de (desempleo estadounidense))))) hoy considerablemente (al (euro (cotizaba , que (a (15.35 las GMT)) se (en (mercado el (de divisas) (de Fráncfort))) (a 0,9452_dólares) (frente_a , (0,9349_dólares los (de (mañana esta))))))) .) Word Lists and Lexicons ======================= The NLTK data package also includes a number of lexicons and word lists. These are accessed just like text corpora. The following examples illustrate the use of the wordlist corpora: >>> from nltk.corpus import names, stopwords, words >>> words.fileids() ['en', 'en-basic'] >>> words.words('en') ['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', ...] >>> stopwords.fileids() # doctest: +SKIP ['arabic', 'azerbaijani', 'bengali', 'danish', 'dutch', 'english', 'finnish', 'french', ...] >>> sorted(stopwords.words('portuguese')) ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', ...] >>> names.fileids() ['female.txt', 'male.txt'] >>> names.words('male.txt') ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', ...] >>> names.words('female.txt') ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', ...] The CMU Pronunciation Dictionary corpus contains pronunciation transcriptions for over 100,000 words. It can be accessed as a list of entries (where each entry consists of a word, an identifier, and a transcription) or as a dictionary from words to lists of transcriptions. Transcriptions are encoded as tuples of phoneme strings. >>> from nltk.corpus import cmudict >>> print(cmudict.entries()[653:659]) [('acetate', ['AE1', 'S', 'AH0', 'T', 'EY2', 'T']), ('acetic', ['AH0', 'S', 'EH1', 'T', 'IH0', 'K']), ('acetic', ['AH0', 'S', 'IY1', 'T', 'IH0', 'K']), ('aceto', ['AA0', 'S', 'EH1', 'T', 'OW0']), ('acetochlor', ['AA0', 'S', 'EH1', 'T', 'OW0', 'K', 'L', 'AO2', 'R']), ('acetone', ['AE1', 'S', 'AH0', 'T', 'OW2', 'N'])] >>> # Load the entire cmudict corpus into a Python dictionary: >>> transcr = cmudict.dict() >>> print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()]) [['N', 'AE1', 'CH', 'ER0', 'AH0', 'L'], ['L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH'], ['T', 'UW1', 'L'], ['K', 'IH1', 'T']] WordNet ======= Please see the separate WordNet howto. FrameNet ======== Please see the separate FrameNet howto. PropBank ======== Please see the separate PropBank howto. SentiWordNet ============ Please see the separate SentiWordNet howto. Categorized Corpora =================== Several corpora included with NLTK contain documents that have been categorized for topic, genre, polarity, etc. In addition to the standard corpus interface, these corpora provide access to the list of categories and the mapping between the documents and their categories (in both directions). Access the categories using the ``categories()`` method, e.g.: >>> from nltk.corpus import brown, movie_reviews, reuters >>> brown.categories() ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] >>> movie_reviews.categories() ['neg', 'pos'] >>> reuters.categories() ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', ...] This method has an optional argument that specifies a document or a list of documents, allowing us to map from (one or more) documents to (one or more) categories: >>> brown.categories('ca01') ['news'] >>> brown.categories(['ca01','cb01']) ['editorial', 'news'] >>> reuters.categories('training/9865') ['barley', 'corn', 'grain', 'wheat'] >>> reuters.categories(['training/9865', 'training/9880']) ['barley', 'corn', 'grain', 'money-fx', 'wheat'] We can go back the other way using the optional argument of the ``fileids()`` method: >>> reuters.fileids('barley') ['test/15618', 'test/15649', 'test/15676', 'test/15728', 'test/15871', ...] Both the ``categories()`` and ``fileids()`` methods return a sorted list containing no duplicates. In addition to mapping between categories and documents, these corpora permit direct access to their contents via the categories. Instead of accessing a subset of a corpus by specifying one or more fileids, we can identify one or more categories, e.g.: >>> brown.tagged_words(categories='news') [('The', 'AT'), ('Fulton', 'NP-TL'), ...] >>> brown.sents(categories=['editorial','reviews']) [['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...] Note that it is an error to specify both documents and categories. In the context of a text categorization system, we can easily test if the category assigned to a document is correct as follows: >>> def classify(doc): return 'news' # Trivial classifier >>> doc = 'ca01' >>> classify(doc) in brown.categories(doc) True Other Corpora ============= comparative_sentences --------------------- A list of sentences from various sources, especially reviews and articles. Each line contains one sentence; sentences were separated by using a sentence tokenizer. Comparative sentences have been annotated with their type, entities, features and keywords. >>> from nltk.corpus import comparative_sentences >>> comparison = comparative_sentences.comparisons()[0] >>> comparison.text ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly', 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve", 'had', '.'] >>> comparison.entity_2 'models' >>> (comparison.feature, comparison.keyword) ('rewind', 'more') >>> len(comparative_sentences.comparisons()) 853 opinion_lexicon --------------- A list of positive and negative opinion words or sentiment words for English. >>> from nltk.corpus import opinion_lexicon >>> opinion_lexicon.words()[:4] ['2-faced', '2-faces', 'abnormal', 'abolish'] The OpinionLexiconCorpusReader also provides shortcuts to retrieve positive/negative words: >>> opinion_lexicon.negative()[:4] ['2-faced', '2-faces', 'abnormal', 'abolish'] Note that words from `words()` method in opinion_lexicon are sorted by file id, not alphabetically: >>> opinion_lexicon.words()[0:10] ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted'] >>> sorted(opinion_lexicon.words())[0:10] ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort'] ppattach -------- The Prepositional Phrase Attachment corpus is a corpus of prepositional phrase attachment decisions. Each instance in the corpus is encoded as a ``PPAttachment`` object: >>> from nltk.corpus import ppattach >>> ppattach.attachments('training') [PPAttachment(sent='0', verb='join', noun1='board', prep='as', noun2='director', attachment='V'), PPAttachment(sent='1', verb='is', noun1='chairman', prep='of', noun2='N.V.', attachment='N'), ...] >>> inst = ppattach.attachments('training')[0] >>> (inst.sent, inst.verb, inst.noun1, inst.prep, inst.noun2) ('0', 'join', 'board', 'as', 'director') >>> inst.attachment 'V' product_reviews_1 and product_reviews_2 --------------------------------------- These two datasets respectively contain annotated customer reviews of 5 and 9 products from amazon.com. >>> from nltk.corpus import product_reviews_1 >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt') >>> review = camera_reviews[0] >>> review.sents()[0] ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am', 'extremely', 'satisfied', 'with', 'the', 'purchase', '.'] >>> review.features() [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'), ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'), ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'), ('option', '+1')] It is also possible to reach the same information directly from the stream: >>> product_reviews_1.features('Canon_G3.txt') [('canon powershot g3', '+3'), ('use', '+2'), ...] We can compute stats for specific product features: >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) >>> mean = tot / n_reviews >>> print(n_reviews, tot, mean) 15 24 1.6 pros_cons --------- A list of pros/cons sentences for determining context (aspect) dependent sentiment words, which are then applied to sentiment analysis of comparative sentences. >>> from nltk.corpus import pros_cons >>> pros_cons.sents(categories='Cons') [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy', 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'], ...] >>> pros_cons.words('IntegratedPros.txt') ['Easy', 'to', 'use', ',', 'economical', '!', ...] semcor ------ The Brown Corpus, annotated with WordNet senses. >>> from nltk.corpus import semcor >>> semcor.words('brown2/tagfiles/br-n12.xml') ['When', 'several', 'minutes', 'had', 'passed', ...] senseval -------- The Senseval 2 corpus is a word sense disambiguation corpus. Each item in the corpus corresponds to a single ambiguous word. For each of these words, the corpus contains a list of instances, corresponding to occurrences of that word. Each instance provides the word; a list of word senses that apply to the word occurrence; and the word's context. >>> from nltk.corpus import senseval >>> senseval.fileids() ['hard.pos', 'interest.pos', 'line.pos', 'serve.pos'] >>> senseval.instances('hard.pos') ... [SensevalInstance(word='hard-a', position=20, context=[('``', '``'), ('he', 'PRP'), ...('hard', 'JJ'), ...], senses=('HARD1',)), SensevalInstance(word='hard-a', position=10, context=[('clever', 'NNP'), ...('hard', 'JJ'), ('time', 'NN'), ...], senses=('HARD1',)), ...] The following code looks at instances of the word 'interest', and displays their local context (2 words on each side) and word sense(s): >>> for inst in senseval.instances('interest.pos')[:10]: ... p = inst.position ... left = ' '.join(w for (w,t) in inst.context[p-2:p]) ... word = ' '.join(w for (w,t) in inst.context[p:p+1]) ... right = ' '.join(w for (w,t) in inst.context[p+1:p+3]) ... senses = ' '.join(inst.senses) ... print('%20s |%10s | %-15s -> %s' % (left, word, right, senses)) declines in | interest | rates . -> interest_6 indicate declining | interest | rates because -> interest_6 in short-term | interest | rates . -> interest_6 4 % | interest | in this -> interest_5 company with | interests | in the -> interest_5 , plus | interest | . -> interest_6 set the | interest | rate on -> interest_6 's own | interest | , prompted -> interest_4 principal and | interest | is the -> interest_6 increase its | interest | to 70 -> interest_5 sentence_polarity ----------------- The Sentence Polarity dataset contains 5331 positive and 5331 negative processed sentences. >>> from nltk.corpus import sentence_polarity >>> sentence_polarity.sents() [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.'], ...] >>> sentence_polarity.categories() ['neg', 'pos'] >>> sentence_polarity.sents()[1] ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.'] shakespeare ----------- The Shakespeare corpus contains a set of Shakespeare plays, formatted as XML files. These corpora are returned as ElementTree objects: >>> from nltk.corpus import shakespeare >>> from xml.etree import ElementTree >>> shakespeare.fileids() ['a_and_c.xml', 'dream.xml', 'hamlet.xml', 'j_caesar.xml', ...] >>> play = shakespeare.xml('dream.xml') >>> print(play) >>> print('%s: %s' % (play[0].tag, play[0].text)) TITLE: A Midsummer Night's Dream >>> personae = [persona.text for persona in ... play.findall('PERSONAE/PERSONA')] >>> print(personae) ['THESEUS, Duke of Athens.', 'EGEUS, father to Hermia.', ...] >>> # Find and print speakers not listed as personae >>> names = [persona.split(',')[0] for persona in personae] >>> speakers = set(speaker.text for speaker in ... play.findall('*/*/*/SPEAKER')) >>> print(sorted(speakers.difference(names))) ['ALL', 'COBWEB', 'DEMETRIUS', 'Fairy', 'HERNIA', 'LYSANDER', 'Lion', 'MOTH', 'MUSTARDSEED', 'Moonshine', 'PEASEBLOSSOM', 'Prologue', 'Pyramus', 'Thisbe', 'Wall'] subjectivity ----------- The Subjectivity Dataset contains 5000 subjective and 5000 objective processed sentences. >>> from nltk.corpus import subjectivity >>> subjectivity.categories() ['obj', 'subj'] >>> subjectivity.sents()[23] ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits', 'happened', 'off', 'screen', '.'] >>> subjectivity.words(categories='subj') ['smart', 'and', 'alert', ',', 'thirteen', ...] toolbox ------- The Toolbox corpus distributed with NLTK contains a sample lexicon and several sample texts from the Rotokas language. The Toolbox corpus reader returns Toolbox files as XML ElementTree objects. The following example loads the Rotokas dictionary, and figures out the distribution of part-of-speech tags for reduplicated words. .. doctest: +SKIP >>> from nltk.corpus import toolbox >>> from nltk.probability import FreqDist >>> from xml.etree import ElementTree >>> import re >>> rotokas = toolbox.xml('rotokas.dic') >>> redup_pos_freqdist = FreqDist() >>> # Note: we skip over the first record, which is actually >>> # the header. >>> for record in rotokas[1:]: ... lexeme = record.find('lx').text ... if re.match(r'(.*)\1$', lexeme): ... redup_pos_freqdist[record.find('ps').text] += 1 >>> for item, count in redup_pos_freqdist.most_common(): ... print(item, count) V 41 N 14 ??? 4 This example displays some records from a Rotokas text: .. doctest: +SKIP >>> river = toolbox.xml('rotokas/river.txt', key='ref') >>> for record in river.findall('record')[:3]: ... for piece in record: ... if len(piece.text) > 60: ... print('%-6s %s...' % (piece.tag, piece.text[:57])) ... else: ... print('%-6s %s' % (piece.tag, piece.text)) ref Paragraph 1 t ``Viapau oisio ra ovaupasi ... m viapau oisio ra ovau -pa -si ... g NEG this way/like this and forget -PROG -2/3.DL... p NEG ??? CONJ V.I -SUFF.V.3 -SUFF.V... f ``No ken lus tingting wanema samting papa i bin tok,'' Na... fe ``Don't forget what Dad said,'' yelled Naomi. ref 2 t Osa Ira ora Reviti viapau uvupasiva. m osa Ira ora Reviti viapau uvu -pa -si ... g as/like name and name NEG hear/smell -PROG -2/3... p CONJ N.PN CONJ N.PN NEG V.T -SUFF.V.3 -SUF... f Tasol Ila na David no bin harim toktok. fe But Ila and David took no notice. ref 3 t Ikaupaoro rokosiva ... m ikau -pa -oro roko -si -va ... g run/hurry -PROG -SIM go down -2/3.DL.M -RP ... p V.T -SUFF.V.3 -SUFF.V.4 ADV -SUFF.V.4 -SUFF.VT.... f Tupela i bin hariap i go long wara . fe They raced to the river. timit ----- The NLTK data package includes a fragment of the TIMIT Acoustic-Phonetic Continuous Speech Corpus. This corpus is broken down into small speech samples, each of which is available as a wave file, a phonetic transcription, and a tokenized word list. >>> from nltk.corpus import timit >>> print(timit.utteranceids()) ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', 'dr1-fvmh0/si2096', 'dr1-fvmh0/si836', 'dr1-fvmh0/sx116', 'dr1-fvmh0/sx206', 'dr1-fvmh0/sx26', 'dr1-fvmh0/sx296', ...] >>> item = timit.utteranceids()[5] >>> print(timit.phones(item)) ['h#', 'k', 'l', 'ae', 's', 'pcl', 'p', 'dh', 'ax', 's', 'kcl', 'k', 'r', 'ux', 'ix', 'nx', 'y', 'ax', 'l', 'eh', 'f', 'tcl', 't', 'hh', 'ae', 'n', 'dcl', 'd', 'h#'] >>> print(timit.words(item)) ['clasp', 'the', 'screw', 'in', 'your', 'left', 'hand'] >>> timit.play(item) # doctest: +SKIP The corpus reader can combine the word segmentation information with the phonemes to produce a single tree structure: >>> for tree in timit.phone_trees(item): ... print(tree) (S h# (clasp k l ae s pcl p) (the dh ax) (screw s kcl k r ux) (in ix nx) (your y ax) (left l eh f tcl t) (hand hh ae n dcl d) h#) The start time and stop time of each phoneme, word, and sentence are also available: >>> print(timit.phone_times(item)) [('h#', 0, 2190), ('k', 2190, 3430), ('l', 3430, 4326), ...] >>> print(timit.word_times(item)) [('clasp', 2190, 8804), ('the', 8804, 9734), ...] >>> print(timit.sent_times(item)) [('Clasp the screw in your left hand.', 0, 32154)] We can use these times to play selected pieces of a speech sample: >>> timit.play(item, 2190, 8804) # 'clasp' # doctest: +SKIP The corpus reader can also be queried for information about the speaker and sentence identifier for a given speech sample: >>> print(timit.spkrid(item)) dr1-fvmh0 >>> print(timit.sentid(item)) sx116 >>> print(timit.spkrinfo(timit.spkrid(item))) SpeakerInfo(id='VMH0', sex='F', dr='1', use='TRN', recdate='03/11/86', birthdate='01/08/60', ht='5\'05"', race='WHT', edu='BS', comments='BEST NEW ENGLAND ACCENT SO FAR') >>> # List the speech samples from the same speaker: >>> timit.utteranceids(spkrid=timit.spkrid(item)) ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...] twitter_samples --------------- Twitter is well-known microblog service that allows public data to be collected via APIs. NLTK's twitter corpus currently contains a sample of 20k Tweets retrieved from the Twitter Streaming API. >>> from nltk.corpus import twitter_samples >>> twitter_samples.fileids() ['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json'] We follow standard practice in storing full Tweets as line-separated JSON. These data structures can be accessed via `tweets.docs()`. However, in general it is more practical to focus just on the text field of the Tweets, which are accessed via the `strings()` method. >>> twitter_samples.strings('tweets.20150430-223406.json')[:5] ['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain \xa3170 billion per year! #BetterOffOut #UKIP', ...] The default tokenizer for Tweets is specialised for 'casual' text, and the `tokenized()` method returns a list of lists of tokens. >>> twitter_samples.tokenized('tweets.20150430-223406.json')[:5] [['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', ...], ['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY'], ...] rte --- The RTE (Recognizing Textual Entailment) corpus was derived from the RTE1, RTE2 and RTE3 datasets (dev and test data), and consists of a list of XML-formatted 'text'/'hypothesis' pairs. >>> from nltk.corpus import rte >>> print(rte.fileids()) ['rte1_dev.xml', 'rte1_test.xml', 'rte2_dev.xml', ..., 'rte3_test.xml'] >>> rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml']) >>> print(rtepairs) [, , , ...] In the gold standard test sets, each pair is labeled according to whether or not the text 'entails' the hypothesis; the entailment value is mapped to an integer 1 (True) or 0 (False). >>> rtepairs[5] >>> rtepairs[5].text 'His wife Strida won a seat in parliament after forging an alliance with the main anti-Syrian coalition in the recent election.' >>> rtepairs[5].hyp 'Strida elected to parliament.' >>> rtepairs[5].value 1 The RTE corpus also supports an ``xml()`` method which produces ElementTrees. >>> xmltree = rte.xml('rte3_dev.xml') >>> xmltree # doctest: +SKIP >>> xmltree[7].findtext('t') "Mrs. Bush's approval ratings have remained very high, above 80%, even as her husband's have recently dropped below 50%." verbnet ------- The VerbNet corpus is a lexicon that divides verbs into classes, based on their syntax-semantics linking behavior. The basic elements in the lexicon are verb lemmas, such as 'abandon' and 'accept', and verb classes, which have identifiers such as 'remove-10.1' and 'admire-31.2-1'. These class identifiers consist of a representative verb selected from the class, followed by a numerical identifier. The list of verb lemmas, and the list of class identifiers, can be retrieved with the following methods: >>> from nltk.corpus import verbnet >>> verbnet.lemmas()[20:25] ['accelerate', 'accept', 'acclaim', 'accompany', 'accrue'] >>> verbnet.classids()[:5] ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93'] The `classids()` method may also be used to retrieve the classes that a given lemma belongs to: >>> verbnet.classids('accept') ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2'] The `classids()` method may additionally be used to retrieve all classes within verbnet if nothing is passed: >>> verbnet.classids() ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93', 'advise-37.9', 'advise-37.9-1', 'allow-64', 'amalgamate-22.2', 'amalgamate-22.2-1', 'amalgamate-22.2-1-1', 'amalgamate-22.2-2', 'amalgamate-22.2-2-1', 'amalgamate-22.2-3', 'amalgamate-22.2-3-1', 'amalgamate-22.2-3-1-1', 'amalgamate-22.2-3-2', 'amuse-31.1', 'animal_sounds-38', 'appeal-31.4', 'appeal-31.4-1', 'appeal-31.4-2', 'appeal-31.4-3', 'appear-48.1.1', 'appoint-29.1', 'approve-77', 'assessment-34', 'assuming_position-50', 'avoid-52', 'banish-10.2', 'battle-36.4', 'battle-36.4-1', 'begin-55.1', 'begin-55.1-1', 'being_dressed-41.3.3', 'bend-45.2', 'berry-13.7', 'bill-54.5', 'body_internal_motion-49', 'body_internal_states-40.6', 'braid-41.2.2', 'break-45.1', 'breathe-40.1.2', 'breathe-40.1.2-1', 'bring-11.3', 'bring-11.3-1', 'build-26.1', 'build-26.1-1', 'bulge-47.5.3', 'bump-18.4', 'bump-18.4-1', 'butter-9.9', 'calibratable_cos-45.6', 'calibratable_cos-45.6-1', 'calve-28', 'captain-29.8', 'captain-29.8-1', 'captain-29.8-1-1', 'care-88', 'care-88-1', 'carry-11.4', 'carry-11.4-1', 'carry-11.4-1-1', 'carve-21.2', 'carve-21.2-1', 'carve-21.2-2', 'change_bodily_state-40.8.4', 'characterize-29.2', 'characterize-29.2-1', 'characterize-29.2-1-1', 'characterize-29.2-1-2', 'chase-51.6', 'cheat-10.6', 'cheat-10.6-1', 'cheat-10.6-1-1', 'chew-39.2', 'chew-39.2-1', 'chew-39.2-2', 'chit_chat-37.6', 'clear-10.3', 'clear-10.3-1', 'cling-22.5', 'coil-9.6', 'coil-9.6-1', 'coloring-24', 'complain-37.8', 'complete-55.2', 'concealment-16', 'concealment-16-1', 'confess-37.10', 'confine-92', 'confine-92-1', 'conjecture-29.5', 'conjecture-29.5-1', 'conjecture-29.5-2', 'consider-29.9', 'consider-29.9-1', 'consider-29.9-1-1', 'consider-29.9-1-1-1', 'consider-29.9-2', 'conspire-71', 'consume-66', 'consume-66-1', 'contiguous_location-47.8', 'contiguous_location-47.8-1', 'contiguous_location-47.8-2', 'continue-55.3', 'contribute-13.2', 'contribute-13.2-1', 'contribute-13.2-1-1', 'contribute-13.2-1-1-1', 'contribute-13.2-2', 'contribute-13.2-2-1', 'convert-26.6.2', 'convert-26.6.2-1', 'cooking-45.3', 'cooperate-73', 'cooperate-73-1', 'cooperate-73-2', 'cooperate-73-3', 'cope-83', 'cope-83-1', 'cope-83-1-1', 'correlate-86', 'correspond-36.1', 'correspond-36.1-1', 'correspond-36.1-1-1', 'cost-54.2', 'crane-40.3.2', 'create-26.4', 'create-26.4-1', 'curtsey-40.3.3', 'cut-21.1', 'cut-21.1-1', 'debone-10.8', 'declare-29.4', 'declare-29.4-1', 'declare-29.4-1-1', 'declare-29.4-1-1-1', 'declare-29.4-1-1-2', 'declare-29.4-1-1-3', 'declare-29.4-2', 'dedicate-79', 'defend-85', 'destroy-44', 'devour-39.4', 'devour-39.4-1', 'devour-39.4-2', 'differ-23.4', 'dine-39.5', 'disappearance-48.2', 'disassemble-23.3', 'discover-84', 'discover-84-1', 'discover-84-1-1', 'dress-41.1.1', 'dressing_well-41.3.2', 'drive-11.5', 'drive-11.5-1', 'dub-29.3', 'dub-29.3-1', 'eat-39.1', 'eat-39.1-1', 'eat-39.1-2', 'enforce-63', 'engender-27', 'entity_specific_cos-45.5', 'entity_specific_modes_being-47.2', 'equip-13.4.2', 'equip-13.4.2-1', 'equip-13.4.2-1-1', 'escape-51.1', 'escape-51.1-1', 'escape-51.1-2', 'escape-51.1-2-1', 'exceed-90', 'exchange-13.6', 'exchange-13.6-1', 'exchange-13.6-1-1', 'exhale-40.1.3', 'exhale-40.1.3-1', 'exhale-40.1.3-2', 'exist-47.1', 'exist-47.1-1', 'exist-47.1-1-1', 'feeding-39.7', 'ferret-35.6', 'fill-9.8', 'fill-9.8-1', 'fit-54.3', 'flinch-40.5', 'floss-41.2.1', 'focus-87', 'forbid-67', 'force-59', 'force-59-1', 'free-80', 'free-80-1', 'fulfilling-13.4.1', 'fulfilling-13.4.1-1', 'fulfilling-13.4.1-2', 'funnel-9.3', 'funnel-9.3-1', 'funnel-9.3-2', 'funnel-9.3-2-1', 'future_having-13.3', 'get-13.5.1', 'get-13.5.1-1', 'give-13.1', 'give-13.1-1', 'gobble-39.3', 'gobble-39.3-1', 'gobble-39.3-2', 'gorge-39.6', 'groom-41.1.2', 'grow-26.2', 'help-72', 'help-72-1', 'herd-47.5.2', 'hiccup-40.1.1', 'hit-18.1', 'hit-18.1-1', 'hold-15.1', 'hold-15.1-1', 'hunt-35.1', 'hurt-40.8.3', 'hurt-40.8.3-1', 'hurt-40.8.3-1-1', 'hurt-40.8.3-2', 'illustrate-25.3', 'image_impression-25.1', 'indicate-78', 'indicate-78-1', 'indicate-78-1-1', 'inquire-37.1.2', 'instr_communication-37.4', 'investigate-35.4', 'judgement-33', 'keep-15.2', 'knead-26.5', 'learn-14', 'learn-14-1', 'learn-14-2', 'learn-14-2-1', 'leave-51.2', 'leave-51.2-1', 'lecture-37.11', 'lecture-37.11-1', 'lecture-37.11-1-1', 'lecture-37.11-2', 'light_emission-43.1', 'limit-76', 'linger-53.1', 'linger-53.1-1', 'lodge-46', 'long-32.2', 'long-32.2-1', 'long-32.2-2', 'manner_speaking-37.3', 'marry-36.2', 'marvel-31.3', 'marvel-31.3-1', 'marvel-31.3-2', 'marvel-31.3-3', 'marvel-31.3-4', 'marvel-31.3-5', 'marvel-31.3-6', 'marvel-31.3-7', 'marvel-31.3-8', 'marvel-31.3-9', 'masquerade-29.6', 'masquerade-29.6-1', 'masquerade-29.6-2', 'matter-91', 'meander-47.7', 'meet-36.3', 'meet-36.3-1', 'meet-36.3-2', 'mine-10.9', 'mix-22.1', 'mix-22.1-1', 'mix-22.1-1-1', 'mix-22.1-2', 'mix-22.1-2-1', 'modes_of_being_with_motion-47.3', 'murder-42.1', 'murder-42.1-1', 'neglect-75', 'neglect-75-1', 'neglect-75-1-1', 'neglect-75-2', 'nonvehicle-51.4.2', 'nonverbal_expression-40.2', 'obtain-13.5.2', 'obtain-13.5.2-1', 'occurrence-48.3', 'order-60', 'order-60-1', 'orphan-29.7', 'other_cos-45.4', 'pain-40.8.1', 'pay-68', 'peer-30.3', 'pelt-17.2', 'performance-26.7', 'performance-26.7-1', 'performance-26.7-1-1', 'performance-26.7-2', 'performance-26.7-2-1', 'pit-10.7', 'pocket-9.10', 'pocket-9.10-1', 'poison-42.2', 'poke-19', 'pour-9.5', 'preparing-26.3', 'preparing-26.3-1', 'preparing-26.3-2', 'price-54.4', 'push-12', 'push-12-1', 'push-12-1-1', 'put-9.1', 'put-9.1-1', 'put-9.1-2', 'put_direction-9.4', 'put_spatial-9.2', 'put_spatial-9.2-1', 'reach-51.8', 'reflexive_appearance-48.1.2', 'refrain-69', 'register-54.1', 'rely-70', 'remove-10.1', 'risk-94', 'risk-94-1', 'roll-51.3.1', 'rummage-35.5', 'run-51.3.2', 'rush-53.2', 'say-37.7', 'say-37.7-1', 'say-37.7-1-1', 'say-37.7-2', 'scribble-25.2', 'search-35.2', 'see-30.1', 'see-30.1-1', 'see-30.1-1-1', 'send-11.1', 'send-11.1-1', 'separate-23.1', 'separate-23.1-1', 'separate-23.1-2', 'settle-89', 'shake-22.3', 'shake-22.3-1', 'shake-22.3-1-1', 'shake-22.3-2', 'shake-22.3-2-1', 'sight-30.2', 'simple_dressing-41.3.1', 'slide-11.2', 'slide-11.2-1-1', 'smell_emission-43.3', 'snooze-40.4', 'sound_emission-43.2', 'sound_existence-47.4', 'spank-18.3', 'spatial_configuration-47.6', 'split-23.2', 'spray-9.7', 'spray-9.7-1', 'spray-9.7-1-1', 'spray-9.7-2', 'stalk-35.3', 'steal-10.5', 'stimulus_subject-30.4', 'stop-55.4', 'stop-55.4-1', 'substance_emission-43.4', 'succeed-74', 'succeed-74-1', 'succeed-74-1-1', 'succeed-74-2', 'suffocate-40.7', 'suspect-81', 'swarm-47.5.1', 'swarm-47.5.1-1', 'swarm-47.5.1-2', 'swarm-47.5.1-2-1', 'swat-18.2', 'talk-37.5', 'tape-22.4', 'tape-22.4-1', 'tell-37.2', 'throw-17.1', 'throw-17.1-1', 'throw-17.1-1-1', 'tingle-40.8.2', 'touch-20', 'touch-20-1', 'transcribe-25.4', 'transfer_mesg-37.1.1', 'transfer_mesg-37.1.1-1', 'transfer_mesg-37.1.1-1-1', 'try-61', 'turn-26.6.1', 'turn-26.6.1-1', 'urge-58', 'vehicle-51.4.1', 'vehicle-51.4.1-1', 'waltz-51.5', 'want-32.1', 'want-32.1-1', 'want-32.1-1-1', 'weather-57', 'weekend-56', 'wink-40.3.1', 'wink-40.3.1-1', 'wipe_instr-10.4.2', 'wipe_instr-10.4.2-1', 'wipe_manner-10.4.1', 'wipe_manner-10.4.1-1', 'wish-62', 'withdraw-82', 'withdraw-82-1', 'withdraw-82-2', 'withdraw-82-3'] The primary object in the lexicon is a class record, which is stored as an ElementTree xml object. The class record for a given class identifier is returned by the `vnclass()` method: >>> verbnet.vnclass('remove-10.1') The `vnclass()` method also accepts "short" identifiers, such as '10.1': >>> verbnet.vnclass('10.1') See the Verbnet documentation, or the Verbnet files, for information about the structure of this xml. As an example, we can retrieve a list of thematic roles for a given Verbnet class: >>> vn_31_2 = verbnet.vnclass('admire-31.2') >>> for themrole in vn_31_2.findall('THEMROLES/THEMROLE'): ... print(themrole.attrib['type'], end=' ') ... for selrestr in themrole.findall('SELRESTRS/SELRESTR'): ... print('[%(Value)s%(type)s]' % selrestr.attrib, end=' ') ... print() Theme Experiencer [+animate] Predicate The Verbnet corpus also provides a variety of pretty printing functions that can be used to display the xml contents in a more concise form. The simplest such method is `pprint()`: >>> print(verbnet.pprint('57')) weather-57 Subclasses: (none) Members: blow clear drizzle fog freeze gust hail howl lightning mist mizzle pelt pour precipitate rain roar shower sleet snow spit spot sprinkle storm swelter teem thaw thunder Thematic roles: * Theme[+concrete +force] Frames: Intransitive (Expletive Subject) Example: It's raining. Syntax: LEX[it] LEX[[+be]] VERB Semantics: * weather(during(E), Weather_type, ?Theme) NP (Expletive Subject, Theme Object) Example: It's raining cats and dogs. Syntax: LEX[it] LEX[[+be]] VERB NP[Theme] Semantics: * weather(during(E), Weather_type, Theme) PP (Expletive Subject, Theme-PP) Example: It was pelting with rain. Syntax: LEX[it[+be]] VERB PREP[with] NP[Theme] Semantics: * weather(during(E), Weather_type, Theme) Verbnet gives us frames that link the syntax and semantics using an example. These frames are part of the corpus and we can use `frames()` to get a frame for a given verbnet class. >>> frame = verbnet.frames('57') >>> frame == [{'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': '?Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': "It's raining.", 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'Intransitive', 'secondary': 'Expletive Subject'}}, {'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': 'Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': "It's raining cats and dogs.", 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'NP', 'secondary': 'Expletive Subject, Theme Object'}}, {'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': 'Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': 'It was pelting with rain.', 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'PREP', 'modifiers': {'value': 'with', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'PP', 'secondary': 'Expletive Subject, Theme-PP'}}] True Verbnet corpus lets us access thematic roles individually using `themroles()`. >>> themroles = verbnet.themroles('57') >>> themroles == [{'modifiers': [{'type': 'concrete', 'value': '+'}, {'type': 'force', 'value': '+'}], 'type': 'Theme'}] True Verbnet classes may also have subclasses sharing similar syntactic and semantic properties while having differences with the superclass. The Verbnet corpus allows us to access these subclasses using `subclasses()`. >>> print(verbnet.subclasses('9.1')) #Testing for 9.1 since '57' does not have subclasses ['put-9.1-1', 'put-9.1-2'] nps_chat -------- The NPS Chat Corpus, Release 1.0 consists of over 10,000 posts in age-specific chat rooms, which have been anonymized, POS-tagged and dialogue-act tagged. >>> print(nltk.corpus.nps_chat.words()) ['now', 'im', 'left', 'with', 'this', 'gay', ...] >>> print(nltk.corpus.nps_chat.tagged_words()) [('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...] >>> print(nltk.corpus.nps_chat.tagged_posts()) [[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN')], [(':P', 'UH')], ...] We can access the XML elements corresponding to individual posts. These elements have ``class`` and ``user`` attributes that we can access using ``p.attrib['class']`` and ``p.attrib['user']``. They also have text content, accessed using ``p.text``. >>> print(nltk.corpus.nps_chat.xml_posts()) [, , ...] >>> posts = nltk.corpus.nps_chat.xml_posts() >>> sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys()) ['Accept', 'Bye', 'Clarify', 'Continuer', 'Emotion', 'Emphasis', 'Greet', 'Other', 'Reject', 'Statement', 'System', 'nAnswer', 'whQuestion', 'yAnswer', 'ynQuestion'] >>> posts[0].text 'now im left with this gay name' In addition to the above methods for accessing tagged text, we can navigate the XML structure directly, as follows: >>> tokens = posts[0].findall('terminals/t') >>> [t.attrib['pos'] + "/" + t.attrib['word'] for t in tokens] ['RB/now', 'PRP/im', 'VBD/left', 'IN/with', 'DT/this', 'JJ/gay', 'NN/name'] multext_east ------------ The Multext-East Corpus consists of POS-tagged versions of George Orwell's book 1984 in 12 languages: English, Czech, Hungarian, Macedonian, Slovenian, Serbian, Slovak, Romanian, Estonian, Farsi, Bulgarian and Polish. The corpus can be accessed using the usual methods for tagged corpora. The tagset can be transformed from the Multext-East specific MSD tags to the Universal tagset using the "tagset" parameter of all functions returning tagged parts of the corpus. >>> print(nltk.corpus.multext_east.words("oana-en.xml")) ['It', 'was', 'a', 'bright', ...] >>> print(nltk.corpus.multext_east.tagged_words("oana-en.xml")) [('It', '#Pp3ns'), ('was', '#Vmis3s'), ('a', '#Di'), ...] >>> print(nltk.corpus.multext_east.tagged_sents("oana-en.xml", "universal")) [[('It', 'PRON'), ('was', 'VERB'), ('a', 'DET'), ...] --------------------- Corpus Reader Classes --------------------- NLTK's *corpus reader* classes are used to access the contents of a diverse set of corpora. Each corpus reader class is specialized to handle a specific corpus format. Examples include the `PlaintextCorpusReader`, which handles corpora that consist of a set of unannotated text files, and the `BracketParseCorpusReader`, which handles corpora that consist of files containing parenthesis-delineated parse trees. Automatically Created Corpus Reader Instances ============================================= When the `nltk.corpus` module is imported, it automatically creates a set of corpus reader instances that can be used to access the corpora in the NLTK data distribution. Here is a small sample of those corpus reader instances: >>> import nltk >>> nltk.corpus.brown >>> nltk.corpus.treebank >>> nltk.corpus.names >>> nltk.corpus.genesis >>> nltk.corpus.inaugural This sample illustrates that different corpus reader classes are used to read different corpora; but that the same corpus reader class may be used for more than one corpus (e.g., ``genesis`` and ``inaugural``). Creating New Corpus Reader Instances ==================================== Although the `nltk.corpus` module automatically creates corpus reader instances for the corpora in the NLTK data distribution, you may sometimes need to create your own corpus reader. In particular, you would need to create your own corpus reader if you want... - To access a corpus that is not included in the NLTK data distribution. - To access a full copy of a corpus for which the NLTK data distribution only provides a sample. - To access a corpus using a customized corpus reader (e.g., with a customized tokenizer). To create a new corpus reader, you will first need to look up the signature for that corpus reader's constructor. Different corpus readers have different constructor signatures, but most of the constructor signatures have the basic form:: SomeCorpusReader(root, files, ...options...) Where ``root`` is an absolute path to the directory containing the corpus data files; ``files`` is either a list of file names (relative to ``root``) or a regexp specifying which files should be included; and ``options`` are additional reader-specific options. For example, we can create a customized corpus reader for the genesis corpus that uses a different sentence tokenizer as follows: >>> # Find the directory where the corpus lives. >>> genesis_dir = nltk.data.find('corpora/genesis') >>> # Create our custom sentence tokenizer. >>> my_sent_tokenizer = nltk.RegexpTokenizer('[^.!?]+') >>> # Create the new corpus reader object. >>> my_genesis = nltk.corpus.PlaintextCorpusReader( ... genesis_dir, r'.*\.txt', sent_tokenizer=my_sent_tokenizer) >>> # Use the new corpus reader object. >>> print(my_genesis.sents('english-kjv.txt')[0]) ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth'] If you wish to read your own plaintext corpus, which is stored in the directory '/usr/share/some-corpus', then you can create a corpus reader for it with:: >>> my_corpus = nltk.corpus.PlaintextCorpusReader( ... '/usr/share/some-corpus', r'.*\.txt') # doctest: +SKIP For a complete list of corpus reader subclasses, see the API documentation for `nltk.corpus.reader`. Corpus Types ============ Corpora vary widely in the types of content they include. This is reflected in the fact that the base class `CorpusReader` only defines a few general-purpose methods for listing and accessing the files that make up a corpus. It is up to the subclasses to define *data access methods* that provide access to the information in the corpus. However, corpus reader subclasses should be consistent in their definitions of these data access methods wherever possible. At a high level, corpora can be divided into three basic types: - A *token corpus* contains information about specific occurrences of language use (or linguistic tokens), such as dialogues or written texts. Examples of token corpora are collections of written text and collections of speech. - A *type corpus*, or *lexicon*, contains information about a coherent set of lexical items (or linguistic types). Examples of lexicons are dictionaries and word lists. - A *language description corpus* contains information about a set of non-lexical linguistic constructs, such as grammar rules. However, many individual corpora blur the distinctions between these types. For example, corpora that are primarily lexicons may include token data in the form of example sentences; and corpora that are primarily token corpora may be accompanied by one or more word lists or other lexical data sets. Because corpora vary so widely in their information content, we have decided that it would not be wise to use separate corpus reader base classes for different corpus types. Instead, we simply try to make the corpus readers consistent wherever possible, but let them differ where the underlying data itself differs. Common Corpus Reader Methods ============================ As mentioned above, there are only a handful of methods that all corpus readers are guaranteed to implement. These methods provide access to the files that contain the corpus data. Every corpus is assumed to consist of one or more files, all located in a common root directory (or in subdirectories of that root directory). The absolute path to the root directory is stored in the ``root`` property: >>> import os >>> str(nltk.corpus.genesis.root).replace(os.path.sep,'/') '.../nltk_data/corpora/genesis' Each file within the corpus is identified by a platform-independent identifier, which is basically a path string that uses ``/`` as the path separator. I.e., this identifier can be converted to a relative path as follows: >>> some_corpus_file_id = nltk.corpus.reuters.fileids()[0] >>> import os.path >>> os.path.normpath(some_corpus_file_id).replace(os.path.sep,'/') 'test/14826' To get a list of all data files that make up a corpus, use the ``fileids()`` method. In some corpora, these files will not all contain the same type of data; for example, for the ``nltk.corpus.timit`` corpus, ``fileids()`` will return a list including text files, word segmentation files, phonetic transcription files, sound files, and metadata files. For corpora with diverse file types, the ``fileids()`` method will often take one or more optional arguments, which can be used to get a list of the files with a specific file type: >>> nltk.corpus.timit.fileids() ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...] >>> nltk.corpus.timit.fileids('phn') ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa2.phn', 'dr1-fvmh0/si1466.phn', ...] In some corpora, the files are divided into distinct categories. For these corpora, the ``fileids()`` method takes an optional argument, which can be used to get a list of the files within a specific category: >>> nltk.corpus.brown.fileids('hobbies') ['ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', ...] The ``abspath()`` method can be used to find the absolute path to a corpus file, given its file identifier: >>> str(nltk.corpus.brown.abspath('ce06')).replace(os.path.sep,'/') '.../corpora/brown/ce06' The ``abspaths()`` method can be used to find the absolute paths for one corpus file, a list of corpus files, or (if no fileids are specified), all corpus files. This method is mainly useful as a helper method when defining corpus data access methods, since data access methods can usually be called with a string argument (to get a view for a specific file), with a list argument (to get a view for a specific list of files), or with no argument (to get a view for the whole corpus). Data Access Methods =================== Individual corpus reader subclasses typically extend this basic set of file-access methods with one or more *data access methods*, which provide easy access to the data contained in the corpus. The signatures for data access methods often have the basic form:: corpus_reader.some_data access(fileids=None, ...options...) Where ``fileids`` can be a single file identifier string (to get a view for a specific file); a list of file identifier strings (to get a view for a specific list of files); or None (to get a view for the entire corpus). Some of the common data access methods, and their return types, are: - I{corpus}.words(): list of str - I{corpus}.sents(): list of (list of str) - I{corpus}.paras(): list of (list of (list of str)) - I{corpus}.tagged_words(): list of (str,str) tuple - I{corpus}.tagged_sents(): list of (list of (str,str)) - I{corpus}.tagged_paras(): list of (list of (list of (str,str))) - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves) - I{corpus}.parsed_sents(): list of (Tree with str leaves) - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves)) - I{corpus}.xml(): A single xml ElementTree - I{corpus}.raw(): str (unprocessed corpus contents) For example, the `words()` method is supported by many different corpora, and returns a flat list of word strings: >>> nltk.corpus.brown.words() ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] >>> nltk.corpus.treebank.words() ['Pierre', 'Vinken', ',', '61', 'years', 'old', ...] >>> nltk.corpus.conll2002.words() ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', ...] >>> nltk.corpus.genesis.words() ['In', 'the', 'beginning', 'God', 'created', ...] On the other hand, the `tagged_words()` method is only supported by corpora that include part-of-speech annotations: >>> nltk.corpus.brown.tagged_words() [('The', 'AT'), ('Fulton', 'NP-TL'), ...] >>> nltk.corpus.treebank.tagged_words() [('Pierre', 'NNP'), ('Vinken', 'NNP'), ...] >>> nltk.corpus.conll2002.tagged_words() [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...] >>> nltk.corpus.genesis.tagged_words() Traceback (most recent call last): ... AttributeError: 'PlaintextCorpusReader' object has no attribute 'tagged_words' Although most corpus readers use file identifiers to index their content, some corpora use different identifiers instead. For example, the data access methods for the ``timit`` corpus uses *utterance identifiers* to select which corpus items should be returned: >>> nltk.corpus.timit.utteranceids() ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...] >>> nltk.corpus.timit.words('dr1-fvmh0/sa2') ["don't", 'ask', 'me', 'to', 'carry', 'an', 'oily', 'rag', 'like', 'that'] Attempting to call ``timit``\ 's data access methods with a file identifier will result in an exception: >>> nltk.corpus.timit.fileids() ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...] >>> nltk.corpus.timit.words('dr1-fvmh0/sa1.txt') # doctest: +SKIP Traceback (most recent call last): ... IOError: No such file or directory: '.../dr1-fvmh0/sa1.txt.wrd' As another example, the ``propbank`` corpus defines the ``roleset()`` method, which expects a roleset identifier, not a file identifier: >>> roleset = nltk.corpus.propbank.roleset('eat.01') >>> from xml.etree import ElementTree as ET >>> print(ET.tostring(roleset).decode('utf8')) ...... ... ... Stream Backed Corpus Views ========================== An important feature of NLTK's corpus readers is that many of them access the underlying data files using "corpus views." A *corpus view* is an object that acts like a simple data structure (such as a list), but does not store the data elements in memory; instead, data elements are read from the underlying data files on an as-needed basis. By only loading items from the file on an as-needed basis, corpus views maintain both memory efficiency and responsiveness. The memory efficiency of corpus readers is important because some corpora contain very large amounts of data, and storing the entire data set in memory could overwhelm many machines. The responsiveness is important when experimenting with corpora in interactive sessions and in in-class demonstrations. The most common corpus view is the `StreamBackedCorpusView`, which acts as a read-only list of tokens. Two additional corpus view classes, `ConcatenatedCorpusView` and `LazySubsequence`, make it possible to create concatenations and take slices of `StreamBackedCorpusView` objects without actually storing the resulting list-like object's elements in memory. In the future, we may add additional corpus views that act like other basic data structures, such as dictionaries. Writing New Corpus Readers ========================== In order to add support for new corpus formats, it is necessary to define new corpus reader classes. For many corpus formats, writing new corpus readers is relatively straight-forward. In this section, we'll describe what's involved in creating a new corpus reader. If you do create a new corpus reader, we encourage you to contribute it back to the NLTK project. Don't Reinvent the Wheel ------------------------ Before you start writing a new corpus reader, you should check to be sure that the desired format can't be read using an existing corpus reader with appropriate constructor arguments. For example, although the `TaggedCorpusReader` assumes that words and tags are separated by ``/`` characters by default, an alternative tag-separation character can be specified via the ``sep`` constructor argument. You should also check whether the new corpus format can be handled by subclassing an existing corpus reader, and tweaking a few methods or variables. Design ------ If you decide to write a new corpus reader from scratch, then you should first decide which data access methods you want the reader to provide, and what their signatures should be. You should look at existing corpus readers that process corpora with similar data contents, and try to be consistent with those corpus readers whenever possible. You should also consider what sets of identifiers are appropriate for the corpus format. Where it's practical, file identifiers should be used. However, for some corpora, it may make sense to use additional sets of identifiers. Each set of identifiers should have a distinct name (e.g., fileids, utteranceids, rolesets); and you should be consistent in using that name to refer to that identifier. Do not use parameter names like ``id``, which leave it unclear what type of identifier is required. Once you've decided what data access methods and identifiers are appropriate for your corpus, you should decide if there are any customizable parameters that you'd like the corpus reader to handle. These parameters make it possible to use a single corpus reader to handle a wider variety of corpora. The ``sep`` argument for `TaggedCorpusReader`, mentioned above, is an example of a customizable corpus reader parameter. Implementation -------------- Constructor ~~~~~~~~~~~ If your corpus reader implements any customizable parameters, then you'll need to override the constructor. Typically, the new constructor will first call its base class's constructor, and then store the customizable parameters. For example, the `ConllChunkCorpusReader`\ 's constructor is defined as follows: >>> def __init__(self, root, fileids, chunk_types, encoding='utf8', ... tagset=None, separator=None): ... ConllCorpusReader.__init__( ... self, root, fileids, ('words', 'pos', 'chunk'), ... chunk_types=chunk_types, encoding=encoding, ... tagset=tagset, separator=separator) If your corpus reader does not implement any customization parameters, then you can often just inherit the base class's constructor. Data Access Methods ~~~~~~~~~~~~~~~~~~~ The most common type of data access method takes an argument identifying which files to access, and returns a view covering those files. This argument may be a single file identifier string (to get a view for a specific file); a list of file identifier strings (to get a view for a specific list of files); or None (to get a view for the entire corpus). The method's implementation converts this argument to a list of path names using the `abspaths()` method, which handles all three value types (string, list, and None): >>> print(str(nltk.corpus.brown.abspaths()).replace('\\\\','/')) [FileSystemPathPointer('.../corpora/brown/ca01'), FileSystemPathPointer('.../corpora/brown/ca02'), ...] >>> print(str(nltk.corpus.brown.abspaths('ce06')).replace('\\\\','/')) [FileSystemPathPointer('.../corpora/brown/ce06')] >>> print(str(nltk.corpus.brown.abspaths(['ce06', 'ce07'])).replace('\\\\','/')) [FileSystemPathPointer('.../corpora/brown/ce06'), FileSystemPathPointer('.../corpora/brown/ce07')] An example of this type of method is the `words()` method, defined by the `PlaintextCorpusReader` as follows: >>> def words(self, fileids=None): ... return concat([self.CorpusView(fileid, self._read_word_block) ... for fileid in self.abspaths(fileids)]) This method first uses `abspaths()` to convert ``fileids`` to a list of absolute paths. It then creates a corpus view for each file, using the `PlaintextCorpusReader._read_word_block()` method to read elements from the data file (see the discussion of corpus views below). Finally, it combines these corpus views using the `nltk.corpus.reader.util.concat()` function. When writing a corpus reader for a corpus that is never expected to be very large, it can sometimes be appropriate to read the files directly, rather than using a corpus view. For example, the `WordListCorpusView` class defines its `words()` method as follows: >>> def words(self, fileids=None): ... return concat([[w for w in open(fileid).read().split('\n') if w] ... for fileid in self.abspaths(fileids)]) (This is usually more appropriate for lexicons than for token corpora.) If the type of data returned by a data access method is one for which NLTK has a conventional representation (e.g., words, tagged words, and parse trees), then you should use that representation. Otherwise, you may find it necessary to define your own representation. For data structures that are relatively corpus-specific, it's usually best to define new classes for these elements. For example, the ``propbank`` corpus defines the `PropbankInstance` class to store the semantic role labeling instances described by the corpus; and the ``ppattach`` corpus defines the `PPAttachment` class to store the prepositional attachment instances described by the corpus. Corpus Views ~~~~~~~~~~~~ .. (Much of the content for this section is taken from the StreamBackedCorpusView docstring.) The heart of a `StreamBackedCorpusView` is its *block reader* function, which reads zero or more tokens from a stream, and returns them as a list. A very simple example of a block reader is: >>> def simple_block_reader(stream): ... return stream.readline().split() This simple block reader reads a single line at a time, and returns a single token (consisting of a string) for each whitespace-separated substring on the line. A `StreamBackedCorpusView` built from this block reader will act like a read-only list of all the whitespace-separated tokens in an underlying file. When deciding how to define the block reader for a given corpus, careful consideration should be given to the size of blocks handled by the block reader. Smaller block sizes will increase the memory requirements of the corpus view's internal data structures (by 2 integers per block). On the other hand, larger block sizes may decrease performance for random access to the corpus. (But note that larger block sizes will *not* decrease performance for iteration.) Internally, the `StreamBackedCorpusView` class maintains a partial mapping from token index to file position, with one entry per block. When a token with a given index *i* is requested, the corpus view constructs it as follows: 1. First, it searches the toknum/filepos mapping for the token index closest to (but less than or equal to) *i*. 2. Then, starting at the file position corresponding to that index, it reads one block at a time using the block reader until it reaches the requested token. The toknum/filepos mapping is created lazily: it is initially empty, but every time a new block is read, the block's initial token is added to the mapping. (Thus, the toknum/filepos map has one entry per block.) You can create your own corpus view in one of two ways: 1. Call the `StreamBackedCorpusView` constructor, and provide your block reader function via the ``block_reader`` argument. 2. Subclass `StreamBackedCorpusView`, and override the `read_block()` method. The first option is usually easier, but the second option can allow you to write a single `read_block` method whose behavior can be customized by different parameters to the subclass's constructor. For an example of this design pattern, see the `TaggedCorpusView` class, which is used by `TaggedCorpusView`. ---------------- Regression Tests ---------------- The following helper functions are used to create and then delete testing corpora that are stored in temporary directories. These testing corpora are used to make sure the readers work correctly. >>> import tempfile, os.path, textwrap >>> def make_testcorpus(ext='', **fileids): ... root = tempfile.mkdtemp() ... for fileid, contents in fileids.items(): ... fileid += ext ... f = open(os.path.join(root, fileid), 'w') ... f.write(textwrap.dedent(contents)) ... f.close() ... return root >>> def del_testcorpus(root): ... for fileid in os.listdir(root): ... os.remove(os.path.join(root, fileid)) ... os.rmdir(root) Plaintext Corpus Reader ======================= The plaintext corpus reader is used to access corpora that consist of unprocessed plaintext data. It assumes that paragraph breaks are indicated by blank lines. Sentences and words can be tokenized using the default tokenizers, or by custom tokenizers specified as parameters to the constructor. >>> root = make_testcorpus(ext='.txt', ... a="""\ ... This is the first sentence. Here is another ... sentence! And here's a third sentence. ... ... This is the second paragraph. Tokenization is currently ... fairly simple, so the period in Mr. gets tokenized. ... """, ... b="""This is the second file.""") >>> from nltk.corpus.reader.plaintext import PlaintextCorpusReader The list of documents can be specified explicitly, or implicitly (using a regexp). The ``ext`` argument specifies a file extension. >>> corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt']) >>> corpus.fileids() ['a.txt', 'b.txt'] >>> corpus = PlaintextCorpusReader(root, r'.*\.txt') >>> corpus.fileids() ['a.txt', 'b.txt'] The directory containing the corpus is corpus.root: >>> str(corpus.root) == str(root) True We can get a list of words, or the raw string: >>> corpus.words() ['This', 'is', 'the', 'first', 'sentence', '.', ...] >>> corpus.raw()[:40] 'This is the first sentence. Here is ano' Check that reading individual documents works, and reading all documents at once works: >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()] (46, [40, 6]) >>> corpus.words('a.txt') ['This', 'is', 'the', 'first', 'sentence', '.', ...] >>> corpus.words('b.txt') ['This', 'is', 'the', 'second', 'file', '.'] >>> corpus.words()[:4], corpus.words()[-4:] (['This', 'is', 'the', 'first'], ['the', 'second', 'file', '.']) We're done with the test corpus: >>> del_testcorpus(root) Test the plaintext corpora that come with nltk: >>> from nltk.corpus import abc, genesis, inaugural >>> from nltk.corpus import state_union, webtext >>> for corpus in (abc, genesis, inaugural, state_union, ... webtext): ... print(str(corpus).replace('\\\\','/')) ... print(' ', repr(corpus.fileids())[:60]) ... print(' ', repr(corpus.words()[:10])[:60]) ['rural.txt', 'science.txt'] ['PM', 'denies', 'knowledge', 'of', 'AWB', ... ['english-kjv.txt', 'english-web.txt', 'finnish.txt', ... ['In', 'the', 'beginning', 'God', 'created', 'the', ... ['1789-Washington.txt', '1793-Washington.txt', ... ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ... ['1945-Truman.txt', '1946-Truman.txt', ... ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ... ['firefox.txt', 'grail.txt', 'overheard.txt', ... ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ... Tagged Corpus Reader ==================== The Tagged Corpus reader can give us words, sentences, and paragraphs, each tagged or untagged. All of the read methods can take one item (in which case they return the contents of that file) or a list of documents (in which case they concatenate the contents of those files). By default, they apply to all documents in the corpus. >>> root = make_testcorpus( ... a="""\ ... This/det is/verb the/det first/adj sentence/noun ./punc ... Here/det is/verb another/adj sentence/noun ./punc ... Note/verb that/comp you/pron can/verb use/verb \ ... any/noun tag/noun set/noun ... ... This/det is/verb the/det second/adj paragraph/noun ./punc ... word/n without/adj a/det tag/noun :/: hello ./punc ... """, ... b="""\ ... This/det is/verb the/det second/adj file/noun ./punc ... """) >>> from nltk.corpus.reader.tagged import TaggedCorpusReader >>> corpus = TaggedCorpusReader(root, list('ab')) >>> corpus.fileids() ['a', 'b'] >>> str(corpus.root) == str(root) True >>> corpus.words() ['This', 'is', 'the', 'first', 'sentence', '.', ...] >>> corpus.sents() [['This', 'is', 'the', 'first', ...], ['Here', 'is', 'another'...], ...] >>> corpus.paras() [[['This', ...], ['Here', ...], ...], [['This', ...], ...], ...] >>> corpus.tagged_words() [('This', 'DET'), ('is', 'VERB'), ('the', 'DET'), ...] >>> corpus.tagged_sents() [[('This', 'DET'), ('is', 'VERB'), ...], [('Here', 'DET'), ...], ...] >>> corpus.tagged_paras() [[[('This', 'DET'), ...], ...], [[('This', 'DET'), ...], ...], ...] >>> corpus.raw()[:40] 'This/det is/verb the/det first/adj sente' >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()] (38, [32, 6]) >>> len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()] (6, [5, 1]) >>> len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()] (3, [2, 1]) >>> print(corpus.words('a')) ['This', 'is', 'the', 'first', 'sentence', '.', ...] >>> print(corpus.words('b')) ['This', 'is', 'the', 'second', 'file', '.'] >>> del_testcorpus(root) The Brown Corpus uses the tagged corpus reader: >>> from nltk.corpus import brown >>> brown.fileids() ['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', ...] >>> brown.categories() ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] >>> print(repr(brown.root).replace('\\\\','/')) FileSystemPathPointer('.../corpora/brown') >>> brown.words() ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] >>> brown.sents() [['The', 'Fulton', 'County', 'Grand', ...], ...] >>> brown.paras() [[['The', 'Fulton', 'County', ...]], [['The', 'jury', ...]], ...] >>> brown.tagged_words() [('The', 'AT'), ('Fulton', 'NP-TL'), ...] >>> brown.tagged_sents() [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ...], ...] >>> brown.tagged_paras() [[[('The', 'AT'), ...]], [[('The', 'AT'), ...]], ...] Verbnet Corpus Reader ===================== Make sure we're picking up the right number of elements: >>> from nltk.corpus import verbnet >>> len(verbnet.lemmas()) 3621 >>> len(verbnet.wordnetids()) 4953 >>> len(verbnet.classids()) 429 Selecting classids based on various selectors: >>> verbnet.classids(lemma='take') ['bring-11.3', 'characterize-29.2', 'convert-26.6.2', 'cost-54.2', 'fit-54.3', 'performance-26.7-2', 'steal-10.5'] >>> verbnet.classids(wordnetid='lead%2:38:01') ['accompany-51.7'] >>> verbnet.classids(fileid='approve-77.xml') ['approve-77'] >>> verbnet.classids(classid='admire-31.2') # subclasses ['admire-31.2-1'] vnclass() accepts filenames, long ids, and short ids: >>> a = ElementTree.tostring(verbnet.vnclass('admire-31.2.xml')) >>> b = ElementTree.tostring(verbnet.vnclass('admire-31.2')) >>> c = ElementTree.tostring(verbnet.vnclass('31.2')) >>> a == b == c True fileids() can be used to get files based on verbnet class ids: >>> verbnet.fileids('admire-31.2') ['admire-31.2.xml'] >>> verbnet.fileids(['admire-31.2', 'obtain-13.5.2']) ['admire-31.2.xml', 'obtain-13.5.2.xml'] >>> verbnet.fileids('badidentifier') Traceback (most recent call last): . . . ValueError: vnclass identifier 'badidentifier' not found longid() and shortid() can be used to convert identifiers: >>> verbnet.longid('31.2') 'admire-31.2' >>> verbnet.longid('admire-31.2') 'admire-31.2' >>> verbnet.shortid('31.2') '31.2' >>> verbnet.shortid('admire-31.2') '31.2' >>> verbnet.longid('badidentifier') Traceback (most recent call last): . . . ValueError: vnclass identifier 'badidentifier' not found >>> verbnet.shortid('badidentifier') Traceback (most recent call last): . . . ValueError: vnclass identifier 'badidentifier' not found Corpus View Regression Tests ============================ Select some corpus files to play with: >>> import nltk.data >>> # A very short file (160 chars): >>> f1 = nltk.data.find('corpora/inaugural/README') >>> # A relatively short file (791 chars): >>> f2 = nltk.data.find('corpora/inaugural/1793-Washington.txt') >>> # A longer file (32k chars): >>> f3 = nltk.data.find('corpora/inaugural/1909-Taft.txt') >>> fileids = [f1, f2, f3] Concatenation ------------- Check that concatenation works as intended. >>> from nltk.corpus.reader.util import * >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8') >>> c2 = StreamBackedCorpusView(f2, read_whitespace_block, encoding='utf-8') >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8') >>> c123 = c1+c2+c3 >>> print(c123) ['C-Span', 'Inaugural', 'Address', 'Corpus', 'US', ...] >>> l1 = f1.open(encoding='utf-8').read().split() >>> l2 = f2.open(encoding='utf-8').read().split() >>> l3 = f3.open(encoding='utf-8').read().split() >>> l123 = l1+l2+l3 >>> list(c123) == l123 True >>> (c1+c2+c3)[100] == l123[100] True Slicing ------- First, do some tests with fairly small slices. These will all generate tuple values. >>> from nltk.util import LazySubsequence >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8') >>> l1 = f1.open(encoding='utf-8').read().split() >>> print(len(c1)) 21 >>> len(c1) < LazySubsequence.MIN_SIZE True Choose a list of indices, based on the length, that covers the important corner cases: >>> indices = [-60, -30, -22, -21, -20, -1, ... 0, 1, 10, 20, 21, 22, 30, 60] Test slicing with explicit start & stop value: >>> for s in indices: ... for e in indices: ... assert list(c1[s:e]) == l1[s:e] Test slicing with stop=None: >>> for s in indices: ... assert list(c1[s:]) == l1[s:] Test slicing with start=None: >>> for e in indices: ... assert list(c1[:e]) == l1[:e] Test slicing with start=stop=None: >>> list(c1[:]) == list(l1[:]) True Next, we'll do some tests with much longer slices. These will generate LazySubsequence objects. >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8') >>> l3 = f3.open(encoding='utf-8').read().split() >>> print(len(c3)) 5430 >>> len(c3) > LazySubsequence.MIN_SIZE*2 True Choose a list of indices, based on the length, that covers the important corner cases: >>> indices = [-12000, -6000, -5431, -5430, -5429, -3000, -200, -1, ... 0, 1, 200, 3000, 5000, 5429, 5430, 5431, 6000, 12000] Test slicing with explicit start & stop value: >>> for s in indices: ... for e in indices: ... assert list(c3[s:e]) == l3[s:e] Test slicing with stop=None: >>> for s in indices: ... assert list(c3[s:]) == l3[s:] Test slicing with start=None: >>> for e in indices: ... assert list(c3[:e]) == l3[:e] Test slicing with start=stop=None: >>> list(c3[:]) == list(l3[:]) True Multiple Iterators ------------------ If multiple iterators are created for the same corpus view, their iteration can be interleaved: >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block) >>> iterators = [c3.iterate_from(n) for n in [0,15,30,45]] >>> for i in range(15): ... for iterator in iterators: ... print('%-15s' % next(iterator), end=' ') ... print() My a duties in fellow heavy of a citizens: weight the proper Anyone of office sense who responsibility. upon of has If which the taken not, he obligation the he is which oath has about the I no to oath have conception enter, imposes. just of or The taken the he office must powers is of feel and lacking an SeekableUnicodeStreamReader =========================== The file-like objects provided by the ``codecs`` module unfortunately suffer from a bug that prevents them from working correctly with corpus view objects. In particular, although the expose ``seek()`` and ``tell()`` methods, those methods do not exhibit the expected behavior, because they are not synchronized with the internal buffers that are kept by the file-like objects. For example, the ``tell()`` method will return the file position at the end of the buffers (whose contents have not yet been returned by the stream); and therefore this file position can not be used to return to the 'current' location in the stream (since ``seek()`` has no way to reconstruct the buffers). To get around these problems, we define a new class, `SeekableUnicodeStreamReader`, to act as a file-like interface to files containing encoded unicode data. This class is loosely based on the ``codecs.StreamReader`` class. To construct a new reader, we call the constructor with an underlying stream and an encoding name: >>> from io import StringIO, BytesIO >>> from nltk.data import SeekableUnicodeStreamReader >>> stream = BytesIO(b"""\ ... This is a test file. ... It is encoded in ascii. ... """.decode('ascii').encode('ascii')) >>> reader = SeekableUnicodeStreamReader(stream, 'ascii') `SeekableUnicodeStreamReader`\ s support all of the normal operations supplied by a read-only stream. Note that all of the read operations return ``unicode`` objects (not ``str`` objects). >>> reader.read() # read the entire file. 'This is a test file.\nIt is encoded in ascii.\n' >>> reader.seek(0) # rewind to the start. >>> reader.read(5) # read at most 5 bytes. 'This ' >>> reader.readline() # read to the end of the line. 'is a test file.\n' >>> reader.seek(0) # rewind to the start. >>> for line in reader: ... print(repr(line)) # iterate over lines 'This is a test file.\n' 'It is encoded in ascii.\n' >>> reader.seek(0) # rewind to the start. >>> reader.readlines() # read a list of line strings ['This is a test file.\n', 'It is encoded in ascii.\n'] >>> reader.close() Size argument to ``read()`` --------------------------- The ``size`` argument to ``read()`` specifies the maximum number of *bytes* to read, not the maximum number of *characters*. Thus, for encodings that use multiple bytes per character, it may return fewer characters than the ``size`` argument: >>> stream = BytesIO(b"""\ ... This is a test file. ... It is encoded in utf-16. ... """.decode('ascii').encode('utf-16')) >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') >>> reader.read(10) 'This ' If a read block ends in the middle of the byte string encoding a single character, then that byte string is stored in an internal buffer, and re-used on the next call to ``read()``. However, if the size argument is too small to read even a single character, even though at least one character is available, then the ``read()`` method will read additional bytes until it can return a single character. This ensures that the ``read()`` method does not return an empty string, which could be mistaken for indicating the end of the file. >>> reader.seek(0) # rewind to the start. >>> reader.read(1) # we actually need to read 4 bytes 'T' >>> int(reader.tell()) 4 The ``readline()`` method may read more than a single line of text, in which case it stores the text that it does not return in a buffer. If this buffer is not empty, then its contents will be included in the value returned by the next call to ``read()``, regardless of the ``size`` argument, since they are available without reading any new bytes from the stream: >>> reader.seek(0) # rewind to the start. >>> reader.readline() # stores extra text in a buffer 'This is a test file.\n' >>> print(reader.linebuffer) # examine the buffer contents ['It is encoded i'] >>> reader.read(0) # returns the contents of the buffer 'It is encoded i' >>> print(reader.linebuffer) # examine the buffer contents None Seek and Tell ------------- In addition to these basic read operations, `SeekableUnicodeStreamReader` also supports the ``seek()`` and ``tell()`` operations. However, some care must still be taken when using these operations. In particular, the only file offsets that should be passed to ``seek()`` are ``0`` and any offset that has been returned by ``tell``. >>> stream = BytesIO(b"""\ ... This is a test file. ... It is encoded in utf-16. ... """.decode('ascii').encode('utf-16')) >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') >>> reader.read(20) 'This is a ' >>> pos = reader.tell(); print(pos) 22 >>> reader.read(20) 'test file.' >>> reader.seek(pos) # rewind to the position from tell. >>> reader.read(20) 'test file.' The ``seek()`` and ``tell()`` methods work property even when ``readline()`` is used. >>> stream = BytesIO(b"""\ ... This is a test file. ... It is encoded in utf-16. ... """.decode('ascii').encode('utf-16')) >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') >>> reader.readline() 'This is a test file.\n' >>> pos = reader.tell(); print(pos) 44 >>> reader.readline() 'It is encoded in utf-16.\n' >>> reader.seek(pos) # rewind to the position from tell. >>> reader.readline() 'It is encoded in utf-16.\n' Squashed Bugs ============= svn 5276 fixed a bug in the comment-stripping behavior of parse_sexpr_block. >>> from io import StringIO >>> from nltk.corpus.reader.util import read_sexpr_block >>> f = StringIO(b""" ... (a b c) ... # This line is a comment. ... (d e f\ng h)""".decode('ascii')) >>> print(read_sexpr_block(f, block_size=38, comment_char='#')) ['(a b c)'] >>> print(read_sexpr_block(f, block_size=38, comment_char='#')) ['(d e f\ng h)'] svn 5277 fixed a bug in parse_sexpr_block, which would cause it to enter an infinite loop if a file ended mid-sexpr, or ended with a token that was not followed by whitespace. A related bug caused an infinite loop if the corpus ended in an unmatched close paren -- this was fixed in svn 5279 >>> f = StringIO(b""" ... This file ends mid-sexpr ... (hello (world""".decode('ascii')) >>> for i in range(3): print(read_sexpr_block(f)) ['This', 'file', 'ends', 'mid-sexpr'] ['(hello (world'] [] >>> f = StringIO(b"This file has no trailing whitespace.".decode('ascii')) >>> for i in range(3): print(read_sexpr_block(f)) ['This', 'file', 'has', 'no', 'trailing'] ['whitespace.'] [] >>> # Bug fixed in 5279: >>> f = StringIO(b"a b c)".decode('ascii')) >>> for i in range(3): print(read_sexpr_block(f)) ['a', 'b'] ['c)'] [] svn 5624 & 5265 fixed a bug in ConcatenatedCorpusView, which caused it to return the wrong items when indexed starting at any index beyond the first file. >>> import nltk >>> sents = nltk.corpus.brown.sents() >>> print(sents[6000]) ['Cholesterol', 'and', 'thyroid'] >>> print(sents[6000]) ['Cholesterol', 'and', 'thyroid'] svn 5728 fixed a bug in Categorized*CorpusReader, which caused them to return words from *all* files when just one file was specified. >>> from nltk.corpus import reuters >>> reuters.words('training/13085') ['SNYDER', '&', 'lt', ';', 'SOI', '>', 'MAKES', ...] >>> reuters.words('training/5082') ['SHEPPARD', 'RESOURCES', 'TO', 'MERGE', 'WITH', ...] svn 7227 fixed a bug in the qc corpus reader, which prevented access to its tuples() method >>> from nltk.corpus import qc >>> qc.tuples('test.txt') [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...] Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulnerability. >>> import re >>> import time >>> from nltk.corpus.reader.comparative_sents import KEYWORD >>> sizes = { ... "short": 4000, ... "long": 40000 ... } >>> exec_times = { ... "short": [], ... "long": [], ... } >>> for size_name, size in sizes.items(): ... for j in range(9): ... start_t = time.perf_counter() ... payload = "( " + "(" * size ... output = KEYWORD.findall(payload) ... exec_times[size_name].append(time.perf_counter() - start_t) ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the median Ideally, the execution time of such a regular expression is linear in the length of the input. As such, we would expect exec_times["long"] to be roughly 10 times as big as exec_times["short"]. With the ReDoS in place, it took roughly 80 times as long. For now, we accept values below 30 (times as long), due to the potential for variance. This ensures that the ReDoS has certainly been reduced, if not removed. >>> exec_times["long"] / exec_times["short"] < 30 # doctest: +SKIP True nltk-3.7/nltk/test/crubadan.doctest000066400000000000000000000037131420073152400174320ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT Crubadan Corpus Reader ====================== Crubadan is an NLTK corpus reader for ngram files provided by the Crubadan project. It supports several languages. >>> from nltk.corpus import crubadan >>> crubadan.langs() ['abk', 'abn',..., 'zpa', 'zul'] ---------------------------------------- Language code mapping and helper methods ---------------------------------------- The web crawler that generates the 3-gram frequencies works at the level of "writing systems" rather than languages. Writing systems are assigned internal 2-3 letter codes that require mapping to the standard ISO 639-3 codes. For more information, please refer to the README in nltk_data/crubadan folder after installing it. To translate ISO 639-3 codes to "Crubadan Code": >>> crubadan.iso_to_crubadan('eng') 'en' >>> crubadan.iso_to_crubadan('fra') 'fr' >>> crubadan.iso_to_crubadan('aaa') In reverse, print ISO 639-3 code if we have the Crubadan Code: >>> crubadan.crubadan_to_iso('en') 'eng' >>> crubadan.crubadan_to_iso('fr') 'fra' >>> crubadan.crubadan_to_iso('aa') --------------------------- Accessing ngram frequencies --------------------------- On initialization the reader will create a dictionary of every language supported by the Crubadan project, mapping the ISO 639-3 language code to its corresponding ngram frequency. You can access individual language FreqDist and the ngrams within them as follows: >>> english_fd = crubadan.lang_freq('eng') >>> english_fd['the'] 728135 Above accesses the FreqDist of English and returns the frequency of the ngram 'the'. A ngram that isn't found within the language will return 0: >>> english_fd['sometest'] 0 A language that isn't supported will raise an exception: >>> crubadan.lang_freq('elvish') Traceback (most recent call last): ... RuntimeError: Unsupported language. nltk-3.7/nltk/test/data.doctest000066400000000000000000000324641420073152400165710ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========================================= Loading Resources From the Data Package ========================================= >>> import nltk.data Overview ~~~~~~~~ The `nltk.data` module contains functions that can be used to load NLTK resource files, such as corpora, grammars, and saved processing objects. Loading Data Files ~~~~~~~~~~~~~~~~~~ Resources are loaded using the function `nltk.data.load()`, which takes as its first argument a URL specifying what file should be loaded. The ``nltk:`` protocol loads files from the NLTK data distribution: >>> tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') >>> tokenizer.tokenize('Hello. This is a test. It works!') ['Hello.', 'This is a test.', 'It works!'] It is important to note that there should be no space following the colon (':') in the URL; 'nltk: tokenizers/punkt/english.pickle' will not work! The ``nltk:`` protocol is used by default if no protocol is specified: >>> nltk.data.load('tokenizers/punkt/english.pickle') But it is also possible to load resources from ``http:``, ``ftp:``, and ``file:`` URLs, e.g. ``cfg = nltk.data.load('https://example.com/path/to/toy.cfg')`` >>> # Load a grammar using an absolute path. >>> url = 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg') >>> url.replace('\\', '/') 'file:...toy.cfg' >>> print(nltk.data.load(url)) Grammar with 14 productions (start state = S) S -> NP VP PP -> P NP ... P -> 'on' P -> 'in' The second argument to the `nltk.data.load()` function specifies the file format, which determines how the file's contents are processed before they are returned by ``load()``. The formats that are currently supported by the data module are described by the dictionary `nltk.data.FORMATS`: >>> for format, descr in sorted(nltk.data.FORMATS.items()): ... print('{0:<7} {1:}'.format(format, descr)) cfg A context free grammar. fcfg A feature CFG. fol A list of first order logic expressions, parsed with nltk.sem.logic.Expression.fromstring. json A serialized python object, stored using the json module. logic A list of first order logic expressions, parsed with nltk.sem.logic.LogicParser. Requires an additional logic_parser parameter pcfg A probabilistic CFG. pickle A serialized python object, stored using the pickle module. raw The raw (byte string) contents of a file. text The raw (unicode string) contents of a file. val A semantic valuation, parsed by nltk.sem.Valuation.fromstring. yaml A serialized python object, stored using the yaml module. `nltk.data.load()` will raise a ValueError if a bad format name is specified: >>> nltk.data.load('grammars/sample_grammars/toy.cfg', 'bar') Traceback (most recent call last): . . . ValueError: Unknown format type! By default, the ``"auto"`` format is used, which chooses a format based on the filename's extension. The mapping from file extensions to format names is specified by `nltk.data.AUTO_FORMATS`: >>> for ext, format in sorted(nltk.data.AUTO_FORMATS.items()): ... print('.%-7s -> %s' % (ext, format)) .cfg -> cfg .fcfg -> fcfg .fol -> fol .json -> json .logic -> logic .pcfg -> pcfg .pickle -> pickle .text -> text .txt -> text .val -> val .yaml -> yaml If `nltk.data.load()` is unable to determine the format based on the filename's extension, it will raise a ValueError: >>> nltk.data.load('foo.bar') Traceback (most recent call last): . . . ValueError: Could not determine format for foo.bar based on its file extension; use the "format" argument to specify the format explicitly. Note that by explicitly specifying the ``format`` argument, you can override the load method's default processing behavior. For example, to get the raw contents of any file, simply use ``format="raw"``: >>> s = nltk.data.load('grammars/sample_grammars/toy.cfg', 'text') >>> print(s) S -> NP VP PP -> P NP NP -> Det N | NP PP VP -> V NP | VP PP ... Making Local Copies ~~~~~~~~~~~~~~~~~~~ .. This will not be visible in the html output: create a tempdir to play in. >>> import tempfile, os >>> tempdir = tempfile.mkdtemp() >>> old_dir = os.path.abspath('.') >>> os.chdir(tempdir) The function `nltk.data.retrieve()` copies a given resource to a local file. This can be useful, for example, if you want to edit one of the sample grammars. >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy.cfg' >>> # Simulate editing the grammar. >>> with open('toy.cfg') as inp: ... s = inp.read().replace('NP', 'DP') >>> with open('toy.cfg', 'w') as out: ... _bytes_written = out.write(s) >>> # Load the edited grammar, & display it. >>> cfg = nltk.data.load('file:///' + os.path.abspath('toy.cfg')) >>> print(cfg) Grammar with 14 productions (start state = S) S -> DP VP PP -> P DP ... P -> 'on' P -> 'in' The second argument to `nltk.data.retrieve()` specifies the filename for the new copy of the file. By default, the source file's filename is used. >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg', 'mytoy.cfg') Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'mytoy.cfg' >>> os.path.isfile('./mytoy.cfg') True >>> nltk.data.retrieve('grammars/sample_grammars/np.fcfg') Retrieving 'nltk:grammars/sample_grammars/np.fcfg', saving to 'np.fcfg' >>> os.path.isfile('./np.fcfg') True If a file with the specified (or default) filename already exists in the current directory, then `nltk.data.retrieve()` will raise a ValueError exception. It will *not* overwrite the file: >>> os.path.isfile('./toy.cfg') True >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') Traceback (most recent call last): . . . ValueError: File '...toy.cfg' already exists! .. This will not be visible in the html output: clean up the tempdir. >>> os.chdir(old_dir) >>> for f in os.listdir(tempdir): ... os.remove(os.path.join(tempdir, f)) >>> os.rmdir(tempdir) Finding Files in the NLTK Data Package ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The `nltk.data.find()` function searches the NLTK data package for a given file, and returns a pointer to that file. This pointer can either be a `FileSystemPathPointer` (whose `path` attribute gives the absolute path of the file); or a `ZipFilePathPointer`, specifying a zipfile and the name of an entry within that zipfile. Both pointer types define the `open()` method, which can be used to read the string contents of the file. >>> path = nltk.data.find('corpora/abc/rural.txt') >>> str(path) '...rural.txt' >>> print(path.open().read(60).decode()) PM denies knowledge of AWB kickbacks The Prime Minister has Alternatively, the `nltk.data.load()` function can be used with the keyword argument ``format="raw"``: >>> s = nltk.data.load('corpora/abc/rural.txt', format='raw')[:60] >>> print(s.decode()) PM denies knowledge of AWB kickbacks The Prime Minister has Alternatively, you can use the keyword argument ``format="text"``: >>> s = nltk.data.load('corpora/abc/rural.txt', format='text')[:60] >>> print(s) PM denies knowledge of AWB kickbacks The Prime Minister has Resource Caching ~~~~~~~~~~~~~~~~ NLTK uses a weakref dictionary to maintain a cache of resources that have been loaded. If you load a resource that is already stored in the cache, then the cached copy will be returned. This behavior can be seen by the trace output generated when verbose=True: >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True) <> >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True) <> If you wish to load a resource from its source, bypassing the cache, use the ``cache=False`` argument to `nltk.data.load()`. This can be useful, for example, if the resource is loaded from a local file, and you are actively editing that file: >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg',cache=False,verbose=True) <> The cache *no longer* uses weak references. A resource will not be automatically expunged from the cache when no more objects are using it. In the following example, when we clear the variable ``feat0``, the reference count for the feature grammar object drops to zero. However, the object remains cached: >>> del feat0 >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', ... verbose=True) <> You can clear the entire contents of the cache, using `nltk.data.clear_cache()`: >>> nltk.data.clear_cache() Retrieving other Data Sources ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> formulas = nltk.data.load('grammars/book_grammars/background.fol') >>> for f in formulas: print(str(f)) all x.(boxerdog(x) -> dog(x)) all x.(boxer(x) -> person(x)) all x.-(dog(x) & person(x)) all x.(married(x) <-> exists y.marry(x,y)) all x.(bark(x) -> dog(x)) all x y.(marry(x,y) -> (person(x) & person(y))) -(Vincent = Mia) -(Vincent = Fido) -(Mia = Fido) Regression Tests ~~~~~~~~~~~~~~~~ Create a temp dir for tests that write files: >>> import tempfile, os >>> tempdir = tempfile.mkdtemp() >>> old_dir = os.path.abspath('.') >>> os.chdir(tempdir) The `retrieve()` function accepts all url types: >>> urls = ['https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', ... 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg'), ... 'nltk:grammars/sample_grammars/toy.cfg', ... 'grammars/sample_grammars/toy.cfg'] >>> for i, url in enumerate(urls): ... nltk.data.retrieve(url, 'toy-%d.cfg' % i) Retrieving 'https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', saving to 'toy-0.cfg' Retrieving 'file:...toy.cfg', saving to 'toy-1.cfg' Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-2.cfg' Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-3.cfg' Clean up the temp dir: >>> os.chdir(old_dir) >>> for f in os.listdir(tempdir): ... os.remove(os.path.join(tempdir, f)) >>> os.rmdir(tempdir) Lazy Loader ----------- A lazy loader is a wrapper object that defers loading a resource until it is accessed or used in any way. This is mainly intended for internal use by NLTK's corpus readers. >>> # Create a lazy loader for toy.cfg. >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg') >>> # Show that it's not loaded yet: >>> object.__repr__(ll) '' >>> # printing it is enough to cause it to be loaded: >>> print(ll) >>> # Show that it's now been loaded: >>> object.__repr__(ll) '' >>> # Test that accessing an attribute also loads it: >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg') >>> ll.start() S >>> object.__repr__(ll) '' Buffered Gzip Reading and Writing --------------------------------- Write performance to gzip-compressed is extremely poor when the files become large. File creation can become a bottleneck in those cases. Read performance from large gzipped pickle files was improved in data.py by buffering the reads. A similar fix can be applied to writes by buffering the writes to a StringIO object first. This is mainly intended for internal use. The test simply tests that reading and writing work as intended and does not test how much improvement buffering provides. >>> from io import StringIO >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'wb', size=2**10) >>> ans = [] >>> for i in range(10000): ... ans.append(str(i).encode('ascii')) ... test.write(str(i).encode('ascii')) >>> test.close() >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'rb') >>> test.read() == b''.join(ans) True >>> test.close() >>> import os >>> os.unlink('testbuf.gz') JSON Encoding and Decoding -------------------------- JSON serialization is used instead of pickle for some classes. >>> from nltk import jsontags >>> from nltk.jsontags import JSONTaggedEncoder, JSONTaggedDecoder, register_tag >>> @jsontags.register_tag ... class JSONSerializable: ... json_tag = 'JSONSerializable' ... ... def __init__(self, n): ... self.n = n ... ... def encode_json_obj(self): ... return self.n ... ... @classmethod ... def decode_json_obj(cls, obj): ... n = obj ... return cls(n) ... >>> JSONTaggedEncoder().encode(JSONSerializable(1)) '{"!JSONSerializable": 1}' >>> JSONTaggedDecoder().decode('{"!JSONSerializable": 1}').n 1 nltk-3.7/nltk/test/dependency.doctest000077500000000000000000000164041420073152400177750ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =================== Dependency Grammars =================== >>> from nltk.grammar import DependencyGrammar >>> from nltk.parse import ( ... DependencyGraph, ... ProjectiveDependencyParser, ... NonprojectiveDependencyParser, ... ) CoNLL Data ---------- >>> treebank_data = """Pierre NNP 2 NMOD ... Vinken NNP 8 SUB ... , , 2 P ... 61 CD 5 NMOD ... years NNS 6 AMOD ... old JJ 2 NMOD ... , , 2 P ... will MD 0 ROOT ... join VB 8 VC ... the DT 11 NMOD ... board NN 9 OBJ ... as IN 9 VMOD ... a DT 15 NMOD ... nonexecutive JJ 15 NMOD ... director NN 12 PMOD ... Nov. NNP 9 VMOD ... 29 CD 16 NMOD ... . . 9 VMOD ... """ >>> dg = DependencyGraph(treebank_data) >>> dg.tree().pprint() (will (Vinken Pierre , (old (years 61)) ,) (join (board the) (as (director a nonexecutive)) (Nov. 29) .)) >>> for head, rel, dep in dg.triples(): ... print( ... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})' ... .format(h=head, r=rel, d=dep) ... ) (will, MD), SUB, (Vinken, NNP) (Vinken, NNP), NMOD, (Pierre, NNP) (Vinken, NNP), P, (,, ,) (Vinken, NNP), NMOD, (old, JJ) (old, JJ), AMOD, (years, NNS) (years, NNS), NMOD, (61, CD) (Vinken, NNP), P, (,, ,) (will, MD), VC, (join, VB) (join, VB), OBJ, (board, NN) (board, NN), NMOD, (the, DT) (join, VB), VMOD, (as, IN) (as, IN), PMOD, (director, NN) (director, NN), NMOD, (a, DT) (director, NN), NMOD, (nonexecutive, JJ) (join, VB), VMOD, (Nov., NNP) (Nov., NNP), NMOD, (29, CD) (join, VB), VMOD, (., .) Using a custom cell extractor. >>> def custom_extractor(cells): ... _, tag, head, rel = cells ... return 'spam', 'spam', tag, tag, '', head, rel >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) >>> dg.tree().pprint() (spam (spam spam spam (spam (spam spam)) spam) (spam (spam spam) (spam (spam spam spam)) (spam spam) spam)) Custom cell extractors can take in and return an index. >>> def custom_extractor(cells, index): ... word, tag, head, rel = cells ... return (index, '{}-{}'.format(word, index), word, ... tag, tag, '', head, rel) >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) >>> dg.tree().pprint() (will-8 (Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7) (join-9 (board-11 the-10) (as-12 (director-15 a-13 nonexecutive-14)) (Nov.-16 29-17) .-18)) Using the dependency-parsed version of the Penn Treebank corpus sample. >>> from nltk.corpus import dependency_treebank >>> t = dependency_treebank.parsed_sents()[0] >>> print(t.to_conll(3)) Pierre NNP 2 Vinken NNP 8 , , 2 61 CD 5 years NNS 6 old JJ 2 , , 2 will MD 0 join VB 8 the DT 11 board NN 9 as IN 9 a DT 15 nonexecutive JJ 15 director NN 12 Nov. NNP 9 29 CD 16 . . 8 Using the output of zpar (like Malt-TAB but with zero-based indexing) >>> zpar_data = """ ... Pierre NNP 1 NMOD ... Vinken NNP 7 SUB ... , , 1 P ... 61 CD 4 NMOD ... years NNS 5 AMOD ... old JJ 1 NMOD ... , , 1 P ... will MD -1 ROOT ... join VB 7 VC ... the DT 10 NMOD ... board NN 8 OBJ ... as IN 8 VMOD ... a DT 14 NMOD ... nonexecutive JJ 14 NMOD ... director NN 11 PMOD ... Nov. NNP 8 VMOD ... 29 CD 15 NMOD ... . . 7 P ... """ >>> zdg = DependencyGraph(zpar_data, zero_based=True) >>> print(zdg.tree()) (will (Vinken Pierre , (old (years 61)) ,) (join (board the) (as (director a nonexecutive)) (Nov. 29)) .) Projective Dependency Parsing ----------------------------- >>> grammar = DependencyGrammar.fromstring(""" ... 'fell' -> 'price' | 'stock' ... 'price' -> 'of' 'the' ... 'of' -> 'stock' ... 'stock' -> 'the' ... """) >>> print(grammar) Dependency grammar with 5 productions 'fell' -> 'price' 'fell' -> 'stock' 'price' -> 'of' 'the' 'of' -> 'stock' 'stock' -> 'the' >>> dp = ProjectiveDependencyParser(grammar) >>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])): ... print(t) (fell (price the (of (stock the)))) (fell (price the of) (stock the)) (fell (price the of the) stock) Non-Projective Dependency Parsing --------------------------------- >>> grammar = DependencyGrammar.fromstring(""" ... 'taught' -> 'play' | 'man' ... 'man' -> 'the' ... 'play' -> 'golf' | 'dog' | 'to' ... 'dog' -> 'his' ... """) >>> print(grammar) Dependency grammar with 7 productions 'taught' -> 'play' 'taught' -> 'man' 'man' -> 'the' 'play' -> 'golf' 'play' -> 'dog' 'play' -> 'to' 'dog' -> 'his' >>> dp = NonprojectiveDependencyParser(grammar) >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']) >>> print(g.root['word']) taught >>> for _, node in sorted(g.nodes.items()): ... if node['word'] is not None: ... print('{address} {word}: {d}'.format(d=node['deps'][''], **node)) 1 the: [] 2 man: [1] 3 taught: [2, 7] 4 his: [] 5 dog: [4] 6 to: [] 7 play: [5, 6, 8] 8 golf: [] >>> print(g.tree()) (taught (man the) (play (dog his) to golf)) Integration with MALT parser ============================ In case the top relation is different from the default, we can set it. In case of MALT parser, it's set to `'null'`. >>> dg_str = """1 I _ NN NN _ 2 nn _ _ ... 2 shot _ NN NN _ 0 null _ _ ... 3 an _ AT AT _ 2 dep _ _ ... 4 elephant _ NN NN _ 7 nn _ _ ... 5 in _ NN NN _ 7 nn _ _ ... 6 my _ NN NN _ 7 nn _ _ ... 7 pajamas _ NNS NNS _ 3 dobj _ _ ... """ >>> dg = DependencyGraph(dg_str, top_relation_label='null') >>> len(dg.nodes) 8 >>> dg.root['word'], dg.root['address'] ('shot', 2) >>> print(dg.to_conll(10)) 1 I _ NN NN _ 2 nn _ _ 2 shot _ NN NN _ 0 null _ _ 3 an _ AT AT _ 2 dep _ _ 4 elephant _ NN NN _ 7 nn _ _ 5 in _ NN NN _ 7 nn _ _ 6 my _ NN NN _ 7 nn _ _ 7 pajamas _ NNS NNS _ 3 dobj _ _ nltk-3.7/nltk/test/discourse.doctest000066400000000000000000000417331420073152400176570ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ================== Discourse Checking ================== >>> from nltk import * >>> from nltk.sem import logic >>> logic._counter._value = 0 Setup ===== >>> from nltk.test.childes_fixt import setup_module >>> setup_module() Introduction ============ The NLTK discourse module makes it possible to test consistency and redundancy of simple discourses, using theorem-proving and model-building from `nltk.inference`. The ``DiscourseTester`` constructor takes a list of sentences as a parameter. >>> dt = DiscourseTester(['a boxer walks', 'every boxer chases a girl']) The ``DiscourseTester`` parses each sentence into a list of logical forms. Once we have created ``DiscourseTester`` object, we can inspect various properties of the discourse. First off, we might want to double-check what sentences are currently stored as the discourse. >>> dt.sentences() s0: a boxer walks s1: every boxer chases a girl As you will see, each sentence receives an identifier `s`\ :subscript:`i`. We might also want to check what grammar the ``DiscourseTester`` is using (by default, ``book_grammars/discourse.fcfg``): >>> dt.grammar() % start S # Grammar Rules S[SEM = ] -> NP[NUM=?n,SEM=?subj] VP[NUM=?n,SEM=?vp] NP[NUM=?n,SEM= ] -> Det[NUM=?n,SEM=?det] Nom[NUM=?n,SEM=?nom] NP[LOC=?l,NUM=?n,SEM=?np] -> PropN[LOC=?l,NUM=?n,SEM=?np] ... A different grammar can be invoked by using the optional ``gramfile`` parameter when a ``DiscourseTester`` object is created. Readings and Threads ==================== Depending on the grammar used, we may find some sentences have more than one logical form. To check this, use the ``readings()`` method. Given a sentence identifier of the form `s`\ :subscript:`i`, each reading of that sentence is given an identifier `s`\ :sub:`i`-`r`\ :sub:`j`. >>> dt.readings() s0 readings: s0-r0: exists z1.(boxer(z1) & walk(z1)) s0-r1: exists z1.(boxerdog(z1) & walk(z1)) s1 readings: s1-r0: all z2.(boxer(z2) -> exists z3.(girl(z3) & chase(z2,z3))) s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) In this case, the only source of ambiguity lies in the word *boxer*, which receives two translations: ``boxer`` and ``boxerdog``. The intention is that one of these corresponds to the ``person`` sense and one to the ``dog`` sense. In principle, we would also expect to see a quantifier scope ambiguity in ``s1``. However, the simple grammar we are using, namely `sem4.fcfg `_, doesn't support quantifier scope ambiguity. We can also investigate the readings of a specific sentence: >>> dt.readings('a boxer walks') The sentence 'a boxer walks' has these readings: exists x.(boxer(x) & walk(x)) exists x.(boxerdog(x) & walk(x)) Given that each sentence is two-ways ambiguous, we potentially have four different discourse 'threads', taking all combinations of readings. To see these, specify the ``threaded=True`` parameter on the ``readings()`` method. Again, each thread is assigned an identifier of the form `d`\ :sub:`i`. Following the identifier is a list of the readings that constitute that thread. >>> dt.readings(threaded=True) d0: ['s0-r0', 's1-r0'] d1: ['s0-r0', 's1-r1'] d2: ['s0-r1', 's1-r0'] d3: ['s0-r1', 's1-r1'] Of course, this simple-minded approach doesn't scale: a discourse with, say, three sentences, each of which has 3 readings, will generate 27 different threads. It is an interesting exercise to consider how to manage discourse ambiguity more efficiently. Checking Consistency ==================== Now, we can check whether some or all of the discourse threads are consistent, using the ``models()`` method. With no parameter, this method will try to find a model for every discourse thread in the current discourse. However, we can also specify just one thread, say ``d1``. >>> dt.models('d1') -------------------------------------------------------------------------------- Model for Discourse Thread d1 -------------------------------------------------------------------------------- % number = 1 % seconds = 0 % Interpretation of size 2 c1 = 0. f1(0) = 0. f1(1) = 0. boxer(0). - boxer(1). - boxerdog(0). - boxerdog(1). - girl(0). - girl(1). walk(0). - walk(1). - chase(0,0). - chase(0,1). - chase(1,0). - chase(1,1). Consistent discourse: d1 ['s0-r0', 's1-r1']: s0-r0: exists z1.(boxer(z1) & walk(z1)) s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) There are various formats for rendering **Mace4** models --- here, we have used the 'cooked' format (which is intended to be human-readable). There are a number of points to note. #. The entities in the domain are all treated as non-negative integers. In this case, there are only two entities, ``0`` and ``1``. #. The ``-`` symbol indicates negation. So ``0`` is the only ``boxerdog`` and the only thing that ``walk``\ s. Nothing is a ``boxer``, or a ``girl`` or in the ``chase`` relation. Thus the universal sentence is vacuously true. #. ``c1`` is an introduced constant that denotes ``0``. #. ``f1`` is a Skolem function, but it plays no significant role in this model. We might want to now add another sentence to the discourse, and there is method ``add_sentence()`` for doing just this. >>> dt.add_sentence('John is a boxer') >>> dt.sentences() s0: a boxer walks s1: every boxer chases a girl s2: John is a boxer We can now test all the properties as before; here, we just show a couple of them. >>> dt.readings() s0 readings: s0-r0: exists z1.(boxer(z1) & walk(z1)) s0-r1: exists z1.(boxerdog(z1) & walk(z1)) s1 readings: s1-r0: all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2))) s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) s2 readings: s2-r0: boxer(John) s2-r1: boxerdog(John) >>> dt.readings(threaded=True) d0: ['s0-r0', 's1-r0', 's2-r0'] d1: ['s0-r0', 's1-r0', 's2-r1'] d2: ['s0-r0', 's1-r1', 's2-r0'] d3: ['s0-r0', 's1-r1', 's2-r1'] d4: ['s0-r1', 's1-r0', 's2-r0'] d5: ['s0-r1', 's1-r0', 's2-r1'] d6: ['s0-r1', 's1-r1', 's2-r0'] d7: ['s0-r1', 's1-r1', 's2-r1'] If you are interested in a particular thread, the ``expand_threads()`` method will remind you of what readings it consists of: >>> thread = dt.expand_threads('d1') >>> for rid, reading in thread: ... print(rid, str(reading.normalize())) s0-r0 exists z1.(boxer(z1) & walk(z1)) s1-r0 all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2))) s2-r1 boxerdog(John) Suppose we have already defined a discourse, as follows: >>> dt = DiscourseTester(['A student dances', 'Every student is a person']) Now, when we add a new sentence, is it consistent with what we already have? The `` consistchk=True`` parameter of ``add_sentence()`` allows us to check: >>> dt.add_sentence('No person dances', consistchk=True) Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']: s0-r0: exists z1.(student(z1) & dance(z1)) s1-r0: all z1.(student(z1) -> person(z1)) s2-r0: -exists z1.(person(z1) & dance(z1)) >>> dt.readings() s0 readings: s0-r0: exists z1.(student(z1) & dance(z1)) s1 readings: s1-r0: all z1.(student(z1) -> person(z1)) s2 readings: s2-r0: -exists z1.(person(z1) & dance(z1)) So let's retract the inconsistent sentence: >>> dt.retract_sentence('No person dances', verbose=True) Current sentences are s0: A student dances s1: Every student is a person We can now verify that result is consistent. >>> dt.models() -------------------------------------------------------------------------------- Model for Discourse Thread d0 -------------------------------------------------------------------------------- % number = 1 % seconds = 0 % Interpretation of size 2 c1 = 0. dance(0). - dance(1). person(0). - person(1). student(0). - student(1). Consistent discourse: d0 ['s0-r0', 's1-r0']: s0-r0: exists z1.(student(z1) & dance(z1)) s1-r0: all z1.(student(z1) -> person(z1)) Checking Informativity ====================== Let's assume that we are still trying to extend the discourse *A student dances.* *Every student is a person.* We add a new sentence, but this time, we check whether it is informative with respect to what has gone before. >>> dt.add_sentence('A person dances', informchk=True) Sentence 'A person dances' under reading 'exists x.(person(x) & dance(x))': Not informative relative to thread 'd0' In fact, we are just checking whether the new sentence is entailed by the preceding discourse. >>> dt.models() -------------------------------------------------------------------------------- Model for Discourse Thread d0 -------------------------------------------------------------------------------- % number = 1 % seconds = 0 % Interpretation of size 2 c1 = 0. c2 = 0. dance(0). - dance(1). person(0). - person(1). student(0). - student(1). Consistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']: s0-r0: exists z1.(student(z1) & dance(z1)) s1-r0: all z1.(student(z1) -> person(z1)) s2-r0: exists z1.(person(z1) & dance(z1)) Adding Background Knowledge =========================== Let's build a new discourse, and look at the readings of the component sentences: >>> dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks']) >>> dt.readings() s0 readings: s0-r0: boxer(Vincent) s0-r1: boxerdog(Vincent) s1 readings: s1-r0: boxer(Fido) s1-r1: boxerdog(Fido) s2 readings: s2-r0: married(Vincent) s3 readings: s3-r0: bark(Fido) This gives us a lot of threads: >>> dt.readings(threaded=True) d0: ['s0-r0', 's1-r0', 's2-r0', 's3-r0'] d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0'] d2: ['s0-r1', 's1-r0', 's2-r0', 's3-r0'] d3: ['s0-r1', 's1-r1', 's2-r0', 's3-r0'] We can eliminate some of the readings, and hence some of the threads, by adding background information. >>> import nltk.data >>> bg = nltk.data.load('grammars/book_grammars/background.fol') >>> dt.add_background(bg) >>> dt.background() all x.(boxerdog(x) -> dog(x)) all x.(boxer(x) -> person(x)) all x.-(dog(x) & person(x)) all x.(married(x) <-> exists y.marry(x,y)) all x.(bark(x) -> dog(x)) all x y.(marry(x,y) -> (person(x) & person(y))) -(Vincent = Mia) -(Vincent = Fido) -(Mia = Fido) The background information allows us to reject three of the threads as inconsistent. To see what remains, use the ``filter=True`` parameter on ``readings()``. >>> dt.readings(filter=True) d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0'] The ``models()`` method gives us more information about the surviving thread. >>> dt.models() -------------------------------------------------------------------------------- Model for Discourse Thread d0 -------------------------------------------------------------------------------- No model found! -------------------------------------------------------------------------------- Model for Discourse Thread d1 -------------------------------------------------------------------------------- % number = 1 % seconds = 0 % Interpretation of size 3 Fido = 0. Mia = 1. Vincent = 2. f1(0) = 0. f1(1) = 0. f1(2) = 2. bark(0). - bark(1). - bark(2). - boxer(0). - boxer(1). boxer(2). boxerdog(0). - boxerdog(1). - boxerdog(2). dog(0). - dog(1). - dog(2). - married(0). - married(1). married(2). - person(0). - person(1). person(2). - marry(0,0). - marry(0,1). - marry(0,2). - marry(1,0). - marry(1,1). - marry(1,2). - marry(2,0). - marry(2,1). marry(2,2). -------------------------------------------------------------------------------- Model for Discourse Thread d2 -------------------------------------------------------------------------------- No model found! -------------------------------------------------------------------------------- Model for Discourse Thread d3 -------------------------------------------------------------------------------- No model found! Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0', 's3-r0']: s0-r0: boxer(Vincent) s1-r0: boxer(Fido) s2-r0: married(Vincent) s3-r0: bark(Fido) Consistent discourse: d1 ['s0-r0', 's1-r1', 's2-r0', 's3-r0']: s0-r0: boxer(Vincent) s1-r1: boxerdog(Fido) s2-r0: married(Vincent) s3-r0: bark(Fido) Inconsistent discourse: d2 ['s0-r1', 's1-r0', 's2-r0', 's3-r0']: s0-r1: boxerdog(Vincent) s1-r0: boxer(Fido) s2-r0: married(Vincent) s3-r0: bark(Fido) Inconsistent discourse: d3 ['s0-r1', 's1-r1', 's2-r0', 's3-r0']: s0-r1: boxerdog(Vincent) s1-r1: boxerdog(Fido) s2-r0: married(Vincent) s3-r0: bark(Fido) .. This will not be visible in the html output: create a tempdir to play in. >>> import tempfile, os >>> tempdir = tempfile.mkdtemp() >>> old_dir = os.path.abspath('.') >>> os.chdir(tempdir) In order to play around with your own version of background knowledge, you might want to start off with a local copy of ``background.fol``: >>> nltk.data.retrieve('grammars/book_grammars/background.fol') Retrieving 'nltk:grammars/book_grammars/background.fol', saving to 'background.fol' After you have modified the file, the ``load_fol()`` function will parse the strings in the file into expressions of ``nltk.sem.logic``. >>> from nltk.inference.discourse import load_fol >>> mybg = load_fol(open('background.fol').read()) The result can be loaded as an argument of ``add_background()`` in the manner shown earlier. .. This will not be visible in the html output: clean up the tempdir. >>> os.chdir(old_dir) >>> for f in os.listdir(tempdir): ... os.remove(os.path.join(tempdir, f)) >>> os.rmdir(tempdir) >>> nltk.data.clear_cache() Regression Testing from book ============================ >>> logic._counter._value = 0 >>> from nltk.tag import RegexpTagger >>> tagger = RegexpTagger( ... [('^(chases|runs)$', 'VB'), ... ('^(a)$', 'ex_quant'), ... ('^(every)$', 'univ_quant'), ... ('^(dog|boy)$', 'NN'), ... ('^(He)$', 'PRP') ... ]) >>> rc = DrtGlueReadingCommand(depparser=MaltParser(tagger=tagger)) >>> dt = DiscourseTester(map(str.split, ['Every dog chases a boy', 'He runs']), rc) >>> dt.readings() s0 readings: s0-r0: ([z2],[boy(z2), (([z5],[dog(z5)]) -> ([],[chases(z5,z2)]))]) s0-r1: ([],[(([z1],[dog(z1)]) -> ([z2],[boy(z2), chases(z1,z2)]))]) s1 readings: s1-r0: ([z1],[PRO(z1), runs(z1)]) >>> dt.readings(show_thread_readings=True) d0: ['s0-r0', 's1-r0'] : ([z1,z2],[boy(z1), (([z3],[dog(z3)]) -> ([],[chases(z3,z1)])), (z2 = z1), runs(z2)]) d1: ['s0-r1', 's1-r0'] : INVALID: AnaphoraResolutionException >>> dt.readings(filter=True, show_thread_readings=True) d0: ['s0-r0', 's1-r0'] : ([z1,z3],[boy(z1), (([z2],[dog(z2)]) -> ([],[chases(z2,z1)])), (z3 = z1), runs(z3)]) >>> logic._counter._value = 0 >>> from nltk.parse import FeatureEarleyChartParser >>> from nltk.sem.drt import DrtParser >>> grammar = nltk.data.load('grammars/book_grammars/drt.fcfg', logic_parser=DrtParser()) >>> parser = FeatureEarleyChartParser(grammar, trace=0) >>> trees = parser.parse('Angus owns a dog'.split()) >>> print(list(trees)[0].label()['SEM'].simplify().normalize()) ([z1,z2],[Angus(z1), dog(z2), own(z1,z2)]) nltk-3.7/nltk/test/discourse_fixt.py000066400000000000000000000006171420073152400176700ustar00rootroot00000000000000# FIXME: the entire discourse.doctest is skipped if Prover9/Mace4 is # not installed, but there are pure-python parts that don't need Prover9. def setup_module(): import pytest from nltk.inference.mace import Mace try: m = Mace() m._find_binary("mace4") except LookupError as e: pytest.skip("Mace4/Prover9 is not available so discourse.doctest is skipped") nltk-3.7/nltk/test/drt.doctest000066400000000000000000000461511420073152400164470ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ================================ Discourse Representation Theory ================================ >>> from nltk.sem import logic >>> from nltk.inference import TableauProver Overview ======== A DRS can be created with the ``DRS()`` constructor. This takes two arguments: a list of discourse referents and list of conditions. . >>> from nltk.sem.drt import * >>> dexpr = DrtExpression.fromstring >>> man_x = dexpr('man(x)') >>> walk_x = dexpr('walk(x)') >>> x = dexpr('x') >>> print(DRS([x], [man_x, walk_x])) ([x],[man(x), walk(x)]) The ``parse()`` method can also be applied directly to DRS expressions, which allows them to be specified more easily. >>> drs1 = dexpr('([x],[man(x),walk(x)])') >>> print(drs1) ([x],[man(x), walk(x)]) DRSs can be *merged* using the ``+`` operator. >>> drs2 = dexpr('([y],[woman(y),stop(y)])') >>> drs3 = drs1 + drs2 >>> print(drs3) (([x],[man(x), walk(x)]) + ([y],[woman(y), stop(y)])) >>> print(drs3.simplify()) ([x,y],[man(x), walk(x), woman(y), stop(y)]) We can embed DRSs as components of an ``implies`` condition. >>> s = '([], [(%s -> %s)])' % (drs1, drs2) >>> print(dexpr(s)) ([],[(([x],[man(x), walk(x)]) -> ([y],[woman(y), stop(y)]))]) The ``fol()`` method converts DRSs into FOL formulae. >>> print(dexpr(r'([x],[man(x), walks(x)])').fol()) exists x.(man(x) & walks(x)) >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol()) all x.(man(x) -> walks(x)) In order to visualize a DRS, the ``pretty_format()`` method can be used. >>> print(drs3.pretty_format()) _________ __________ | x | | y | (|---------| + |----------|) | man(x) | | woman(y) | | walk(x) | | stop(y) | |_________| |__________| Parse to semantics ------------------ .. >>> logic._counter._value = 0 DRSs can be used for building compositional semantics in a feature based grammar. To specify that we want to use DRSs, the appropriate logic parser needs be passed as a parameter to ``load_earley()`` >>> from nltk.parse import load_parser >>> from nltk.sem.drt import DrtParser >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, logic_parser=DrtParser()) >>> for tree in parser.parse('a dog barks'.split()): ... print(tree.label()['SEM'].simplify()) ... ([x],[dog(x), bark(x)]) Alternatively, a ``FeatStructReader`` can be passed with the ``logic_parser`` set on it >>> from nltk.featstruct import FeatStructReader >>> from nltk.grammar import FeatStructNonterminal >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, fstruct_reader=FeatStructReader(fdict_class=FeatStructNonterminal, logic_parser=DrtParser())) >>> for tree in parser.parse('every girl chases a dog'.split()): ... print(tree.label()['SEM'].simplify().normalize()) ... ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chase(z1,z2)]))]) Unit Tests ========== Parser ------ >>> print(dexpr(r'([x,y],[sees(x,y)])')) ([x,y],[sees(x,y)]) >>> print(dexpr(r'([x],[man(x), walks(x)])')) ([x],[man(x), walks(x)]) >>> print(dexpr(r'\x.([],[man(x), walks(x)])')) \x.([],[man(x), walks(x)]) >>> print(dexpr(r'\x.\y.([],[sees(x,y)])')) \x y.([],[sees(x,y)]) >>> print(dexpr(r'([x,y],[(x = y)])')) ([x,y],[(x = y)]) >>> print(dexpr(r'([x,y],[(x != y)])')) ([x,y],[-(x = y)]) >>> print(dexpr(r'\x.([],[walks(x)])(john)')) (\x.([],[walks(x)]))(john) >>> print(dexpr(r'\R.\x.([],[big(x,R)])(\y.([],[mouse(y)]))')) (\R x.([],[big(x,R)]))(\y.([],[mouse(y)])) >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))')) (([x],[walks(x)]) + ([y],[runs(y)])) >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))')) (([x,y],[walks(x), jumps(y)]) + ([z],[twos(z)]) + ([w],[runs(w)])) >>> print(dexpr(r'((([],[walks(x)]) + ([],[twos(x)])) + ([],[runs(x)]))')) (([],[walks(x)]) + ([],[twos(x)]) + ([],[runs(x)])) >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)])) + (([],[threes(x)]) + ([],[fours(x)])))')) (([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])) >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))')) (([],[walks(x)]) -> ([],[runs(x)])) >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])')) ([x],[PRO(x), sees(John,x)]) >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])')) ([x],[man(x), -([],[walks(x)])]) >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])')) ([],[(([x],[man(x)]) -> ([],[walks(x)]))]) >>> print(dexpr(r'DRS([x],[walk(x)])')) ([x],[walk(x)]) >>> print(dexpr(r'DRS([x][walk(x)])')) ([x],[walk(x)]) >>> print(dexpr(r'([x][walk(x)])')) ([x],[walk(x)]) ``simplify()`` -------------- >>> print(dexpr(r'\x.([],[man(x), walks(x)])(john)').simplify()) ([],[man(john), walks(john)]) >>> print(dexpr(r'\x.\y.([z],[dog(z),sees(x,y)])(john)(mary)').simplify()) ([z],[dog(z), sees(john,mary)]) >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').simplify()) \x.([],[big(x,\y.([],[mouse(y)]))]) >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').simplify()) ([x,y],[walks(x), runs(y)]) >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))').simplify()) ([w,x,y,z],[walks(x), jumps(y), twos(z), runs(w)]) >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])))').simplify()) ([],[walks(x), runs(x), threes(x), fours(x)]) >>> dexpr(r'([x],[man(x)])+([x],[walks(x)])').simplify() == \ ... dexpr(r'([x,z1],[man(x), walks(z1)])') True >>> dexpr(r'([y],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)]))])+([x],[run(x)])').simplify() == \ ... dexpr(r'([y,z1],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)])), run(z1)])') True >>> dexpr(r'\Q.(([x],[john(x),walks(x)]) + Q)(([x],[PRO(x),leaves(x)]))').simplify() == \ ... dexpr(r'([x,z1],[john(x), walks(x), PRO(z1), leaves(z1)])') True >>> logic._counter._value = 0 >>> print(dexpr('([],[(([x],[dog(x)]) -> ([e,y],[boy(y), chase(e), subj(e,x), obj(e,y)]))])+([e,x],[PRO(x), run(e), subj(e,x)])').simplify().normalize().normalize()) ([e02,z5],[(([z3],[dog(z3)]) -> ([e01,z4],[boy(z4), chase(e01), subj(e01,z3), obj(e01,z4)])), PRO(z5), run(e02), subj(e02,z5)]) ``fol()`` ----------- >>> print(dexpr(r'([x,y],[sees(x,y)])').fol()) exists x y.sees(x,y) >>> print(dexpr(r'([x],[man(x), walks(x)])').fol()) exists x.(man(x) & walks(x)) >>> print(dexpr(r'\x.([],[man(x), walks(x)])').fol()) \x.(man(x) & walks(x)) >>> print(dexpr(r'\x y.([],[sees(x,y)])').fol()) \x y.sees(x,y) >>> print(dexpr(r'\x.([],[walks(x)])(john)').fol()) \x.walks(x)(john) >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').fol()) (\R x.big(x,R))(\y.mouse(y)) >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').fol()) (exists x.walks(x) & exists y.runs(y)) >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))').fol()) (walks(x) -> runs(x)) >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])').fol()) exists x.(PRO(x) & sees(John,x)) >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])').fol()) exists x.(man(x) & -walks(x)) >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol()) all x.(man(x) -> walks(x)) >>> print(dexpr(r'([x],[man(x) | walks(x)])').fol()) exists x.(man(x) | walks(x)) >>> print(dexpr(r'P(x) + ([x],[walks(x)])').fol()) (P(x) & exists x.walks(x)) ``resolve_anaphora()`` ---------------------- >>> from nltk.sem.drt import AnaphoraResolutionException >>> print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])'))) ([x,y,z],[dog(x), cat(y), walks(z), (z = [x,y])]) >>> print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])'))) ([],[(([x],[dog(x)]) -> ([y],[walks(y), (y = x)]))]) >>> print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')).simplify()) ([x,y],[(x = y)]) >>> try: print(resolve_anaphora(dexpr(r'([x],[walks(x), PRO(x)])'))) ... except AnaphoraResolutionException as e: print(e) Variable 'x' does not resolve to anything. >>> print(resolve_anaphora(dexpr('([e01,z6,z7],[boy(z6), PRO(z7), run(e01), subj(e01,z7)])'))) ([e01,z6,z7],[boy(z6), (z7 = z6), run(e01), subj(e01,z7)]) ``equiv()``: ---------------- >>> a = dexpr(r'([x],[man(x), walks(x)])') >>> b = dexpr(r'([x],[walks(x), man(x)])') >>> print(a.equiv(b, TableauProver())) True ``replace()``: -------------- >>> a = dexpr(r'a') >>> w = dexpr(r'w') >>> x = dexpr(r'x') >>> y = dexpr(r'y') >>> z = dexpr(r'z') replace bound ------------- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, False)) ([x],[give(x,y,z)]) >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, True)) ([a],[give(a,y,z)]) replace unbound --------------- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, False)) ([x],[give(x,a,z)]) >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, True)) ([x],[give(x,a,z)]) replace unbound with bound -------------------------- >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, False) == \ ... dexpr('([z1],[give(z1,x,z)])') True >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, True) == \ ... dexpr('([z1],[give(z1,x,z)])') True replace unbound with unbound ---------------------------- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, False)) ([x],[give(x,z,z)]) >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, True)) ([x],[give(x,z,z)]) replace unbound --------------- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False)) (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True)) (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) replace bound ------------- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, False)) (([x],[P(x,y,z)]) + ([y],[Q(x,y,z)])) >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, True)) (([a],[P(a,y,z)]) + ([y],[Q(a,y,z)])) replace unbound with unbound ---------------------------- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False)) (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True)) (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) replace unbound with bound on same side --------------------------------------- >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, False) == \ ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))') True >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, True) == \ ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))') True replace unbound with bound on other side ---------------------------------------- >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, False) == \ ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))') True >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, True) == \ ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))') True replace unbound with double bound --------------------------------- >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, False) == \ ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))') True >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, True) == \ ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))') True regression tests ---------------- >>> d = dexpr('([x],[A(c), ([y],[B(x,y,z,a)])->([z],[C(x,y,z,a)])])') >>> print(d) ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.pretty_format()) ____________________________________ | x | |------------------------------------| | A(c) | | ____________ ____________ | | | y | | z | | | (|------------| -> |------------|) | | | B(x,y,z,a) | | C(x,y,z,a) | | | |____________| |____________| | |____________________________________| >>> print(str(d)) ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.fol()) exists x.(A(c) & all y.(B(x,y,z,a) -> exists z.C(x,y,z,a))) >>> print(d.replace(Variable('a'), DrtVariableExpression(Variable('r')))) ([x],[A(c), (([y],[B(x,y,z,r)]) -> ([z],[C(x,y,z,r)]))]) >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')))) ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')))) ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')))) ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')), True)) ([r],[A(c), (([y],[B(r,y,z,a)]) -> ([z],[C(r,y,z,a)]))]) >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')), True)) ([x],[A(c), (([r],[B(x,r,z,a)]) -> ([z],[C(x,r,z,a)]))]) >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')), True)) ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([r],[C(x,y,r,a)]))]) >>> print(d == dexpr('([l],[A(c), ([m],[B(l,m,z,a)])->([n],[C(l,m,n,a)])])')) True >>> d = dexpr('([],[([x,y],[B(x,y,h), ([a,b],[dee(x,a,g)])])->([z,w],[cee(x,y,f), ([c,d],[E(x,c,d,e)])])])') >>> sorted(d.free()) [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')] >>> sorted(d.variables()) [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')] >>> sorted(d.get_refs(True)) [Variable('a'), Variable('b'), Variable('c'), Variable('d'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] >>> sorted(d.conds[0].get_refs(False)) [Variable('x'), Variable('y')] >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])->([],[C(x,y)]), ([x,y],[D(x,y)])->([],[E(x,y)]), ([],[F(x,y)])->([x,y],[G(x,y)])])').eliminate_equality()) ([x],[A(x,x), (([],[B(x,x)]) -> ([],[C(x,x)])), (([x,y],[D(x,y)]) -> ([],[E(x,y)])), (([],[F(x,x)]) -> ([x,y],[G(x,y)]))]) >>> print(dexpr('([x,y],[A(x,y), (x=y)]) -> ([],[B(x,y)])').eliminate_equality()) (([x],[A(x,x)]) -> ([],[B(x,x)])) >>> print(dexpr('([x,y],[A(x,y)]) -> ([],[B(x,y), (x=y)])').eliminate_equality()) (([x,y],[A(x,y)]) -> ([],[B(x,x)])) >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])])').eliminate_equality()) ([x],[A(x,x), ([],[B(x,x)])]) >>> print(dexpr('([x,y],[A(x,y), ([],[B(x,y), (x=y)])])').eliminate_equality()) ([x,y],[A(x,y), ([],[B(x,x)])]) >>> print(dexpr('([z8 z9 z10],[A(z8), z8=z10, z9=z10, B(z9), C(z10), D(z10)])').eliminate_equality()) ([z9],[A(z9), B(z9), C(z9), D(z9)]) >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)]), ([x,y],[C(x,y)])])').eliminate_equality()) ([x],[A(x,x), ([],[B(x,x)]), ([x,y],[C(x,y)])]) >>> print(dexpr('([x,y],[A(x,y)]) + ([],[B(x,y), (x=y)]) + ([],[C(x,y)])').eliminate_equality()) ([x],[A(x,x), B(x,x), C(x,x)]) >>> print(dexpr('([x,y],[B(x,y)])+([x,y],[C(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) (([x,y],[B(x,y)]) + ([x,y],[C(x,y)])) >>> print(dexpr('(([x,y],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) (([x,y],[B(x,y)]) + ([],[C(x,y)]) + ([],[D(x,y)])) >>> print(dexpr('(([],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) (([],[B(x,x)]) + ([],[C(x,x)]) + ([],[D(x,x)])) >>> print(dexpr('(([],[B(x,y), ([x,y],[A(x,y)])])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))).normalize()) (([],[B(z3,z1), ([z2,z3],[A(z3,z2)])]) + ([],[C(z3,z1)]) + ([],[D(z3,z1)])) Parse errors ============ >>> def parse_error(drtstring): ... try: dexpr(drtstring) ... except logic.LogicalExpressionException as e: print(e) >>> parse_error(r'') End of input found. Expression expected. ^ >>> parse_error(r'(') End of input found. Expression expected. ( ^ >>> parse_error(r'()') Unexpected token: ')'. Expression expected. () ^ >>> parse_error(r'([') End of input found. Expected token ']'. ([ ^ >>> parse_error(r'([,') ',' is an illegal variable name. Constants may not be quantified. ([, ^ >>> parse_error(r'([x,') End of input found. Variable expected. ([x, ^ >>> parse_error(r'([]') End of input found. Expected token '['. ([] ^ >>> parse_error(r'([][') End of input found. Expected token ']'. ([][ ^ >>> parse_error(r'([][,') Unexpected token: ','. Expression expected. ([][, ^ >>> parse_error(r'([][]') End of input found. Expected token ')'. ([][] ^ >>> parse_error(r'([x][man(x)]) |') End of input found. Expression expected. ([x][man(x)]) | ^ Pretty Printing =============== >>> dexpr(r"([],[])").pretty_print() __ | | |--| |__| >>> dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print() _____________________________ | | |-----------------------------| | ________ _________ | | | x | | | | | (|--------| -> |---------|) | | | big(x) | | bark(x) | | | | dog(x) | |_________| | | |________| | | _________ | | | x | | | __ |---------| | | | | walk(x) | | | |_________| | |_____________________________| >>> dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print() _________ _________ | x y | | z | (|---------| + |---------|) | (x = y) | | dog(z) | |_________| | walk(z) | |_________| >>> dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print() _______________________________ | | |-------------------------------| | ___ ___ _________ | | | x | | y | | z | | | (|---| | |---| | |---------|) | | |___| |___| | dog(z) | | | | walk(z) | | | |_________| | |_______________________________| >>> dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print() ___ ________ \ | x | \ | | /\ P Q.(|---| + P(x) + Q(x))( /\ x.|--------|) |___| | dog(x) | |________| nltk-3.7/nltk/test/featgram.doctest000066400000000000000000000667601420073152400174540ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========================= Feature Grammar Parsing ========================= .. include:: ../../../nltk_book/definitions.rst Grammars can be parsed from strings. >>> import nltk >>> from nltk import grammar, parse >>> g = """ ... % start DP ... DP[AGR=?a] -> D[AGR=?a] N[AGR=?a] ... D[AGR=[NUM='sg', PERS=3]] -> 'this' | 'that' ... D[AGR=[NUM='pl', PERS=3]] -> 'these' | 'those' ... D[AGR=[NUM='pl', PERS=1]] -> 'we' ... D[AGR=[PERS=2]] -> 'you' ... N[AGR=[NUM='sg', GND='m']] -> 'boy' ... N[AGR=[NUM='pl', GND='m']] -> 'boys' ... N[AGR=[NUM='sg', GND='f']] -> 'girl' ... N[AGR=[NUM='pl', GND='f']] -> 'girls' ... N[AGR=[NUM='sg']] -> 'student' ... N[AGR=[NUM='pl']] -> 'students' ... """ >>> grammar = grammar.FeatureGrammar.fromstring(g) >>> tokens = 'these girls'.split() >>> parser = parse.FeatureEarleyChartParser(grammar) >>> trees = parser.parse(tokens) >>> for tree in trees: print(tree) (DP[AGR=[GND='f', NUM='pl', PERS=3]] (D[AGR=[NUM='pl', PERS=3]] these) (N[AGR=[GND='f', NUM='pl']] girls)) In general, when we are trying to develop even a very small grammar, it is convenient to put the rules in a file where they can be edited, tested and revised. Let's assume that we have saved feat0cfg as a file named ``'feat0.fcfg'`` and placed it in the NLTK ``data`` directory. We can inspect it as follows: >>> nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg') % start S # ################### # Grammar Productions # ################### # S expansion productions S -> NP[NUM=?n] VP[NUM=?n] # NP expansion productions NP[NUM=?n] -> N[NUM=?n] NP[NUM=?n] -> PropN[NUM=?n] NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n] NP[NUM=pl] -> N[NUM=pl] # VP expansion productions VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n] VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP # ################### # Lexical Productions # ################### Det[NUM=sg] -> 'this' | 'every' Det[NUM=pl] -> 'these' | 'all' Det -> 'the' | 'some' | 'several' PropN[NUM=sg]-> 'Kim' | 'Jody' N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child' N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks' TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes' IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk' TV[TENSE=pres, NUM=pl] -> 'see' | 'like' IV[TENSE=past] -> 'disappeared' | 'walked' TV[TENSE=past] -> 'saw' | 'liked' Assuming we have saved feat0cfg as a file named ``'feat0.fcfg'``, the function ``parse.load_parser`` allows us to read the grammar into NLTK, ready for use in parsing. >>> cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1) >>> sent = 'Kim likes children' >>> tokens = sent.split() >>> tokens ['Kim', 'likes', 'children'] >>> trees = cp.parse(tokens) |.Kim .like.chil.| |[----] . .| [0:1] 'Kim' |. [----] .| [1:2] 'likes' |. . [----]| [2:3] 'children' |[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' * |[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] * |[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'} |. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' * |. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'} |. . [----]| [2:3] N[NUM='pl'] -> 'children' * |. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] * |. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'} |. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] * |[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] * >>> for tree in trees: print(tree) (S[] (NP[NUM='sg'] (PropN[NUM='sg'] Kim)) (VP[NUM='sg', TENSE='pres'] (TV[NUM='sg', TENSE='pres'] likes) (NP[NUM='pl'] (N[NUM='pl'] children)))) The parser works directly with the underspecified productions given by the grammar. That is, the Predictor rule does not attempt to compile out all admissible feature combinations before trying to expand the non-terminals on the left hand side of a production. However, when the Scanner matches an input word against a lexical production that has been predicted, the new edge will typically contain fully specified features; e.g., the edge [PropN[`num`:feat: = `sg`:fval:] |rarr| 'Kim', (0, 1)]. Recall from Chapter 8 that the Fundamental (or Completer) Rule in standard CFGs is used to combine an incomplete edge that's expecting a nonterminal *B* with a following, complete edge whose left hand side matches *B*. In our current setting, rather than checking for a complete match, we test whether the expected category *B* will `unify`:dt: with the left hand side *B'* of a following complete edge. We will explain in more detail in Section 9.2 how unification works; for the moment, it is enough to know that as a result of unification, any variable values of features in *B* will be instantiated by constant values in the corresponding feature structure in *B'*, and these instantiated values will be used in the new edge added by the Completer. This instantiation can be seen, for example, in the edge [NP [`num`:feat:\ =\ `sg`:fval:] |rarr| PropN[`num`:feat:\ =\ `sg`:fval:] |dot|, (0, 1)] in Example 9.2, where the feature `num`:feat: has been assigned the value `sg`:fval:. Feature structures in NLTK are ... Atomic feature values can be strings or integers. >>> fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') >>> print(fs1) [ NUM = 'sg' ] [ TENSE = 'past' ] We can think of a feature structure as being like a Python dictionary, and access its values by indexing in the usual way. >>> fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem') >>> print(fs1['GND']) fem We can also define feature structures which have complex values, as discussed earlier. >>> fs2 = nltk.FeatStruct(POS='N', AGR=fs1) >>> print(fs2) [ [ GND = 'fem' ] ] [ AGR = [ NUM = 'pl' ] ] [ [ PER = 3 ] ] [ ] [ POS = 'N' ] >>> print(fs2['AGR']) [ GND = 'fem' ] [ NUM = 'pl' ] [ PER = 3 ] >>> print(fs2['AGR']['PER']) 3 Feature structures can also be constructed using the ``parse()`` method of the ``nltk.FeatStruct`` class. Note that in this case, atomic feature values do not need to be enclosed in quotes. >>> f1 = nltk.FeatStruct("[NUMBER = sg]") >>> f2 = nltk.FeatStruct("[PERSON = 3]") >>> print(nltk.unify(f1, f2)) [ NUMBER = 'sg' ] [ PERSON = 3 ] >>> f1 = nltk.FeatStruct("[A = [B = b, D = d]]") >>> f2 = nltk.FeatStruct("[A = [C = c, D = d]]") >>> print(nltk.unify(f1, f2)) [ [ B = 'b' ] ] [ A = [ C = 'c' ] ] [ [ D = 'd' ] ] Feature Structures as Graphs ---------------------------- Feature structures are not inherently tied to linguistic objects; they are general purpose structures for representing knowledge. For example, we could encode information about a person in a feature structure: >>> person01 = nltk.FeatStruct("[NAME=Lee, TELNO='01 27 86 42 96',AGE=33]") >>> print(person01) [ AGE = 33 ] [ NAME = 'Lee' ] [ TELNO = '01 27 86 42 96' ] There are a number of notations for representing reentrancy in matrix-style representations of feature structures. In NLTK, we adopt the following convention: the first occurrence of a shared feature structure is prefixed with an integer in parentheses, such as ``(1)``, and any subsequent reference to that structure uses the notation ``->(1)``, as shown below. >>> fs = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""") >>> print(fs) [ ADDRESS = (1) [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ SPOUSE = [ ADDRESS -> (1) ] ] [ [ NAME = 'Kim' ] ] There can be any number of tags within a single feature structure. >>> fs3 = nltk.FeatStruct("[A=(1)[B=b], C=(2)[], D->(1), E->(2)]") >>> print(fs3) [ A = (1) [ B = 'b' ] ] [ ] [ C = (2) [] ] [ ] [ D -> (1) ] [ E -> (2) ] >>> fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal') >>> fs2 = nltk.FeatStruct(CITY='Paris') >>> print(nltk.unify(fs1, fs2)) [ CITY = 'Paris' ] [ NUMBER = 74 ] [ STREET = 'rue Pascal' ] Unification is symmetric: >>> nltk.unify(fs1, fs2) == nltk.unify(fs2, fs1) True Unification is commutative: >>> fs3 = nltk.FeatStruct(TELNO='01 27 86 42 96') >>> nltk.unify(nltk.unify(fs1, fs2), fs3) == nltk.unify(fs1, nltk.unify(fs2, fs3)) True Unification between `FS`:math:\ :subscript:`0` and `FS`:math:\ :subscript:`1` will fail if the two feature structures share a path |pi|, but the value of |pi| in `FS`:math:\ :subscript:`0` is a distinct atom from the value of |pi| in `FS`:math:\ :subscript:`1`. In NLTK, this is implemented by setting the result of unification to be ``None``. >>> fs0 = nltk.FeatStruct(A='a') >>> fs1 = nltk.FeatStruct(A='b') >>> print(nltk.unify(fs0, fs1)) None Now, if we look at how unification interacts with structure-sharing, things become really interesting. >>> fs0 = nltk.FeatStruct("""[NAME=Lee, ... ADDRESS=[NUMBER=74, ... STREET='rue Pascal'], ... SPOUSE= [NAME=Kim, ... ADDRESS=[NUMBER=74, ... STREET='rue Pascal']]]""") >>> print(fs0) [ ADDRESS = [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ [ ADDRESS = [ NUMBER = 74 ] ] ] [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ] [ [ ] ] [ [ NAME = 'Kim' ] ] >>> fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]") >>> print(nltk.unify(fs0, fs1)) [ ADDRESS = [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ [ [ CITY = 'Paris' ] ] ] [ [ ADDRESS = [ NUMBER = 74 ] ] ] [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ] [ [ ] ] [ [ NAME = 'Kim' ] ] >>> fs2 = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""") >>> print(fs2) [ ADDRESS = (1) [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ SPOUSE = [ ADDRESS -> (1) ] ] [ [ NAME = 'Kim' ] ] >>> print(nltk.unify(fs2, fs1)) [ [ CITY = 'Paris' ] ] [ ADDRESS = (1) [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ SPOUSE = [ ADDRESS -> (1) ] ] [ [ NAME = 'Kim' ] ] >>> fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]") >>> fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]") >>> print(fs2) [ ADDRESS1 = ?x ] [ ADDRESS2 = ?x ] >>> print(nltk.unify(fs1, fs2)) [ ADDRESS1 = (1) [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ ADDRESS2 -> (1) ] >>> sent = 'who do you claim that you like' >>> tokens = sent.split() >>> cp = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1) >>> trees = cp.parse(tokens) |.w.d.y.c.t.y.l.| |[-] . . . . . .| [0:1] 'who' |. [-] . . . . .| [1:2] 'do' |. . [-] . . . .| [2:3] 'you' |. . . [-] . . .| [3:4] 'claim' |. . . . [-] . .| [4:5] 'that' |. . . . . [-] .| [5:6] 'you' |. . . . . . [-]| [6:7] 'like' |# . . . . . . .| [0:0] NP[]/NP[] -> * |. # . . . . . .| [1:1] NP[]/NP[] -> * |. . # . . . . .| [2:2] NP[]/NP[] -> * |. . . # . . . .| [3:3] NP[]/NP[] -> * |. . . . # . . .| [4:4] NP[]/NP[] -> * |. . . . . # . .| [5:5] NP[]/NP[] -> * |. . . . . . # .| [6:6] NP[]/NP[] -> * |. . . . . . . #| [7:7] NP[]/NP[] -> * |[-] . . . . . .| [0:1] NP[+WH] -> 'who' * |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {} |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {} |. [-] . . . . .| [1:2] V[+AUX] -> 'do' * |. [-> . . . . .| [1:2] S[+INV] -> V[+AUX] * NP[] VP[] {} |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {} |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {} |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {} |. . [-] . . . .| [2:3] NP[-WH] -> 'you' * |. . [-> . . . .| [2:3] S[-INV] -> NP[] * VP[] {} |. . [-> . . . .| [2:3] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |. . [-> . . . .| [2:3] S[-INV] -> NP[] * S[]/NP[] {} |. [---> . . . .| [1:3] S[+INV] -> V[+AUX] NP[] * VP[] {} |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {} |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' * |. . . [-> . . .| [3:4] VP[] -> V[-AUX, SUBCAT='clause'] * SBar[] {} |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {} |. . . . [-] . .| [4:5] Comp[] -> 'that' * |. . . . [-> . .| [4:5] SBar[] -> Comp[] * S[-INV] {} |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {} |. . . . . [-] .| [5:6] NP[-WH] -> 'you' * |. . . . . [-> .| [5:6] S[-INV] -> NP[] * VP[] {} |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |. . . . . [-> .| [5:6] S[-INV] -> NP[] * S[]/NP[] {} |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' * |. . . . . . [->| [6:7] VP[] -> V[-AUX, SUBCAT='trans'] * NP[] {} |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {} |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] * |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] * |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] * |. . [---------]| [2:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] * |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] * >>> trees = list(trees) >>> for tree in trees: print(tree) (S[-INV] (NP[+WH] who) (S[+INV]/NP[] (V[+AUX] do) (NP[-WH] you) (VP[]/NP[] (V[-AUX, SUBCAT='clause'] claim) (SBar[]/NP[] (Comp[] that) (S[-INV]/NP[] (NP[-WH] you) (VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] ))))))) A different parser should give the same parse trees, but perhaps in a different order: >>> cp2 = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1, ... parser=parse.FeatureEarleyChartParser) >>> trees2 = cp2.parse(tokens) |.w.d.y.c.t.y.l.| |[-] . . . . . .| [0:1] 'who' |. [-] . . . . .| [1:2] 'do' |. . [-] . . . .| [2:3] 'you' |. . . [-] . . .| [3:4] 'claim' |. . . . [-] . .| [4:5] 'that' |. . . . . [-] .| [5:6] 'you' |. . . . . . [-]| [6:7] 'like' |> . . . . . . .| [0:0] S[-INV] -> * NP[] VP[] {} |> . . . . . . .| [0:0] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} |> . . . . . . .| [0:0] S[-INV] -> * NP[] S[]/NP[] {} |> . . . . . . .| [0:0] S[-INV] -> * Adv[+NEG] S[+INV] {} |> . . . . . . .| [0:0] S[+INV] -> * V[+AUX] NP[] VP[] {} |> . . . . . . .| [0:0] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {} |> . . . . . . .| [0:0] NP[+WH] -> * 'who' {} |[-] . . . . . .| [0:1] NP[+WH] -> 'who' * |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {} |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {} |. > . . . . . .| [1:1] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} |. > . . . . . .| [1:1] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {} |. > . . . . . .| [1:1] V[+AUX] -> * 'do' {} |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} |. > . . . . . .| [1:1] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='intrans'] {} |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {} |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {} |. > . . . . . .| [1:1] VP[] -> * V[+AUX] VP[] {} |. [-] . . . . .| [1:2] V[+AUX] -> 'do' * |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {} |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {} |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {} |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='intrans'] {} |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {} |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {} |. . > . . . . .| [2:2] VP[] -> * V[+AUX] VP[] {} |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} |. . > . . . . .| [2:2] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} |. . > . . . . .| [2:2] NP[-WH] -> * 'you' {} |. . [-] . . . .| [2:3] NP[-WH] -> 'you' * |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {} |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} |. . . > . . . .| [3:3] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} |. . . > . . . .| [3:3] V[-AUX, SUBCAT='clause'] -> * 'claim' {} |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' * |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {} |. . . . > . . .| [4:4] SBar[]/?x[] -> * Comp[] S[-INV]/?x[] {} |. . . . > . . .| [4:4] Comp[] -> * 'that' {} |. . . . [-] . .| [4:5] Comp[] -> 'that' * |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {} |. . . . . > . .| [5:5] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} |. . . . . > . .| [5:5] NP[-WH] -> * 'you' {} |. . . . . [-] .| [5:6] NP[-WH] -> 'you' * |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} |. . . . . . > .| [6:6] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} |. . . . . . > .| [6:6] V[-AUX, SUBCAT='trans'] -> * 'like' {} |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' * |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {} |. . . . . . . #| [7:7] NP[]/NP[] -> * |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] * |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] * |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] * |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] * |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] * >>> sorted(trees) == sorted(trees2) True Let's load a German grammar: >>> cp = parse.load_parser('grammars/book_grammars/german.fcfg', trace=0) >>> sent = 'die Katze sieht den Hund' >>> tokens = sent.split() >>> trees = cp.parse(tokens) >>> for tree in trees: print(tree) (S[] (NP[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] (Det[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] die) (N[AGR=[GND='fem', NUM='sg', PER=3]] Katze)) (VP[AGR=[NUM='sg', PER=3]] (TV[AGR=[NUM='sg', PER=3], OBJCASE='acc'] sieht) (NP[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] (Det[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] den) (N[AGR=[GND='masc', NUM='sg', PER=3]] Hund)))) Grammar with Binding Operators ------------------------------ The bindop.fcfg grammar is a semantic grammar that uses lambda calculus. Each element has a core semantics, which is a single lambda calculus expression; and a set of binding operators, which bind variables. In order to make the binding operators work right, they need to instantiate their bound variable every time they are added to the chart. To do this, we use a special subclass of `Chart`, called `InstantiateVarsChart`. >>> from nltk.parse.featurechart import InstantiateVarsChart >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=1, ... chart_class=InstantiateVarsChart) >>> print(cp.grammar()) Grammar with 15 productions (start state = S[]) S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] VP[SEM=[BO=?b2, CORE=?vp]] VP[SEM=[BO={?b1+?b2}, CORE=]] -> TV[SEM=[BO=?b1, CORE=?v]] NP[SEM=[BO=?b2, CORE=?obj]] VP[SEM=?s] -> IV[SEM=?s] NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] N[SEM=[BO=?b2, CORE=?n]] Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' N[SEM=[BO={/}, CORE=]] -> 'dog' N[SEM=[BO={/}, CORE=]] -> 'cat' N[SEM=[BO={/}, CORE=]] -> 'mouse' IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'eats' IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'walks' TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'walks' NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'john' NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'alex' A simple intransitive sentence: >>> from nltk.sem import logic >>> logic._counter._value = 100 >>> trees = cp.parse('john barks'.split()) |. john.barks.| |[-----] .| [0:1] 'john' |. [-----]| [1:2] 'barks' |[-----] .| [0:1] NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=]] -> 'john' * |[-----> .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: } |. [-----]| [1:2] IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' * |. [-----]| [1:2] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] * |[===========]| [0:2] S[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] * >>> for tree in trees: print(tree) (S[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] (NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=]] john) (VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] (IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] barks))) A transitive sentence: >>> trees = cp.parse('john feeds a dog'.split()) |.joh.fee. a .dog.| |[---] . . .| [0:1] 'john' |. [---] . .| [1:2] 'feeds' |. . [---] .| [2:3] 'a' |. . . [---]| [3:4] 'dog' |[---] . . .| [0:1] NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=]] -> 'john' * |[---> . . .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: } |. [---] . .| [1:2] TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' * |. [---> . .| [1:2] VP[SEM=[BO={?b1+?b2}, CORE=]] -> TV[SEM=[BO=?b1, CORE=?v]] * NP[SEM=[BO=?b2, CORE=?obj]] {?b1: {/}, ?v: } |. . [---] .| [2:3] Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' * |. . [---> .| [2:3] NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] * N[SEM=[BO=?b2, CORE=?n]] {?b1: {/}, ?det: } |. . . [---]| [3:4] N[SEM=[BO={/}, CORE=]] -> 'dog' * |. . [-------]| [2:4] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=]] -> Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] N[SEM=[BO={/}, CORE=]] * |. . [------->| [2:4] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.exists x.(dog(x) & P(x)),z2)}, ?subj: } |. [-----------]| [1:4] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] -> TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=]] * |[===============]| [0:4] S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<\y.feed(y,z3)>]] * >>> for tree in trees: print(tree) (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] (NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=]] john) (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=]] (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) (N[SEM=[BO={/}, CORE=]] dog)))) Turn down the verbosity: >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=0, ... chart_class=InstantiateVarsChart) Reuse the same lexical item twice: >>> trees = cp.parse('john feeds john'.split()) >>> for tree in trees: print(tree) (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.P(John),z3)}, CORE=]] (NP[SEM=[BO={bo(\P.P(John),z104)}, CORE=]] john) (VP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<\y.feed(y,z2)>]] (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) (NP[SEM=[BO={bo(\P.P(John),z105)}, CORE=]] john))) >>> trees = cp.parse('a dog feeds a dog'.split()) >>> for tree in trees: print(tree) (S[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z106)}, CORE=]] (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) (N[SEM=[BO={/}, CORE=]] dog)) (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z107)}, CORE=]] (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) (N[SEM=[BO={/}, CORE=]] dog)))) nltk-3.7/nltk/test/featstruct.doctest000066400000000000000000001114411420073152400200350ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ================================== Feature Structures & Unification ================================== >>> from nltk.featstruct import FeatStruct >>> from nltk.sem.logic import Variable, VariableExpression, Expression .. note:: For now, featstruct uses the older lambdalogic semantics module. Eventually, it should be updated to use the new first order predicate logic module. Overview ~~~~~~~~ A feature structure is a mapping from feature identifiers to feature values, where feature values can be simple values (like strings or ints), nested feature structures, or variables: >>> fs1 = FeatStruct(number='singular', person=3) >>> print(fs1) [ number = 'singular' ] [ person = 3 ] Feature structure may be nested: >>> fs2 = FeatStruct(type='NP', agr=fs1) >>> print(fs2) [ agr = [ number = 'singular' ] ] [ [ person = 3 ] ] [ ] [ type = 'NP' ] Variables are used to indicate that two features should be assigned the same value. For example, the following feature structure requires that the feature fs3['agr']['number'] be bound to the same value as the feature fs3['subj']['number']. >>> fs3 = FeatStruct(agr=FeatStruct(number=Variable('?n')), ... subj=FeatStruct(number=Variable('?n'))) >>> print(fs3) [ agr = [ number = ?n ] ] [ ] [ subj = [ number = ?n ] ] Feature structures are typically used to represent partial information about objects. A feature name that is not mapped to a value stands for a feature whose value is unknown (*not* a feature without a value). Two feature structures that represent (potentially overlapping) information about the same object can be combined by *unification*. >>> print(fs2.unify(fs3)) [ agr = [ number = 'singular' ] ] [ [ person = 3 ] ] [ ] [ subj = [ number = 'singular' ] ] [ ] [ type = 'NP' ] When two inconsistent feature structures are unified, the unification fails and returns ``None``. >>> fs4 = FeatStruct(agr=FeatStruct(person=1)) >>> print(fs4.unify(fs2)) None >>> print(fs2.unify(fs4)) None .. >>> del fs1, fs2, fs3, fs4 # clean-up Feature Structure Types ----------------------- There are actually two types of feature structure: - *feature dictionaries*, implemented by `FeatDict`, act like Python dictionaries. Feature identifiers may be strings or instances of the `Feature` class. - *feature lists*, implemented by `FeatList`, act like Python lists. Feature identifiers are integers. When you construct a feature structure using the `FeatStruct` constructor, it will automatically decide which type is appropriate: >>> type(FeatStruct(number='singular')) >>> type(FeatStruct([1,2,3])) Usually, we will just use feature dictionaries; but sometimes feature lists can be useful too. Two feature lists will unify with each other only if they have equal lengths, and all of their feature values match. If you wish to write a feature list that contains 'unknown' values, you must use variables: >>> fs1 = FeatStruct([1,2,Variable('?y')]) >>> fs2 = FeatStruct([1,Variable('?x'),3]) >>> fs1.unify(fs2) [1, 2, 3] .. >>> del fs1, fs2 # clean-up Parsing Feature Structure Strings --------------------------------- Feature structures can be constructed directly from strings. Often, this is more convenient than constructing them directly. NLTK can parse most feature strings to produce the corresponding feature structures. (But you must restrict your base feature values to strings, ints, logic expressions (`nltk.sem.logic.Expression`), and a few other types discussed below). Feature dictionaries are written like Python dictionaries, except that keys are not put in quotes; and square brackets (``[]``) are used instead of braces (``{}``): >>> FeatStruct('[tense="past", agr=[number="sing", person=3]]') [agr=[number='sing', person=3], tense='past'] If a feature value is a single alphanumeric word, then it does not need to be quoted -- it will be automatically treated as a string: >>> FeatStruct('[tense=past, agr=[number=sing, person=3]]') [agr=[number='sing', person=3], tense='past'] Feature lists are written like python lists: >>> FeatStruct('[1, 2, 3]') [1, 2, 3] The expression ``[]`` is treated as an empty feature dictionary, not an empty feature list: >>> type(FeatStruct('[]')) Feature Paths ------------- Features can be specified using *feature paths*, or tuples of feature identifiers that specify path through the nested feature structures to a value. >>> fs1 = FeatStruct('[x=1, y=[1,2,[z=3]]]') >>> fs1['y'] [1, 2, [z=3]] >>> fs1['y', 2] [z=3] >>> fs1['y', 2, 'z'] 3 .. >>> del fs1 # clean-up Reentrance ---------- Feature structures may contain reentrant feature values. A *reentrant feature value* is a single feature structure that can be accessed via multiple feature paths. >>> fs1 = FeatStruct(x='val') >>> fs2 = FeatStruct(a=fs1, b=fs1) >>> print(fs2) [ a = (1) [ x = 'val' ] ] [ ] [ b -> (1) ] >>> fs2 [a=(1)[x='val'], b->(1)] As you can see, reentrane is displayed by marking a feature structure with a unique identifier, in this case ``(1)``, the first time it is encountered; and then using the special form ``var -> id`` whenever it is encountered again. You can use the same notation to directly create reentrant feature structures from strings. >>> FeatStruct('[a=(1)[], b->(1), c=[d->(1)]]') [a=(1)[], b->(1), c=[d->(1)]] Reentrant feature structures may contain cycles: >>> fs3 = FeatStruct('(1)[a->(1)]') >>> fs3['a', 'a', 'a', 'a'] (1)[a->(1)] >>> fs3['a', 'a', 'a', 'a'] is fs3 True Unification preserves the reentrance relations imposed by both of the unified feature structures. In the feature structure resulting from unification, any modifications to a reentrant feature value will be visible using any of its feature paths. >>> fs3.unify(FeatStruct('[a=[b=12], c=33]')) (1)[a->(1), b=12, c=33] .. >>> del fs1, fs2, fs3 # clean-up Feature Structure Equality -------------------------- Two feature structures are considered equal if they assign the same values to all features, *and* they contain the same reentrances. >>> fs1 = FeatStruct('[a=(1)[x=1], b->(1)]') >>> fs2 = FeatStruct('[a=(1)[x=1], b->(1)]') >>> fs3 = FeatStruct('[a=[x=1], b=[x=1]]') >>> fs1 == fs1, fs1 is fs1 (True, True) >>> fs1 == fs2, fs1 is fs2 (True, False) >>> fs1 == fs3, fs1 is fs3 (False, False) Note that this differs from how Python dictionaries and lists define equality -- in particular, Python dictionaries and lists ignore reentrance relations. To test two feature structures for equality while ignoring reentrance relations, use the `equal_values()` method: >>> fs1.equal_values(fs1) True >>> fs1.equal_values(fs2) True >>> fs1.equal_values(fs3) True .. >>> del fs1, fs2, fs3 # clean-up Feature Value Sets & Feature Value Tuples ----------------------------------------- `nltk.featstruct` defines two new data types that are intended to be used as feature values: `FeatureValueTuple` and `FeatureValueSet`. Both of these types are considered base values -- i.e., unification does *not* apply to them. However, variable binding *does* apply to any values that they contain. Feature value tuples are written with parentheses: >>> fs1 = FeatStruct('[x=(?x, ?y)]') >>> fs1 [x=(?x, ?y)] >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2}) [x=(1, 2)] Feature sets are written with braces: >>> fs1 = FeatStruct('[x={?x, ?y}]') >>> fs1 [x={?x, ?y}] >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2}) [x={1, 2}] In addition to the basic feature value tuple & set classes, nltk defines feature value unions (for sets) and feature value concatenations (for tuples). These are written using '+', and can be used to combine sets & tuples: >>> fs1 = FeatStruct('[x=((1, 2)+?z), z=?z]') >>> fs1 [x=((1, 2)+?z), z=?z] >>> fs1.unify(FeatStruct('[z=(3, 4, 5)]')) [x=(1, 2, 3, 4, 5), z=(3, 4, 5)] Thus, feature value tuples and sets can be used to build up tuples and sets of values over the course of unification. For example, when parsing sentences using a semantic feature grammar, feature sets or feature tuples can be used to build a list of semantic predicates as the sentence is parsed. As was mentioned above, unification does not apply to feature value tuples and sets. One reason for this that it's impossible to define a single correct answer for unification when concatenation is used. Consider the following example: >>> fs1 = FeatStruct('[x=(1, 2, 3, 4)]') >>> fs2 = FeatStruct('[x=(?a+?b), a=?a, b=?b]') If unification applied to feature tuples, then the unification algorithm would have to arbitrarily choose how to divide the tuple (1,2,3,4) into two parts. Instead, the unification algorithm refuses to make this decision, and simply unifies based on value. Because (1,2,3,4) is not equal to (?a+?b), fs1 and fs2 will not unify: >>> print(fs1.unify(fs2)) None If you need a list-like structure that unification does apply to, use `FeatList`. .. >>> del fs1, fs2 # clean-up Light-weight Feature Structures ------------------------------- Many of the functions defined by `nltk.featstruct` can be applied directly to simple Python dictionaries and lists, rather than to full-fledged `FeatDict` and `FeatList` objects. In other words, Python ``dicts`` and ``lists`` can be used as "light-weight" feature structures. >>> # Note: pprint prints dicts sorted >>> from pprint import pprint >>> from nltk.featstruct import unify >>> pprint(unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b')))) {'a': 'a', 'x': 1, 'y': {'b': 'b'}} However, you should keep in mind the following caveats: - Python dictionaries & lists ignore reentrance when checking for equality between values. But two FeatStructs with different reentrances are considered nonequal, even if all their base values are equal. - FeatStructs can be easily frozen, allowing them to be used as keys in hash tables. Python dictionaries and lists can not. - FeatStructs display reentrance in their string representations; Python dictionaries and lists do not. - FeatStructs may *not* be mixed with Python dictionaries and lists (e.g., when performing unification). - FeatStructs provide a number of useful methods, such as `walk()` and `cyclic()`, which are not available for Python dicts & lists. In general, if your feature structures will contain any reentrances, or if you plan to use them as dictionary keys, it is strongly recommended that you use full-fledged `FeatStruct` objects. Custom Feature Values --------------------- The abstract base class `CustomFeatureValue` can be used to define new base value types that have custom unification methods. For example, the following feature value type encodes a range, and defines unification as taking the intersection on the ranges: >>> from functools import total_ordering >>> from nltk.featstruct import CustomFeatureValue, UnificationFailure >>> @total_ordering ... class Range(CustomFeatureValue): ... def __init__(self, low, high): ... assert low <= high ... self.low = low ... self.high = high ... def unify(self, other): ... if not isinstance(other, Range): ... return UnificationFailure ... low = max(self.low, other.low) ... high = min(self.high, other.high) ... if low <= high: return Range(low, high) ... else: return UnificationFailure ... def __repr__(self): ... return '(%s>> fs1 = FeatStruct(x=Range(5,8), y=FeatStruct(z=Range(7,22))) >>> print(fs1.unify(FeatStruct(x=Range(6, 22)))) [ x = (6>> print(fs1.unify(FeatStruct(x=Range(9, 12)))) None >>> print(fs1.unify(FeatStruct(x=12))) None >>> print(fs1.unify(FeatStruct('[x=?x, y=[z=?x]]'))) [ x = (7>> fs1 = FeatStruct(a=1, b=2, c=3) >>> fs2 = FeatStruct(x=fs1, y='x') Feature structures support all dictionary methods (excluding the class method `dict.fromkeys()`). Non-mutating methods: >>> sorted(fs2.keys()) # keys() ['x', 'y'] >>> sorted(fs2.values()) # values() [[a=1, b=2, c=3], 'x'] >>> sorted(fs2.items()) # items() [('x', [a=1, b=2, c=3]), ('y', 'x')] >>> sorted(fs2) # __iter__() ['x', 'y'] >>> 'a' in fs2, 'x' in fs2 # __contains__() (False, True) >>> fs2.has_key('a'), fs2.has_key('x') # has_key() (False, True) >>> fs2['x'], fs2['y'] # __getitem__() ([a=1, b=2, c=3], 'x') >>> fs2['a'] # __getitem__() Traceback (most recent call last): . . . KeyError: 'a' >>> fs2.get('x'), fs2.get('y'), fs2.get('a') # get() ([a=1, b=2, c=3], 'x', None) >>> fs2.get('x', 'hello'), fs2.get('a', 'hello') # get() ([a=1, b=2, c=3], 'hello') >>> len(fs1), len(fs2) # __len__ (3, 2) >>> fs2.copy() # copy() [x=[a=1, b=2, c=3], y='x'] >>> fs2.copy() is fs2 # copy() False Note: by default, `FeatStruct.copy()` does a deep copy. Use `FeatStruct.copy(deep=False)` for a shallow copy. .. >>> del fs1, fs2 # clean-up. Dictionary access methods (mutating) ------------------------------------ >>> fs1 = FeatStruct(a=1, b=2, c=3) >>> fs2 = FeatStruct(x=fs1, y='x') Setting features (`__setitem__()`) >>> fs1['c'] = 5 >>> fs1 [a=1, b=2, c=5] >>> fs1['x'] = 12 >>> fs1 [a=1, b=2, c=5, x=12] >>> fs2['x', 'a'] = 2 >>> fs2 [x=[a=2, b=2, c=5, x=12], y='x'] >>> fs1 [a=2, b=2, c=5, x=12] Deleting features (`__delitem__()`) >>> del fs1['x'] >>> fs1 [a=2, b=2, c=5] >>> del fs2['x', 'a'] >>> fs1 [b=2, c=5] `setdefault()`: >>> fs1.setdefault('b', 99) 2 >>> fs1 [b=2, c=5] >>> fs1.setdefault('x', 99) 99 >>> fs1 [b=2, c=5, x=99] `update()`: >>> fs2.update({'a':'A', 'b':'B'}, c='C') >>> fs2 [a='A', b='B', c='C', x=[b=2, c=5, x=99], y='x'] `pop()`: >>> fs2.pop('a') 'A' >>> fs2 [b='B', c='C', x=[b=2, c=5, x=99], y='x'] >>> fs2.pop('a') Traceback (most recent call last): . . . KeyError: 'a' >>> fs2.pop('a', 'foo') 'foo' >>> fs2 [b='B', c='C', x=[b=2, c=5, x=99], y='x'] `clear()`: >>> fs1.clear() >>> fs1 [] >>> fs2 [b='B', c='C', x=[], y='x'] `popitem()`: >>> sorted([fs2.popitem() for i in range(len(fs2))]) [('b', 'B'), ('c', 'C'), ('x', []), ('y', 'x')] >>> fs2 [] Once a feature structure has been frozen, it may not be mutated. >>> fs1 = FeatStruct('[x=1, y=2, z=[a=3]]') >>> fs1.freeze() >>> fs1.frozen() True >>> fs1['z'].frozen() True >>> fs1['x'] = 5 Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> del fs1['x'] Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.clear() Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.pop('x') Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.popitem() Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.setdefault('x') Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.update(z=22) Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. .. >>> del fs1, fs2 # clean-up. Feature Paths ------------- Make sure that __getitem__ with feature paths works as intended: >>> fs1 = FeatStruct(a=1, b=2, ... c=FeatStruct( ... d=FeatStruct(e=12), ... f=FeatStruct(g=55, h='hello'))) >>> fs1[()] [a=1, b=2, c=[d=[e=12], f=[g=55, h='hello']]] >>> fs1['a'], fs1[('a',)] (1, 1) >>> fs1['c','d','e'] 12 >>> fs1['c','f','g'] 55 Feature paths that select unknown features raise KeyError: >>> fs1['c', 'f', 'e'] Traceback (most recent call last): . . . KeyError: ('c', 'f', 'e') >>> fs1['q', 'p'] Traceback (most recent call last): . . . KeyError: ('q', 'p') Feature paths that try to go 'through' a feature that's not a feature structure raise KeyError: >>> fs1['a', 'b'] Traceback (most recent call last): . . . KeyError: ('a', 'b') Feature paths can go through reentrant structures: >>> fs2 = FeatStruct('(1)[a=[b=[c->(1), d=5], e=11]]') >>> fs2['a', 'b', 'c', 'a', 'e'] 11 >>> fs2['a', 'b', 'c', 'a', 'b', 'd'] 5 >>> fs2[tuple('abcabcabcabcabcabcabcabcabcabca')] (1)[b=[c=[a->(1)], d=5], e=11] Indexing requires strings, `Feature`\s, or tuples; other types raise a TypeError: >>> fs2[12] Traceback (most recent call last): . . . TypeError: Expected feature name or path. Got 12. >>> fs2[list('abc')] Traceback (most recent call last): . . . TypeError: Expected feature name or path. Got ['a', 'b', 'c']. Feature paths can also be used with `get()`, `has_key()`, and `__contains__()`. >>> fpath1 = tuple('abcabc') >>> fpath2 = tuple('abcabz') >>> fs2.get(fpath1), fs2.get(fpath2) ((1)[a=[b=[c->(1), d=5], e=11]], None) >>> fpath1 in fs2, fpath2 in fs2 (True, False) >>> fs2.has_key(fpath1), fs2.has_key(fpath2) (True, False) .. >>> del fs1, fs2 # clean-up Reading Feature Structures -------------------------- Empty feature struct: >>> FeatStruct('[]') [] Test features with integer values: >>> FeatStruct('[a=12, b=-33, c=0]') [a=12, b=-33, c=0] Test features with string values. Either single or double quotes may be used. Strings are evaluated just like python strings -- in particular, you can use escape sequences and 'u' and 'r' prefixes, and triple-quoted strings. >>> FeatStruct('[a="", b="hello", c="\'", d=\'\', e=\'"\']') [a='', b='hello', c="'", d='', e='"'] >>> FeatStruct(r'[a="\\", b="\"", c="\x6f\\y", d="12"]') [a='\\', b='"', c='o\\y', d='12'] >>> FeatStruct(r'[b=r"a\b\c"]') [b='a\\b\\c'] >>> FeatStruct('[x="""a"""]') [x='a'] Test parsing of reentrant feature structures. >>> FeatStruct('[a=(1)[], b->(1)]') [a=(1)[], b->(1)] >>> FeatStruct('[a=(1)[x=1, y=2], b->(1)]') [a=(1)[x=1, y=2], b->(1)] Test parsing of cyclic feature structures. >>> FeatStruct('[a=(1)[b->(1)]]') [a=(1)[b->(1)]] >>> FeatStruct('(1)[a=[b=[c->(1)]]]') (1)[a=[b=[c->(1)]]] Strings of the form "+name" and "-name" may be used to specify boolean values. >>> FeatStruct('[-bar, +baz, +foo]') [-bar, +baz, +foo] None, True, and False are recognized as values: >>> FeatStruct('[bar=True, baz=False, foo=None]') [+bar, -baz, foo=None] Special features: >>> FeatStruct('NP/VP') NP[]/VP[] >>> FeatStruct('?x/?x') ?x[]/?x[] >>> print(FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')) [ *type* = 'VP' ] [ ] [ [ *type* = 'NP' ] ] [ *slash* = [ agr = ?x ] ] [ [ pl = True ] ] [ ] [ agr = ?x ] [ fin = True ] [ tense = 'past' ] Here the slash feature gets coerced: >>> FeatStruct('[*slash*=a, x=b, *type*="NP"]') NP[x='b']/a[] >>> FeatStruct('NP[sem=]/NP') NP[sem=]/NP[] >>> FeatStruct('S[sem=]') S[sem=] >>> print(FeatStruct('NP[sem=]/NP')) [ *type* = 'NP' ] [ ] [ *slash* = [ *type* = 'NP' ] ] [ ] [ sem = ] Playing with ranges: >>> from nltk.featstruct import RangeFeature, FeatStructReader >>> width = RangeFeature('width') >>> reader = FeatStructReader([width]) >>> fs1 = reader.fromstring('[*width*=-5:12]') >>> fs2 = reader.fromstring('[*width*=2:123]') >>> fs3 = reader.fromstring('[*width*=-7:-2]') >>> fs1.unify(fs2) [*width*=(2, 12)] >>> fs1.unify(fs3) [*width*=(-5, -2)] >>> print(fs2.unify(fs3)) # no overlap in width. None The slash feature has a default value of 'False': >>> print(FeatStruct('NP[]/VP').unify(FeatStruct('NP[]'), trace=1)) Unification trace: / NP[]/VP[] |\ NP[] | | Unify feature: *type* | / 'NP' | |\ 'NP' | | | +-->'NP' | | Unify feature: *slash* | / VP[] | |\ False | | X X <-- FAIL None The demo structures from category.py. They all parse, but they don't do quite the right thing, -- ?x vs x. >>> FeatStruct(pos='n', agr=FeatStruct(number='pl', gender='f')) [agr=[gender='f', number='pl'], pos='n'] >>> FeatStruct(r'NP[sem=]/NP') NP[sem=]/NP[] >>> FeatStruct(r'S[sem=]') S[sem=] >>> FeatStruct('?x/?x') ?x[]/?x[] >>> FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]') VP[agr=?x, +fin, tense='past']/NP[agr=?x, +pl] >>> FeatStruct('S[sem = ]') S[sem=] >>> FeatStruct('S') S[] The parser also includes support for reading sets and tuples. >>> FeatStruct('[x={1,2,2,2}, y={/}]') [x={1, 2}, y={/}] >>> FeatStruct('[x=(1,2,2,2), y=()]') [x=(1, 2, 2, 2), y=()] >>> print(FeatStruct('[x=(1,[z=(1,2,?x)],?z,{/})]')) [ x = (1, [ z = (1, 2, ?x) ], ?z, {/}) ] Note that we can't put a featstruct inside a tuple, because doing so would hash it, and it's not frozen yet: >>> print(FeatStruct('[x={[]}]')) Traceback (most recent call last): . . . TypeError: FeatStructs must be frozen before they can be hashed. There's a special syntax for taking the union of sets: "{...+...}". The elements should only be variables or sets. >>> FeatStruct('[x={?a+?b+{1,2,3}}]') [x={?a+?b+{1, 2, 3}}] There's a special syntax for taking the concatenation of tuples: "(...+...)". The elements should only be variables or tuples. >>> FeatStruct('[x=(?a+?b+(1,2,3))]') [x=(?a+?b+(1, 2, 3))] Parsing gives helpful messages if your string contains an error. >>> FeatStruct('[a=, b=5]]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [a=, b=5]] ^ Expected value >>> FeatStruct('[a=12 22, b=33]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [a=12 22, b=33] ^ Expected comma >>> FeatStruct('[a=5] [b=6]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [a=5] [b=6] ^ Expected end of string >>> FeatStruct(' *++*') Traceback (most recent call last): . . . ValueError: Error parsing feature structure *++* ^ Expected open bracket or identifier >>> FeatStruct('[x->(1)]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [x->(1)] ^ Expected bound identifier >>> FeatStruct('[x->y]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [x->y] ^ Expected identifier >>> FeatStruct('') Traceback (most recent call last): . . . ValueError: Error parsing feature structure ^ Expected open bracket or identifier Unification ----------- Very simple unifications give the expected results: >>> FeatStruct().unify(FeatStruct()) [] >>> FeatStruct(number='singular').unify(FeatStruct()) [number='singular'] >>> FeatStruct().unify(FeatStruct(number='singular')) [number='singular'] >>> FeatStruct(number='singular').unify(FeatStruct(person=3)) [number='singular', person=3] Merging nested structures: >>> fs1 = FeatStruct('[A=[B=b]]') >>> fs2 = FeatStruct('[A=[C=c]]') >>> fs1.unify(fs2) [A=[B='b', C='c']] >>> fs2.unify(fs1) [A=[B='b', C='c']] A basic case of reentrant unification >>> fs4 = FeatStruct('[A=(1)[B=b], E=[F->(1)]]') >>> fs5 = FeatStruct("[A=[C='c'], E=[F=[D='d']]]") >>> fs4.unify(fs5) [A=(1)[B='b', C='c', D='d'], E=[F->(1)]] >>> fs5.unify(fs4) [A=(1)[B='b', C='c', D='d'], E=[F->(1)]] More than 2 paths to a value >>> fs1 = FeatStruct("[a=[],b=[],c=[],d=[]]") >>> fs2 = FeatStruct('[a=(1)[], b->(1), c->(1), d->(1)]') >>> fs1.unify(fs2) [a=(1)[], b->(1), c->(1), d->(1)] fs1[a] gets unified with itself >>> fs1 = FeatStruct('[x=(1)[], y->(1)]') >>> fs2 = FeatStruct('[x=(1)[], y->(1)]') >>> fs1.unify(fs2) [x=(1)[], y->(1)] Bound variables should get forwarded appropriately >>> fs1 = FeatStruct('[A=(1)[X=x], B->(1), C=?cvar, D=?dvar]') >>> fs2 = FeatStruct('[A=(1)[Y=y], B=(2)[Z=z], C->(1), D->(2)]') >>> fs1.unify(fs2) [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)] >>> fs2.unify(fs1) [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)] Cyclic structure created by unification. >>> fs1 = FeatStruct('[F=(1)[], G->(1)]') >>> fs2 = FeatStruct('[F=[H=(2)[]], G->(2)]') >>> fs3 = fs1.unify(fs2) >>> fs3 [F=(1)[H->(1)], G->(1)] >>> fs3['F'] is fs3['G'] True >>> fs3['F'] is fs3['G']['H'] True >>> fs3['F'] is fs3['G']['H']['H'] True >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H'] True Cyclic structure created w/ variables. >>> fs1 = FeatStruct('[F=[H=?x]]') >>> fs2 = FeatStruct('[F=?x]') >>> fs3 = fs1.unify(fs2, rename_vars=False) >>> fs3 [F=(1)[H->(1)]] >>> fs3['F'] is fs3['F']['H'] True >>> fs3['F'] is fs3['F']['H']['H'] True >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H'] True Unifying w/ a cyclic feature structure. >>> fs4 = FeatStruct('[F=[H=[H=[H=(1)[]]]], K->(1)]') >>> fs3.unify(fs4) [F=(1)[H->(1)], K->(1)] >>> fs4.unify(fs3) [F=(1)[H->(1)], K->(1)] Variable bindings should preserve reentrance. >>> bindings = {} >>> fs1 = FeatStruct("[a=?x]") >>> fs2 = fs1.unify(FeatStruct("[a=[]]"), bindings) >>> fs2['a'] is bindings[Variable('?x')] True >>> fs2.unify(FeatStruct("[b=?x]"), bindings) [a=(1)[], b->(1)] Aliased variable tests >>> fs1 = FeatStruct("[a=?x, b=?x]") >>> fs2 = FeatStruct("[b=?y, c=?y]") >>> bindings = {} >>> fs3 = fs1.unify(fs2, bindings) >>> fs3 [a=?x, b=?x, c=?x] >>> bindings {Variable('?y'): Variable('?x')} >>> fs3.unify(FeatStruct("[a=1]")) [a=1, b=1, c=1] If we keep track of the bindings, then we can use the same variable over multiple calls to unify. >>> bindings = {} >>> fs1 = FeatStruct('[a=?x]') >>> fs2 = fs1.unify(FeatStruct('[a=[]]'), bindings) >>> fs2.unify(FeatStruct('[b=?x]'), bindings) [a=(1)[], b->(1)] >>> bindings {Variable('?x'): []} .. >>> del fs1, fs2, fs3, fs4, fs5 # clean-up Unification Bindings -------------------- >>> bindings = {} >>> fs1 = FeatStruct('[a=?x]') >>> fs2 = FeatStruct('[a=12]') >>> fs3 = FeatStruct('[b=?x]') >>> fs1.unify(fs2, bindings) [a=12] >>> bindings {Variable('?x'): 12} >>> fs3.substitute_bindings(bindings) [b=12] >>> fs3 # substitute_bindings didn't mutate fs3. [b=?x] >>> fs2.unify(fs3, bindings) [a=12, b=12] >>> bindings = {} >>> fs1 = FeatStruct('[a=?x, b=1]') >>> fs2 = FeatStruct('[a=5, b=?x]') >>> fs1.unify(fs2, bindings) [a=5, b=1] >>> sorted(bindings.items()) [(Variable('?x'), 5), (Variable('?x2'), 1)] .. >>> del fs1, fs2, fs3 # clean-up Expressions ----------- >>> e = Expression.fromstring('\\P y.P(z,y)') >>> fs1 = FeatStruct(x=e, y=Variable('z')) >>> fs2 = FeatStruct(y=VariableExpression(Variable('John'))) >>> fs1.unify(fs2) [x=<\P y.P(John,y)>, y=] Remove Variables ---------------- >>> FeatStruct('[a=?x, b=12, c=[d=?y]]').remove_variables() [b=12, c=[]] >>> FeatStruct('(1)[a=[b=?x,c->(1)]]').remove_variables() (1)[a=[c->(1)]] Equality & Hashing ------------------ The `equal_values` method checks whether two feature structures assign the same value to every feature. If the optional argument ``check_reentrances`` is supplied, then it also returns false if there is any difference in the reentrances. >>> a = FeatStruct('(1)[x->(1)]') >>> b = FeatStruct('(1)[x->(1)]') >>> c = FeatStruct('(1)[x=[x->(1)]]') >>> d = FeatStruct('[x=(1)[x->(1)]]') >>> e = FeatStruct('(1)[x=[x->(1), y=1], y=1]') >>> def compare(x,y): ... assert x.equal_values(y, True) == y.equal_values(x, True) ... assert x.equal_values(y, False) == y.equal_values(x, False) ... if x.equal_values(y, True): ... assert x.equal_values(y, False) ... print('equal values, same reentrance') ... elif x.equal_values(y, False): ... print('equal values, different reentrance') ... else: ... print('different values') >>> compare(a, a) equal values, same reentrance >>> compare(a, b) equal values, same reentrance >>> compare(a, c) equal values, different reentrance >>> compare(a, d) equal values, different reentrance >>> compare(c, d) equal values, different reentrance >>> compare(a, e) different values >>> compare(c, e) different values >>> compare(d, e) different values >>> compare(e, e) equal values, same reentrance Feature structures may not be hashed until they are frozen: >>> hash(a) Traceback (most recent call last): . . . TypeError: FeatStructs must be frozen before they can be hashed. >>> a.freeze() >>> v = hash(a) Feature structures define hash consistently. The following example looks at the hash value for each (fs1,fs2) pair; if their hash values are not equal, then they must not be equal. If their hash values are equal, then display a message, and indicate whether their values are indeed equal. Note that c and d currently have the same hash value, even though they are not equal. That is not a bug, strictly speaking, but it wouldn't be a bad thing if it changed. >>> for fstruct in (a, b, c, d, e): ... fstruct.freeze() >>> for fs1_name in 'abcde': ... for fs2_name in 'abcde': ... fs1 = locals()[fs1_name] ... fs2 = locals()[fs2_name] ... if hash(fs1) != hash(fs2): ... assert fs1 != fs2 ... else: ... print('%s and %s have the same hash value,' % ... (fs1_name, fs2_name)) ... if fs1 == fs2: print('and are equal') ... else: print('and are not equal') a and a have the same hash value, and are equal a and b have the same hash value, and are equal b and a have the same hash value, and are equal b and b have the same hash value, and are equal c and c have the same hash value, and are equal c and d have the same hash value, and are not equal d and c have the same hash value, and are not equal d and d have the same hash value, and are equal e and e have the same hash value, and are equal .. >>> del a, b, c, d, e, v # clean-up Tracing ------- >>> fs1 = FeatStruct('[a=[b=(1)[], c=?x], d->(1), e=[f=?x]]') >>> fs2 = FeatStruct('[a=(1)[c="C"], e=[g->(1)]]') >>> fs1.unify(fs2, trace=True) Unification trace: / [a=[b=(1)[], c=?x], d->(1), e=[f=?x]] |\ [a=(1)[c='C'], e=[g->(1)]] | | Unify feature: a | / [b=[], c=?x] | |\ [c='C'] | | | | Unify feature: a.c | | / ?x | | |\ 'C' | | | | | +-->Variable('?x') | | | +-->[b=[], c=?x] | Bindings: {?x: 'C'} | | Unify feature: e | / [f=?x] | |\ [g=[c='C']] | | | +-->[f=?x, g=[b=[], c=?x]] | Bindings: {?x: 'C'} | +-->[a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]] Bindings: {?x: 'C'} [a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]] >>> >>> fs1 = FeatStruct('[a=?x, b=?z, c=?z]') >>> fs2 = FeatStruct('[a=?y, b=?y, c=?q]') >>> #fs1.unify(fs2, trace=True) >>> .. >>> del fs1, fs2 # clean-up Unification on Dicts & Lists ---------------------------- It's possible to do unification on dictionaries: >>> from nltk.featstruct import unify >>> pprint(unify(dict(x=1, y=dict(z=2)), dict(x=1, q=5)), width=1) {'q': 5, 'x': 1, 'y': {'z': 2}} It's possible to do unification on lists as well: >>> unify([1, 2, 3], [1, Variable('x'), 3]) [1, 2, 3] Mixing dicts and lists is fine: >>> pprint(unify([dict(x=1, y=dict(z=2)),3], [dict(x=1, q=5),3]), ... width=1) [{'q': 5, 'x': 1, 'y': {'z': 2}}, 3] Mixing dicts and FeatStructs is discouraged: >>> unify(dict(x=1), FeatStruct(x=1)) Traceback (most recent call last): . . . ValueError: Mixing FeatStruct objects with Python dicts and lists is not supported. But you can do it if you really want, by explicitly stating that both dictionaries and FeatStructs should be treated as feature structures: >>> unify(dict(x=1), FeatStruct(x=1), fs_class=(dict, FeatStruct)) {'x': 1} Finding Conflicts ----------------- >>> from nltk.featstruct import conflicts >>> fs1 = FeatStruct('[a=[b=(1)[c=2], d->(1), e=[f->(1)]]]') >>> fs2 = FeatStruct('[a=[b=[c=[x=5]], d=[c=2], e=[f=[c=3]]]]') >>> for path in conflicts(fs1, fs2): ... print('%-8s: %r vs %r' % ('.'.join(path), fs1[path], fs2[path])) a.b.c : 2 vs [x=5] a.e.f.c : 2 vs 3 .. >>> del fs1, fs2 # clean-up Retracting Bindings ------------------- >>> from nltk.featstruct import retract_bindings >>> bindings = {} >>> fs1 = FeatStruct('[a=?x, b=[c=?y]]') >>> fs2 = FeatStruct('[a=(1)[c=[d=1]], b->(1)]') >>> fs3 = fs1.unify(fs2, bindings) >>> print(fs3) [ a = (1) [ c = [ d = 1 ] ] ] [ ] [ b -> (1) ] >>> pprint(bindings) {Variable('?x'): [c=[d=1]], Variable('?y'): [d=1]} >>> retract_bindings(fs3, bindings) [a=?x, b=?x] >>> pprint(bindings) {Variable('?x'): [c=?y], Variable('?y'): [d=1]} Squashed Bugs ~~~~~~~~~~~~~ In svn rev 5167, unifying two feature structures that used the same variable would cause those variables to become aliased in the output. >>> fs1 = FeatStruct('[a=?x]') >>> fs2 = FeatStruct('[b=?x]') >>> fs1.unify(fs2) [a=?x, b=?x2] There was a bug in svn revision 5172 that caused `rename_variables` to rename variables to names that are already used. >>> FeatStruct('[a=?x, b=?x2]').rename_variables( ... vars=[Variable('?x')]) [a=?x3, b=?x2] >>> fs1 = FeatStruct('[a=?x]') >>> fs2 = FeatStruct('[a=?x, b=?x2]') >>> fs1.unify(fs2) [a=?x, b=?x2] There was a bug in svn rev 5167 that caused us to get the following example wrong. Basically the problem was that we only followed 'forward' pointers for other, not self, when unifying two feature structures. (nb: this test assumes that features are unified in alphabetical order -- if they are not, it might pass even if the bug is present.) >>> fs1 = FeatStruct('[a=[x=1], b=?x, c=?x]') >>> fs2 = FeatStruct('[a=(1)[], b->(1), c=[x=2]]') >>> print(fs1.unify(fs2)) None .. >>> del fs1, fs2 # clean-up nltk-3.7/nltk/test/floresta.txt000066400000000000000000037423101420073152400166520ustar00rootroot00000000000000O 7 e Meio um ex-libris da noite algarvia. uma das mais antigas discotecas do Algarve, situada em Albufeira, que continua a manter os traos decorativos e as clientelas de sempre. um pouco a verso de uma espcie de outro lado da noite, a meio caminho entre os devaneios de uma fauna perifrica, seja de Lisboa, Londres, Dublin ou Faro e Portimo, e a postura circunspecta dos fiis da casa, que dela esperam a msica geracionista dos 60 ou dos 70. No deixa de ser, nos tempos que correm, um certo very typical algarvio, cabea de cartaz para os que querem fugir a algumas movimentaes nocturnas j a caminho da ritualizao de massas, do gnero vamos todos ao Calypso e encontramo-nos na Locomia. E assim, aos 2,5 milhes que o Ministrio do Planeamento e Administrao do Territrio j gasta no pagamento do pessoal afecto a estes organismos, vm juntar-se os montantes das obras propriamente ditas, que os municpios, j com projectos na mo, vm reivindicar junto do Executivo, como salienta aquele membro do Governo. E o dinheiro no falta s s cmaras, lembra o secretrio de Estado, que considera que a soluo para as autarquias especializarem-se em fundos comunitrios. Mas como, se muitas no dispem, nos seus quadros, dos tcnicos necessrios? Encomendem-nos a projectistas de fora porque, se as obras vierem a ser financiadas, eles at saem de graa, j que, nesse caso, os fundos comunitrios pagam os projectos, o mesmo no acontecendo quando eles so feitos pelos GAT, dado serem organismos do Estado. Essa poder vir a ser uma hiptese, at porque, no terreno, a capacidade dos GAT est cada vez mais enfraquecida. Alguns at j desapareceram, como o de Castro Verde, e outros tm vindo a perder quadros. O primeiro fabricante mundial de ratos para computador, a empresa sua Logitech, apresentou esta semana numa feira especializada que teve lugar em Basileia (Sua) um equipamento perifrico denominado Audioman que permitir dotar os computadores de orelhas. Segundo a empresa, o aparelho permite que o aparelho grave e transcreva a voz humana. Estamos a dotar os computadores de um novo sentido disse Steve d'Averio, director de marketing para a Europa da Logitech. O Audioman foi desenvolvido na Sua em apenas sete meses e compe-se de um microfone e de um altifalante que se podem acoplar facilmente a um computador, devendo ser comercializado ao preo de 290 francos suos (28 contos). Junqueiro foi ainda confrontado com o facto de no ter falado com o ministro antes de avanar com a proposta. Joo Cravinho, que integra a comitiva de Jorge Sampaio na visita de Estado a Moambique, ainda no reagiu carta que o dirigente socialista lhe enviou. No estou a ver que, para emitir uma opinio, ns tivssemos de informar previamente o ministro, afirmou. O senhor ministro interpretar esta sugesto como entender. Junqueiro recordou ainda que, nas ltimas autrquicas, o IGAT suspendeu as suas actividades um ms antes das eleies. Alm do Museu do Ar, o projecto gira em torno do parque temtico propriamente dito. A zona ldica, com os divertimentos, reas comerciais de souvenirs e de restaurao, compreende espaos distintos para os vrios temas, ainda em anlise, tais como Portugal, Japo, Brasil, frica e Far-West. Os quatro primeiros temas destinam-se a mostrar o papel de Portugal no mundo e o quinto, o nico sem relao com a histria nacional, justificado pela experincia de Barcelona (Port Aventura), que regista assinalvel sucesso. Uma zona petting anexa, em cooperao com o Jardim Zoolgico de Lisboa, destina-se a permitir o contacto das crianas com animais. Tudo, claro est, muito arborizado. O museu, a desenvolver sob orientao de uma comisso de notveis, dirigida pelo Presidente da Repblica, est orado em seis milhes de contos, valor incomportvel para a Fora Area. Da que a Cameron Hall tenha cado do cu. A proposta muito bem vista, porque ser mais vantajosa do que se houver s um plo de interesse no local, afirmou o major Carlos Barbosa, das relaes pblicas da Fora Area, admitindo que, com o parque temtico, se o interesse for diversificado, toda a gente fica a ganhar. A pouco mais de um ms do lanamento nacional do Rendimento Mnimo Garantido (RMG), o nmero de famlias j abrangidas pelos projectos-piloto deste programa de apoio aos agregados mais desfavorecidos no pra de aumentar. 7.777 famlias, totalizando 26.668 pessoas, esto j a usufruir do rendimento destinado a garantir condies consideradas mnimas de sobrevivncia e reinsero de cidados excludos socialmente. O balano -- a que o PBLICO teve acesso -- traado pela comisso revela que o nmero de pessoas abrangidas pelo RMG aumentou 36 por cento relativamente ao ltimo balano de 30 de Maro. Este crescimento resulta da opo de alargar o nmero de projectos-piloto, de modo a cobrir uma parte do territrio nacional at ao dia 1 de Julho, referiu ao PBLICO o presidente da Comisso Nacional do RMG, Paulo Pedroso. Para tal, referiu aquele responsvel, foram montadas mais estruturas, mais zonas esto abrangidas e, por isso, mais pessoas se podem candidatar. E tantos foram os candidatos que o perodo destinado a testar a aplicao do RMG acabaria por ceder lugar a um processo efectivo de financiamento. A instituio deste direito s ser, contudo, efectivado depois do lanamento nacional do projecto dentro de pouco mais de um ms. Mais metafrico foi o secretrio de Estado do Desenvolvimento Regional, Adriano Pimpo, que comparou o acordo a uma embraiagem. Porque a embraiagem que pe o motor em contacto com as rodas que geram o movimento, que para ns o desenvolvimento. Para que no surjam avarias, Pimpo pediu aos presentes que se empenhem na execuo dos termos do acordo, sob pena de a embraiagem se transformar em travo. A propsito, no Museu da Segunda Guerra Mundial, que a foi aberto, a histria da maior guerra no continente europeu comea com a fotografia de Estaline a cumprimentar o ministro dos Negcios Estrangeiros da Alemanha nazi, ou seja, a guerra comea com a assinatura do Pacto Molotov-Ribbentrop. Na cerimnia de inaugurao do edifcio, Ieltsin declarou perante mais de cem mil pessoas que a Rssia est perto da estabilidade poltica e que todos os problemas podem ser resolvidos mesa das conversaes. Talvez entusiasmado pela festa da vitria, o Presidente russo afirmou que chegar o dia em que a Rssia ajudar o Ocidente. As cerimnias oficiais terminaram com desfiles militares e recriaes de combates areos entre os aviadores soviticos e alemes. Co-produo franco-egpcia, O Emigrante inspira-se na histria de Jos, filho de Jacob, contando o percurso de Ram que, h 3000 anos, decide abandonar a sua terra rida para se instalar no Egipto dos faras, centro da civilizao. Tomando a defesa do filme de Chahine, numa sala atulhada, o bastonrio dos advogados, Ahmed al-Khawaga, replicou que o realizador egpcio se inspirou na histria de Jos, mas no se afastou das palavras do Coro que evoca, em termos claros, as propostas feitas ao profeta pela esposa do mestre que o comprou chegada ao Egipto. Eu no sou francs para falar da cultura francesa, mas sei que ela deu uma contribuio importante cultura egpcia. E uma vez que um dos nossos artistas conclui um acordo com um artista francs, isso no nos desonra. O ltimo romance de Paul Auster -- que ainda no est traduzido. Nashe encontra Pozzi, um jogador, com quem inicia um pquer extravagante. No imenso desacerto que foi a defesa do Penafiel, o capito Vasco foi o homem que ainda segurou as pontas. Seguro, eficiente, decidido -- tivesse o Penafiel outro Vasco e provavelmente o resultado teria sido outro. VALTINHO -- Foi dos melhores jogadores do Penafiel, este brasileiro de pernas altas. Dos seus ps ainda nasceu alguma coisa, embora o resto da equipa no ajudasse grande coisa. Lutou como lhe vemos poucas vezes e ainda teve nos ps uma boa oportunidade de golo, mas teve que rematar em jeito e no em fora, como gosta mais. (H quem defenda, no entanto, que se trata de um fax apcrifo, realmente escrito pelo deputado Jos Magalhes, o qual teria, alis, imitado a letra do ex-deputado Antnio Barreto. Fontes fidedignas -- que o mesmo dizer, no jornalsticas -- garantiram entretanto que, entre as frases atribudas ao senhor primeiro-ministro que, de facto, lhe no pertencem esto o comentrio ao congresso Portugal e o futuro, a resposta ao deputado Adriano Moreira quando este o interrogou sobre a sua concepo de federalismo e -- at! -- a contestada frase sobre a fidedignidade dos jornalistas. O Benfica voltou ontem a vencer o Lokomotiv de Moscovo pela diferena mnima (3-2), passando aos quartos-de-final da Taa das Taas em futebol. O jogo chegou a estar complicado para a equipa de Paulo Autuori, essencialmente porque o Lokomotiv marcou muito cedo (8 ') e os jogadores portugueses no conseguiam adaptar-se ao estado do relvado. Mas a expulso prematura e estpida de Gurenko, quando ainda no estava decorrida meia hora de jogo, seguida, pouco depois, da entrada de Panduru, foi o suficiente para dar a volta ao resultado. O Benfica apresentou o seu esquema habitual, com Jamir e Tahar mais recuados e insistindo com a utilizao de Valdo na esquerda. Em contrapartida, o Lokomotiv actuava com trs centrais, com o lbero Chugainov, como habitualmente, a comandar o jogo e o lateral-esquerdo Solomatin muito activo. Muito bem estavam tambm os trs mdios -- Maminov, Drozdov e Kharlachev --, desempenhando Kosolapov as funes de pivot do ataque. Por isso, e tambm porque os russos estavam mais bem adaptados ao muito enlameado e escorregadio relvado, o Lokomotiv era mais rpido sobre a bola, ganhando a maior parte dos duelos na zona fulcral do terreno, situada entre o meio-campo e a defesa do Benfica. O cocktail que Jupp condena bem real. Mas -o apenas porque este Governo e esta maioria, escolheram, desde Agosto do ano passado, a imigrao e a poltica da imigrao como ponto de clivagem poltica para as prximas legislativas, ontem mesmo marcadas para Maro de 1998, em simultneo com as regionais. Um endurecimento ntido existe desde ento neste terreno altamente perigoso. E razo desta escolha , obviamente, a progresso demente da Frente Nacional, que prospera sempre a apontar o imigrante como bode expiatrio e simultaneamente como a fonte de todos males do povo francs. Foi com estupefaco e surpresa que li, na edio do passado fim-de-semana, os comunicados da direco e da administrao deste jornal. Sou um leitor assduo e atento do PBLICO, desde o primeiro nmero, e no poucas vezes tenho manifestado opinies nas suas pginas, o que me leva agora a emitir o meu pensamento sobre o jornal, sobre quem o faz e sobre os ditos comunicados. 1. O PBLICO veio dar imprensa diria portuguesa uma nova dimenso e, pelo seu aparecimento, obrigou os grandes (JN e DN) a reformular a sua postura e tambm o seu grafismo. Tal como na ltima final do Campeonato do Mundo, disputada em Nova Iorque e Lion, Kasparov tenta apresentar-se como o bom reformador contra o mau conservador (Karpov). A FIDE, que organiza a competio desde 1948, declarou estar pronta para defender os seus direitos em tribunal, e acusou Kasparov e Short de privilegiarem os seus interesses econmicos ao tentarem conseguir uma verba superior oferecida pela cidade de Manchester para organizar a final e que rondava os 250 mil contos. Este golpe de Estado deixa cptica a maior parte grandes mestres de xadrez (cerca de 300 em todo o mundo), que esperam ver a situao clarificada, independentemente da parte que acabe vencedora. Por outro lado, a expulso de Kasparov, detentor do ttulo mundial desde 1985, criar dificuldades FIDE para convencer Manchester a acolher uma falsa final entre Karpov e Timman. Isto no caso de os dois xadrezistas aceitarem o convite. A bem dizer, todos tm razo. O rbitro porque o terreno de jogo estava quase impraticvel. Henrique Calisto porque o terreno estava quase impraticvel, o Leixes defendera o adiamento da partida antes do seu incio e tinha menos um homem em campo que o adversrio, por expulso de Srgio. Eurico e Milton Areal porque o terreno estava quase impraticvel, mas nem melhor nem pior do que na primeira metade do encontro e o Tirsense parecia mais fresco e estava em situao de vantagem numrica. Quanto ao jogo, ele dividiu-se em dois perodos distintos: antes e depois da expulso de Srgio. Antes, o Leixes marcou um golo, por intermdio de Maurcio, na sequncia de um canto apontado por Barriga, e foi a equipa que melhor se adaptou s condies do terreno. Depois, o Tirsense tomou conta da partida, criou vrias situaes de golo e conseguiu a igualdade atravs de uma grande penalidade marcada por Rui Manuel, a castigar carga de Correia sobre Batista. Com o dilvio como pano de fundo, o empate traduz de forma feliz um jogo que ficou no meio. Durante a anlise do relatrio de actividades passadas, foram identificadas como principais insuficincias a ausncia de uma orientao nacional junto dos quadros tcnicos, e o fraco recrutamento e pouca contribuio na rea da Cincia e Tecnologia. O Partido Popular (PP), vencedor das eleies de 3 de Maro, quer a plena integrao da Espanha na Aliana Atlntica, organizao a que Madrid aderiu em 1982, sem, no entanto, englobar as suas foras militares nas da NATO, de acordo com os resultados do referendo de 1986. Um processo que ocupar os prximos ministros da Defesa e das Relaes Exteriores e que no ter a oposio dos socialistas de Felipe Gonzlez, a segunda maior fora poltica do pas. O russo ser uma das seis lnguas principais usadas por Joo Paulo II amanh e depois, no Encontro Mundial da Juventude, no Santurio Mariano de Czestokowa, onde se prev a presena de um milho de jovens, o dobro dos que h dois anos se congregaram em Santiago de Compostela. que, alm de a proximidade geogrfica da URSS e dos outros pases do Leste, muita coisa aconteceu nos ltimos 24 meses na grande Casa Comum Europeia. Ao chegar, s 9 horas (TMG) de hoje, ao aeroporto de Cracvia, Joo Paulo II ser recebido com um mnimo de formalidades. a quinta vez que Karol Wojtyla pisa, como Pontfice, o solo da sua ptria. Mas convencionou-se que esta sua deslocao de trs dias, para presidir ao Encontro dos Jovens em Czestochawa, seria considerado um complemento ainda da sua recente viagem Polnia, de 1 a 9 de Junho passado. Carrington fez sempre questo de salientar que as hipteses de sucesso do cessar-fogo dependem sobretudo dos beligerantes. Se for firmado, ningum ficar mais contente do que ns. Eu tentei, o senhor Vance tentou, se for respeitado, urrah!, comentou. Mas se falhar? Ningum fez mais at agora do que o secretrio Vance. Devemos tentar. A situao pode ser potencialmente horrvel. Pela primeira vez no Haiti um padre foi assassinado por motivos polticos. Uma mensagem dos militares no poder para mostrarem quem ainda manda no pas, interpretam meios eclesisticos, que reafirmam a disposio de continuar a luta pela libertao do povo haitiano. A Conferncia Haitiana de Religiosos, cuja direco tida por moderada, vem respondendo ao crime com jejum, oraes, missas ao ar livre e homilias em que o engajamento do padre Jean-Marie ao lado dos pobres e oprimidos apontado como exemplo. Pode vir a ser o ponto de viragem na Igreja, cuja hierarquia, com a excepo de um nico bispo, prefere os militares golpistas a Aristide, o Presidente eleito democraticamente, ele prprio um padre que os salesianos expulsaram da ordem. H, no ar, uma certa ideia de invaso. Cavaco Silva no assinou o habitual despacho que d tolerncia de ponto na tera-feira de Carnaval- o que significa que, ao contrrio do que tradicional, este ano no h o feriado do Entrudo. Ontem, comeou a chegar s direces-gerais (atravs de uma circular) a informao de que o dia 23 ser um dia normal para os funcionrios pblicos. Registe-se que a tera-feira de Carnaval no um feriado legal, mas to-s tradicional: ou seja, todos os anos o primeiro-ministro tem que produzir um despacho, publicado em Dirio da Repblica, em que decreta a tolerncia de ponto. Este ano, eventualmente condicionado pela polmica que o ope ao Presidente da Repblica em torno dos feriados (recorde-se que Soares enviou a lei para o Tribunal Constitucional) Cavaco decidiu, pura e simplesmente, acabar com a tera-feira de Carnaval. Alis, era contra as teras-feiras, propiciadoras de sugestivas pontes, que a lei governamental mais se batia ... Desde 1990 que estava na mesa a reformulao das secretas. A primeira inteno do Governo era ver legislado algo que, ignorando a lei, era um facto desde o incio: a inexistncia efectiva do Servio de Informaes Estratgicas de Defesa (SIED), que nunca foi mais que uma alnea da lei-quadro nunca levada prtica. Com argumentos economicistas e de operacionalidade, o Executivo de Cavaco Silva sempre se escusou a concretizar o SIED, cujas competncias foram, entretanto, transferidas para o SIM (Servios de Informaes Militares), por via de um polmico acto administrativo do Governo, que assim chamava a si matrias da exclusiva competncia da AR. Ou seja, em dez anos, nunca a Lei dos Servios de Informaes foi integralmente cumprida, com uma estrutura que estava no papel sem existncia prtica (o SIED) e outra que assegurava as funes da primeira (a Dinfo). Facto que, ao longo do tempo, foi repetidamente denunciado, tanto pelos partidos da oposio (onde se destacaram o PCP e o PS), como pelo Conselho de Fiscalizao dos Servios de Informaes nomeado pela AR. O caso ocorreu numa noite de 1978, na ilha de Carvalo, ao largo da Crsega. O prncipe jantava com amigos num restaurante deste paraso para milionrios, quando um grupo barulhento de jovens da alta sociedade italiana acostou na enseada de Palma, ao lado do seu iate, o L'Aniram. Os advogados da defesa sublinharam no processo que este facto perturbou altamente o senhor de Sabia. Naquele ano, as Brigadas Vermelhas (BR) estavam no auge da actividade terrorista, o lder cristo-democrata Aldo Moro acabara de ser raptado, e o prncipe -- proibido de entrar em Itlia desde o exlio do pai em 1946 -- teria mesmo recebido ameaas das BR. O certo que, pouco depois, Vtor-Emanuel apercebeu-se que um barco pneumtico fora deslocado do seu iate e atracado ao Cocke, o navio dos jovens italianos. Irritado com este acto de apropriao, foi buscar uma espingarda US 30 semiautomtica, utilizada em safaris, e 31 cartuchos, e dirigiu-se para o Cocke. Pouco depois, o prncipe aponta-lhe a arma ao ventre. Na confuso que se segue, parte um primeiro tiro, depois um segundo, e os dois homens caem ao mar. A bordo do veleiro polaco Dar Mlodziezy, patrocinado pela Cutty Sark, os tripulantes e passageiros tiveram que esperar at s 13 horas de ontem para, finalmente, pisarem o cais. Ancorado fora do porto de Cdis durante toda a noite, o grande veleiro foi tomado de assalto pelos 18 passageiros portugueses a cantar fados at de madrugada. Desde as 10 horas da manh, as manobras de entrada do porto eram aguardadas ansiosamente e por trs horas toda a guarnio permaneceu no convs, pronta para receber ordens do piloto a bordo do rebocador que coordenou as manobras de atracao ao lado do veleiro Esmeralda, da armada chilena. Os organizadores da regata ainda davam os ltimos retoques nas instalaes volta do porto quando a frota desembarcou. Vrias tendas ofereciam os mais variados servios aos tripulantes, como restaurantes, cabines telefnicas, servios bancrios e de correios enquanto na sala de imprensa os empregados da Telefnica ainda instalavam os telefones e mquinas de fax para o batalho de jornalistas que chegou cidade. Mesmo com a confuso administrativa da escala, o show nutico continua a todo o pano por quatro dias. A Netscape Communicationns decidiu adquirir a Collabra Software por 108,7 milhes de contos (16,3 milhes de contos). A Collabra edita o groupware e o software que permitem compatibilizar diversas redes de computadores, nomeadamente as que existem no interior de uma mesma empresa. Os termos concretos da transaco no foram tornados pblicos mas os analistas coincidem na interpretao deste negcio como mais um passo da Netscape para transformar a Internet num meio privilegiado de comunicao e informao escala mundial. A Netscape o mais importante fabricante de software de navegao para a Internet. A LG Electronics, o terceiro maior fabricante sul-coreano de informtica, decidiu reduzir os preos dos seus computadores pessoais. A deciso visa tornar mais competitivos os produtos da empresa, que est a sofrer uma forte concorrncia dos seus principais concorrentes. As descidas oscilam, consoante os produtos, entre 9,8 e 26,9 por cento. Durante o primeiro semestre do corrente ano, o mercado de computadores pessoais da Coreia do Sul registou um aumento de 42 por cento relativamente a idntico perodo do ano transacto, num montante de 1300 milhes de dlares (195 milhes de contos). O anncio divulgado na sexta-feira pela OMS dizia que uma srie de testes iniciais, realizados pelo Instituto Pasteur de Paris, em amostras de sangue das primeiras nove pessoas a morrerem eram consistentes com o diagnstico de febre de Ebola. Esse mesmo diagnstico j tinha sido feito a ttulo provisrio apenas com base nos sintomas dos doentes. Uma equipa daquela organizao encontra-se desde sexta-feira passada na regio para examinar as vtimas e colher amostras. O Governo gabons, num comunicado em que d conta da existncia de uma epidemia na regio, pede aos habitantes que no evacuem os doentes nem para a capital da provncia, Makokou, nem para a capital nacional, Libreville, e que alertem para qualquer novo caso as autoridades sanitrias, para que estas possam providenciar o tratamento dos doentes in loco. Recomenda-se ainda s pessoas que no toquem com as mos nuas nem nos doentes nem nos mortos e que evitem o contacto com o seu sangue, vmitos e excrementos. Sempre que surge um problema, chamam-na. L vai Dolores Fasca ver o que se passa porta do Nacofino. s vezes acontece, mas normalmente nunca h problemas graves. C dentro ningum se apercebe de que um teimoso brio quer impor a sua presena. O homem insiste, meio zonzo, a patroa no cede e ele acaba por ir-se em grande dificuldade. Tambm mal atina com as manobras precisas para sair do parque de estacionamento privativo do dancing. Acaba por se lanar estrada dos Quatro Caminhos rumo a Quarteira, sem temer o balo que agora transforma sopros em cadeia. Faz frio. So 2h25, j s resta um casal mesa. s 0h00 eram quatro. As relaes de Hong Kong com a China esto a condicionar a evoluo do mercado accionista local. So as trocas comerciais, o novo aeroporto internacional, entre outros aspectos. Os investidores, cada vez mais sensveis, esto a reagir prontamente no tomando posies. Assim, est a registar-se um abrandamento na procura com a consequente queda das cotaes. O ndice Hang Seng caiu 2,47 por cento, fechando nos 5481,61 pontos. A Comisso Europeia considerou ontem politicamente inoportuno avanar com uma proposta de harmonizao dos impostos sobre os produtos energticos que, a concretizar-se, poderia provocar um aumento do gasleo em Portugal de quase 25 escudos em 2002. Os moradores so convidados a fazer desde logo a separao dos lixos -- condio necessria para o xito do projecto --, que depois sero recolhidos por viaturas equipadas a preceito. Neste sistema, a recolha far-se- uma vez por semana para os materiais orgnicos e trs por semana para os restantes resduos domsticos. Em Gondomar, a experincia comear com sacos de plstico em vez de cestos, por vontade da prpria autarquia. Ao mesmo tempo, ir manter-se a recolha indiferenciada tal como hoje a conhecemos. Tendencialmente, o caminho ser para aprofundar a recolha selectiva, acentuou o mesmo responsvel da Lipor. Este sistema de recolha adequa-se a edifcios baixos, com poucos pisos. Em os prdios com muitos andares, haver um ecoponto para todos os moradores. O nico trabalho das pessoas separar o lixo e colocar nos dias certos os contentores e cestos para serem recolhidos. O resto connosco, garante a Lipor. Na Maia, cada morador receber esses recipientes pessoalmente, da mo de funcionrios municipais, enquanto alunos dos cursos de Relaes Pblicas e Psicologia do Instituto Superior da Maia explicaro como funciona o sistema. Esta aco poder comear j em Julho, com abordagens porta a porta. Para muitos analistas o verdadeiro problema o facto de no se poder falar do desenvolvimento da economia palestiniana como um facto isolado, porque ele s faz sentido integrado no desenvolvimento de todo o Mdio Oriente. que tipo de relao econmica se poder estabelecer entre Israel e os seus vizinhos rabes? Ser prematuro falar numa comunidade econmica entre a Jordnia, os territrios palestinianos autnomos e Israel? outros defendem precisamente o contrrio. Fundamental para os palestinianos a abertura de novos mercados, tanto a Ocidente como a Oriente -- e em Israel -- para os produtos que, apesar dos bvios problemas da sua agricultura e da indstria, venham a produzir. Outra das apostas o turismo, esperando-se dois milhes de visitantes por ano e a criao de 30 mil a 50 mil postos de emprego. Quanto s anunciadas sadas de alguns jogadores, Donner afirmou que essas notcias no foram feitas por jornalistas, mas por pataratas. Dizia-se que o Carlos Resende e o Filipe Cruz podiam ir para a Alemanha e o Carlos Galambas e o lvaro Martins para o Benfica, mas mentira. J todos renovaram os seus contratos. Acho que o FC Porto s tem de se preocupar com o Sporting, no pensamos em mais nada. A fartura de pensamento pode dar maus resultados e ns no queremos ter um enfarte. Esta uma prova de regularidade e s pode beneficiar quem for mais regular. R. -- No sou capaz. Sou formada em Direito, mas no conseguiria ensinar algum a ler e a escrever. Nunca dei uma lio na vida, mas tenho pena, porque penso que se aprende muito a ensinar. P. -- Como a sua relao com o piano? No sei. Talvez morta. Para estes haitianos, s h uma maneira de regressar a casa: revoluo. Voltando rua que durante dcadas evocou o talentoso marido de D. Maria II, citemos mais um passo do j referido relatrio da Cmara do Porto, que nos permite avaliar at que ponto a concretizao da Rua de D. Fernando ficou aqum das intenes que lhe estiveram na origem. A convenincia desta rua palpvel; uma cmoda estrada desobstruda de tortuosidades e declives, desde a Foz ao corao da cidade, especialmente para seges e carros, o que at aqui mal se consegue antes de chegar ao stio do banco de S. Domingos [ leia-se o Banco Comercial do Porto, que fora fundado poucos anos antes e detinha autorizao para emitir notas ]. Comea a dar resultados a poltica da Unio Europeia de bloquear todas as tentativas suas de gozar as vantagens da UE, sem as responsabilidades de um pas membro. Os seus governantes j perceberam que o isolamento j no d lucro e pode levar rejeio. Quando o povo suo recusou, em 92, a adeso ao Espao Econmico Europeu, como j fizera com a ONU, cometeu um grave engano. Foi essa voluntria e pretensiosa rejeio dos vizinhos, que deixou a Sua sem a cobertura europeia, na crise que destruiu a sua imagem. O fim da guerra fria, com a imploso da URSS acabou com a importncia helvtica no tabuleiro europeu, enquanto os recentes escndalos de cumplicidade com os nazis, mais o roubo das economias dos judeus pelos seus bancos lhe tiraram a simpatia americana. No entanto, alguns dos analistas contactados pelo PBLICO, consideram que a Sumolis tem sido esquecida pelo mercado e que existem boas perspectivas quanto aos resultados de 1997. Para alm destes dois aspectos, surgem os habituais rumores sobre um eventual interesse comprador por parte de outros grupos empresariais do sector das bebidas, como, por exemplo, a Jernimo Martins. Uma possibilidade entretanto desmentida pelo grupo presidido por Soares dos Santos. totalmente falso que a Jernimo Martins esteja interessada na compra da Sumolis, garantiu ao PBLICO um porta-voz da empresa. No lado das subidas, destaca-se ainda o comportamento do Banco Totta & Aores que, ao contrrio dos restantes ttulos do sector bancrio, encerrou a ganhar 3,41 por cento. Movimentaram-se cerca de 285 mil ttulos, com a cotao de fecho a situar-se nos 4359 escudos. Alguma coisa se passa volta deste papel. Existe um forte interesse dos internacionais, salientou outro responsvel. Inez Teixeira uma jovem pintora que tem exposto regularmente desde h uns dois anos. Agora, num espao de exposies tambm recente, mostra uma srie de obras de pequenssimo formato feitas a grafite sobre tela. E julgamos ainda estar longe de casos como o do banco britnico NatWest que guardava as opinies religiosas e polticas e mesmo os hbitos alimentares de alguns dos seus 6,5 milhes de titulares de contas. Em paralelo, h sempre o perigo de estas BD irem cair nas mos de pessoas menos escrupulosas. Em Janeiro de 1994, noticiava-se a introduo na Alemanha de um supercomputador que regista as impresses digitais dos candidatos a asilo poltico, para tentar detectar fraudes nos subsdios da segurana social. Na mesma altura, comeou a funcionar um sistema informtico que permite distribuir mais rapidamente os estrangeiros pelos campos de refugiados. Nada fez parar esta compilao de dados, nem o receio de que os endereos pudessem cair nas mos de grupos nazis que ficariam assim a conhecer onde moram os seus alvos. Agora, a polcia inclina-se para que o assassinato tenha a ver com a promoo de John Gotti Junior ao cargo de chefe da famlia durante o encarceramento do pai. Segundo fontes policiais citadas pelo New York Times, o atentado poderia ter partido da velha guarda do cl Gambino. Desde que Gotti foi preso, em Dezembro passado, o seu filho, de 26 anos, est a substitu-lo, sobretudo na colecta de fundos resultantes de diversas actividades ilegais. De acordo com o testemunho de informadores, Junior estaria a manter uma arrogncia excessiva em relao aos velhos membros do cl, indo ao ponto de reclamar somas superiores ao que era habitual. O motorista acompanhava frequentes vezes John Junior e a sua morte pode ser considerada como um aviso da velha guarda. Dezenas de timorenses e portugueses ocupam pacificamente o pavilho indonsio da Expo-92, em Sevilha. Bush s poder ganhar as eleies presidenciais se fizer cair Saddam Hussein, vaticinam os analistas polticos. O Presidente cancela todos os compromissos e fecha-se na Casa Branca. Na mesma ocasio iniciaram-se investigaes que incidiram sobre o rbitro madeirense Marques da Silva, tambm ele suspeito de se ter deixado corromper. A Judiciria aproveitou ainda o balano para passar buscas s casas de Reinaldo Teles (dirigente), Jorge Gomes (funcionrio) e Antnio Garrido (colaborador), todos ligados ao FC Porto, com a curiosidade de o ltimo ser um ex-rbitro de futebol. se Gumaro era corrupto, quem eram os corruptores e porque motivo no foram igualmente presentes ao juiz do Tribunal de Instruo Criminal? Essa uma pergunta que ainda hoje permanece sem resposta. que, apesar de todas as tentativas feitas pelos agentes da Direco Central de Investigao de Corrupo, Fraudes e Infraces Econmico-Financeiras, o rbitro, sujeito a diversos interrogatrios, nunca fez qualquer revelao que pudesse incriminar outras pessoas. Por outras palavras: nunca quis beneficiar do estatuto de arrependido. Alice no sabia o que era um fato-macaco, mas no teve coragem de perguntar. No podamos. Afinal, os direitos dos trabalhadores esto garantidos na Constituio. Os temas escolhidos so cinco: Patrimnio Virtual (explorao dos monumentos portugueses usando as tecnologias da realidade virtual); Portugal Global (Sagres como antena de expanso e comunicao da expanso global e local dos portugueses); Sacra Saturni (Sagres como lugar de mistrio e simbologia, tema ideal para um jogo de aventura grfica); Um Milho de Navegadores (Sagres como centro privilegiado de turismo cultural, com um milho de visitantes por ano) e Terrvista (Sagres como ponto de partida para uma abordagem pedaggica, pondo os alunos no papel de produtores de informao). Numa primeira fase o trabalho ser concretizado num ' site ' na Internet, com acesso universal e que ter sede em Sagres. Essas pginas na Internet tero uma componente de referncia a Sagres e regio costeira do Algarve, num regime de divulgao e promoo do patrimnio, com um registo diferente do clssico. Depois, poder haver exploraes em etapas sucessivas para outras linguagens. Poder-se- tocar na realidade virtual, embora seja um objectivo a longo prazo. As primeiras semanas foram dedicadas ao estudo dos textos propostos pelos EUA. Nos ltimos dias o ritmo tornou-se frentico, manipularam-se mais de uma centena de documentos e mapas, garante de la Pea. Segundo a Newsweek, as duas salas dos mapas foram o palco principal das negociaes. Estavam repletas de lixo, copos de plstico sujos de caf. Os negociadores usaram canetas de ponta de feltro para traar linhas de fronteira nos cartes plastificados com o territrio da ex-Jugoslvia impresso. Bastava um pano hmido para fazer desaparecer as linhas, e recomear tudo de novo. rbitros: Miguel Castro (Argentina) e Alfonso Bove (Itlia). Portugal -- Guilherme Silva, Paulo Almeida, Vtor Fortunato, Pedro Alves, T Neves; Rui Lopes (2), Paulo Alves e Antnio Ramalho (2). H muito tempo que tenho uma estranha relao afectuosa com esta ilha. No quiosque vendem-se dessas revistas de viagens que agora proliferam e que perpetuam as fantasias sobre ilhas exticas. Sempre me pareceu estranho nunca ter lido um artigo sobre a ilha de Santos. Estava convencido que s eu a via, s eu a imaginava vista de cima naufragando no meio dos horrveis autocarros lisboetas. Protegida pelas correntes anti-estacionamento selvagem, todos os dias suspirava de alvio por ainda ver a ilha no seu stio, com as fronteiras bem definidas -- o que em si uma das razes que faz das ilhas um dos nossos arqutipos mais resistentes. Outro dia reparei numa bandeira hasteada no passeio em frente. Era a bandeira duma organizao que eu desconhecia: Amigos da Ilha de Santos. Como algum que descobre no estar s no mundo, aquela bandeira foi alimento espiritual. Eis que a minha ilha tinha bandeira e tudo. Eis que, afinal, existem mais habitantes virtuais daquele pas que me d gosto imaginar como um principado independente. Sobretudo foi bom descobrir que h mais gente a fazer parte do equipamento imaginrio da ilha. Isolada dos passeios que bordejam os quarteires, frequentada por uma populao mvel, provisria e em constante renovao, a ilha serve de plataforma de comunicaes: apanhar um transporte, comprar um jornal com notcias, enviar uma carta, fazer um telefonema. Os seus habitantes so do mais cosmopolita que h. A cimeira sindical ibrica j no dever realizar-se este ano, segundo apurou o PBLICO. Um reunio ao mais alto nvel a realizar na segunda-feira entre a CGTP e a UGT (na sede desta ltima central) poder desbloquear anteriores dificuldades e levar marcao de uma data. Na Europa Ocidental tem-se assistido a uma queda acentuada da fecundidade, o que fez surgir o problema da no substituio das geraes. Cada vez nascem menos bebs, com o inevitvel envelhecimento progressivo da populao. entre 1960 e 1991, os valores da taxa de fecundidade passaram de 94,9 por cento para 47 por cento. Quer isto dizer que a percentagem da populao nacional que, em determinado perodo, procriou baixou para metade em quatro dcadas. Os Padres Recentes da Fecundidade em Portugal, estudo que ser lanado na prxima semana, foi elaborado pelas socilogas Ana Nunes de Almeida e Cristina Ferreira e pelas gegrafas Filipa Ferro e Isabel Margarida Andr. Editado pela Comisso para a Igualdade e para os Direitos da Mulher, este trabalho pretende contextualizar a queda recente e vertiginosa da fecundidade em Portugal. Com a acareao entre Paradela de Abreu e o cnego Melo, terminaram anteontem as diligncias previstas no acordo que determinou a reabertura do processo do padre Max. Para a acusao, o balano positivo, mas para a defesa um novo arquivamento do caso est mais prximo. At porque o juiz acaba de indeferir uma acareao entre os sete suspeitos. 0 juiz titular do processo do padre Max, Artur Oliveira, indeferiu um pedido do procurador-geral adjunto nomeado pela Procuradoria-Geral da Repblica para acompanhar a investigao deste caso, no sentido de promover uma acareao entre os sete indivduos indiciados na acusao provisria como responsveis pelo crime-- trs como autores morais e os restantes como autores materiais. Paulo S pedia ainda uma acareao entre o industrial portuense Manuel Macedo, Ramiro Moreira e o tenente da Marinha Pedro Menezes, todos testemunhas neste caso. Todos os nomes citados derivam desta cena, que rapidamente foi superada por outras modas. Qualquer deles prosseguiu na linha da pop electrnica, todavia, s Marc Almond e os Erasure seguem hoje uma atitude camp, embora ambos com algumas nuances que de algum modo tendem a atenuar-lhes a envolvncia escandalosa. Almond o nico que continua a cantar teatral e amaneirado, fazendo das suas interpretaes casos de incandescncia incontrolvel, assente no culto do personagem instantneo, boa maneira tradicional do camp de Oscar Wilde. Se isso fica uma vez mais reiterado em Memoribilia, a compilao dos seus xitos a solo e nos Soft Cell, que parcialmente regravou para o efeito, tambm visvel que nos respectivos novos vdeos colados s antigas canes qualquer coisa mudou em Almond. O exemplo paradigmtico Say hello and Wave Goodbye, cujo primeiro clip consistia numa verdadeira orgia de excessos e que agora substitudo por um teledisco de um romantismo asseado repleto de modelos em cmara lenta, que mais se tende a ligar aos dessexuados Black ou Don Henley. Em sntese, aquele dinheiro seria a ltima prestao do pagamento do terreno. Em troca, assinava-se o contrato-promessa de compra e venda. Deolinda, com o seu advogado ao lado, deu-lhe a procurao e Manuel foi ter com Constantino para ele lhe dar tambm os papis assinados. E Constantino perguntou-lhe se ele estava a brincar, pois no lhe ia dar procurao nenhuma enquanto no recebesse a sua parte, ao que Manuel, (se ningum no tribunal mentiu neste aspecto), lhe replicou que j tinha dado os 1095 contos a Deolinda e eles que dividissem a soma entre os dois. E viu ento pela cara de Constantino que tinha feito mal, mas muito mal, pois marido e mulher agora s partilhavam uma filha, um barraco e um dio horrvel, azedo e mtuo. De facto, dava a impresso que marido e mulher, nesta ltima fase do casamento, s j comunicavam atravs dos respectivos advogados, um luxo estranho para pessoas quase sem dinheiro. -- E ela disse para passar um cheque ... A Assembleia aprovou ainda moes que reclamavam a divulgao dos resultados provisrios da avaliao das universidades, a fiscalizao da constitucionalidade da Lei de Financiamento do Ensino Superior Pblico, e insistiam nas campanhas de divulgao das queixa dos estudantes, aproveitando, por exemplo, a presena da comunicao social no prximo jogo Acadmica-Benfica. Entre as propostas mais ousadas, decidiu-se pedir ao Presidente da Repblica que proponha um referendo sobre a Lei do Financiamento, desafiar as televises a promoverem um debate entre o ministro da Educao e todos os dirigentes associativos, pintar de negro a sede da Direco Regional de Educao do Centro e remeter envelopes com folhas de papel higinico ao ministrio. E no seria mais til que o dr. Soares que, enquanto primeiro-ministro, deixou a educao no caos que se conhece, dirigisse mensagens a lanar o debate sobre o que h a reformar no ensino, em vez de passar a vida a exigir uma televiso to livre que Eduardo Moniz tenha liberdade para criticar toda a gente excepto .. o dr. Soares? ... E, se o dr. Soares tivesse praticado desporto na escola, ser que, hoje, pensaria da mesma maneira? A VW ainda no tomou qualquer deciso, porque esto a analisar as vrias hipteses. Mira Amaral confirmou, no entanto, que Portugal uma das possibilidades que esto a ser estudadas pelos alemes, tendo-lhe sido colocadas vrias perguntas sobre as condies de investimento, quando da recente visita a uma das fbricas da VW, na Alemanha. Para o ministro, a deciso dever ser tomada em meados de 1993 e, se a escolha recair sobre Portugal, essa ser uma boa altura em termos de incentivos. Em 1993 j estaro esgotadas as verbas do actual Quadro Comunitrio de Apoio [ QCA ] e ainda no estaro disponveis as do QCA de 1994. V.C. -- Mas quase disco de prata [ faltam seis mil, segundo a editora, a BMG ]! Comprar um CD no est ao alcance de todas as pessoas que compravam os nossos discos, mas acreditamos nos jovens, porque temos a certeza que aderem a isto. lgico que estamos a aproveitar o facto de os portugueses aderirem outra vez msica popular. P. -- E qual a vossa opinio sobre a msica portuguesa actual? O que que acham que mudou desde os anos de apogeu do conjunto? A RSSIA anunciou ontem ter assinado um contrato para o fornecimento de mais dois reactores nucleares ao Iro, mas negou ter recebido uma encomenda para vender 4000 blindados ao Iraque. Quanto ao contrato com Teero, cujo montante no foi especificado, o ministro russo da Energia Atmica explicou que ele inclui a venda de dois reactores de tipo VVER-440, com uma potncia de 440 megawatts, para serem instalados em Bouchehr, no sul do Iro. A Rssia, insensvel aos protestos dos EUA e de Israel, j se tinha comprometido, no incio do ano, a instalar naquele mesmo local um reactor de 1000 megawatts, num negcio avaliado em mil milhes de dlares. Em relao ao Iraque, Valeri Progrebenkov, porta-voz da sociedade de Estado Rosvooroujenie, responsvel pelas exportaes militares, desmentiu a existncia de uma encomenda de 4000 carros de combate russos, como afirmara o genro de Saddam Hussein que desertou para a Jordnia. Segundo este, os blindados seriam entregues ao longo de vrios anos e pagos em petrleo, depois do levantamento das sanes impostas ao Iraque. Os New York Knicks venceram tera-feira no seu reduto os Chicago Bulls, por 96-91, passando a liderar, por 2-0, a final da Conferncia Leste da Liga Norte-Americana de Basquetebol Profissional (NBA), que se disputa melhor de sete encontros. O Governo inaugurou pontes e estradas, mas no foi capaz de inaugurar uma Lei de Bases do Ordenamento do Territrio. A denncia foi feita ontem, em Faro, pelo lder do Partido da Terra (PT). Ribeiro Teles defendeu, no Jardim Alameda, a necessidade de uma poltica que tenha os ps bem assentes na terra, sem estar subordinada aos nmeros da macroeconomia, construda custa da degradao dos nossos recursos. O grupo de fotgrafos participantes encontra-se para um confronto pela diversidade no s de estilos, como de opes relativamente construo de uma ideia da cidade. Assim, Nuno Flix da Costa, que traa um percurso cronolgico -- com uma sequncia que comea ao amanhecer e acaba na noite escura lisboeta, das ruas e e dos bares -- integra constantemente o elemento inslito (ou tornado inslito pelo olhar), da existncia humana e urbana, na paisagem quotidiana: a rotina dos gestos e dos comportamentos. A sina do seu destino, abre a sequncia de imagens do autor, que fecha com a trmula, brilhante e opaca, potica e tensa, vida da noite. Antnio Pedro Ferreira, fotgrafo que re-afirma a sua originalidade na busca do acontecimento humano. No h fotografias sem pessoas, ou sem os seus vestgios, o que no deixa de nunca de remeter o trabalho deste fotgrafo para um dos domnios mais fascinantes, ou encantatrios, mgicos da fotografia, o retrato, ou a fotografia como lente humanstica ... permanece o registo dos negros, das sombras e da aluso. Para 1995, a administrao da PT calcula que os lucros atinjam valores prximos dos 25 milhes de contos e pretende manter os nveis de investimento, verificados em 1994. Dentro da poltica de racionalizao de estruturas, a PT pretende ainda baixar para quatro milhes de contos o valor das existncias em armazm (encontra-se em sete milhes, contra os 11 milhes que se atingiam em Maio) e alienar 40 imveis espalhados pelo pas. Botelho da Costa garantiu tambm que o oramento para 98/99 ser entrega ao conselho fiscal a tempo de ser analisado por este, provavelmente no final desta semana, incio da prxima. O Sporting de Braga-Guimares est a suscitar grande interesse na regio minhota e a procura de bilhetes tem decorrido em bom ritmo. Embora o Guimares no tenha organizado as habituais excurses, praticamente garantido que o Estdio 1 de Maio registar uma das melhores assistncias da poca. Na hora de defrontar o Sporting, o Famalico pode finalmente respirar aliviado com a notcia do regresso dos seus dois jogadores argelinos, Medane e Menad, que se ausentaram para disputar a Taa de Africa. Os argelinos, que desempenham um papel preponderante na equipa de Josip Skoblar, chegaram anteontem noite a Famalico, quase uma semana depois de a sua equipa ter sido afastada da luta pelo ceptro mximo do futebol africano, tendo j ontem treinado com o resto do plantel famalicense. Com o regresso de Medane e Menad, o treinador jugoslavo do Famalico tem sua disposio todos os jogadores, com excepo do jovem Manuel Jos, que se encontra lesionado. O que a verdade, o que a fico? Isso no podemos saber. O que um escritor? um topgrafo, um investigador, um reprter, ou um fotgrafo, como perguntou um dia Herv Guibert a Peter Handke, numa entrevista? Quando morre algum, e depois ns ficamos sempre a pensar que dessa pessoa no sabamos quase nada, escolhemos uma pequena parte, e, como dessa pessoa j no conseguimos ver nem os olhos nem as mos, ficamos com essa pequena parte para ocuparmos o espao todo que resta no stio da pessoa morta, ficou-nos s essa parte, muito pouco, por isso depois tem de ser aumentada. Uma parte de Herv Guibert. Um livro. O ttulo Des Aveugles, e foi publicado na Minuit em 1985. Depois de L'Image Fantme, de Les Aventures Singulires, de Les Chiens, de Voyage Avec Deux Enfants, de Les Lubies d'Arthur. Antes de Mes Parents, e muito antes dos livros-Sida. Des Aveugles to nitidamente cruel e generoso como o cinismo quando absoluto e coincide com a ingenuidade. O lugar do livro uma instituio para cegos. As personagens so cegos, atrozes e desconcertantes como nas fotografias de Sander, porque os cegos so pessoas que se dirigem incessantemente para um lugar interminvel de onde no podem sair e onde ns nos sentimos gratos por no podermos entrar. Herv Guibert na poca era jornalista do Le Monde, e pediu autorizao ao Instituto Nacional dos Cegos para fazer uma reportagem sobre Os Cegos e a Cultura, h-de ter sido o que de melhor lhe ocorreu como desculpa para passar l uma semana. Diz que imaginou que o narrador seria uma espcie de perverso que entra no estabelecimento graas a uma cumplicidade que lhe permite manter relaes amorosas ilcitas com as crianas. O livro depois mudou, e os cegos tornaram-se personagens passionais, e obscenas na sua violncia, esse livro depois de escrito continha tudo, o infinito, a transparncia, o medo, as imagens do escuro, continha uma criana cega que no admitia no ver e inventava, e Josette que torturava ratos, uma mulher casada, um marido e um amante, e depois havia uma vingana. Herv Guibert gostava dos cegos. Depois dessa semana, ficou durante um ano como leitor, ia duas horas por semana. Apaixonou-se por um cego que no era aluno dele. Tambm se tornou amigo do empregado da loja onde os cegos iam buscar vdeos. O vdeo mais procurado pelos cegos era Texas Chainsaw Massacre, de Tobe Hooper. Foi anos mais tarde que Herv Guibert escreveu o livro. neste contexto que entre a indstria e a Direco-Geral da Qualidade do Ambiente [DGQA] nunca se interromperam os contactos tcnicos necessrios atempada informao sobre a evoluo verificada no cumprimento do contrato-programa e que permitiro a continuao de uma abordagem responsvel da questo para futuro. Apraz-nos referir a disponibilidade da DGQA para manter com a indstria um dilogo srio e objectivo. No conhecendo outra forma de abordar um tema to importante como o da efectiva proteco do ambiente de que tambm somos parte. Mas alm das afinidades culturais, dos quais os japoneses esto cientes, mas que muitos portugueses desconhecem, h outros pontos de aproximao. O tema da Expo-98, os Oceanos, ser tambm abordado na Expo-2001, em Yamaguchi, cujo lema geral ser ' O Futuro e o sculo XXI ', ou, dito de uma forma mais potica e numa traduo mais livre do japons, Rumo a um futuro onde brilhe a vida. Se na primeira metada da dcada de 80 o balano dos confrontos entre os dois eternos rivais era francamente equilibrado, na segunda metade da mesma dcada- excepo feita aos fabulosos 7-1 com que o Sporting venceu o Benfica na tarde de 14 de Dezembro de 1986, em Alvalade- os encarnados foram ganhando vantagem neste muito especial campeonato entre as equipas da Luz e de Alvalade. A maior evidncia para esta recente superioridade encarnada vai para o facto do Benfica, nas suas trs ltimas deslocaes a Alvalade, ter vencido sempre os lees. P. -- ... e mais: j depois da derrota continuou a tentar nos jornalistas, e na comunicao social, os bodes expiatrios da derrota ... R. -- ... no generalizo mas houve aspectos na comunicao social que tiveram que ver com os resultados das eleies. Mas, verdade, estava completamente convencido de que o PSD iria ganhar e, inclusive, poderia ter maioria absoluta. Enganei-me: no serie nem o primeiro nem o ltimo, espero .. Observador privilegiado, o francs Michel Platini, ex-jogador, ex-seleccionador nacional e actual coordenador do Comit Organizador do Campeonato do Mundo de futebol de 1998 em Frana, deita um olhar sobre aquilo que foi o Mundial dos Estados Unidos. Por entre crticas falta de vontade dos polticos franceses, faz algumas comparaes e previses. No que se refere s regras do jogo, considera que o futebol deve continuar a ser futebol. Da que seja avesso a profundas alteraes das suas regras. L'QUIPE -- De que maneira viu o recente Campeonato do Mundo de futebol? No pondo em causa o direito dos clnicos de exigirem melhores condies remuneratrias e de trabalho, Maria Belm lembrou que, para se pedir mais dinheiro, temos de mostrar que fazemos mais e melhor. O recado no tinha apenas como destinatrios os mdicos algarvios, mas toda a classe. A este propsito, argumentou que as dificuldades e os constrangimentos do pas exigem um grande controlo dos gastos. No seu entender, o seu Ministrio j fez um esforo para que os trabalhadores da Sade beneficiassem de um aumento superior ao resto da funo pblica. Por conseguinte, um novo aumento salarial, de acordo com a ministra, ter necessariamente de ser acompanhado de um aumento de produtividade. Nesse sentido, adiantou que o Ministrio est disponvel para negociar a alterao dos regimes de trabalho, em sede dos centros de responsabilidade integrados, tendo j convidado os sindicatos para para iniciar esse processo. No caso do Hospital de Faro, onde existe um conflito entre os cardiologistas e a administrao, renovou a confiana no rgo de gesto. Maria de Belm aproveitou esta deslocao ao Algarve para inaugurar o centro de sade de Lagoa, que j estava a funcionar, mas encontrou a maioria dos mdicos em greve. Portugal encontra-se hoje mergulhado numa crise que parece ter-se instalado predominantemente na Europa, mas afecta toda a humanidade. No h por isso, para alm da prossecuo das polticas que privilegiam o modelo de sociedade escolhido, solues especficas para o nosso pas. Dez canes para lembrar a carreira de Cassandra Wilson no catlogo JMT, a casa que lhe abriu o mundo. Uma das mais prometedoras cantoras dos ltimos anos, Cassandra usa um gro de voz e um jeito de desenhar e se apropriar do verso que tem razes em Carmen McRae. Nascida no seio da esttica M-Base, de que se tornou a nica porta-voz vocal, medida que foi avanando mar adentro, Cassandra soube libertar-se do lastro que lhe ajudou a voz a crescer mas que ameaava paralis-la. Despojada da preocupao de fazer novo e diferente a cada passo, o canto virou rvore, ganhou espao, trepou ao cu. No dia em que Wilson inventar o tempo & o modo de combinar a tradio (de que se tem aproximado progressivamente, como o mostra, de forma exemplar, o lbum Blue Skies) com os novos sons que lhe adubaram a voz (do rap e hip-hop inquietao experimental vivida ao lado de Steve Coleman), uma nova porta se abrir ao jazz vocal. Parte-se depois para o debate. Paquete de Oliveira, socilogo, fala do drama social. E Carlos Narciso junta a sua voz daqueles que se surpreendem com o facto de Domingos Pereira, condenado a 15 anos pela morte da mulher, ter cumprido apenas seis. Fala na opinio pblica. Era a tirada mais infeliz da noite. Com programas destes, que correm o risco de valorizar at exausto alguns dos aspectos mais srdidos da histria de criminosos, arriscamo-nos a ter, em breve, uma opinio pblica a pedir agravamentos sem fim das penas, da represso e mesmo o ressurgimento da pena de morte. Quanto mais os EUA forem capazes de deixar claro que ftil competir com o poder americano, menos chances haver de que outros alimentem ideias de perturbar a actual ordem mundial. Um outro poder, aliado mas diferente, como pode vir a ser a Unio Europeia, s lhes pode ser til. E apelava ao idealismo e ao pioneirismo da Amrica como o antdoto capaz de dar sentido ao seu enorme poder. Mariano Gago falava na sesso de encerramento de uma conferncia sobre A criao da sociedade de informao euromediterrnea, um projecto que surge na sequncia da conferncia de Barcelona (Novembro de 1995), em que foram lanadas as bases de uma cooperao mais estreita -- a nvel econmico e poltico, mas tambm cultural -- entre os pases das duas margens do Mediterrneo. As novas tecnologias da informao permitiro construir rapidamente este smbolo moderno da nossa vontade comum de criar, entre as elites estudiosas dos nossos povos, laos de solidariedade, compreenso mtua e trabalhos em comum, declarou o ministro. Atentados suicidas do grupo Hamas em Israel: 25 mortos e 80 feridos. a vingana pela morte do seu lder Ayyash. Jos Eduardo dos Santos e Jonas Savimbi renem-se em Libreville, concordando que um Governo de Unidade e Reconciliao esteja formado at Junho ou Julho. Antnio Guterres possui uma boa relao com o teatro. Tem pena de ter pouco tempo para assistir a espectculos, mas est atento. Nos ltimos meses ainda conseguiu escapar-se e ver Eu, Feuerbach, pelo Cendrev, em vora, e A pera do Malandro, pelo Seiva Troupe, no Porto. E gostou. As declaraes prestou-as ontem durante o almoo que ofereceu, em S. Bento, s personalidades do teatro. No dia em que ficou constitudo o jri de apoio ao teatro no governamental. Um vizinho do Po de Acar de Cascais j no aguenta ouvir as descargas do supermercado. Queixas atrs de queixas, foi conseguindo umas vitrias. Agora, porm, tudo esbarrou num muro que a empresa diz ser a soluo mas que o vizinho no aceita. Afinal, parece que consumir fura os tmpanos. A Gr-Bretanha pediu na segunda-feira Comunidade Europeia que fosse banido um acar artificial usado na produo de refrigerantes, revelaram fontes diplomticas comunitrias e industriais citadas pela Reuter. Os britnicos advogam que a substnca em questo, o ciclamato -- 30 vezes mais doce do que o acar -- no seguro para o ser humano e ameaam vetar uma lei sobre a segurana alimentar, que os ministros do comrcio da CEE esto a discutir, se o produto no for proibido. A Frana e a Grcia apoiam a Gr-Bretanha. Este tipo de acar foi banido nos Estados Unidos e na Gr-Bretanha depois de uma srie de testes ter demonstrado que o produto provocava cancro nos ratos. A Gr-Bretanha -- o nico pas da CEE que proibiu o produto -- quer agora banir os ciclamatos da CEE baseados no argumento de que o produto leva, nos animais, atrofia e degenerao testicular. Mas a comisso da CEE para a segurana alimentar, baseada em peritos dos Doze pases, aprovou os ciclamatos depois de analisar o consumo mdio em relao aos humanos durante um perodo de vida. Domingos, peito hirto e joelhos levantados, cerra os dentes e vence Buckner (medalha de bronze) e Dlze. Domingos no se tem em si de contentamento, no quer acreditar no que se est a passar. Abraa-se ao irmo, que ficou em 8 lugar. Ganhei a medalha de prata, diz-lhe ofegante, mas logo se recusa a acreditar na realidade. S quando Moniz Pereira lhe surge na frente, se compenetra de que era mesmo verdade. O treinador, de sorriso rasgado, abraa-o. Lisboa, 18 de Fevereiro de 1992. So 9h 25m e Moniz Pereira arruma o automvel junto ao estdio do Sporting Clube de Portugal, incrustado entre o Campo Grande e o bairro residencial lisboeta do Lumiar. Metade do Oramento destina-se a despesas correntes, suscitando criticas da oposio, que as consideram exageradas, nomeadamente as despesas com pessoal (mais de um milho de contos). O oramento prev obras como a construo do pavilho gimno-desportivo de Pinhal Novo e as piscinas do Pinhal Novo, entre outras. Para o presidente Carlos de Sousa, trata-se de um oramento tpico de um concelho rural e pobre com poucas receitas. Segundo elementos recolhidos na nica junta de freguesia do concelho, o Entroncamento tem, aps o recenseamento eleitoral do ano passado, 12 480 eleitores, apenas mais 372 do que no ano anterior. Os responsveis desta junta, bem como os da autarquia, garantem que este nmero est muito aqum da realidade. se Maom no vai montanha, a montanha vai a Maom. Alteraes ao Cdigo Civil que incidem sobre o direito da famlia. Uma proposta de lei a aprovar por a Assembleia da Repblica que permita a alterao do Cdigo de Procedimento Administrativo. O CONGRESSO Nacional Africano (ANC) reiterou ontem, em comunicado, como insuficiente a remodelao ministerial realizada pelo Presidente De Klerk, na sequncia do escndalo sobre o financiamento secreto de Pretria ao partido zulu Inkhata, anunciando que vai intensificar a sua campanha para obter a mudana do Governo sul-africano por um Executivo de transio. UM GRUPO de alegados assaltantes, acusados do roubo de diamantes no valor de um milho de contos da central de escolha de uma operadora portuguesa em Angola, est a ser submetido a interrogatrios que forneam pistas sobre uma presumvel rede de trfico. Cerca de um milho de contos em diamantes, em gemas e para uso industrial, foi roubado na noite de 14 para 15 de Setembro da estao de escolha da Sociedade Portuguesa de Empreendimentos (SPE), no Ocapa, Lunda Norte. O que consta que os militares tm bastante cuidado para evitar esse tipo de situaes. Sabendo antecipadamente o grau hierrquico da personalidade que vai actuar, fazem-se representar em conformidade. Para um secretrio de Estado, um vice-chefe. No entanto, no Ministrio da Defesa no h grandes preocupaes quanto funcionalidade da gesto se houver vacatura com a provvel sada de Nogueira. Se os chefes militares, actualmente, despacham com o ministro de 15 em 15 dias, faro o mesmo com o primeiro-ministro e o secretrio de Estado governa o Ministrio. tudo uma questo de tempo e ainda falta conhecer os resultados do congresso do PSD e saber a altura exacta em que o Presidente da Repblica tomar deciso relativamente ao Governo. Entretanto, a Comisso Nacional de Eleies (CNE) notificou ontem a SIC pela no cobertura da campanha eleitoral do candidato Jernimo de Sousa, na sequncia de uma queixa por este apresentada. A estao de Carnaxide tem um prazo de 48 horas para responder, devendo a CNE na sua prxima reunio tomar uma deciso, que, se for desfavorvel SIC, lhe poder custar uma coima entre os mil e 10 mil escudos. Entre um e dez contos, exactamente. A Kapiti forneceu ao Banco de Comrcio e Indstria (BCI) os sistemas Equation, back-office para operaes internacionais, e FS-Dealer, front-office para operaes cambiais. Mas o primeiro grande atleta de Moniz Pereira veio de Viseu, no interior nortenho, com muito frio no Inverno e muito calor no Vero. Numa freguesia rural s portas da cidade, Vildemoinhos, nasceu e fez as primeiras corridas aquele que viria a ganhar a maratona de Los Angeles em 1984: Carlos Lopes. Lopes corria na infncia pelas hortas, competia com amigos quando iam a festas na aldeia vizinha, atravessava velozmente vinhas e castanhais. E era sempre ele quem ganhava. Na terra natal foi torneiro-mecnico, teve vida dura, at que um dia deu nas vistas nos crosses nortenhos. O Sporting contratou-o, trouxe-o para Lisboa. E Moniz Pereira comeou a trein-lo. O ministro canadiano das Pescas, Brian Tobin, tinha dito, no domingo passado, estar pronto a tomar todas as medidas necessrias para impedir 49 barcos europeus -- 38 espanhis e 11 portugueses -- de continuarem a pescar nos grandes bancos, ao largo da Terra Nova. Tobin sublinhou que os pesqueiros europeus foram todos prevenidos, via rdio, de que o Canad proteger os seus stocks de solha e palmeta, mesmo para alm do limite das 200 milhas nuticas. As informaes disponveis do conta da presena de quatro fragatas, um porta-helicpteros e avies canadianos a vigiar os barcos ibricos. Em causa neste diferendo esto dois problemas: a inteno canadiana de, pretensamente, preservar os recursos de pesca da zona e, em segundo plano, a inteno de alargar a sua jurisdio a guas internacionais para alm das 200 milhas da zona econmica exclusiva. Os canadianos dizem querer diminuir fortemente a pesca da palmeta para evitar o seu desaparecimento, como aconteceu com o bacalhau e o ' red fish ', observa Ernest Loignon. A dupla Jorge Bica / Joaquim Capelo regressou ao comando da 42 Volta Galp a Portugal, prova pontuvel para o Europeu de ralis, aps a disputa da segunda etapa. Uma vantagem de 1m34s sobre os segundos, Jos Carlos Macedo / Miguel Borges, em Renault Clio 16V, garante ao piloto do Lancia HF Integrale uma forte dose de tranquilidade para conseguir a vitria na prova, o que o colocaria em excelente posio para a conquista do ttulo. O livro foi inicialmente publicado pela Cadernos Tempo, em Moambique. Recentemente, uma editora uma italiana descobriu-o. Maria Teresa Pazzolo traduziu-o, prefaciou-o e ilustrou-o, com fotografias de sua autoria. A AIE-Guaraldi, da Reppublica di San Marino chancelou este livro que se deixa inquietar com o desaparecimento da Ilha de Moambique. Em Portugal, esta novela no foi ainda editada. Com a sua publicao em Itlia, talvez O Barco Encalhado desenterre as amarras e aporte no Tejo. O que seria importante pois, de alguma forma, este livro combate a viso preconceituosa e lacunar que, ao longo dos tempos, Ilha de Moambique tem sido consagrada. Os ministros europeus do Trabalho e dos Assuntos Sociais aprovaram ontem no Luxemburgo leis comunitrias instituindo uma licena mnima de trs meses para assistncia famlia e a aplicao de regras do pas de acolhimento para os trabalhadores deslocados no estrangeiro, anunciou o ministro alemo Norbert Blum. Estes dois projectos tinham sido objecto de um acordo poltico durante a ltima reunio dos Quinze, a 29 de Maro em Bruxelas, mas faltava-lhes a adopo formal. O fogo consumiu inmeros fardos de bacalhau, dois dos trs tneis de secagem artificial, alguns compressores, duas carrinhas utilizadas no transporte dos trabalhadores da fbrica e um carro de marca Triumph, propriedade do comendador. Os armazns e uma habitao contguas empresa foram tambm danificados pelo sinistro. Os prejuizos, disse Gonalves Gomes, esto cobertos por o seguro, mas ascendem a centenas de milhar de contos. O bacalhau queimado estava j embalado em fardos ou disposto nos tabuleiros de secagem e pertencia a lotes de produo da empresa, sendo o peixe importado da Noruega e Dinamarca. Apesar da quantidade de bacalhau consumida pelas chamas ter sido elevada, Gonalves Gomes afirmou que no ir acontecer qualquer desestabilizao do abastecimento do mercado. Quanto Empresa de Pesca de Viana estar garantida a continuidade da laborao, embora a nveis de produo mais reduzidos, j que s poder ser utilizado o nico tnel de secagem artificial poupado no incndio. A comparao perversa e d para os dois lados. O deputado social-democrata Carlos Coelho, por exemplo, at j se deu ao trabalho de comparar os dois documentos, apenas para provar que h inmeras citaes literais do diploma no articulado do pacto, que assim se transforma num repositrio de ideias globais, que no acrescenta nada ao que a lei de bases e a Constituio portuguesa preconizam. A forma como todo o processo tem vindo a ser encaminhado , alis, susceptvel de gerar expectativas perversas, porque na forma como as coisas iro ser levadas prtica que as pessoas se dividem. Escola Secundria de Paos de Ferreira -- Est aberto pelo prazo de trs dias, a contar da publicao do presente aviso, concurso para preenchimento de um horrio incompleto de 6 (seis) horas na disciplina de Socorrismo (11 ano). Na Gr-Bretanha so mortas, anualmente, cerca de 70 crianas. As causas so os maus tratos. Os culpados so quase sempre os pais. O nmero de crianas menores de cinco anos, mortas por estranhos, tem sido em mdia de uma por ano, pelo menos desde 1982. Se o assassnio de crianas por estranhos um tipo de crime raro, estranho que isso acontea pela mo dos prprios pais. Mais inesperado ainda encontrar uma criana assassina. O Declogo, de Krystof Kieslowski . Qual foi, para si, o principal acontecimento mundial da ltima dcada? Trs jovens portugueses que fazem msica portuguesa. Assim se definem os Requiem, ex-Requiem pelos Vivos, que aps quase cinco anos de ausncia dos estdios regressam em Outubro com um novo lbum e uma mo-cheia de espectculos. Pelas rdios, entretanto, roda j Entre o Cu e o Medo, o single promocional. Duas Viagens, uma exposio de fotografias a preto-e-branco de Francisco Villa-Lobos, feitas em 1995 em Tquio, Nagasaqui e Quioto, nos intervalos de rodagem do filme Os Olhos da sia, de Joo Mrio Grilo, pode ser vista at 7 de Novembro na sala Laman do Centro Cultural de Belm. Impermanncia -- Um Caminho para o Auto-Conhecimento o ttulo de uma exposio / instalao de Regina Chulam, que joga com vrios auto-retratos pintados pela artista e que pode ser apreciada, a partir das 18h30, na Casa Fernando Pessoa (R . Coelho da Rocha, 16), em Lisboa. Outros dissidentes conhecidos ainda presos, envolvidos na Primavera de Pequim so Wang Juntao e Chen Zimin, jornalistas, 35 e 40 anos, respectivamente, condenados em 1990 a 13 anos de priso, Liu Gang, estudante, 32 anos, condenado a seis anos, Reng Wanding, operrio, 48 anos, condenado a sete anos, e Bao Tong, 60 anos, antigo brao-direito do secretrio geral Zhao Ziyang, condenado a sete anos. No havia cruzamento, no se tratou de uma tentativa de ultrapassagem, nem sequer de uma travagem brusca. Segundo o relato da agncia Lusa, o Volvo deslizou sozinho de uma garagem particular onde estava estacionado. E s o inesperado encontro com o Mercedes o fez parar. Sabe-se l at onde poderia ter ido, deslizando pelo piso molhado da Rua das Flores. O incrdulo automobilista, por no ter interlocutor, achou por bem chamar a polcia e relatar-lhe como tudo se passou. Afinal, h que apurar responsabilidades em matria de seguros, mesmo em situaes inslitas com esta. As tropas ficariam fora dessa espcie de estrada, que na altura no foi sequer definida, concentradas em determinados pontos, que tambm no foram definidos. vo definir esse corredor e determinar os pontos de acantonamento dos soldados. Se no se incomodarem um ao outro porque h condies para partir para a fase seguinte. Que tambm no se sabe ainda qual . Os Fados d'Aqum e d'Alm-Mar que Joo Braga concebeu para o Centro Cultural de Belm traduziram-se numa exibio confrangedora de equvocos e falta de preparao onde couberam amigos, muito Fernando Pessoa, uma boa voz, de Rita Guerra e uma anedota brasileira de mau-gosto. O fado, esse, ficou aqum. O defesa portugus Hlder estreou-se no campeonato espanhol e encontrou logo pela frente um seu velho conhecido: o ex-sportinguista Amunike. Bateu-se bem, ganhou-lhes muitos lances e anulou o flanco esquerdo do ataque dos visitantes, mas acabou por ver o remate decisivo da partida tabelar no seu corpo antes de a bola chegar s redes. Fui infeliz, a bola bateu-me nas costas e entrou. Mas, pelo que fez na segunda parte, o Barcelona mereceu ganhar. Hlder cumpriu bem na sua estreia. Com a particularidade de ter jogado fora do seu posto habitual. Penso que foi por o John Toshack saber que eu conheo bem o Amunike. Pediram-me para desenrascar, mas prefiro jogar a central, explicou o portugus. O Hlder tem razes para estar contente. Pelo caminho, tinha repudiado uma insinuao de Damsio, que, no seu discurso, se referiu sua comisso como sendo constituda por crticos ou opositores da actual direco. E o mesmo fez o orador seguinte, Jos Diogo, este representante do grupo responsvel pela terceira proposta. A reunio prosseguia ainda hora de fecho desta edio. Sim, porque as pessoas acabam por passar por todas, apesar das diferenas. J.S.R. -- Uma das condies fundamentais para se criar outros gostos nas pessoas so os concertos. C no h apresentao ao vivo de quase nenhuma msica. E isso muito importante. Mas nos demais casos em que a lei permite a cobrana coerciva pelos tribunais fiscais, pode ser discutida a legalidade da dvida, o que quer dizer que no h ttulo executivo. A no ser que estejamos perante um dos documentos do art.46 do Cdigo do Processo Civil, aqui sim, plenamente aplicvel, ou haja lei especial que tal disponha. A norte de Porto Dinheiro, na Lourinh, elementos da Guarda Fiscal apreenderam, cerca das 4h30 de ontem, uma traineira (presumivelmente registada no porto de Peniche) que transportava 138 caixas de tabaco americano, com valor da ordem dos 22.500 contos. Na sequncia da operao foram detidos quatro indivduos, entre os quais o mestre da embarcao, a qual foi, tambm, apreendida. A GF confiscou ainda uma viatura ligeira de marca Bedford, envolvida na rede de contrabando, ontem descoberta. Quantidade idntica de tabaco -- seis caixas de marca Camel e 124 de Winston -- foi aprendida, na madrugada de tera-feira, na zona de Vila do Bispo, por elementos do comando da Guarda Fiscal do Algarve. A mercadoria, proveniente de um desembarque, foi encontrada numa carrinha suspeita. Alverca -- Sem meia dzia de jogadores (os emprestados pelo Benfica), o recm-promovido Alverca ganhou o seu primeiro ponto da poca e logo fora de casa. A perder, o treinador Mrio Wilson arriscou com a entrada de trs jogadores e saiu de Campo Maior com mais moral para enfrentar a difcil tarefa da manuteno. Por este andar, Portugal vai ter o campeonato com o ciclo mais alargado de jogos em cada semana, com jogos de sexta a segunda-feira. Em Itlia os treinadores recusaram os jogos segunda-feira, que as televises queriam impr, em Espanha tambm haver jogos s de sexta a domingo, e em lado nenhum se foi to longe como em Portugal dividindo as jornadas por quatro dias. H razes tcnicas e tambm de marketing, porque o pblico nunca sabe a que dia joga a sua equipa e a percepo do que se vai passando, para o pblico em geral, tambm no a mesma. A televiso pode ter assim tanto poder? No que diz respeito s 200 toneladas de farinha de carne e ossos, a respectiva incinerao fica para mais tarde, em data a anunciar oportunamente pelo Ministrio da Agricultura. Segundo este organismo, a operao s ter lugar quando for disponibilizado o equipamento que est a ser propositadamente construdo para proceder introduo escalonada nos fornos da farinha obtida a partir da transformao das carcaas dos animais abatidos. Tratando-se de matrias muito inflamveis -- no possvel retirar integralmente a gordura no processo de transformao --, h que evitar o risco de combusto no momento de entrada no forno (a laborar a alta temperatura), que poderia atingir o prprio operador. Por isso, as embalagens sero incineradas de forma diferente do habitual, recorrendo-se a um dispositivo que permitir puxar a farinha e lan-la por cima, em completa segurana, para dentro do forno. Os palestinianos, que querem reivindicar a independncia da Cisjordnia e de Gaza numa fase posterior das conversaes, insistem em ser reconhecidos como parceiros de plenos direitos, mas os israelitas, interessados apenas numa frgil autonomia palestiniana, querem manter-los na delegao conjunta com a Jordnia. Fontes em Washington disseram France Presse que, para desbloquear o impasse, Israel vai propor a realizao simultnea de conversaes com uma delegao jordano-palestiniana e com uma delegao s de palestinianos. Pedro Almodovar j filma Kika, onde, com a ajuda do costureiro francs Jean-Paul Gaultier, criou uma espcie de cmara humana. Nesse filme, uma criada lsbica tem o rosto de Rossy de Palma, o perfil cubista que irrompeu em A Lei do Desejo. A actriz esteve no Festival Internacional de Teatro de Almada. Chica Almodovar o que ? Uma inveno. Pertence quele grupo de rostos femininos a que chamam as chicas Almodovar, todos eles inventados pelo cineasta de mulheres, Pedro Almodovar. No caso de Rossy de Palma, a primeira coisa que mostrou no cinema foi mesmo o perfil, ameaador e desregrado como uma pintura cubista. Foi n' A Lei do Desejo, e depois aconteceu Mulheres Beira de um Ataque de Nervos -- dormiu durante o filme todo, aps um jarro de sumo de tomate e soporferos --, Ata-me! e agora, em rodagem em Madrid, Kika. O problema do desemprego dos engenheiros e tcnicos do Minseredmash, o antigo e gigantesco complexo da URSS dirigido para a construo do escudo nuclear da ptria, actualmente tema de acalorada discusso na Rssia e constitui uma preocupao para os seus dirigentes. Os custos desta pesada estrutura tornaram-se insuportveis para o pas. e um estabelecimento muito chique da capital oferecia aos melhores corredores um par de relgios em ouro. Era o Maxime ... Quando me convenci que todos os esforos para tomar o aquartelamento se haviam tornado perfeitamente fteis, comecei a retirar os nossos homens em grupos de oito e dez. (...) As nossas baixas no combate haviam sido insignificantes, 95 por cento dos nossos mortos resultou da desumanidade do Exrcito aps a luta. Durante uma semana mantivemo-nos nos cumes da cordilheira da Gran Piedra, enquanto o exrcito ocupava as bases. No podamos descer, e eles no se atreviam a subir. No foi a fora das armas, mas sim a fome e a sede, que acabaram por vencer a nossa resistncia. Tive de dividir os homens em grupos mais pequenos. outros foram escoltados por Monsenhor Prez Serantes [ arcebispo de Santiago de Cuba ] a fim de se renderem. Entre os episdios mais importantes a transmitir, alm da entrevista com Giacometti, destaque-se aquele em que se fala da arte de Modigliani, mais ou menos a meio da exibio (nem sempre a ordem respeitada pela RTP). Fala-se da sua vida: do alcoolismo, do desespero; tambm do charme, da sua gentileza. E do drama da morte. Outro episdio importante, Um dia na vida de Man Ray, entrevista filmada no comeo de 1961. Ou uma memria do surrealismo dita na primeira pessoa. Montparnasse Revisited assim. Est cheia destes tesouros. Os parlamentos da federao croato-muulmana e dos srvios bsnios vo hoje pronunciar-se sobre o novo plano de paz internacional para a repblica. Os dirigentes croatas e muulmanos j manifestaram o seu apoio ao novo projecto, mas os lderes srvios da Bsnia continuam a manifestar profundas reservas face ao novo mapa territorial proposto pelas grandes potncias. O jogador representou na poca passada a Associao Desportiva Ovarense, mas foi dispensado em virtude do seu alegado comportamento antidesportivo, que, na opinio da direco do clube vareiro, ter contribudo para a derrota sofrida perante o Benfica na fase decisiva dos play-off. Sprewer ir ter como colegas na sua nova equipa os seus compatriotas Terrence Radford- que transita da poca passada- e Ruben Cotton, um veterano de 32 anos, naturalizado portugus, que na poca passada esteve ao servio da Oliveirense. Alexandre Dias, Caetano, Moutinho, Nunes, Paulo Duarte, Rui Santos, Jos Ruivo e Henrique Silva completam o plantel, que ser dirigido pelo tcnico Orlando Simes. O oramento para a nova poca ronda os 25 mil contos e o principal objectivo garantir a permanncia do Esgueira no escalo principal. O Beira Mar, a outra equipa da cidade de Aveiro, procedeu tambm no incio desta semana substituio do norte-americano Mike Smith pelo seu compatriota Deshon Washington. Segundo responsveis do clube aveirense, Mike Smith apresentou-se em deficientes condies fsicas e sem ritmo competitivo. Deshon Washington, que joga na posio de extremo-poste, tem 23 anos de idade, mede 2,02 metros, e vai fazer dupla com o seu compatriota Kip Jones. O tcnico do Beira Mar, Aniceto Carmo, tem ainda ao seu dispor os seguintes atletas: Catarino (ex-Esgueira), Paulo Sousa (ex-Salesianos), Rebelo, Traylor, Moreira, Mourinho, Alex Pires, Renato, Joo Miguel e Pinto. Tendo como objectivo intrometer-se na luta pelo ttulo, o Beira Mar dispor, para a nova temporada, de um oramento a rondar os 40 mil contos, verba substancialmente inferior da poca passada. Na sua globalidade, as respostas alimentam, apesar de tudo, um certo optimismo entre os que se interessam pela sorte dos bichos. Para Jorge Rocha, o inqurito demonstrou que h muitas cmaras do pas que tm uma atitude de grande dignidade pelos animais, apesar de serem letra morta as parcas disposies da legislao que regulamenta esta matria e do imenso atraso no nosso pas em relao a outros pases europeus, onde h inclusivamente seguros e assistncia doena. De um modo geral, os animais domsticos que vivem no campo tm uma existncia mais feliz do que os citadinos. Nas grandes cidades, os animais domsticos so normalmente mais maltratados, pois, em certa medida, entram em concorrncia com as pessoas, ocupando as ruas, fazendo rudo nos prdios, observa o veterinrio municipal da Cmara do Porto, Vtor Aires. Nos concelhos rurais, essa concorrncia no existe e os bichos mantm, at, determinadas funes teis, como a guarda ou a caa. Atrs de Kerrigan na lista das grandes favoritas est a japonesa Yuka Stao, quarta nos Mundiais e que nos ltimos tempos bateu patinadoras de grande nome como a alem Tanja Szewczenko e a canadiana Jose Chouinard. A China aposta tudo em Chen Lu, terceira classificada nos ltimos dois Mundiais, apontada com a atleta que melhor alia a tcnica e a beleza. S depois surge Tonya Harding, campe dos Estados Unidos. E a grande dvida se as suspeitas da agresso sua rival sero um handicap ou uma motivao. Homens ensopados da cabea aos ps por uma chuva de petrleo, com as chamas e o fumo deixados pelas tropas iraquianas como cenrio de fundo. As imagens que Sebastio Salgado registou da guerra que ops os Estados Unidos ao Iraque e que comeou pela invaso do Kuwait no mostram msseis Scud, nem a ferocidade dos exrcitos dos dois inimigos. Revelam a misso perigosa dos homens das 27 equipas de bombeiros enviadas para o inferno que os iraquianos atearam em 788 poos de petrleo quando se retiraram do Kuwait e que permitiram ao fotgrafo brasileiro concluir o seu portfolio sobre o homem e o trabalho. A fotografia foi publicada na imprensa de todo o mundo h cerca de um ano, quando de o massacre de Bentalha, na Arglia, onde morreram cerca de duas centenas de pessoas, na noite de 22 para 23 de Setembro. Cinco meses mais tarde, a imagem de uma mulher chorando a perda dos seus oito filhos, baptizada como piet argelina, voltava a ser impressa, desta feita porque o seu autor, escondido atrs do pseudnimo Hocine por razes de segurana, recebia o Prmio World Press Photo 1997. Tem um projecto pessoal: encenar e interpretar, com a actriz Margarida Tavares, O Amante, de Harold Pinter, em regime de co-produo no Teatro da Trindade, com estreia prevista para Julho. Tive um professor, Joo Brites, que nos dizia sempre que tnhamos que ter os nossos prprios projectos. Achei que era importante assumir uma postura pessoal. Ida e volta do domiclio ao trabalho. Parti de Cabo Verde h oito anos. Estava sem trabalho, fazia uns biscates de vez em quando, mas nada de srio. Se parti, foi porque para mim, como para todos os outros cabo-verdianos, s havia uma escolha: ficar e morrer fome ou partir, no importa para onde, para onde pudesse encontrar trabalho e boas condies para sustentar a minha famlia. Futebol -- Transmisso directa da cerimnia de abertura do Mundial de Juniores, que se realiza no Estdio das Antas, no Porto. Futebol -- Transmisso em directo do jogo inaugural do Campeonato do Mundo de Juniores, Portugal-Repblica da Irlanda, a contar para o Grupo A, a partir de as 21h00, no Estdio das Antas. Porque a imprensa est feita com a direita, como responderia o dr. Cunhal? Explicao demasiado fcil e desculpabilizadora. Se assim , porque morreu a imprensa comunista, como o Dirio, porque se afastaram do Partido todos os jornalistas comunistas que trabalhavam em outros rgos de informao? A resposta outra. A resposta que o PP, com razo ou sem ela, demagogicamente ou no, para o caso tanto faz, tem tido a capacidade de impor os debates que suscitam a separao de guas, as fracturas polticas de que nascem as opes do eleitorado. Enquanto que o PCP um deserto de ideias, de discusso e at de coragem ideolgica. Nada mais previsvel, nada mais desesperadamente igual e repetitivo do que o discurso do PCP. Existe apenas como uma espcie de museu de ideias. O PP s tem sete por cento dos votos, mas no difcil imaginar que possa crescer. O PCP tem os mesmos sete por cento, mas quem imagina que possa subir? Alm disso, a nova lei orgnica do LNIV veio atribuir-lhe mais responsabilidades (ver Controlo da sade animal ganha terreno, PBLICO de 9-6-97) no mbito das provas laboratoriais necessrias ao controlo sanitrio dos animais e seus produtos -- a par das normais competncias no mbito da Investigao e Desenvolvimento (I&D) e das que decorrem do seu estatuto de autoridade nacional de referncia para todas as questes referentes sade animal. Por tudo isso, o ministro da Agricultura entendeu que no era possvel adiar por mais tempo a deciso relativamente ao futuro do laboratrio. O despacho de Gomes da Silva aprova uma metodologia de realizao faseada no tempo. A primeira iniciativa consiste na abertura de um concurso limitado para a elaborao, por um gabinete de projectistas, do programa preliminar e do caderno de encargos que ter de ser apresentado no concurso pblico (segunda fase) para a concepo e execuo das novas instalaes. Este ciclo, a iniciar muito brevemente, dever ser cumprido at ao final do prximo ano. A terceira fase do processo consistir na construo das instalaes propriamente ditas, cujo arranque ocorrer em 1999 e no dever estar concluda antes de passados trs anos. Mais de mil casais tinham-se oferecido, at quinta-feira noite, para recolher provisoriamente refugiados, sobretudo crianas. Mas hoje [ ontem ], quando perceberam que no era para adoptar, comearam tambm a aparecer pessoas interessadas em receber tambm as mes, explicou uma fonte do Frum Estudante, que est a organizar a Misso Crescer em Esperana. Nas instalaes do Instituto da Juventude, o telefone no pra de tocar. Tambm na Madeira, a delegao da Critas no Funchal tem recebido inmeras solicitaes de famlias do arquiplago dispostas a receber crianas da Bsnia. Apesar dos muitos pedidos de adopo, as pessoas esto a ser informadas que as crianas apenas sero recohidas temporariamente, entre nove a doze meses, segundo o Jornal da Madeira. Os lobos no tm fronteiras, para alm das que eles prprios delimitam. Mas, um simples salto sobre a linha que divide o Parque Natural de Montesinho da Reserva Regional de Caa da Serra da Culebra, na regio espanhola de Castilla y Lon, pode significar um passo da vida para a morte. Protegido em Portugal, o lobo considerado uma espcie cinegtica no lado de l da fronteira, onde, desde h dois anos, tem vindo a ser leiloado o seu abate e a prosperar um mercado negro em torno dos seus restos. Curiosamente, na conferncia de imprensa em que anunciou o adeus, o primeiro-ministro no apresentou justificaes de peso para a deciso. Decidi demitir-me hoje. Pensei ser chegado o tempo de renovar as pessoas do Governo e comear de fresco no ano novo -- afirmou, revelando que tomara a deciso dia 1. A justificao, para muitos analistas, pode procurar-se na frase de alvio de Murayama aps o seu copo de sak. A impopularidade pessoal e do seu executivo tornara-se demasiada. Eles fizeram da nossa aldeia um cemitrio, contou uma velha. at agora, as pessoas morriam de doena ou de velhice. Homens e ps cavavam ontem freneticamente, que j tinha passado um dia, e camies descarregavam cimento em quantidades que a aldeia, de casas de argila e telhados de chapa, nunca tinha visto. A sportinguista Teresa Machado estabeleceu no domingo, numa pequena reunio realizada em So Jacinto, perto de Aveiro, um novo recorde nacional do disco, alcanando a marca de 65,40m. O anterior mximo fora conseguido no mesmo local a 8 de Agosto de 1993, com 63,70m. A sportinguista abriu a sua srie com 62,16m, subiu a seguir para 63 metros exactos e o novo mximo chegou na terceira ronda. A acabar confirmou a sua regularidade com 61,87m, 62,84m e 63,09m. O Sporting j ganhou (a Taa de Portugal) e j voltou a perder (com o Benfica no jogo de repetio, na quarta-feira, no Restelo, com meio plantel ainda sob os festejos do fim do jejum de 13 anos). Mas pelas bandas de Alvalade continua ainda por deslindar o mistrio do irlands Niall Quinn, contratado num dia pelo novo Sporting de Santana Lopes e descontratado no dia seguinte com o mesmo espalhafato meditico. S porque, de facto, o actual clube do irlands, o Manchester City, tratou o Sporting como um clube terceiro-mundista, como alegou Santana Lopes? Ou a histria tem mais que ver com os nmeros do negcio (mais de 400 mil contos por um jogador beira dos 30 anos, indesejado pelos tcnicos de Alvalade e chumbado pelo respectivo departamento mdico)? A ser verdade, o Sporting continua a fazer negcios Cintra. E a portar-se Cintra, no dando cavaco, sequer, ao jogador ,que at j vendera a casa l na Old England. Ser que est a nascer o Sporting Cavaco Lopes? De um modo aparentemente displicente, mas evidentemente estudado nos seus efeitos, surgem referncias a Nitsch Hermann e a Rodolf Schwarzkogler, por exemplo, personagens que, com Arnulf Rainer e Gnter Brus, compuseram um grupo de artistas vienenses capaz de, nos anos 60, provocar escndalo no s na cena pantanosa e hipcrita da arte austraca do ps-guerra, como no conjunto da arte ocidental. Radicalizando a ideia das vanguardas em reduzir cada linguagem artstica sua especificidade, eles acabaram por inverter a lgica de purificao de meios do modernismo. O corpo humano (na sua realidade fsica) tornou-se suporte -- ou, se quisermos, a prpria matria-prima da obra de arte. Chamaram-se a si mesmos actionistas, embora o movimento, que se espalhou largamente at aos anos 70, seja designado globalmente por body art, ou surja nos textos de Bowie como arte ritual. Por mim, achei que foi infeliz semelhante comparao (...) e, muito menos, em relao quele grande estadista, que penso ser respeitado em todo o mundo, sempre a rigor fardado, de franciscanas e respeitveis barbas e de linguagem to simples que todo o explorado da terra entende e o povo cubano venera e ama. Debate-se agora se ter sido oportuno ou nefasto o reconhecimento internacional da Crocia e houve quem recordasse que a guerra servo-croata j estava em curso em 1991, antes desse reconhecimento. Mas este no o cerne da questo, o cerne da questo a Bsnia. a sua dignidade enquanto naes, e logo enquanto Estados, estava firmada. O mesmo no se passa na Bsnia, onde as etnias convivem ao simples nvel da coabitao. Por isso, constituiu um erro reconhecer a Crocia sem antes ter preparado uma soluo vivel para a Bsnia, e no h paixo ideolgica capaz de o apagar, se bem que seja intil dramatizar um erro de natureza diplomtica. A urbanizao de Arcena passou por um complicado processo, com origem numa venda entre scios da empresa proprietria, a Eurocapital. Perante a falta de pagamento dos compradores, os scios vendedores reclamaram em tribunal a restituio dos seus direitos sobre a urbanizao, o que viriam a conseguir no incio deste ano. Durante o perodo em que estava indefinida a propriedade dos edifcios, grande parte deles foram ocupados por imigrantes africanos. Em 9 de Abril ltimo, o tribunal de Vila Franca iniciava aces de despejo, para a restituio da totalidade da urbanizao Eurocapital. A maior parte das cerca de 260 pessoas desalojadas instalou ento abrigos improvisados numa das ruas da urbanizao. A polcia alem localizou, ontem de manh, os dois criminosos que fizeram cinco refns durante uma fuga de mais de 27 horas pela Alemanha, aps terem assaltado um banco, na segunda-feira, em Stuttgart. Quando tentavam controlar a identidade de um homem perto de uma cabina telefnica em Driedorf, em Hessen, os polcias foram alvo de tiros, tendo respondido. O homem acabou por fugir a p. Importantes reforos foram j enviados para a zona. Os dois criminosos tinham libertado, ontem, por volta das 4h50 locais, os trs refns que detinham, um dos quais ferido, segundo a polcia de Wiesbaden. Os refns, um homem e um casal, encontravam-se a cerca de 60 quilmetros a norte de Frankfurt. Aps uma perseguio de mil quilmetros atravs do pas, a polcia a distribuu-se pela autoestrada entre Hofheim e Wiesbade. Os agentes decidiram dar vantagem aos raptores por estes terem ameaado suicidar-se com os refns fazendo explodir uma granada, caso a polcia os seguisse a curta distncia. Os delinquentes comearam por sequestrar dois polcias no seguimento do assalto ao banco que lhes rendeu 250 mil marcos, tendo feito depois trs novos refns. Os dois criminosos, um antigo soldado de elite do exrcito da Alemanha de Leste de 32 anos, e um suo de 35, evadiram-se, h trs semanas, de uma priso de Hamburgo, acusados de homicdio e assalto mo armada. Segundo as autoridades, no est excluda a possibilidade de que os raptores se dirijam para Frana. O prmio literrio Donna -Cidade de Roma, 1992, foi atribudo a Raisa Gorbatchov, pela sua biografia-testemunho, Io Spero, editada no Vero passado, anunciou ontem, na capital italiana, a presidente do jri, a escritora Gabriella Sobrino. O galardo ser entregue numa cerimnia a decorrer a 28 de Maro, mas ainda no est confirmada a deslocao de Raisa a Itlia. Telas atribudas a Picasso, Mir e Martinez foram descobertas ontem, pela polcia Judiciria de Paris, na casa de um galerista de origem americana, Theodor Robertson. A descoberta aconteceu na sequncia de uma busca efectuada sua residncia, em Frana. Robertson era alvo de um mandato internacional de captura, por acusaes de fraude e falsificao. Nove emprstimos por obrigaes e as aces da Luzostela foram ontem retirados da cotao na Bolsa de Lisboa. Dificuldades financeiras so o motivo invocado para a medida. Os responsveis do mercado lisboeta decidiram tambm extinguir a direco de operaes. A reduzida actividade do floor j no justificava a sua existncia. O escndalo Augusta explodiu no seguimento das investigaes sobre o assassinato, em 1991, de um antigo ministro socialista francfono, Andr Cools. De acordo com Philippe Moureaux, ex-vice-primeiro-ministro do mesmo partido, a sua morte ocorreu no momento em que Cools estaria prestes a revelar o pagamento das luvas pela firma italiana. em Janeiro de 1994, os trs Guys -- Guy Spitaels, ministro-presidente da Valnia, Guy Coeme, ministro da Defesa, e Guy Mathot, ministro regional -- foram forados a demitir-se para assegurar a sua defesa depois de a sua imunidade parlamentar ter sido suspensa. Neste caso, h duas coisas que custam sobremaneira a Fernando Neves. Primeiro: o processo coloca-o na ingrata posio de delator dos seus camaradas. Segundo: a queixa foi apresentada pelo governador civil, um homem que Neves chegou a louvar pela sua disponibilidade para o dilogo, que contrastou com o silncio comprometido do presidente da Cmara de Castelo de Paiva, Antero Gaspar. No estava nada espera disto, diz o mineiro. Entretanto, por causa deste processo, a agitao ameaa voltar ao Pejo. Os mineiros esto dispostos a manifestar-se na sede do concelho em solidariedade com o seu camarada, ao mesmo tempo que est a ser preparado um abaixo-assinado. Se querem prender algum, tm que nos prender a todos, porque fomos ns todos juntos que cortmos a estrada e barricmos a empresa. Sempre quero ver se tm prises que cheguem para l meter dentro 500 mineiros!, diz Manuel Vasconcelos. O mdico Manuel Almeida Marta, principal suspeito do crime de Armamar, continua a ser procurado pelas autoridades. As buscas duram h 72 horas, sem que tenha sido descoberto o paradeiro do mdico que por pouco escapou a um linchamento popular. Muito mais no diz, nem vale a pena, apenas um sussurro curto de remorso. Estou arrependido, mas no tinha comido nada. O Tribunal de Polcia entende as palavras de Agostinho como pesar sincero, e esta uma pequena condio muito prezada pela letra e forma da lei. Mas uma atenuante no um perdo, e os cdigos esticam as penas em caso de reincidncia. Ainda se considera, pela ltima vez, ser desnecessria a priso, dita o juiz para o escrivo, convertendo os cinco meses na penitenciria em 45 mil escudos de multa, ou, em alternativa, em cem dias de deteno. Atende-se, tambm, situao econmica do arguido e autoriza-se o pagamento da multa em dez prestaes. Quanto ao carvo, que j regressara ao restaurante quando Agostinho saiu do tribunal, foi grelhar carne para quem a come. Esta foi a argumentao apresentada na reunio de Cmara, que mereceu, invariavelmente, alguma diplomacia na resposta dada pelo presidente. Disse vezes sem conta Manuel Machado que, no que for vlido e legtimo, a edilidade est disposta a encontrar uma alternativa possvel. E no ajuda muito tcnicas de bloqueios a mquinas. Se for a bem, tudo bem; se for a mal, arranja-se a um '31' que s o tribunal resolve. prefervel concertao de solues, advertiu. Mas se a alternativa for a estrada, no possvel. por falar em alcatro, o presidente relembrou aos moradores que, por altura das campanha para as eleies autquicas de 1993, lhe tinha sido pedido alcatro para o local. Constata-se que nem sempre, na altura prpria, os muncipes do ateno aos editais. Porque nenhum loteamento avana em Coimbra sem publicao de editais. No falem em falta de dilogo. Nenhuma audincia dos moradores foi pedida Cmara. Foi apenas recebido um fax da CHEM a mandar parar as obras, mas no se pode retomar o PREC numa situao desta natureza. Os dois principais partidos austracos chegaram ontem a um acordo para fazer redues oramentais no valor de 100 mil milhes de xelins (perto de 1500 milhes de contos) em 1996 e 1997. Dois teros dessas economias sero obtidas atravs de cortes nas despesas do Estado Federal, provncias e autoridades comunais, estando o resto dependente de um aumento de receitas. As medidas, anunciadas pelos ministros da Economia, Johannes Ditz (conservador), e das Finanas, Viktor Klima (social-democrata), numa conferncia de imprensa comum, destinam-se a assegurar o cumprimento dos critrios de convergncia, assegurando que a Astria possa entrar no primeiro comboio da Unio Econmica e Monetria. Sem estas redues, o dfice pblico representaria 6,1 por cento do Produto Interno Bruto (PIB) este ano e 6,5 em 1997, valores muito acima do limite de trs por cento imposto por Maastricht. PIB desce em Frana ... O PP decidiu ontem em reunio do grupo parlamentar adoptar a liberdade de voto e Helena Santo ficou -vontade perante o debate parlamentar da prxima quinta-feira em que sero votados os projectos de alterao e liberalizao do aborto. A deputada no decidiu ainda, contudo, se se vai abster ou votar a favor do texto de Strecht Monteiro, mas assume claramente que considera o projecto pr-natalista e vem corrigir a lei em vigor, pelo que merece simpatia. Este afirmou que o seu objectivo devolver CIA a credibilidade perdida. Precisou que ia realizar mudanas profundas na gesto da CIA, designadamente em o que respeita aos critrios aplicados no recrutamento de agentes. Os EUA tm de ter a melhor capacidade do mundo para recolher informaes, concluiu. Aldricht Ames, aps ter sido descoberto, confessou que espiava a favor de Moscovo havia nove anos, desde Abril de 1985. Devido aco de Ames, explicou o actual director da CIA, foi muito mais difcil para os EUA compreender o que se passava na URSS durante aquele perodo crtico, porque ele denunciou aos soviticos muitos agentes que trabalhavam para os servios americanos. Depois de Cantanhede, a formao apresenta-se hoje em Vila Real, no sbado no Porto e na prxima quinta-feira em Tomar (ver pgs. 10/11 deste Fim de Semana). At 24 de Outubro, a Nova Filarmonia -- que actualmente composta por 35 msicos -- estar em Covilh, Leiria, Paio Pires, Lisboa, Coimbra, Valena, volta a Vila Real e, finalmente, toca em Matosinhos. Entre os variadssimos patrocinadores destes concertos, encontram-se, entre outras empresas e instituies financeiras, a Rodoviria Nacional, Portucel, Shell Portuguesa, Sonae, Banco Totta & Aores e Montepio Geral. Entre os concertos previstos para Novembro, destaca-se o de dia 3, no Palcio Nacional de Queluz, quando a orquestra acompanhar o pianista Sequeira Costa, que interpretar o concerto n 1, opus 11, de Chopin, com o patrocnio da GALP. O rio Grande nasce no concelho de Torres Vedras e atravessa todo o concelho da Lourinh sendo as suas guas consideradas excessivamente poludas e imprprias para qualquer uso, segundo dados de 1987 da Direco Geral da Qualidade do Ambiente. Neste estudo so apontados como principais fontes poluidores as pecurias, matadouros e esgotos domsticos sendo observados vrios casos de contaminao de guas de fontanrios e poos. sua poluio deve-se ainda o desaparecimento dos famosos bivalves negros da Mexelhoeira, uma zona de rochas entre as praias da Areia Branca e do Peralta. As histrias da poluio do rio Grande correm toda a regio, desde o aparecimento de cadveres de animais na sua foz at ao boato de um surto de hepatite B que no ano passado afastou centenas de veraneantes. Na perspectiva da autarquia este quadro est em vias de melhorar pois existem j projectos para cinco estaes de tratamento domsticos das principais localidades do concelho e duas centrais de tratamento de dejectos das suiniculturas para posterior transformao em fertilizante, j candidatas a fundos comunitrios, isto para alm das vrias pequenas estaes de tratamento que esto previstas em todo o concelho. A Cmara Municipal aponta os complicados processos burocrticos como os grandes entraves para que tudo se concretize uma vez que a maioria dos apoios financeiros j estar garantida. A ajuda humanitria, que finalmente recomeou a mover-se em direco Bsnia central, pode revelar-se uma arma de dois gumes. Adia o sofrimento de uma populao esgotada e esfomeada, mas no o resolve de vez. Crescem os receios de que antes contribua para prolongar uma guerra devastadora e alimentar os florescentes circuitos do mercado negro. Quanto ao projecto paisagstico, as dvidas persistem. Armindo Cordeiro, da Cmara Municipal de Lisboa, afirma ainda no ter sido aprovado nenhum, mas que tal vir a acontecer a breve trecho, cabendo a responsabilidade ao Departamento de Espaos Verves, j que o projecto apresentado pela Junta de Freguesia foi recusado poe no reunir os requisitos de qualidade necessrios. Tambm Armindo Cordeiro peremptrio ao afirmar que o logradouro ajardinado no ser destrudo, mas pelo contrrio ampliado, assim como melhorar a situao dos moradores e lojistas da zona. Acabar com o desleixo e a degradao do parque actualmente existente, que no tem cuidados de jardinagem h muito tempo e que poder transformar-se em poiso de marginais e toxicodependentes. Ablio Loureno, da Comisso de Luta dos Professores de Educao Tecnolgica (Colpet), lembra que a anterior tutela se limitou a reconverter os professores de Trabalhos Oficinais em professores de Educao Tecnolgica, sem lhes dar qualquer tipo de formao. as professoras de Txteis s ensinam Txteis, os professores de Electrotecnia s do a sua matria e por a adiante. Nas aces de formao que a Colpet tem vindo a organizar, ficou provado que os docentes no cumprem os novos programas porque no lhes foi dada a necessria reciclagem. Como s dominam uma das matrias do programa, quando so confrontados com o enunciado de uma Prova Global de Educao Tecnolgica, os professores, na generalidade, no sabem resolv-la. A rapariga franzina faz o que pode, mas a voz to titubeante que o apresentador tem logo que ir buscar outra cantora multido. Percebe-se que a grande massa de gente est ali mais ou menos como o Padro dos Descobrimentos: para proporcionar SIC o seu show de iluso, a ideia de que uma multido entusistica rodeia o palco, o que serve s mil-maravilhas os primeiros planos das cmaras. As coisas animam-se decididamente com Crocodile rock de Elton John. Os cantores sucedem-se, Miguel ngelo j pula, por momentos dana Elvis e pe o concorrente a danar. Quantas Laura Pausini esto aqui esta noite?, pergunta. As duas primeiras jovens fraquejam e Miguel vai logo buscar outra. O novo parque urbano de Ermesinde, que comea a ganhar forma junto s runas da antiga Fbrica da Cermica (vulgo Fbrica da Telha), ficar pronto em finais deste ms. esta a expectativa assumida pelo presidente da Cmara de Valongo, Fernando Melo, empenhado em abrir aos ermesindenses a primeira fase deste novo equipamento de passeio e lazer antes que se escoe o presente mandato autrquico. Como o PBLICO pde constatar no local, a ideia transformar, nesse curto espao de tempo, um terreno que ainda se encontra revolvido e enlameado numa rea relvada cruzada por passeios e estadias. J podem ser vistas algumas das estruturas que vo integrar o parque, despontando no meio de um piso em rebulio. So os casos de alguns dos caminhos e do pequeno anfiteatro ao ar livre destinado a espectculos estivais. Hoje e amanh, decorrem na Sala Polivalente do Acarte as ltimas de Josa com o Violino Mgico dos London Puppet Players. Porto: no Carlos Alberto, continua Luzes de Palco, a ltima produo da Seiva Trupe . Que futuro para o livro na Europa o mote que preside a um frum internacional que a Associao Portuguesa de Editores e Livreiros (APEL) organiza, em 16 e 17 de Fevereiro, no Centro Cultural de Belm, em Lisboa. Para alm de representaes de vrias estruturas associativas europeias de editores e livreiros, est prevista a presena do comissrio europeu com a pasta da Cultura, Martin Bangemann, bem como de vrios membros do Parlamento Europeu. A discusso centrar-se- em trs grandes painis: a edio na sociedade europeia de informao, a dimenso cultural no Tratado da Unio Europeia e o mercado do livro na Europa: problemas de comercializao (preo fixo). O ministro da Cultura portugus, Manuel Maria Carrilho, encerra o debate. P. -- Qual a diferena entre trabalhar em teatro e em cinema? R. -- No teatro tenta-se encontrar a unidade do espectculo, cria-se um tempo, e no um momento -- cria-se um tempo fora do tempo. Em o cinema, essencialmente o que se apanha o momento de qualquer coisa. um trabalho sobre o detalhe. Foi em 1913 que a aventura teve incio. Jesse Lasky, a conselho de Sam Goldfish (depois Goldowyn) d a primeira oportunidade a DeMille para fazer um filme: The Squaw Man. O palco das filmagens seria New Jersey. Os sicrios do fara Edison, cujo trust controlava as patentes das mquinas de filmar foraram o grande xodo. Ei-lo de abalada para Flagstad, Arizona, com mais quatro cmplices, entre eles o actor Dustin Farnum. Desiludido com o local, DeMille segue mais para Oeste (seguindo, como os pioneiros, as palavras de Horace Greeley: Go west young man, and grew up with the country ) e assenta arraiais num velho celeiro comprado numa zona quase deserta: Hollywood. Se outros j por l andavam, fugidos ira faranica, a partir de ento que a zona comea o seu boom, tornando-se, em poucos anos, a capital dos sonhos. Nascia a nova Terra Prometida. O filme foi um xito e DeMille at ao fim da primeira grande guerra dirigiu e supervisou mais de metade dos filmes da sua carreira. Desde logo se manifestou a habilidade do regisseur controlando mais de um filme em simultneo. paradigmtico o caso de A Marca de Fogo (The Cheat), esse filme que marcou a crtica francesa e o cinema de vanguarda dos anos 20, que fez dele a sua bandeira e deu ao cinema a categoria de arte (a stima). A Marca de Fogo foi filmado em 1914 em simultneo com The Golden Chance, o primeiro de dia e o segundo de noite. Estreado um ano antes de Nascimento de Uma Nao, A Marca de Fogo uma obra surpreendente por tudo o que trouxe de novo em termos de linguagem cinematogrfica, nos enquadramentos e na iluminao de interiores. Mas o programa cultural abre logo s 22h00 do dia 3 de Julho, uma sexta-feira, com a actuao da Orquestra Ligeira do Conservatrio de Coimbra, na Praa do Comrcio, e encerra no mesmo local e mesma hora no domingo, dia 12, com os Cool Hipnoise. Este mesmo espao, no corao da cidade, acolhe ao longo da semana e sempre mesma hora um espectculo de msica e dana sevilhana (dia 6), o grupo Negros de Luz (dia 7), Fausto (dia 8), Cesria vora (dia 10) e Carlos do Carmo (dia 11). Muito perto, nas principais ruas da Baixa de Coimbra, a animao ser constante, garante a organizao. Aos saltimbancos, bombos e gigantones juntar-se-o, ao longo dos vrios dias de festa, a msica, o malabarismo, animaes de fogo, teatro e fantoches. P. -- H algo de verdadeiramente diferente em Where In The World, o seu ltimo trabalho? R. -- Penso que o fundamental que Where In The World o primeiro lbum mesmo da banda, s isso. O lbum funciona mais como uma afirmao, feito como um todo, no soa como trs ou quatro discos diferentes. No se construiu um mundo diferente para cada um dos seus temas, em estdio. quando, finalmente, conseguimos um disco realmente da banda, ficamos sem um dos msicos. A directoria de Faro da Polcia Judiciria encerrou na passada quarta-feira, aps denncia, e na sequncia de investigaes que estavam a ser feitas no Algarve sobre o aliciamento de mulheres para a prostituio, um bordel na zona de Ferreiras. Na operao foram detectadas, entre outras, quatro jovens com idades compreendidas entre os 16 e os 18 anos, que eram foradas, segundo a PJ, a exercer a prostituio. O PSD decidiu deixar hoje pronta a Lei da Greve. Para isso, depois do debate na generalidade agendado para hoje, pedir a avocao dos artigos do seu projecto-lei, de maneira a deixar feita a discusso na especialidade, despachando assim esta questo antes de frias. Em protesto contra esta pressa social-democrata, que deitar por terra o projecto do CDS, a oposio decidiu no dar acordo para que a Lei Orgnica do Ministrio Pblico, que regressou ao Parlamento ferida de inconstitucionalidades, seja expurgada antes do prazo regimental. Assim, a aprovao desta lei, prevista para amanh, obrigar realizao de uma sesso extraordinria, j marcada para a prxima quarta-feira. Um lder sempre tem os seus fiis. Mas, com a normal tendncia para a reduo da realidade, passou-se a falar no apenas na Escola do Porto, conceito com origem numa falcia regionalista j muito afastada, mas numa j mesquinha e inexistente Escola do Siza. A arquitectura de Siza Vieira no o permite, talvez porque em todo o lado seja prprio da liderana cultural, o no se submeter s regras que ela prpria cria. Como estilo que nunca se chega a definir, como caminho que est sempre interminado, a obra do mestre mais a permanente inflexo que a coerncia inabalvel. Apaixonei-me imediatamente pela Casa dos Espritos. Telefonei mil vezes para a agente de Isabel Allende, e finalmente consegui, isso foi nos Estados Unidos, onde forcei Isabel a ver Pelle, o Conquistador antes da estreia na Dinamarca. Isabel respondeu-me que ia pensar na proposta. No dia seguinte ligou para mim, e disse que estava interessada. este filme mesmo internacional. Mas a adaptao ao cinema foi fcil. A minha ambio era a de contar exactamente a mesma histria da de Isabel. Depois tive que encontrar os melhores actores que h neste mundo, o que tambm foi muito complicado e fascinante, e eu acho, sinceramente, que conseguimos criar uma famlia credvel para a Casa dos Espritos. Optvamos por filmar em Portugal porque era demasiado difcil no Chile, onde as autoridades no vem com bons olhos, ainda hoje, uma recriao cinematogrfica do golpe de estado contra Salvador Allende. mas sairia muito caro. E devo dizer que a parte portuguesa da nossa grande equipa tem sido eficaz e prestvel. As autoridades portuguesas, entre elas os militares, tm sido muito abertas, concluiu Bille August que ainda revelou que A Casa dos Espritos ter a sua primeira estreia na Alemanha em Outubro deste ano. Calmamente, sem grandes foguetes de marketing, acaba de acontecer um daqueles pequenos passos na informtica que podem significar uma grande revoluo para o mundo da comunicao tal como o conhecemos. Trata-se da aliana entre a Adobe, a mais importante empresa no ramo da concepo de produtos para edio electrnica e a Netscape, que em seis meses capturou 75 por cento do mercado de programas de navegao na Internet com o produto que lhe deu o nome. Msica irlandesa em vora: os Wingers tocam na Feira de So Joo, naquela cidade. s 21 h. No mbito do Ciclo Jovens Compositores, organizado pelo Acarte da Fundao Gulbenkian, pode escutar-se a obra ...H Dois Ou ..., de Antnio de Sousa Dias. Os intrpretes so Joo Natividade (movimento), Lus Madureira (voz), Olga Pratts (piano), Pedro Wallenstein (contrabaixo), Antnio de Sousa Dias (percusso) e Clemente Cuba (desenho de luzes). s 21h30. A atleta e o seu tcnico, Joo Campos, elegeram a corrida de 10.000m como um meio de ela se sagrar campe olmpica. Apesar de ser a recordista mundial dos 5000m, nos 10.000m que a Fernanda se exprime melhor, considera Campos. Fernanda Ribeiro no se aterroriza sequer com as condies climatricas que, decerto, ir encontrar -- muito calor e humidade. O clima igual para todos e no me preocupa muito. Iremos com antecedncia para nos adaptarmos. As mais temveis devero ser as do costume, em especial a etope Tulu. No Word, o salto da verso 2.0c para a 6.0 significou uma alterao profunda na orientao do programa. Em vez de acrescentar uma infinidade de caractersticas novas, o novo Word privilegiou a consistncia de funcionamento e a facilidade de acesso e de aprendizagem. Um dos objectivos do novo Word foi tambm assegurar a mxima coerncia entre as verses para Windows e para Macintosh. Desde o manual (que o mesmo para os dois produtos) ao formato de ficheiros, tudo dever contribuir para que a mudana entre estas duas plataformas no oferea qualquer problema. por exemplo, o excelente search and replace tem caractersticas que s existiam na verso 5.1 para Mac. O WordPerfect, por seu lado, ao passar de 5.2 para 6 tornou-se praticamente num novo produto. Foi, dos trs, o que teve a alterao mais radical e, por isso, o que apresenta a maior quota-parte de problemas. Quando as imagens do Cobe foram publicadas, elas emocionaram toda a gente. Afinal elas representavam o embrio do nosso Universo. Um embrio to primitivo que quase parecia impossvel recuar mais no tempo. No entanto, Smoot pensa que vai ser possvel ir ainda mais longe. Penso que possvel recuar ainda mais no tempo, diz o astrofsico. Acho que vai ser possvel aprender algo mais sobre a inflao do Universo e sobre o seu princpio. Acho que mesmo possvel chegar at ao Big Bang. Em Coimbra, os estudantes marcaram um golo defesa menos batida da prova, marca suficiente para alcanar o triunfo e justificar a recuperao das ltimas jornadas, que j levou a equipa a ascender ao quarto lugar na competio aps um mau incio de poca. A liderana agora repartida entre Tirsense e Rio Ave, ambos com onze pontos, contra dez do Campomaiorense, que foi empatar (2-2), no sbado, Madeira, frente ao Nacional, e da Acadmica. Logo atrs, surge o surpreendente Desportivo das Aves, que foi golear fora o Sporting de Espinho, por 3-0, e a Ovarense -- empate a duas bolas no Algarve com o Louletano --, ambos com nove pontos. A diferena mnima entre os primeiros reflecte uma vez mais o equilbrio da prova, tanto mais que o primeiro e o dcimo quarto da classificao geral esto separados somente por quatro pontos. Referncia maior para o Espinho, que, depois de, na temporada passada, ter disputado a I Diviso, est a ter um comportamento desastroso, pois o lanterna vermelha. Cinco pontos em oito jogos muito pouco para um clube que procura o regresso prova maior do futebol portugus. Entretanto, os espinhenses disfrutam da companhia do Penafiel, que foi perder a Viseu por 3-1 e est a repetir a m campanha da temporada anterior. Nos restantes encontros, o Torrense foi garantir o empate (0-0) em Matosinhos perante o Leixes, enquanto as vitrias tangenciais (1-0) do Unio de Leiria sobre o Lea e do Chaves contra o Felgueiras, vieram elevar para 22 os golos marcados nesta 8 jornada, que forneceu quatro vitrias dos visitados, quatro empates e apenas uma vitria fora. O escudo esteve bastante pressionado, devido sobretudo a vendas especulativas por parte de bancos estrangeiros, e as frequentes intervenes do Banco de Portugal apresentaram-se eficazes, quebrando a tendncia de queda, por vezes acentuada, da divisa nacional. R. -- Era o meu ano de agregao em Filosofia; conclui-a nesse ms na Sorbonne comentando uma frase de Einstein sobre a compreensibilidade do mundo. Mas nunca fui um dirigente do movimento, apenas um simples peo. Tenho 46 anos, ou seja, perteno exactamente a essa gerao, facto de que me orgulho muito. Penso que em relao s minhas filhas, que tm hoje 23 e 20 anos, tive muito mais sorte, apesar de ter crescido numa sociedade que era certamente mais autoritria que a delas. As relaes entre pais e filhos so hoje de melhor qualidade. P. -- As suas filhas compreendem o que se passou em Maio 68? Compreendem o que que vocs queriam? No momento em que a Unio Europeia decidiu abandonar a explorao do carvo de pedra, existem cinco mil mineiros portugueses no Norte de Espanha, nas provncias de Len e das Astrias, condenados a assistir ao encerramento das minas onde trabalham. Partiram para um El Dorado, deparam-se agora com a perspectiva de uma reforma antecipada ou do desemprego forado. A UE o principal parceiro comercial da Rssia, representando 37 por cento do total das trocas comerciais contra 24 por cento no caso das restantes ex-repblicas soviticas, ou quatro por cento com os Estados Unidos. Com exportaes para a UE de 15,5 mil milhes de ecus e importando o equivalente a 11,5 mil milhes, a Rssia mantinha em 1993 um excedente comercial, face aos europeus, de quatro mil milhes de ecus. No de estranhar, porque entre os comensais reunidos no dia 29 de Agosto em casa do romancista norte-americano William Styron, prmio Pulitzer, em Martha's Vineyard, uma ilha ao sul de Boston, Massachusetts, no se encontrava apenas o Presidente dos Estados Unidos, mas sobretudo esses dois grandes obcecados dos livros que so o Nobel colombiano Gabriel Garca Mrquez e o mexicano Carlos Fuentes, ltimo prmio Prncipe das Astrias de Literatura. Toda a gente acreditou que essa reunio, realizada em plena crise dos balseros cubanos, tinha sido programada para falar da nova crise aberta entre Cuba e os Estados Unidos e, dada a personalidade dos convivas -- Garca Mrquez tem uma relao estreita com Castro e Fuentes defende o fim do embargo norte-americano para que se inicie uma nova etapa no longo contencioso da ilha caraba com o seu poderoso vizinho --, deu-se como ponto assente que Cuba tinha de ter sido o assunto. O carro est bem equilibrado e estamos esperanados para a corrida. Confesso que entrei depressa demais. O carro atravessou-se, primeiro para a direita e depois para a esquerda, o que me levou a tirar o p do acelerador. Quanto a Pedro Matos Chaves, o piloto portugus esteve uns furos abaixo do habitual, no conseguindo com o seu BMW Srie 3 melhor que o oitavo tempo, com 1m14,667s, o que o deixou fora da super pole. O carro alemo teimou em no se adaptar ao traado do circuito, o que deixou Chaves algo desalentado, at porque experimentava em Barcelona algumas novidades aerodinmicas que deveriam melhorar as performances do seu carro. So jovens, de facto. Tambm por isso, na FPJ pensa-se j em Sydney. E espera-se o mesmo de sempre. Que para o prximo ciclo olmpico o projecto no seja para dois, mas para quatro anos. P. -- Qual foi a opinio que j exprimiu aos parceiros (e ao Governo) sobre a concertao estratgica proposta pelo Plano Cavaco? R. -- H muito que defendo que os acordos de concertao social devem ultrapassar as dimenses temporal e de contedo que tm tido. Os acordos de concertao social tm vigorado sempre por um ano. Agora temos uma proposta, inovadora, de mdio prazo, ajustada a todo este ciclo que o pas vai atravessar de 1995 a 1999. Parece-me lgico ligar um acordo de concertao social ao Quadro Comunitrio de Apoio (QCA) e ao PDR, na vertente especfica de um acordo de concertao social e, mais do que isso, porque o primeiro-ministro afirmou aqui, na reunio, que est disposto a que o prprio acordo contemple medidas de acompanhamento e de avaliao do QCA e do PDR, visto que pode acontecer que os cenrios macro-econmicos e macro-sociais se alterem face ao previsto no PDR e no QCA. E essas alteraes devero ser discutidas com os parceiros sociais e no apenas decididas pelo Governo. Dois empresrios, um chins de Hong Kong e outro portugus, encontram-se para falar de negcios. O primeiro comea por explicar que est ligado construo de um prdio para habitao e que tem tido dificuldades em encontrar uma fornecedora de material sanitrio nas medidas exigidas. Porqu? Porque a exiguidade de espao em Hong Kong tal que at um metro quadrado de um apartamento para habitao social custa ouro, como quem diz, entre 600 e 800 contos. Para que se aproveite o espao ao mximo, h zonas que tm de ser encolhidas, o que acontece com a casa de banho. Por isso, em Hong Kong as medidas-padro ficam muitos centmetros abaixo. Ao princpio, quando vi ali o carro, sempre pensei que fosse o engenheiro que vinha vistoriar a obra. Depois, como ele nunca mais saa, comecei a espreitar. Foi ento que o vi tirar a boina e umas barbas pretas, contou Jos Lopes. O pedreiro adiantou que ainda esteve tentado a dirigir-se ao carro e dizer ao fulano que o Carnaval j acabou. S no o fez porque o empreiteiro da obra o demoveu. Mais tarde, o Ford Fiesta voltou a parar no mesmo local -- esquina da rua da Penso Beira Baixa, onde Carlos Miguel almoou, e tambm muito prximo de um eucaliptal onde o rapto se consumou. S que, desta feita, o homem, em vez de a barba, tinha bigode, a boina desaparecera e trazia culos escuros. Jos Lopes garante ser o mesmo que vira durante a manh e que estava a fazer uma chamada atravs de um telemvel. O estudante nunca falsifica uma assinatura, deixa descansar o seu encarregado de educao, que v em ele um grande futuro. O estudante no bebe, saboreia. Brennan e Laurent Filipe revelam uma maior segurana instrumental, a bateria de Accio Salero (visiense apostado em desmentir a macrocefalia nacional que no jazz pouco menos do que ditatorial) mantm um grau de ateno ao que se passa sua volta absolutamente indispensvel afirmao de qualquer bom jazzman, e o contrabaixo de David Gausden, embora fraquejando quando lhe compete marcar o sentido da marcha, cumpre razoavelmente a sua funo colectiva. Uma palavra final para sublinhar trs elementos essenciais, clara e inteligente -mente valorizados por Patrick Brennan: o prazer e a alegria de tocar (sem os quais, alis, a msica de Monk inacessvel), relembrando que, mais do que um emprego, o palco um local de paixo; o peso do humor no desenvolvimento das notas e, acima de tudo, a sua presena peripattica, mesmo quando instrumentalmente afastado do discurso, danando os silncios e coreografando os tempos. esse um dos grandes segredos da msica do pianista (cujos bailados foram, quase sempre, interpretados como uma mera excentricidade exibicionista). que a danar que Monk se entende. O croata Goran Ivanisevic, o norte-americano John McEnroe, o checoslovaco Petr Korda, o francs Henri Leconte e o holands Richard Krajicek foram os primeiros a assegurar a passagem segunda ronda da Taa do Grand Slam em tnis, competio que est a ser disputada em Munique (Alemanha) e que est dotada com seis milhes de dlares (cerca de 840 mil contos) em prmios. Ivanisevic, n 4 do ranking mundial, encontrou algumas dificuldades para bater o francs Guy Forget (7-5 e 6-4), acabando por se impr na batalha do servio. O francs conseguiu dez ases no encontro, contra 17 do croata, que est perto de atingir a incrvel marca de mil ases este ano. Ivanisevic vai defrontar na segunda ronda o norte-americano John McEnroe, que eliminou o sueco Niklas Kulti (6-1 e 6-4). McEnroe, que nas ltimas semanas tem evitado os jornalistas, depois de ter admitido dificuldades no seu casamento com a actriz Tatum O'Neal, voltou a jogar o seu melhor tnis, empolgando a assistncia com todo o seu repertrio de pancadas espectaculares. Henri Leconte, que venceu o sul-africano Wayne Ferreira (3-6, 6-3 e 6-0), vai defrontar na prxima ronda o vencedor do encontro entre Pete Sampras (EUA) e Alexander Volkov (Rssia). Tambm Petr Korda no encontrou facilidades para derrotar o australiano Wally Mansur (2-6, 7-5 e 6-4), esperando agora pelo norte-americano Michael Chang, que bateu o seu compatriota Andr Agassi (6-4 e 6-2). Por fim, o jovem holands Richard Krajicek teve o encontro mais fcil desta primeira ronda, batendo o espanhol Emlio Sanchez (especialista em pisos mais lentos) por 6-3 e 6-2. Na prxima jornada, Krajicek defrontar o vencedor do encontro entre Stefan Edberg (Sucia) e Michael Stich (Alemanha). Eles representam mais um aborrecimento do que uma ameaa para o Governo, comentou Sergio Ramirez, o sandinista que preside ao Parlamento. O perigo a internacionalizao do conflito, por causa da instabilidade que os grupos armados causam nas zonas onde operam. Tony Bryant, lder dos Comandos L, uma organizao hostil ao regime de Fidel Castro e sediada em Miami, admitiu ao Washington Post que est a ajudar os Recontras na luta contra Violeta Chamorro e os sandinistas. O corpo do escritor francs Alain Fournier, autor de O Grande Meaulnes, morto no comeo da I Guerra Mundial, foi formalmente identificado, anunciou ontem a Direco Regional de Assuntos Culturais Franceses, na cidade de Metz. Os despojos do escritor francs era um dos 19 cadveres descobertos em Novembro, na regio de Verdun, local da batalha contra o exrcito alemo onde se sabia que Alain Fournier tinha sucumbido. O autor de O Grande Meaulnes (edio portuguesa na Relgio d'gua) pertencia ao 288 Regimento de Infantaria que a combateu. O seu corpo foi identificado por antrometria, sem qualquer margem para dvidas, pela equipa chefiada pelo antroplogo Fredric Adam, que comprovou a estatura do esqueleto, a sua idade data da morte e a compleio fsica, por comparao com documentos da poca. Contra a lenda que falava da morte por tortura do escritor, s mos de soldados alemes, os exames efectuados por Adam demonstram que a morte de Henri Alban Fournier, dito Alain Fournier, se deveu a impactos de balas. Foi em 22 de Setembro de 1914. R. -- No, de todo. O que acontece que, na altura em que foram feitas as estimativas das receitas no oramento, era j sabido que o cenrio internacional adiantado pelo Fundo Monetrio Internacional poderia ser optimista. Isso implicou uma maior incerteza e estimativas acrescidamente prudentes. A confirmao de que as perspectivas de crescimento no so, afinal, to favorveis s veio validar essa abordagem. P. -- Quer dizer que nunca se acreditou num crescimento de trs por cento e que a estimativa da receita foi feita, desde o incio, a pensar em dois por cento ou menos? Os outros operadores so os membros negociadores (intervm na negociao em bolsa, introduzindo directamente no sistema ofertas de compra ou de venda) e os membros compensadores, que alm de as funes dos anteriores, participam, designadamente, nos procedimentos de liquidao de contratos nas datas de vencimento e no processo de constituio, substituio, reduo e libertao de garantias. O futuro sobre OT-10 (uma taxa de juro de longo prazo) conta, partida, com nove market makers, significativamente mais do que os cinco que cumpriro indntica funo para o futuro sobre o PSI-20 (um ndice bolsista calculado para uma carteira de 20 ttulos). O contrato sobre a Lisbor a trs meses (taxa de juro de curto prazo, correspondente mdia das taxas praticadas no mercado monetrio interbancrio) s comear a ser negociado no incio de Julho mas j tem garantidos seis market makers. Segundo o Instituto Nacional de Estatstica (INE), Portugal exportou 265,2 milhes de contos e importou 521 milhes de contos, durante os primeiros cinco meses de 1995, o que resultou num dfice comercial de 255,8 milhes de contos. A taxa de cobertura das importaes pelas exportaes melhorou, situando-se em 50,9 por cento, contra 45,1 por cento nos cinco primeiros meses de 1994. Em Maio verificou-se um aumento de 19,3 por cento nas exportaes e uma queda de 4,3 por cento nas importaes, o que permitiu uma diminuio de 19,2 por cento no dfice comercial mensal, em relao ao mesmo ms de 1994. As compras aos PALOP (Pases Africanos de Lngua Oficial Portuguesa) aumentaram 45,5 por cento de Janeiro a Maio deste ano, face a idntico perodo de 1994, enquanto as vendas ao Japo subiram 34,6 por cento no mesmo perodo. O caso tem sido absolutamente devastador para a agncia .. tanto em termos de pblico como internamente, disse um funcionrio da Administrao. Temos de o ultrapassar ...a fim de restaurar a confiana do Congresso e do pblico. Creio que era tempo de Woolsey sair. E deu como motivos ele no haver despedido quadros que no conseguiram detectar a tempo o trabalho de sapa que Aldrich Ames andava a realizar. A banda sonora de Thirtysomething a de uma srie televisiva, domnio onde a msica vem sendo tratada pior que no cinema. Claro que tambm faz os usuais recursos a fundo de catlogo, mas logo a esse nvel h uma selectividade fora do comum, e os temas repescados de Ray Charles como de Rickie Lee Jones so no apenas canes clssicas, como concordam absolutamente com o esprito da srie. Mais importante que isso, a msica composta por Snuffy Walden no um mero adereo, mas surge como outro personagem de Thirtysomething, de algum modo participando do seu enredo e da sua lgica, mas com a diferena de ser de carcter musical. Predominantes no trabalho de Walden so guitarra e piano, mas os instrumentos que usa so relquias do incio do sculo passado, porque estes tm em seu entender mais carcter, ou, se se preferir, um som mais distintivo. Empregou, para alm disso, tons ressonantes e profundos, de forma a aumentar a densidade emotiva. Da resulta uma msica com uma individualidade peculiar, um misto de pureza e de permeabilidade, mas ao mesmo tempo com um sentido de passado e de amadurecimento. O ditador ter sabido evitar, assim, as trs caractersticas que marcaram os regimes totalitrios de Hitler e Estaline: modernizao, mobilizao e expanso. Salazar quis reinserir Portugal na sua tradio, no queria a modernizao mas a secularizao, disse Braga de a Cruz. Tambm no tinha veleidades de mobilizao, pois o seu grande intuito foi despolitizar, baixar a febre poltica. Ele herdou um imprio colonial e o seu objectivo foi nacionaliz-lo, conserv-lo. No h nada nela que obrigue o regime a ser ditatorial. Nem sequer havia proibio de partidos. Como que, ento, aconteceu a ditadura? Pela via administrativa. Atravs da policializao do Estado e do esvaziamento do sistema representativo. Com o tempo, o poder legislativo foi transferido para o Governo, esvaziando os poderes da Assembleia Nacional, que funcionou apenas trs vezes por ano, disse Braga da Cruz, que, apesar disso, encontra algum pluralismo no salazarismo. A Unio Nacional era uma grande frente onde Salazar tolerava algum pluralismo orgnico, nomeadamente a Causa Monrquica, que, em 1961, se no fosse o incio da guerra em frica, teria mesmo aparecido como alternativa Unio Nacional. Kadhafi, da Lbia, far o mesmo. A Bolsa cai e o preo do petrleo sobe. As pessoas que assistem a tais sesses saiem loucas, histricas, falam alto, as crianas choram, observa a administrao daquele condomnio. Todas estas prticas deveriam ter locais especficos. Segundo se julga, o cinema foi licenciado para exibio de filmes e no para culto, defende. A finalizar, o protesto solicita uma interveno do presidente da Junta de Freguesia, a fim de esclarecer esta situao ou, se possvel, acabar com as sesses. O autarca remeteu cpias do ofcio delegao de sade responsvel pela rea de Alverca e administrao da Lusomundo, empresa proprietria da sala de cinema em causa. Afirmaes extraordinrias exigem provas extraordinrias. Drosnin e muitos dos seus seguidores aceitam acriticamente o episdio Rabin como prova extraordinria do cdigo da Bblia. Mais grave, a afirmao dos media de que foi demonstrada cientificamente a realidade dos cdigos. no s as tcnicas utilizadas no demonstram nada, como as concluses se baseiam em falcias, como todo o processo nada tem a ver com Cincia, ao contrrio do que as mquinas de propaganda pretendem fazer crer. A tcnica utilizada por Drosnin muito simples. Na descrio que se segue realizo algumas simplificaes, bem como uma adaptao do hebraico para o portugus. Passo 1: pegue num texto qualquer, de preferncia grande. Passo 2: elimine todas as vogais, espaos e pontuao, ficando apenas com uma longa cadeia de consoantes. Passo 3: pense num nmero inteiro, digamos 7. Passo 4: faa uma bola em torno de uma consoante qualquer da sua cadeia, e a partir dessa em torno de todas as consoantes contadas de 7 em 7 (7, 14, etc.), construindo uma nova cadeia de consoantes. Passo 5: pegue na nova cadeia e tente construir uma ou vrias palavras com sentido introduzindo vogais onde quiser. Se sim, BINGO! Acertou. Se no, repita os passos 3 a 5 variando o que quiser at conseguir alguma coisa interessante. Pode, por exemplo, comear a nova cadeia em qualquer das milhares de consoantes sua escolha. Ou, em vez de contar de 7 em 7, pode contar de 33 em 33. Ou acrescentar vogais diferentes. Tem milhares de milhes de variaes ao seu dispor. o nome de Rabin, na previso de Drosnin, aparece contando cada 4772 letras a partir da 4333. A abertura da nova Ponte Vasco da Gama parece estar a influenciar de uma forma positiva o trnsito na cidade de Vila Franca de Xira. A Polcia de Segurana Pblica vila-franquense efectuou, durante o ms de Abril, um conjunto de recolhas de dados sobre a circulao na cidade e detectou uma reduo de trfego da ordem dos 20 por cento comparativamente com as mdias registadas em 97. A moo apresentada pela bancada socialista defendeu com rigor o projecto do Governo para a criao de regies administrativas, considerando-o uma boa base de trabalho, e acrescentou-lhe argumentos para impor a necessidade da sua concretizao. Orlando Magalhes recorreu a dados estatsticos para provar que o Norte do pas, apesar do seu peso econmico, tem sido a regio menos beneficiada pelos fundos comunitrios, em relao s zonas do Sul e, por isso, menos desenvolvida. As assimetrias regionais foram, de resto, os argumentos base para defender uma moo que considera a regionalizao uma reforma essencial ao Estado. A bancada da CDU concordou e reforou politicamente o teor do documento socialista. A regionalizao, alm de promover o desenvolvimento, favorece a democracia participativa atravs do voto popular, disse o comunista Antnio Graa. Eu nasci e vivi em Vila Real at aos 17 anos e por isso senti na carne o isolamento do interior. No entanto, o deputado popular acusou receios de bairrismos exacerbados e acabou por seguir a linha do lder do partido, ao rejeitar a criao de regies em favor de mais poder para as autarquias. O presidente da Cmara de Esposende, Alberto Figueiredo, anunciou ontem, que vai pedir ao ministro das Finanas um inqurito a todos os fundos comunitrios que as suas empresas receberam nos ltimos anos. O pedido, feito durante uma conferncia de imprensa, surge na sequncia das insinuaes que o candidato do PP, Franklim Torres, lhe fez de ter utilizado o seu cargo de presidente da autarquia para obter este tipo de apoios do Governo. Figueiredo, que se recandidata a um terceiro mandato, considera que o seu adversrio est a insinuar corrupo e que a sua dignidade foi posta em causa, por isso no tem outro caminho seno esclarecer estes processos para que no fiquem dvidas. Alguns estudos em que foram usadas definies de depresso mais restritas do que as internacionais apuraram taxas mais baixas -- de trs a cinco por cento. No entanto, este especialista, psiquiatra no Hospital de Santa Maria, considera que no h razes mdicas, culturais ou sociais que nos permitam pensar que a frequncia da depresso seja diferente no nosso pas, pelo que os dados internacionas podem ser extrapolados com alguma segurana. As principais preocupaes mdicas da actualidade so, segundo Filipe Arriaga, a elevada morbilidade associada depresso e ao suicdio. A depresso o principal factor de risco do suicdio, est em primeiro lugar nas causas que levam ao acto suicida, acrescenta. O PRIMEIRO-MINISTRO israelita, Yitzhak Rabin, e o chefe da Autoridade palestiniana, Yasser Arafat, conferenciam amanh em Erez, ponto de passagem entre a Faixa de Gaza autnoma e o Estado de Israel, afirmou ontem agncia noticiosa France Presse um conselheiro do chefe da OLP. A pea do jovem dramaturgo Sergi Belbel (Prmio Nacional da Literatura Catal em 91, 92 e 93) tem muito de futurista. Mas o futuro de que fala mais ou menos prximo. Represso, discriminao, transgresso so as molas da comdia posta em cena por Ins Cmara Pestana. O repressor apela para o transgressor e vice-versa. A represso do tabagismo, por exemplo, atrai a transgresso. E dessa transgresso que nasce o espectculo cruel do Teatro do Sculo. Uma verdadeira dana da morte. Ou da vida tomada como um desporto de altssima competio. proibido fumar. Mas toda a gente fuma (s escondidas) incluindo os proibicionistas. Se, espectador distrado, os nomes de actores como Teresa Roby, Ins Cmara Pestana, Antnio Filipe, Elisabete Piecho, Manuela Pedroso, Marina Albuquerque, Orlando Sgio e Rui David no te dizem nada, aproveita a noite de hoje (s 22h) para ir Rua do Sculo ver o prodgio de uma companhia em que os actores parecem ligados corrente. Porque corrente magntica no falta encenadora, que, em espectculos como Crimes e Interldios, Carcias, Kvetch e Zucco, fez do Teatro do Sculo uma referncia incontornvel no panorama do teatro portugus nos anos 90. O SECRETRIO-GERAL da Organizao das Naes Unidas (ONU), Butros Butros-Ghali, deixou ontem ao princpio da tarde Luanda com a promessa de que at ao fim de Agosto devero chegar a Angola as unidades de infantaria que faltam para se completar a Misso de Verificao (Unavem III). Ele pintor, 26 anos, chamado Johnson, ela professora, 28 anos, Edel de seu nome, entraram no carro de um taxista de apelido So Pedro, 40 anos. A troca de palavras menos agradveis comeou logo, antes de arrancarem, por causa da curta distncia da viagem. No sinal vermelho seguinte, a coisa ficou to feia que o casal abriu as portas e saiu do carro. O taxista no se ficou e, com a segurana que a dimenso do seu porte lhe d, foi atrs deles, exigindo o seu dinheiro. A actual administrao da TVI, presidida por Miguel Paes do Amaral, lder do grupo SOCI, e a Antena 3 espanhola assinaram ontem um acordo de parceria estratgica, em que se comprometem a duplicar o actual capital social do quarto canal portugus. A Media Capital (do grupo SOCI), que gere a TVI, apresenta assim mais um argumento para a assembleia de credores que se realiza no prximo dia 14. Nessa assembleia devem ser apresentados, pelo menos, dois projectos de recuperao da TVI: o da Media Capital e da Antena 3 (a principal televiso privada espanhola) e o do grupo Lusomundo, associado ao empresrio macaense Stanley Ho. De momento, ambos os grupos esto em contactos com os vrios credores para assegurarem o respectivo apoio, pois necessrio que os votos favorveis a uma das propostas venham de um conjunto de credores que detenham pelo menos 75 por cento da dvida da TVI, que ascende a 17 milhes de contos. Entre as mais belas fotografias de cinema do mundo as da agncia Magnum ocupam o primeiro plano. so as que procuras os actores, mas tambm os maridos e mulheres e amigos; as cenas de repouso, o cinema no trabalho. Estas imagensque publicamos de Cartier-Bresson, Robert Capa, Eve Arnold ou Dennis Stock, entre outros, demonstra, uma vez mais, a vocao totalizadora da Magnum, a capacidade em acompanhar de perto fenmenos mais importantes da histria da humanidade. Uma exposio de fotografias, concebida em conjunto com o livro Magnum-Cinema, poder ser vista em Fevereiro, em Lisboa, na Culturgest. Ter como amigos pessoas sbias, bem sucedidas e eventualmente belas um privilgio. Digamos que a qualidade dos amigos, para alm da qualidade da sua amizade, constitui para mim um factor importante de qualidade de vida. Que o meu querido amigo Joo Carlos Espada tenha acesso aos sabores duma das tradicionais bolsas da erudio ocidental constitui um contributo, no despiciendo, para a melhoria da minha qualidade de vida. Mais tarde ou mais cedo, quando nos encontrarmos, usufruirei, eu tambm, das experincias de que ele agora desfruta. Entretanto, eu -- e os milhares de pessoas que o leram no PBLICO (da passada segunda-feira) -- vou-me contentando com as suas Cartas de Inglaterra. Um quarteto formado por Bob Mover (sax alto e voz), Carlo Morena (piano), Pedro Gonalves (contrabaixo) e Joo Silvestre (bateria) actua a partir das 23h, na catedral lisboeta do jazz: no Hot Clube de Portugal. Uma autntica big band, constituda por 16 msicos, com alguns dos temas mais conhecidos do reportrio standard liderada pelo trombonista Claus Nymark e com a voz de Ana Paula Oliveira: no Speakeasy (Cais da Rocha Conde d'bidos -- Armazm 115), s 23h e 01h. Alguns milhares de trabalhadores afectos CGTP desfilaram ontem pela baixa de Lisboa em protesto contra a poltica econmica e social do Governo. Um boneco cabeudo baptizado de Santo Cavaco foi a estrela da manifestao, que partiu dos Restauradores e foi at Praa da Ribeira, depois de interromper o trnsito das Ruas do Ouro e da Prata. No podemos permitir que a contratao colectiva continue bloqueada, que o desemprego continue a aumentar, que a segurana social, a sade e a educao continuem a degradar-se. Somos obrigados a tornar os nossos programas o mais interessantes possvel, num ambiente onde existem mltiplas escolhas, refere Victor Neufeld, o produtor executivo do programa 20/20, da ABC, citado pela Associated Press. Conseguir um bom exclusivo pode significar a entrada de milhes de dlares em publicidade. Da que Shapiro tenha ficado to orgulhoso com o exclusivo dos pais dos sete irmos gmeos. E, para que fique registado, o produtor afirma que a NBC no pagou ao casal McCaughey. Shapiro acredita que eles concordaram em aparecer no programa Dateline, da NBC, pura e simplesmente porque gostam de o ver. Quanto presena dos seguranas porta de casa deles, foi a NBC que os contratou para manterem os paparazzi distncia durante a entrevista. Seriam 9h30 quando os trs homens, de quem no se conhecem dados identificativos, surgiram de armas na mo e rosto coberto porta do BNU de Massam, uma dependncia recentemente inaugurada e localizada na denominada Sexta Fase da Urbanizao Pimenta e Rendeiro (rea que na sua maioria ainda est em construo). Os assaltantes atraram a ateno de uma das funcionrias, que deu de imediato o alarme, pelo que o gerente conseguiu bloquear a porta de entrada ainda antes desta poder ser aberta pelos suspeitos. Aps verificarem que a entrada havia sido bloqueada, os trs homens fugiram de imediato para um automvel Citren AX, de cor branca, cujas letras da matrcula so AJ, e tomaram a estrada que conduz a Queluz. Segundo o comandante da GNR do Cacm, entidade a quem primeiro foi comunicada a ocorrncia, no foi consumado qualquer acto violento. Quanto a saber donde que vem a nossa preferncia quase unnime pela mo direita, a questo permanece em aberto. Tanto mais quanto, hoje em dia, o hemisfrio esquerdo (dos dextrmanos) j no considerado como o hemisfrio cerebral dominante, nem como o nico capaz de desempenhar altas funes mentais e de controlar movimentos precisos e subtis. Sabe-se que o hemisfrio direito tambm essencial para funes mentais de altssimo nvel, tais como o raciocnio, a memria, o reconhecimento dos rostos, das melodias musicais, etc. Uma explicao possvel da hegemonia dextrmana poder ser o facto de, nos dextrmanos, o hemisfrio cerebral esquerdo possuir ligaes nervosas sensoriais e motoras para os dois lados do corpo, enquanto o hemisfrio direito est quase s ligado a um nico lado do corpo. Isto poder significar, simplesmente, que o controlo voluntrio da mo direita mais fcil do que o da mo esquerda. As delegaes do Governo de Luanda e da Unita rubricam o protocolo das conversaes de Lusaca para que haja paz em Angola. O acordo definitivo dever ser assinado dentro de quinze dias. Marques Mendes, ministro-adjunto da presidncia de Conselho de Ministros com a tutela da comunicao social, ameaa proceder a alteraes legislativas caso as televises no cheguem a acordo sobre a passagem de imagens de alegada violncia. O provedor recomenda, assim, a imediata transferncia do agente da PSP para um estabelecimento prisional tutelado pelo Ministrio da Justia e que permita compatibilizar as exigncias de segurana com o exerccio dos direitos conferidos a qualquer recluso em ambiente prisional normal. A falta de conscincia do sentido de ridculo ainda recentemente foi dada, em declaraes ao PBLICO, por um douto professor universitrio que classificou liminarmente de mau plano um PDM elaborado dentro de uma cmara municipal com uma equipa tcnica de qualidade e fortemente assessoriada por urbanistas experientes. Fazer uma gesto urbanstica inteligente, dialogante, eficaz, informada e com bons resultados prticos, exige uma tenacidade e um talento que no esto ao alcance de tcnicos cinzentos e submissos que o sistema inevitavelmente prefere, produz e atrai. O incndio que deflagrou no final da semana passada nos concelhos de Mao, Abrantes e Sardoal, foi dado como extinto pelas 12h00 de ontem. O jogo dos campees escoceses, apesar das mltiplas tentativas dos futebolistas do Rangers de maior nome, como Hateley e Durrant, no s perdia mpeto mas morria em qualidade, quando enfrentava o meio-campo do Celtic, onde McStay se exibia a grande altura. Por outro lado, os defesas-centrais do Celtic, Mowbray e Grant, sempre coadjuvados por McGinlay, chegaram para reduzir a pouco o ataque do Rangers, onde Hateley no dispunha do apoio habitual. O primeiro tempo terminou sem golos. Na segunda parte, apesar de o Rangers ter tentado o tudo por tudo para rapidamente resolver a partida, o Celtic depressa recuperou o comando das operaes. Mas o jogador russo do Rangers Mikhailichenko comeou a evidenciar-se devido frescura e rapidez do seu jogo. Era de facto por ali que comeava a surgir perigo para o Celtic, e no surpreendeu que, com pouco mais de uma hora de jogo, fosse esse mesmo Mikhailichenko a colocar a bola frente de McCoist, de modo a permitir-lhe a marcao do primeiro golo da partida. Cerca de trs centenas de trabalhadores municipais manifestaram-se ontem diante da Cmara Municipal de Sintra, exigindo da presidente socialista Edite Estrela o pagamento das horas extraordinrias e das ajudas de custo e a reposio do subsdio de insalubridade, penosidade e risco. No fim, saram com algumas promessas e um nim para o subsdio. Face a o marco, a nota verde no apresentou uma tendncia definida, oscilando entre os 1,79 e os 1,80 marcos por dlar, enquanto no mbito do SME se mantinha a tradicional apatia. Em o fecho dos mercados europeus,o marco/escudo transaccionava-se a 102,35/ 102,36 escudos por marco,tal como tinha j acontecido tera e quarta-feira. Em o que respeita s taxas de juro,o Bundesbank confirmou a expectativa do mercado ao anunciar uma manuteno de todas as suas taxas directoras. Os bancos centrais francs,holands,austraco e suo tomaram idnticas decises. ALVO: Com este afluxo -- recorde de votao, a A.B.P. saiu vencedora? E.G.: A A.B.P.ganhou bastante e acrescento que nunca houve em Delegao nenhuma,uma to grande afluncia de scios para a votao,como se verificou nestas Eleies. Isto significa que os Barmens no esto esquecidos. Continuo a dizer que acho bem que os Barmens saibam que a Delegao est aberta para todos eles,e gostaria imenso que eles aparecessem em vez de uma vez por ms, dez vezes porque temos todas as condies criadas para os receber. Podemos beber um copo,jogar gamo,damas,snooker,ver televiso,etc. Judo (52.500 contos)Nelson Brs,Slvia Henriques,Pedro Caravana,Michel Almeida,Guilherme Bentes,Pedro Soares e Filipa Cavalleri. Lutas Amadoras (7 mil contos)-- David Maia. um documento de meados do sculo XVII refere j uma Rua de o Reimo. da que surja muitas vezes referida como Campo do Cirne. Rua onde ainda hoje se concentram algumas das mais antigas casas de mrmores e cantaria da cidade,a Avenida de Rodrigues de Freitas conta ainda com duas instituies de especial relevo para a histria da cidade,descontado o j referido recolhimento para rfs: a Biblioteca Pblica Municipal do Porto -- cuja fachada lateral est voltada para a Rua de D.Joo IV e para o Jardim de Marques de Oliveira (vulgo de S.Lzaro)-- e a Escola Superior de Belas-artes do Porto. Passados seis meses sobre o anncio oficial da existncia da arte paleoltica do Ca,continuamos pois espera de iniciativas vlidas e de boa f para que,independentemente da deciso final que venha a ser tomada sobre a barragem,o monumento seja estudado com os meios que a sua grandeza requer. At quando permitiro os cidados portugueses,cujos impostos sustentam a hierarquia do IPPAR,e cujas contas de electricidade alimentam os oramentos sumpturios da EDP,que esta vergonha continue? O segundo dia ter a economia como tema comum s trs sesses. Incluir,naturalmente,o debate sobre a UEM,numa perspectiva europeia e portuguesa,a posio europeia na economia mundial e,ainda,o papel da UE no processo de reintegrao da Rssia na economia internacional ou a dimenso econmica e poltica das relaes transatlnticas. Entre os participantes,estaro Alan Milward,Jacques Attali,Vtor Constncio,Leonardo Ferraz de Carvalho,Vtor Martins,Artur Santos Silva,Jorge Jardim Gonalves e Pedro Ferraz da Costa. Finalmente,o terceiro dia ser dedicado a uma reflexo mais profunda sobre essa entidade a que se chama Europa -- do ponto de vista da sua cultura e da sua histria. Para responder questo primordial -- o que a Europa?-- intervm na conferncia personalidades como Alain Finkielkraut,Edgar Morin,Susan Sontag ou Marcelo Rebelo de Sousa. Andr Glucksman,Peter Schneider e Victor Cunha Rego falaro de cultura europeia,e o americano Simon Shama abordar a procura de uma histria europeia. Em os mintos iniciais da segunda parte os vitorianos conseguiram de novo adormecer os seus adversrios,nomeadamente graas a o trabalho laborioso de Paneira e Tonio e a excelente marcao que Marco exerceu sobre Zola. O portugus secou de tal forma o gnio parmeso que Ancelotti acabou por substitui-lo por Melli. O Parma voltou a acordar aos 66',quando Melli rematou barra uma bola passada por Dino Baggio. Porm,quando parecia que os italianos iam embalar para uma exibio arrasadora,aconteceu o golpe de teatro. Acabadinho de entrar a substituir Riva,o brasileiro Gilmar surgiu isolado na rea italiana e deu o melhor seguimento a um centro de Quim Berto. Os adeptos italianos gelaram,enquanto a pequena comitiva vitoriana festejava ruidosamente na bancada central. de novo a polmica ribalta. Um investimento de 27 milhes de contos (mais seis milhes para aquisio de uma coleco prpria)suportado por as arcas bascas apesar de a existncia de 47 partenaires comerciais, sinnimo do interesse que o governo de Euskadi,do Partido Nacionalista Basco (PNV),atribui ao evento. Implantado em terrenos antes ocupados por indstrias desarticuladas por a reconverso industrial dos anos 80 e cujo ferro velho permanecia como legado da crise econmica da outrora laboriosa cidade de Bilbau,o Museu pretende ser um smbolo da regenerao basca: de uma sociedade que quer vencer as dificuldades presentes e,em vsperas do sculo XXI,dar novos argumentos -- os da esperana e da paz -- aos seus cidados. Por isso,atentos ao perder da sua influncia,os etarras,atravs de o at agora desconhecido comando Katu,tentaram destruir o smbolo de um novo tempo,menos rural -- nessas zonas e entre os deserdados das cidades que a Eta faz o seu recrutamento --,mais dinmico e,sobretudo,aberto influncia exterior,algo que o nacionalismo admite com dificuldade. Desvalorizando,por outro lado,a posio assumida por o deputado independente Jos Magalhes,que considera errada a proposta de Manuel Carrilho -- Se ele disse isso,o problema de ele,foram as suas palavras --,Jorge Coelho reafirmou a sua confiana no ministro da Cultura. Se ele apresentou esta soluo para resolver aquelas dvidas, porque deve ser uma boa soluo,rematou. poca de Ouro abre, com chave d'ouro, o ciclo. Fundado por o mestre Jacob do Bandolim em 1964,o grupo tem mantido viva,ao longo de os anos,a tradio no domnio do chorinho,valorizando-a com sabedoria. Compem o grupo seis msicos de reconhecida craveira: Toni (violo),Csar Faria (violino),Jorge Filho (cavaquinho),Ronaldo do Bandolim (bandolim),Cristvo Bastos (piano)e Jorginho do Pandeiro (flauta). Hoje e sbado,s 21h30,uma oportunidade nica para confirmar,em palco,toda a alma e virtuosismo de um gnero musical tambm nico. Com o apoio da Fundao luso-brasileira para o Desenvolvimento da Lngua Portuguesa. s 17 horas,pode ser visto Adamubies (ttulo de um poema de Guimares Rosa),projecto de msica cnica onde textos de Miguel Torga,Clarice Lispector,Joo Guimares Rosa e Agustina Bessa-Lus surgiro vestidos com msicas reinventadas a partir de matrizes populares do Brasil,frica e Portugal. A direco musical de Tilike Coelho e a encenao e cenografia de Jos Caldas Neto. A estes juntaro-se-,em palco,os cantores/msicos Marta Silva,Marcelo Lafontana e Pedro Ribeiro. O espectculo repete domingo mesma hora e no fim-de-semana seguinte,11 e 12,tambm s 17h. O inter-Fiorentina era o jogo grande da terceira jornada do campeonato italiano. A partida at comeou bem para os homens da casa. Ronaldo, beira de o intervalo,inaugurou o marcador para o Inter,justificando o forte investimento que a equipa milanesa fez na sua aquisio. Mas os homens de Gigi Simone tiveram que sofrer muito para levar de vencida a Fiorentina que,com golos de Serena e Batistuta,deu a volta ao resultado. Batistuta viu ainda dois remates seus baterem nos postes da baliza contrria. Um golo de Moriero recolocou as duas formaes em p de igualdade e s a nove minutos do fim o Inter chegou ao golo da vitria,por intermdio do francs Djorkaeff. Em os outros jogos do calcio,destaque para mais uma goleada da Juventus de Dimas (4-0 frente a o Brescia),enquanto a Lazio de Eriksson parece ter perdido a veia goleadora demonstrada em Guimares e foi derrotada por o recm-promovido Empoli,por 1-0. Realce tambm para mais uma derrota do Milan,desta feita por 2-1,frente a a Udinese. Agora, perante os olhos da velha Elvira, sentada no centro da sala com os xailes e o casaco que a protegem do frio, encontra-se o vestido verde com os botes com pequenos sabres dourados idealizado pela costureira Danilina. Rev-se no Mercedes de Goebbels, apreendido na Alemanha e entregue ao marechal Oslikovsky, sentado com a filha pequena, Lora, no assento posterior. Uma corrida para Moscovo a chupar as bolinhas de Kliukva v sacarnoi pudre, feitas de acar em p, com uma baga siberiana de sabor agridoce no meio. Nessa manh, usava pela primeira vez os sapatos de pele macia criados por Goldin, o surdo sapateiro judeu. Tinha calado um par de pantufas inchadas de plo velho. De sbito, ergue os olhos para voltar a mergulhar no passado que ressumava daqueles trajes velhos. Eis o casaco de linho claro e a saia que lhe havia dado a filha casada em Itlia. Uma medida grande para o seu corpo pesado, agora que tinha setenta e dois anos. Usou-o no navio de cruzeiro Taras Scevcenko, que partia de Odessa. Estava calor e ela enxugava o suor com um leno bordado apertado entre os dedos brancos e resplandecentes dos anis, ainda que agora de pouco valor. Quando o navio estava a passar no estreito dos Dardanelos e as cpulas das mesquitas de Constantinopla pareciam pombas brancas numa atmosfera de palhetas douradas, apresenta-se-lhe uma velha senhora magra, elegante e pripudrennaia (empoada) que a submerge numa admirao comovida, quase como se se tratasse de uma irm ou de um familiar regressado do tmulo. Esta senhora senta-se junto dela e, enquanto lhe acaricia as mos e procura lev-las at s faces magras, comea a recordar-lhe o tempo passado, a sua beleza extraordinria, que se tornara uma lenda em toda a Rssia e nas repblicas mais quentes. Recordou-lhe os invernos em Bakuriani, quando percorria os caminhos da aldeia a comer tangerinas e a macular, com as cascas, os montes de neve branca ao longo das estradas, por entre as casas de madeira. E os admiradores seguiam-na de longe, recolhendo aquelas cascas para ficarem qualquer coisa dela. Recordou-lhe quando, no fim da guerra, passava devagar na Avenida Gorki com o carro que pertencera a Goebbels. Falou da praia de Soci e do Castelo d'Aria, no cimo da colina de Dorzacos, quando o chefe da orquestra bebia copos de gua gaseificada, apontou o pequeno restaurante do Ermitage em Moscovo, onde se comiam trutas. Os beijos que todos procuravam dar-lhe na Praa Vermelha no dia da vitria. O seu vestido escuro durante o funeral de Estaline, quando, apesar da multido que se apinhava ao longo dos muros, para ela havia espao, para que ningum sequer lhe tocasse. Depois, os longos anos de ausncia dos teatros, dos restaurantes e das praias da moda. Um aviso fugaz no grande mercado das aves, quando a indiferena da multido lhe permitia observar com tranquilidade um peixinho circunscrito gua de uma garrafa e j ningum era atrado pelo seu perfume, que ainda ento se evaporava da carne rosada e abundante. Finalmente, eis que podia rev-la e abra-la. Subitamente, a mulher empoada recolhe todo o seu afecto nos olhos claros e avermelhados pela comoo. Mas a senhora lembra-se de quando era nova? -- pergunta-lhe de sbito. Nesta onda de apertada concorrncia em que os bancos tm vivido nos ltimos anos, parece que no so apenas os clientes que por vezes ficam baralhados com tantos produtos novos, taxas de juro irrecusveis e remuneraes estonteantes. O turbilho de tal forma irresistvel que at as prprias instituies ficam abananadas e entram, olimpicamente, pela via do disparate a toda a prova. Vem isto a propsito da recente experincia do depositante de um conceituado banco. Segundo o relato do prprio, houve um dia em que foi caixa do correio e l encontrou o familiar envelope que regularmente o informa acerca do movimento da sua conta bancria. Parecia um pouco volumoso para to pouca coisa, mas logo se colocou a hiptese de o extracto ir acompanhado de algum folheto de publicidade a um novo produto verdadeiramente arrasador. Aberto o envelope, a surpresa no podia ser maior. L dentro, alm do que era suposto l estar, havia mais treze papelinhos. Nada menos que os extractos de conta de outros tantos clientes que a esta hora devero estranhar o facto de o seu banco no lhes passar carto acerca dos seus saldos. se tinha sido nomeado gestor de conta de toda aquela gente ou se era o banco que estava a desenvolver alguma aco de luta contra o sigilo bancrio ... A eurodeputada portuguesa Maria Santos uma das personalidades convidadas pela Confederao dos Sindicatos Agrcolas Bascos para as jornadas Agricultura Viva para o Meio Rural, que decorrem em Bilbau. Em debate esto a relao entre os problemas dos agricultores europeus e as questes relacionadas com a defesa do meio ambiente, o desenvolvimento regional e os direitos dos consumidores. em Barcelos, o quei recebe o Benfica, partida que poder servir aos minhotos para recuperarem de uma campanha fraca (trs derrotas e um empate), enquanto os benfiquistas tentaro manter o percurso vitorioso. Na outra partida, a Oliveirense joga em casa com o sensacional Pao d'Arcos, actualmente em quarto lugar, a trs pontos do lder. O FC Porto no ter especiais dificuldades, pois joga nas Antas com o HC Sintra. primeira vista, Mousa, Mohammed, Abu Wahed e Khalid, o cabo, tinham a expresso deprimida e encurralada dos recrutas derrotados. Os uniformes estavam amarrotados e sujos, as botas imundas e rotas. Tinham a barba por fazer, estavam exaustos e famintos. Em o saco havia nacos de po, uma lata pequena de sumo e uma garrafa de gua. Em Frana, a jornada deste fim-de-semana provocou grandes alteraes no posicionamento dos primeiros classificados, com vantagem para o Auxerre, que chegou ao primeiro lugar, e para o Nantes, que, apesar de derrotado, beneficiou dos maus resultados do Paris Saint-Germain e do Mnaco e no abandonou o comando. Auxerre e Nantes dividem agora o primeiro lugar, ambos com 26 pontos, 35 golos marcados e 18 sofridos, seguidos pelo Mnaco, ainda com 26 pontos, pelo Marselha, com 25 (menos um jogo), e pelo Paris Saint-Germain, com 24. A 19 jornada comeou na sexta-feira, com a vitria do Marselha no Parque dos Princpes (0-1), perante a equipa treinada por Artur Jorge. No sbado, o Nantes foi surpreendido no terreno do Toulouse (2-0), cedendo a terceira derrota da poca, enquanto o Auxerre recebeu e venceu o Le Havre (4-1). No campo do Lyon, o Mnaco no foi alm de um empate sem golos, o que impediu a equipa de Rui Barros de se isolar no primeiro lugar. A prova de que a exibio de O Convento deixou muita gente baralhada podemos encontr-la, por exemplo, naquele jornalista do New York Post que fala sempre em scoops -- do estilo quem est a filmar com quem -- e que na conferncia de imprensa de apresentao do filme declarou: Sr. Oliveira, o seu filme o meu favorito de todo o festival. Diga-se, tambm, que h quem pense o contrrio. Mas ontem, Catherine Deneuve e John Malkovich foram, de facto, as atraces da imprensa. Ao lado de Lus Miguel Cintra e de Leonor Silveira, apresentados como os guardies da casa de Oliveira, Deneuve e Malkovich eram os estranhos seduzidos pela obscuridade -- palavra de Deneuve, lembrando-se de Buuel. E como disse a actriz, h certas coisas que no se devem desvendar. Malkovich, por seu lado falou em primitive dream paintings. Silenciosos, os guardies observavam. Foi com eles que o PBLICO falou. basta-nos procurar compreender o contedo do acordo agora assinado. E tendo integrado o ncleo fundador da Plataforma -- sem estarmos sequer ligados ciso que, circunstancialmente, no PCP lhe deu origem --, naturalmente sem satisfao que vemos a Plataforma dar razo a posteriori a lvaro Cunhal. Acreditando na possibilidade e na necessidade da construo de uma terceira fora na esquerda portuguesa, com uma lgica nova de interveno e funcionamento, dar razo a Cunhal tambm dar razo a todos os que ainda no tiveram coragem de arriscar este projecto. O texto do Protocolo de Cooperao suficientemente vago e impreciso para ser incuo e, ao mesmo tempo, ter um significado poltico claro. Embora reafirmem a cada dois passos o carcter estritamente autrquico do acordo, os dirigentes da PE subscrevem um texto que vai desde a reforma do sistema poltico em geral at ao modelo de desenvolvimento para o pas. No ltimo pargrafo das seis longas pginas, conclui-se por uma viso convergente sobre a necessidade de reformar positivamente o sistema poltico no sentido de assegurar melhor democracia e melhor desenvolvimento, na concretizao de uma alternativa poltica ao PSD e ao Governo. A 45 anos de governao do Partido Nacional, quatro dos quais sob a batuta de Frederik Willem de Klerk, dever seguir-se agora um perodo indeterminvel de preponderncia poltica do Congresso Nacional Africano (ANC), criado em 1912 para conseguir o fim do racismo. As sondagens opinio pblica indicam que o ANC, sob a liderana de Nelson Mandela, poder conseguir de 58 a 60 por cento dos votos nas eleies de Abril para a Constituinte. E alguns observadores vem em semelhante vantagem o perigo de uma tentao hegemnica, se bem que a Constituio interina a aprovar durante as prximas semanas preveja a representao no governo de todos os partidos que consigam pelo menos cinco por cento dos votos. O lanamento de estgios para jovens desempregados ou procura do primeiro emprego, o reforo de incentivos j existentes ao emprego por conta de outrm ou criao do prprio posto de trabalho bem como a sistematizao de informao sobre alternativas de formao e de profisses so as trs principais medidas do Programa para a Integrao dos Jovens na Vida Activa. menos de um ms aps ter vencido o Campeonato Nacional de Clubes em golfe, disputado no Oporto Golf Club, coloca na final do Individual dois elementos que contribuiram para a conquista desse ttulo, Stephane Castro Ferreira e Jos Sousa e Melo. O FC Porto conquista a sua oitava Taa de Portugal ao vencer o Sporting, por 2-1, no jogo da finalssima disputado no Estdio Nacional. No final do jogo, adeptos do Sporting lanam garras e pedras para a tribuna de honra, onde estavam Manuela Ferreira Leite, ministra da Educao, e Vtor Vasques, presidente da FPF. Palma Incio, ex-comandante operacional da LUAR, numa entrevista ao Expresso, afirma que no reconhece, aos que contra ele se colocam, envergadura moral para o ofender e lembra que o ELP de Spnola foi a organizao mais terrorista de Portugal. O templo, de configurao quadrangular, com cerca de 15 metros de largo, foi descoberto quando Srgio Coutinho, proprietrio de um terreno, ali quis fazer um estabelecimento de turismo rural. Os arquelogos chamados ao local avaliaram o achado como sendo da poca de Jlio Csar, mas s agora, aps diversas investigaes das arquelogas Ana Arruda e Catarina Viegas, acompanhadas por tcnicos do Instituto Portugus do Patrimnio Arquitectnico e Arqueolgico, se chegou a um consenso pleno. Na mesma zona em que foi encontrado o templo, a Alcova, a caminho das Portas do Sol, foram ainda descobertas cisternas romanas que esto tambm a ser objecto de escavaes e estudos arqueolgicos. Para o autarca Jos Noras, em declaraes agncia Lusa, o achado arqueolgico permite descodificar a presena romana na velha Scalabis e explicar a importncia estratgica de Santarm no contexto da Pennsula dessa poca. O municpio vai agora preservar o monumento e promover o seu conhecimento por arquelogos e estudantes. Ontem, at os futebolistas brasileiros Ronaldo, Roberto Carlos e Denilson ajudaram festa (O nosso favorito era Kuerten; agora Rios), aplaudindo o vencedor de p. O prximo adversrio ser o artista Hicham Arazi que garantiu nova presena nos quartos-de-final do Grand Slam francs. Vindo do anonimato em 1997 graas a quatro vitrias consecutivas (a ltima das quais sobre Marcelo Rios) em Roland Garros, Arazi (47 ATP) parece disposto a fazer melhor este ano depois de ter eliminado Alberto Berasategui (cabea de srie n16), um dos tenistas em melhor forma esta poca sobre a terra batida (vitrias-derrotas: 18-4). No tenho feito grandes resultados ultimamente, mas quando comeo a bater umas bolas aqui sinto-me logo melhor, reconheceu o marroquino aps o triunfo por 6-2, 6-4, 3-6 e 6-3. A economia muito, mas no tudo para aferir da fora vital que sai das entranhas de um povo. O sistema encontra-se disponvel no mercado portugus e faz parte do pacote proposto aos franchisados. Miguns Cardoso, da Triunfo / Il Fornaio, considera que, em termos de software, uma soluo integrada e o apoio do consultor informtico a soluo mais adequada gesto de uma rede comercial deste gnero. O Grupo apostou nesta soluo desde 1990 e as perspectivas so para a abertura de uma nova loja por ms. Uma associao de consumidores de Coimbra exigiu de novo a extino do Instituto do Consumidor, que acusa de ter gasto em 1994 dinheiro sem proveito visvel para os consumidores. A Deco tem uma posio oposta e o organismo visado limita-se a propor uma leitura atenta do relatrio de actividades desenvolvidas. O Instituto do Consumidor (IC) gastou em 1994 mais de meio milho de contos sem resultados visveis, acusa a Associao Portuguesa de Direito do Consumo (APDC), que endereou uma carta ao primeiro-ministro propondo a extino, ainda antes das eleies, daquela estrutura do Ministrio do Ambiente. Cada dia que passa torna mais evidente que o voto dos eleitores do Entre Douro e Minho vai ser decisivo para o resultado final do referendo. para esta regio que se viram os olhos dos partidrios do sim, ansiosos por uma votao que compense a quebra anunciada pelas sondagens noutras zonas do pas. Mas tambm neste espao que cabem as esperanas dos que querem travar o modelo das oito regies. se o no na primeira pergunta do referendo for maioritrio na mais populosa das oito regies propostas, isso significar, quase de certeza, a certido de bito do modelo nascido do acordo entre o PS e o PCP. Se isso vier a acontecer, a regionalizao ficar adiada por 10 ou 15 anos, reconhece Narciso Miranda, lder da Federao do Porto do PS, que acredita numa vitria esmagadora do sim no Entre Douro e Minho. Pense-se em Kingsley Amis, Malcolm Bradbury e Albert Finney. Dois escritores, um actor. Pense-se no romance que o primeiro escreveu -- The Green Man --, que o segundo adaptou e o terceiro interpretou. Pense-se num enredo mirabolante, centrado num heri desfeito, o anti-heri Maurice Allington, e na maneira como o impensvel -- o fantasma de Thomas Underhill -- o cerca. No fim, obtm-se uma srie de televiso. Uma boa srie de televiso: a que a RTP estreou ontem noite, na TV2, e qual nada ligou -- pouco mais do que primeiro episdio escreveu na apresentao. Motor: Econmico, com grande elasticidade e bom nvel de potncia. Caixa: Mais um Toledo salvo por uma caixa bem escalonada, com destaque para as trs primeiras velocidades. -- o nico candidato que me parece capaz de promover uma mudana de paradigma na vida poltica americana. Clinton mais novo na idade e nas ideias. Bush endureceu na atitude de conservao a todo o preo de uma ordem histrica condenada. A Amrica tem de preparar-se para o futuro. Com Bush, no conseguir faz-lo. -- George Bush. -- Em primeiro lugar porque conhece os problemas europeus; em segundo lugar pelo papel que desempenhou na poltica mundial nos ltimos quatro anos e, em terceiro lugar, porque o CDS sempre mais ligado aos republicanos do que aos democratas. Para Eurico de Melo s faz sentido convocar um referendo se se chegar concluso da sua necessidade para cumprir um formalismo constitucional. No desvalorizou, porm, a convocao do Conselho de Estado por entender que no deve subsistir a menor dvida sobre os formalismos constitucionais a cumprir nem sobre a vontade poltica de adeso ao Tratado. Eurico de Melo defende que as decises de Maastricht so de grande importncia para o pas na medida em que reflectem uma linha de mais compromisso poltico com a CE. Segundo a Comisso de Afectados pela Barragem do Lindoso, entidade promotora da concentrao, o objectivo perseguido mantm-se: prosseguir no seu protesto contra o que entendem ser uma atitude de chantagem da EDP. Em declaraes ao PBLICO, um dos elementos da Comiso dos Afectados manifestou esperanas na possibilidade de reatamento do dilogo interrompido a 16 de Dezembro ltimo, remetendo para os resultados de uma reunio que ir juntar no Porto o governador civil de Ourense, o presidenta da Cmara do concelho de Lovios (Ourense) e Marques Seabra, responsvel da EDP, e que est prevista para sexta-feira, embora sujeita ainda a confirmao. A mesma fonte afirmou que os afectados pela barragem estaro dispostos a permitir que prossigam os trabalhos de remoo da igreja de Aceredo, que ir ficar submersa pela albufeira, logo que a arbitragem acordada entre as partes seja assumida em documento assinado perante um notrio. Em vsperas de Carnaval, a Associao Portuguesa para a Defesa do Consumidor (DECO) mostra-se preocupada com a utilizao de explosivos nas brincadeiras carnavalescas, responsvel todos os anos por inmeros acidentes, sobretudo com crianas em idade escolar. Em comunicado, a DECO considera essencial a informao dos consumidores sobre este assunto, causador tambm de rudo e perturbao da ordem pblica, especialmente nas escolas. As bombas de Carnaval pertencem ao conjunto de explosivos tecnicamente designado como bombas de arremesso, cuja venda -- regulamentada por lei -- s pode ser feita a pessoas, com mais de 18 anos, que tenham autorizao das autoridades competentes para as comprar. Mas de faltas de correspondncia entre a lei e a realidade dos factos est o pas cheio. Por exemplo, lembra tambm a DECO, entre os brinquedos preferidos pelas crianas nos festejos carnavalescos encontram-se os estalinhos, considerados brinquedos ou artifcios pirotcnicos que podem rebentar por choque ou atravs de um detonador. Encontram-se em todo o lado e, no entanto, o seu fabrico proibido por lei. Hoje, Bush tem na agenda as cidades de Baidoa e Bali-Dogle e mais visitas a soldados, orfanatos e organizaes humanitrias. O nico incidente registado ontem em Mogadscio ocorreu na embaixada francesa quando um somali tentou entrar nas instalaes e no obedeceu ordem de parar de um sentinela, anunciou o comando francs da operao Oryx. O soldado disparou para o ar, mas o indivduo continuou a avanar e foi atingido mortalmente. Lawrence Summers evitou atacar os pases do G7, mas no pde deixar de falar no Japo, sublinhando que os EUA no manipulam artificialmente as taxas de cmbio e que seu desejo e de todo o mundo que este pas volte a ter um crescimento rpido. O Japo respondeu que concorda com as sugestes norte-americanas e que apoia uma aco concertada para travar a valorizao do iene, considerada como brutal a ambgua -- j que se pode conduzir diminuio do excedente comercial japons pode tambm pr em causa o crescimento mundial, na opinio do secretrio do Tesouro dos EUA, Lloyd Bentsen. As multides no acorreram abertura, domingo, da exposio A Idade do Barroco em Portugal, organizada pela National Gallery de Washington e a Secretaria de Estado da Cultura portuguesa, atravs do Instituto Portugus de Museus (IPM). Mas a exposio estar patente at 6 de Fevereiro e provvel que, nas suas visitas de rotina aos museus, grande parte dos washingtonianos venham a visit-la. Isto maravilhoso, um verdadeiro tesouro, que no imaginei que existisse. Mary Sue, 55 anos, entrou com a amiga na exposio, por acaso. Na realidade, vieram ao museu por causa da exposio sobre os Pssaros da Amrica, que est nas salas ao lado. Mas no se arrependeram. um povo e uma poca fascinantes. Estamos mortas por visitar Portugal. Estas novas atitudes correspondem clebre frase de Robert Fillion . a arte o que faz a vida parecer mais interessante que a arte. Esta exposio nasceu, ainda no dizer de Jean de Loisy, da leitura do clebre artigo de Allan Kaprow: A herana de Jackson Pollock. Kaprow pretende que Pollock no teve tempo de levar as suas obras at s ltimas consequncias, as quais teriam sido, no limitar o quadro tela posta no cho, mas de nele integrar o prprio cho do atelier, os objectos, os rudos da rua, em suma, a vida. Este artigo, escrito em 1957, posiciona a arte numa nova direco, da qual a exposio tenta ser o reflexo. Accionismo, novo realismo, happening, poesia sonora, Fluxus, performance, arte corporal, environnements, so alguns dos nomes que, segundo os perodos e os pases, foram dados a estas novas formas de arte. O Japo quer continuar a negociar com os EUA tendo em vista resolver o diferendo comercial que ope os dois pases, apesar de a ameaa norte-americana de aplicao de sanes, caso as autoridades npnicas no abram os seus mercados pblicos num prazo de 60 dias. O ministro das Finanas de Tquio, Masayoshi Takemura, afirmou lamentar a atitude de Washington, mas adiantou que o Japo continuar a tentar ultrapassar as diferenas entre as duas partes, em cada um dos sectores em negociao: automvel, seguros e mercados pblicos nas reas das telecomunicaes, equipamento mdico, computadores e satlites. O ultimato para a abertura dos mercados pblicos nipnicos foi imposto por Washington no ltimo fim-de-semana, depois do fracasso das conversaes bilaterais. Em Portugal desde 23 de Maio, Manuel Antnio viveu em Lisboa, cerca de trs meses, deslocando-se depois para o Porto. Pelas 3h55 da passada quinta-feira foi surpreendido nas escadas interiores da 15 Esquadra da PSP, na Foz do Douro, sem ter usado a porta de entrada. Fonte policial admite que o jovem tenha saltado o muro das traseiras das instalaes. Na altura, Manuel Antnio alegou sede e que queria beber gua, justificando desta forma a sua entrada na esquadra atravs do muro. Indocumentado, deu um nome trocado -- Manuel Carneiro --, disse ser moambicano e que tinha os documentos numas obras. Esta informao no foi confirmada pelos agentes da PSP que o acompanharam a estaleiros sitos na Foz, Boavista e no Carvalhido. As diligncias da polcia terminaram com a sua entrega ao Servio de Estrangeiros e Fronteiras (SEF). Junto do Consulado de Moambique, o SEF apurou ser falsa a nacionalidade indicada pelo Manuel Antnio e descobriu os documentos no interior de uma pasta na posse de um tal Albertino. As autoridades verificaram que o Manuel Antnio tinha dois passaportes: um angolano, verdadeiro; e outro portugus, falsificado. Este ltimo documento ter sido adquirido no Centro Comercial Dallas, no Porto, por 15 contos e destinava--se a ser usado numa viagem a Frana. Acusado de falsificao de documentos, Manuel Antnio viu a deteno confirmada por um juiz de instruo, que no atendeu ao facto de o jovem ter menos de 16 anos, em funo do passaporte angolano de que era portador. Quatro dias mais tarde, o Ministrio Pblico detectou o erro e ordenou a transferncia do processo para o foro do Tribunal de Menores, a quem foi entregue ontem. A Frana alcanou, em 1995, um excedente comercial recorde de 104,5 mil milhes de francos (3,1 mil milhes de contos), o que representa um acrscimo de 27 por cento em relao aos resultados de 1994. O ministro do Comrcio Externo, Yves Galland, anunciou ainda que, no ano passado, as exportaes francesas atingiram o montante recorde de 1427 mil milhes de francos (cerca de 43 mil milhes de contos), reflectindo uma alta de 9,2 por cento relativamente ao ano anterior. As importaes apresentaram uma progresso de 7,9 por cento, o que equivale a 108,6 mil milhes de francos (3,25 mil milhes de contos). O ltimo saldo recorde do comrcio externo havia sido atingido em 1993, com um excedente de 87 mil milhes de francos (2,6 mil milhes de contos). A Comisso Europeia aguarda uma notificao das autoridades italianas sobre o plano de reestruturao da companhia area Alitalia. O principal accionista, o grupo pblico italiano Iri, adoptou na ltima quinta-feira as grandes linhas do plano, que prev uma injeco de capital na ordem dos 937 milhes de dlares (cerca de 142 milhes de contos) e o reforo da estratgia de alianas. A Comisso Europeia dever em breve tomar uma deciso sobre a segunda parte da ajuda pblica acordada com a companhia area grega, Olympic Airways. Espera-se ainda a aprovao, por parte da Comisso, da terceira parte do plano de ajuda companhia francesa Air France, cujo montante global de cerca de quatro milhes de dlares (cerca de 605 mil contos). Raramente as instituies de planeamento falam claro. Mas, s vezes, acontece. em segundo lugar, e trata-se de uma concluso menos taxativa, de admitir que tenha diminudo a posio relativa da Regio Norte no contexto do espao scio-econmico portugus. Desde que o ministro do Planeamento e Administrao do Territrio, Valente de Oliveira, se reuniu em Setembro e Outubro, em Vila Real e no Porto, com as cmaras do interior e do litoral da Regio Norte, que a CCRN se tem desdobrado em contactos para discutir a forma de aplicao do prximo quadro de apoio. Deste esforo, um contra-relgio que se destina a ouvir tanto os rgos de poder local como os agentes de desenvolvimento e essa entidade genrica a que se chama sociedade civil, dever nascer a proposta nortenha para o novo Plano Regional de Desenvolvimento, espreita dos fundos acrescidos proporcionados pela aprovao em Edimburgo do Pacote Delors II. A equipa masculina concorre Primeira Liga, no decatlo, em Helmond, na Holanda, contra as formaes da Alemanha, Bielorssia, Estnia, Rssia, Holanda, Polnia e Suia. Sobem Superliga (para o ano) as duas primeiras equipas e descem as duas ltimas e Portugal dificilmente escapar despromoo. que, de facto, s por puro acaso equipas como a alem e a russa esto nesta diviso. Portugal alinha com Mrio Anibal Ramos (Benfica), que recentemente deixou o recorde nacional em 7614 pontos, F. Nuno Fernandes (FC Porto- 7381 pontos), Jos Duro (CN Rio Maior- 6614 pontos) e Lus Herdio Costa (Sporting- 6585 pontos). Ao mesmo tempo decorre a Superliga feminina (heptatlo), envolvendo a Bielorssia, Frana, Gr-Bretanha, Alemanha, Holanda, Polnia, Rssia e Ucrnia, com as russas favoritas para manterem o trofu. A seleco feminina portuguesa, por seu lado, compete no heptatlo da segunda liga em Dilbeek, na Blgica, com belgas e suias. A equipa vencedora subir primeira liga, mas para essa posio favorita a Suia. Portugal alinha com Sandra Turpin (Benfica- 5218 pontos), Snia Machado (individual- 5061 p), Mnica Sousa (GD Cavadas- 5122 pontos) e Catarina Rafael (Bairro Anjos- 4602 pontos). A maior perspectiva da parte portuguesa a hiptese do recorde nacional ( de 5228 pontos) por Sandra Turpin. Desejava que tivessem visto a expresso no rosto do dr. Steve, disse ao centro de controlo da misso Ken Bowersox, comandante do vaivm, escreve a agncia Reuter. ali no poro do Discovery que o Hubble vai ser reparado no decurso de quatro passeios no espao, com seis horas de durao cada um, que sero feitos pelos astronautas Mark Lee, Steve Smith, Greg Harbaugh e Joe Tanner. O primeiro, o mais importante desta misso, estava previsto precisamente para a noite passada. Se tudo correu como estava previsto, fez-se a substituio de dois instrumentos de observao do Hubble -- o Goddard High Resolution Spectrometer e o Faint Object Spectrograph -- e a instalao de dois novos instrumentos destinados a ampliar significativamente as suas capacidades de observao: o Space Telescope Imaging Spectrograph (STIS) e o Near Infrared Camera and Multi-Object Spectrometer (NICMOS), cada um do tamanho de uma cabine telefnica. A perspectiva de as duas empresas competirem entre si sem se terem em conta os incmodos resultantes para os aveirenses acabou por ditar uma soluo de consenso, tendo a autarquia funcionado como intermediria para o bom entendimento entre as partes. E a possibilidade de as ruas de Aveiro se transformarem num estaleiro permanente, com um operador a abrir valas onde o seu concorrente as tinha fechado uns dias antes, no seria de todo inadmissvel, at porque a legislao permite a livre concorrncia entre os concessionrios da TV por cabo. Segundo o acordo estabelecido, a TV Cabo Mondego, que entrou j em fora no mercado local de assinantes, utiliza as fibras pticas da rede telefnica j instaladas pela Portugal Telecom, enquanto a Pluricanal se servir, mediante uma comparticipao financeira, das condutas do gs natural. Para o presidente da autarquia, Celso Santos, este acordo, que ser formalizado atravs de um protocolo alargado EDP e Lusitniags, permitiu Cmara controlar o processo de instalao das novas tubagens e evitou que as ruas permanecessem intransitveis durante muito tempo. O vice-presidente Andriessen participa na Reunio de Coordenao da Ajuda Internacional CEI, no Centro Cultural de Belm. A reunio termina amanh. Reunio informal dos ministros da Agricultura, na Curia. Termina no dia 25 deste ms. Ora existem muitas funes (ordenamento territorial, incentivos ao desenvolvimento, ambiente, turismo, cultura, vias de comunicao, educao, etc.), para as quais os municpios so demasiado pequenos e o Estado demasiado distante. para isso que em todos os pases, grandes ou pequenos, existe uma autarquia territorial (ou mesmo duas) entre o Estado e os municpios. Ns prprios, desde a revoluo liberal at 1974, sempre tivemos acima do municpio ora o distrito, ora a provncia. As regies administrativas no so mais do que a restaurao da figura das provncias, com atribuies mais centradas no ordenamento territorial e no desenvolvimento. evidente que as regies administrativas tero tambm condies para reivindicar uma mais equilibrada repartio dos recursos oramentais, mesmo nas funes que ho-de continuar a ser do foro da administrao central. Nas minhas deslocaes de comboio a Lisboa no posso evitar um quase sentimento de revolta, quando comparo a escandalosa indigncia da estao de caminhos-de-ferro de Coimbra, mais prpria de um apeadeiro terceiro-mundista, com a sumptuosidade megalmana da nova Gare do Oriente, que pelos vistos corre o risco de vir a ser o mais oneroso dos apeadeiros de luxo do mundo. E sou levado a pensar que a existncia de regies poderia contribuir tambm para evitar estas gritantes disparidades de tratamento regional ... Que a posio do Vaticano possa ser entendida deste modo por um intelectual desta craveira -- sobretudo a ideia de um malvado voluntarismo de Deus que lhe est subjacente -- deveria fazer reflectir os argumentadores oficiais da doutrina da Igreja. Parece-me que nela se continuam a misturar alhos com bugalhos e a no hierarquizar adequadamente nem as convices de f nem as razes. Neste sentido, espero que o interessante documento de trabalho do Conselho Pontifcio da Famlia, Evolues demogrficas: dimenses ticas e pastorais, Lisboa, 1994) leve uma grande volta. Se, como nele se diz, a Igreja deseja encetar um dilogo construtivo com os que continuam convencidos da necessidade de realizar um controle imperativo da populao e com os governos e as instituies que se ocupam de polticas da populao, j que existem problemas demogrficos reais, apesar de frequentemente serem vistos a partir de uma perspectiva errada e de se proporem solues depravadas para os resolver (n24), no pode favorecer os mal-entendidos. Esta uma posio fundamental que no pode ser trocada por nada, sejam quais forem as chantagens dos patres deste mundo na Conferncia do Cairo ou fora dela. Numa reunio com comerciantes e moradores da vila velha, realizada tera-feira tarde, os vereadores da CDU refutaram as acusaes feitas pela presidente do municpio de estarem a atrasar obras importantes, garantindo no estar contra ou a favor da construo do silo na Volta do Duche -- que se encontra em banho-maria --, mas fizeram depender uma posio favorvel da realizao de um estudo mais vasto para toda a vila que prove a necessidade daquele investimento. Todos os partidos polticos com assento na Assembleia Municipal de Leiria contestam o Plano Estratgico do Eixo de Leiria-Marinha Grande e do Sistema Urbano da Alta Estremadura. Contactados pelo PBLICO, os lderes das concelhias do PP, do PSD- com a maioria da Assembleia Municipal-, do PS e o deputado municipal da CDU so unnimes em considerar que o documento mandado elaborar pela Associao de Municpios da Alta Estremadura (AMAE) carece de um importanle debate pblico, havendo mesmo quem no Ihe reconhea o estatuto de plano estratgico. Apesar disso, as diferentes foras partidrias esto dispostas, na prxima reunio da Assembleia Municipal, sexta-feira, a criarem as condies para que sejam aprovados os projectos a candidatar pela Cmara de Leiria e pela associao de municpios ao Prosiurb (Programa de Consolidao do Sistema Urbano Nacional e Apoio execuo dos PDM). Antnio Guterres foi o primeiro convidado de uma srie de debates com lderes polticos que o Inesc est a promover. Ao falar no Porto, mas ligado a outros pontos do pas graas s inovaes tecnolgicas, o dirigente socialista defendeu ontem as suas ideias na matria: ideias orientadas para uma cincia e uma investigao com qualidade e relevncia para os problemas do pas e contra a aliana entre a mediocridade e a inveja. Ainda no me tinha lembrado da convenincia de levar uma sandesitas e uma garrafa de gua, quando fui confrontado com a notcia de que tal ousadia era proibida. Antevi-me, com ou sem sandes, a ser interpelado -- ou mesmo revistado -- por um qualquer segurana imbudo de uma autoridade disparatada. Imaginei a quantidade (e a qualidade!) de conflitos que essa rasqueirice iria provocar. E no gostei. Depois, contaram-me que, em horas normalssimas, o acesso a certos locais -- como o passeio martimo, que me dizem ser dos mais aprazveis -- passava a ser proibido, e guardado pelos correspondentes seguranas, sem que os prprios soubessem explicar porqu. Conheo o gnero, e no aprecio. Esta medida foi conseguida com a colaborao activa da Associao dos Operadores Porturios do Sul (AOPS) e da Associao dos Agentes de Navegao e Empresas Operadoras Porturias (Anesul), e insere-se na poltica de modernizao do sistema de informaes da comunidade porturia em Setbal. Sporting-Vitria de Setbal e Martimo ou Ovarense-FC Porto. Ontem, em Lisboa, foi este o resultado do sorteio das meias-finais da Taa de Portugal, a jogar dia 9 de Maio. Uma tiragem de sortes que afastou os dois grandes ainda em prova, mas que leva o Sporting a receber uma equipa que lhe roubou, em Alvalade, um ponto para o Nacional, enquanto o FC Porto poder ir Madeira defrontar a nica equipa com quem sofreu uma derrota para o Nacional nesta temporada. Ainda assim, no final todos ficaram contentes. J antes do sculo XIV se havia iniciado o captulo da histria que une Portugal famosa Flandres, cuja referncia povoa os nossos livros de Histria do secundrio, marcando a memria com nomes quase mticos, como o de Anturpia. Muito mais fantstico do que essas menes soltas da realidade actual saber que as cidades mgicas ainda existem, mesmo que por detrs de arquitecturas e modas no trajar que j nada tm a ver com os tempos em que se bailava a compassos diferentes e em que as ruas se pavimentavam com pressas diversas das nossas. O plano tem vindo a ser executado gradualmente, atravs de intervenes que integram ou reintegram funes antigas. exemplo disso a inaugurao de uma nova residncia paroquial em Novembro de 1996, o primeiro passo no sentido da reafectao de espaos do monumento. Fundado no sculo VI, em pleno domnio suevo, o Mosteiro de Santa Maria de Tibes foi posteriormente arrasado, datando a sua reconstruo do ltimo tero do sculo XI. A partir da, foi sucessivamente remodelado e aumentado e, tal como hoje existe, o resultado de campanhas de reconstruo e ampliao levadas a cabo nos sculos XVII e XVIII, respondendo actualmente a uma linguagem mista entre o primeiro barroco e o rocaille, explica o Ippar. P.S. -- PROJECTO DE DIPLOMA e Portaria vm em maisculas no texto, mas escola vem em minsculas! A ESCOLA de Excelncia escreve-se em maisculas. O vosso subconsciente anda a trair-vos! Os gestores de topo das empresas japonesas esto a aceitar o corte dos seus salrios, em face da sbita recesso que se abateu sobre a economia do pas, impondo-se um teste prtico quanto solidariedade que caracteriza a imagem das empresas japonesas no exterior, e tentando contribuir desta forma, para que as firmas que administram possam suportar a crise actual. Uma vez mais o pblico primou pela ausncia nas bancadas do Autdromo do Estoril, tornando quase secretas as quatro corridas ontem disputadas. Como que a sentirem a falta de testemunhas, as provas tambm no foram particularmente emocionantes, com as vitrias decididas muito cedo. No Trofu Renault Clio 16V Vtor Lopes comandou de princpio a fim, construindo uma slida liderana. Lopes terminaria com 10,017'' de avano sobre Jos Joo Magalhes e 12,378'' sobre Celestino Arajo. Na Frmula Ford, Carlos Azevedo (Swift) regressou s vitrias, terminando com uma vantagem de 2,328'' sobre Rui guas (Van Diemen) e 3,040'' sobre Frederico Viegas. Quanto ao Trofu BMW 320iS, Jorge Petiz ultrapassou o seu irmo Alcides a meio da corrida para obter uma vitria fcil, com 2,382'' de avano, deixando o 3., Antnio Barros, a 4,788''. O portugus Pedro Chaves (Lola / Cosworth) terminou ontem a corrida de Frmula 3000 disputada em Hockenheim no 13. lugar, parando logo a seguir meta com o motor partido. O piloto portugus fez um bom arranque do 15. posto da grelha, ganhando quatro lugares. Depois comecei a ter problemas com a embraiagem e com o motor que aquecia muito. A partir de meio da corrida, o motor comeou a perder potncia e partiu na ltima volta, j perto da meta, contou Chaves. Cado, entretanto, no 12. lugar, o portugus acabou mesmo por perder uma posio na derradeira volta. A vitria foi para o italiano Luca Badoer (Reynard / Cosworth) que dominou a corrida de princpio a fim, assumindo tambm o comando do campeonato. ltimo teorema de Fermat: agora de vez? Andrew Wiles, o investigador britnico que anunciou prematuramente, no ano passado, ter demonstrado o ltimo teorema do matemtico francs do sculo XVII Pierre de Fermat, talvez tenha finalmente levado este trabalho a bom termo -- noticiou o dirio New York Times na semana passada. Rui Vilar, ex-comissrio da Europlia 91 e ex-presidente do conselho de administrao da Caixa Geral de Depsitos (CGD), vai ter uma tarefa difcil. O patrimnio da Gulbenkian valiosssimo, est estimado em cerca de 260 milhes de contos, a valores de 1994, mas a desvalorizao do dlar nos ltimos trs anos tem afectado significativamente os rendimentos que permitem a sua sobrevivncia. A administrao sempre foi bastante conservadora nas suas aplicaes de capital, que se traduzem principalmente por investimentos no sector do petrleo e pela gesto de uma carteira de ttulos no valor de dois bilies de dlares nos Estados Unidos, composta por aces e obrigaes. S no ano de 1994, o patrimnio da Fundao diminuu, em valor, cerca de 30 milhes de contos por causa das perdas cambiais do dlar contra o escudo. Recorde-se que as receitas so obtidas na sua quase totalidade em dlares, que depois necessrio converter para a moeda nacional. Com O Fim, as rplicas cruzadas formam uma rede ainda mais inextricvel, um verdadeiro concerto desconcertante, mas afinado, em que, de vez em quando, as palavras parecem disparos, exploses, rajadas de metralhadora. Parecem. Mas quem no se fiar nestas aparncias descobrir que, nesta amlgama de sonoridades informes -- ou nesta acumulao de nuvens (cinzentas) com formas caprichosas -- toma forma um discurso feito de premonies, ameaas, troves e relmpagos, mortes anunciadas. Um clima abafado, prenncio de catstrofes. Quando, na primeira cena, Tom -- amigo e cmplice de Mateus no assassnio da moa -- fala do macaquinho que um amigo lhe vai trazer de Angola, est a falar da arma do crime, do macaco com que Mateus matar Sandra. E, a propsito do macaco, Mateus sugere que, em vez disso, tragam um tigre. A, comeam a instalar-se nos dilogos as alegorias da caa, do caador e da presa, do predador, dos carnvoras e dos omnvoros, da comida que se devora -- alegorias que dominam toda a pea, evocando a priori ou a posteriori o crime que o ponto de chegada do espectculo. Assim, todas as palavras com que Mateus (Manuel Wiborg) contribui para as conversas de caserna anunciam ou recordam o estupro e o assassnio de Sandra, bem como o suicdio com que ele se vai autopunir no final do espectculo. Apesar dos esforos desenvolvidos pela presidncia luxemburguesa para conseguir o acordo da totalidade dos estados membros, a Itlia manteve a sua oposio ao acordo final em sinal de protesto pela reduo da quota do leite. No possui hoje Vendas Novas praa de toiros, razo por que a corrida aconteceu numa desmontvel, mas Antnio Morais, no seu livro A Praa de Toiros do Campo Pequeno, ao referir-se s outras praas do pas, diz que Jos Valrio construiu uma praa em Vendas Novas em 1862, tendo nela havido festas de toiros at 1875, ano em que soldados da Escola Prtica de Artilharia, por descuido, provocaram o seu incndio. Regista ainda na terra a existncia de uma praa nos anos 20 com capacidade para 3400 espectadores, que julgamos ter sido a que existiu at h cerca de 20 anos. Desta feita, redimiu-se a Escola Prtica, ajudando montagem da praa instalada no campo da feira e nela fizeram as cortesias Jos Maldonado Cortes, Nuno Pardal, o praticante Jos Francisco Cortes e o amador Jos Soudo, a quem saudamos o regresso aps convalescer do gravssimo percalo que lhe aconteceu na praa da Malveira. Lidou-se um curro de toiros de Jos Lus Sommer de Andrade, que o Grupo de Forcados amadores da Moita pegou. Acompanhando o comportamento dos principais mercados asiticos, a Bolsa de Sidney terminou a sesso de ontem em alta pronunciada. Para os operadores, tambm aqui as valorizaes se ficaram a dever reentrada de novos investidores, nomeadamente fundos de investimento internacionais. O ndice AOI encerrou nos 2034 pontos. Nesta classificao, destaque para nova subida do sueco Stefan Edberg, que surge na 20 posio, graas presena na final de Queen's (700 mil dlares). Recorde-se que Edberg, antes de Roland Garros, estava no 45 lugar, mas as excelentes exibies na terra batida parisiense e na relva londrina permitiram-lhe o salto. O alemo Boris Becker, vencedor do Queen's, ao derrotar Edberg por 6-4, 7-6 (7-3) 11 anos depois de ter conquistado no mesmo local o seu primeiro ttulo em relva, manteve a quarta posio mas diminuiu a diferena pontual para Andre Agassi. Em Rosmalen (500 mil dlares), na Holanda e tambm em relva, o vencedor foi o norte-americano Richey Reneberg, que bateu na final o francs Stphane Simian por 6-4, 6-0. Espectculo de um jovem grupo italiano, integrado no esprito das mais novas tendncias teatrais europeias. Cerimnia e combate em que a mstica grotowski-barbiana, as artes marciais orientais, a msica e a poesia italiana e um projecto intercultural se cruzam em boa harmonia. Depois de Montemor, vora, Beja, Coimbra e Braga, a vez de a Grande Lisboa o ver na Damaia. Ocasio nica. SALA D. JOO V Largo da Igreja. Dom., 15, s 21h45. O actor e realizador norte-americano Clint Eastwood, 63 anos, presidente do jri do Festival de Cannes (ver p. 28), defendeu ontem a entrada de uma maior variedade de filmes estrangeiros no mercado dos Estados Unidos. Em conferncia de imprensa dada conjuntamente com a vice-presidente do jri, a actriz francesa Catherine Deneuve, 50 anos, Eastwood confessou que, em vez de sanes de proteccionismo, preferiria que uma maior variedade de filmes franceses, asiticos, russos entrasse nos Estados Unidos. Para alargar o horizonte do pbico norte-americano, explicou. O actor disse estar a produzir The Stars Fell on Henrietta, um filme realizado por James Keach, com Robert Duvall, e projecta rodar Golf in the Kingdom, que tem por tema o golfe, um desporto de que se confessa adepto. Como actor dever desempenhar o papel de um fotgrafo em The Bridge of Madison County, uma pelcula a realizar pelo australiano Bruce Beresford com base no best seller homnimo de Robert James Weller. Os Utah Jazz voltaram a provar que so os visitantes mais antipticos da Liga Norte-Americana de Basquetebol profissional (NBA), ao averbarem na jornada de sexta-feira o seu 11 triunfo fora de casa, melhorando o seu registo da Liga na presente temporada para 11v/4d. Desde Maio deste ano que Joaquim Correia, escultor natural do concelho da Marinha Grande- que doou os seus trabalhos para integrar o ncleo do museu no Convento de Santo Agostinho- chegou mesmo a ameaar desvincular-se do compromisso com a autarquia, caso o processo no fosse desbloqueado at ao final do ano. Em declaraes ao PBLICO, Joaquim Correia afirma manter a sua posio, apesar de considerar positiva a iniciativa agora tomada pelo IPPAR, que lhe ter sido comunicada pela prpria Cmara de Leiria na passada sexta-feira. Joaquim Correia justificou a sua posio cautelosa sobre o assunto, adiantando que o processo de reconverso do Convento de Santo Agostinho em museu se arrasta h oito anos. Quem estraga velho paga novo, disse Vtor Melcias, afirmando pretender que sejam repensadas as situaes onde j houve entregas apressadas de antigos edifcios das Misericrdias, ocupados at h bem pouco tempo por unidades hospitalares do Estado. Segundo Vtor Melcias, at aos anos 70, as Misericrdias dedicavam cerca de 90 por cento da sua actividade a aces na rea da Sade. A ocupao dos edifcios das Misericrdias por unidades hospitalares do Estado conduziu, nas ltimas dcadas, reconverso das reas de interveno das Misericrdias portuguesas, que tm actualmente na assistncia social o seu principal campo de aco. Ainda se podem queixar se quiserem ..., gracejou Raposo, aps entregar o documento. Foi por entre elogios unnimes dos deputados sua actuao e a manifesta disponibilidade para dela retirar os necessrios ensinamentos que Mrio Raposo deixou ontem o Parlamento onde foi formalizar e explicar as razes da sua renncia ao cargo de Provedor de Justia. Poucos minutos depois do final da reunio que, durante cerca de trs horas, juntou o Provedor demissionrio e os deputados da Comisso Parlamentar de Direitos, Liberdades e Garantias, era divulgado um comunicado onde se reala a actuao de elevado mrito do dr. Mrio Raposo, marcada por critrios de independncia na defesa dos direitos dos cidados perante a Administrao, dela se devendo retirar os necessrios ensinamentos para que sejam criadas as adequadas condies institucionais que permitam a normalidade e plenitude do exerccio das suas funes. Jean Alesi obteve ontem a pole position provisria para o Grande Prmio de Espanha, que se disputa amanh no circuito da Catalunha (Barcelona). O piloto francs foi o mais rpido na primeira sesso de treinos de qualificao, superando por 35 centsimos de segundo o seu companheiro de equipa Gerhard Berger. A Ferrari conseguiu assim colocar os seus dois carros na primeira linha da grelha provisria, um resultado que motiva toda a equipa. Depois do segundo e terceiro lugares no GP de San Marino, h duas semanas, e dos bons resultados conseguidos nos testes realizados nos dias seguintes, tambm no circuito de mola (Berger foi o mais rpido dos 16 pilotos que estiveram em pista, batendo Schumacher e Hill), a Ferrari mostra em Barcelona que a sua competitividade est a aumentar, reforando a ideia de que se poder bater em igualdade de circunstncias com a Benetton e a Williams, as duas equipas motorizadas pela Renault. O Gerhard tem razo, ele tem os ps bem assentes no cho. preciso no esquecer que estes so os treinos de sexta-feira, no os de sbado. Entretanto, Bernardo Vasconcelos, mdico do Benfica, confirmou ontem que, na sequncia do ocorrido no ltimo Benfica-Sporting, Joo Pinto ser operado, embora ainda no se conhea exactamente a extenso da leso. Como h dvidas se existem roturas totais ou parciais dos ligamentos, o futebolista far agora uma artroscopia e s depois se saber se h necessidade de fazer uma ligamentoplastia. Tambm no se sabe quando ou onde ocorrer a interveno cirrgica. informao reservada, a pedido do jogador. Um guineense e uma portuguesa que se dedicavam a angariar trabalhadores para o Kuwait, cobrando uma taxa de dez contos, foram detidos pela Polcia Judiciria. Assim, a Liga nem sequer pode comprovar que de facto est a ser feito um inqurito, sublinhou Pedro Vieira. Quanto aos 75 mil contos que a Secretaria de Estado diz agora ter disponibilizado para a operao de salvamento do peixe, ningum sabe onde, como e por quem foram utilizados, at porque das centenas de toneladas ali existentes apenas se salvaram cerca de duas dezenas. Cumprindo a promessa de se deslocar ao Porto na primeira tera-feira de cada ms, para reunies de trabalho com os responsveis autrquicos e os agentes culturais da cidade, o ministro recebeu ontem, entre outros, o director do Teatro Nacional S. Joo, Ricardo Pais, a responsvel do Teatro Rivoli, Isabel Alves Costa, e ainda representantes do Teatro Art'Imagem e do Museu de Imprensa, cuja primeira fase ser inaugurada pelo Presidente da Repblica na prxima sexta-feira. s j habituais crticas poltica cavaquista no domnio da cultura, que considerou economicista e frvola, Carrilho acrescentou alguns ataques ao actual lder do PSD. Neste ltimo ano, o que que o PSD props em matria cultural?, perguntou o ministro, aludindo ao primeiro aniversrio da liderana de Marcelo Rebelo de Sousa, que se celebrava mesma hora em Santa Maria da Feira. A Food and Drug Administration (FDA) -- a agncia americana que fiscaliza os medicamentos e os alimentos -- pode passar a regulamentar o tabaco, em virtude dos efeitos biolgicos desta substncia. Quem o decidiu foi o juiz William Osteen, na sexta-feira passada, naquela que foi considerada a mais devastadora derrota da indstria tabaqueira nos Estados Unidos. A deciso significa que vai vingar a inteno da FDA de proibir a venda de tabaco a menores, obrigando as lojas a pedir os bilhetes de identidade para o efeito. Ser tambm FDA que caber regulamentar, a partir de agora, as mquinas de venda de tabaco. Num pas em que -- ainda segundo estatsticas das Naes Unidas -- mais de metade da populao tem menos de 15 anos, foi estranho para muitos observadores que o nmero de eleitores potenciais se estimasse em 5,8 milhes, para uma populao total calculada em dez. Vrios ajustamentos tm sido feitos ao longo desta maratona, como no Bi, em que o nmero potencial de eleitores passou de 577 para 700 mil. Tambm em Benguela e no Cunene houve alteraes. O que se sabe, concretamente, que neste momento o nmero de portadores do carto de eleitor em Angola j ultrapassou os 4,3 milhes e o alargamento do prazo de registo at 10 de Agosto permitir recensear as pessoas que vivem nos lugares mais remotos do territrio. Esta situao demonstra que o tema ambiente est na moda. Quando a obra foi feita pela primeira vez, h uns cinco anos, no aconteceu nada ao responsvel. Agora que s se veio agravar um mal que j estava feito que as pessoas se interessaram, afirmou Joo Mendona, adiantando que o Posto 7 no tem nada que ver com a destruio da duna e esclarecendo que o bar no est a ocupar ilegalmente a praia -- como sups um funcionrio da Junta de Freguesia do Castelo, a que a praia do Moinho Baixo pertence. A licena de ocupao de domnio pblico martimo foi emitida pela DRARN de Lisboa e Vale do Tejo em Maio passado. O Governo Civil tambm autorizou o funcionamento do bar. Neste momento, a DRARN no est a conceder novas licenas de ocupao de domnio pblico, mas tem renovado licenas antigas, segundo Vitria Bruno da Costa. Isto porque vo ser aplicados Planos de Ordenamento da Orla Costeira (POOC), pelo que no vale a pena estar a autorizar investimentos que podem vir a estar em desconformidade com os POOC, explicou a directora. No cantinho da bancada central que estava habitvel havia algumas caras conhecidas. Todas do Benfica, claro, que divide muitos jogadores com o seu clube-satlite. Mrio Wilson foi o primeiro a chegar. Depois, sentaram-se o guarda-redes Veiga e o belga Lucien Huth, treinador dos guardies encarnados. O Alverca foi feliz e conseguiu o golo logo aos 4'. Foi na sequncia de uma jogada bonita de Ral, que subiu bem e deu para Ramirez. O mais novo reforo do Benfica -- do Alverca, no caso -- centrou bem e, na rea, Akw cabeceou para dentro da baliza. Se a primeira volta das eleies foi marcada por um inenarrvel caos organizativo, a segunda correu relativamente bem. As chuvas atrasaram as operaes em algumas regies. Mas o grande motivo do atraso no anncio dos resultados parecia ser a proximidade dos nmeros. Na segunda-feira noite, segundo os dados parciais da CNE, Nino tinha uma vantagem tangencial. a vontade de mudana dividiu ao meio o eleitorado, algo de impensvel meses atrs. Um tunnelier uma autntica fbrica debaixo do solo. composto por uma cabea escavadora, parecida com uma roda dentada, com pontas de carboneto de tungstnio. este material super-resistente que permite roer a rocha mdia de duas a trs voltas por minuto e avanar velocidade vertiginosa de 200 metros por semana. Em cada volta desta roda avana-se 10 centmetros e so trucidadas cinco toneladas de rocha. Depois da cabea vem uma autntica fbrica ambulante que tritura as pedras e as envia para o exterior, ao mesmo tempo que vai enfiando uns aros em beto armado, impermeveis e resistentes a uma presso de 200 bars, que revestem o tnel. O tempo da dinamite, p e picareta j acabou h muito. Estes monstros so comandados e controlados por meios electrnicos e informticos. Tinha que ser bonito, invocaram os responsveis pelo projecto Y, a nova arma compacta da Lancia para o fim do sculo, mais um dos refinados produtos do design italiano, ainda que sado dos esboos de um engenheiro. Maior que o seu antecessor, o Y10, o Y assume formas tridimensionais e curvilneas, sempre exclusivas e eternamente femininas. Esta cumplicidade de conversa constante entre vtimas da mesma sociedade -- na definio de Liberto Cruz -- doou ao teatro de marionetas o infortnio e o orgulho da marginalidade. Bonecos e bonecreiros foram desde sempre perseguidos e interditados, e sintomtico que o dramaturgo Antnio Jos da Silva, O Judeu, tivesse morrido na sacra fogueira da Inquisio -- condenado pelas peas para marionetas que escrevia e representava no seu Teatro do Bairro Alto. Os esbirros da Inquisio apareciam sempre aos pares, perturbando o Sculo das Luzes, a julgar pela Sala Antnio Jos da Silva do Museu da Marioneta, em Lisboa. Um par de presenas enlutadas fazem aqui as vezes de personagens das trevas, no meio de uma vitrina de bordados barrocos, azuis debruados e rendas de brilho transparente: os personagens das peas Vida do Glorioso D. Quixote de La Mancha e do Gordo Sancho Pana e Guerras do Alecrim e da Manjerona. Ambos os textos foram recuperados para a cena pela companhia Marionetas de So Loureno (seguindo textos do Judeu), cujos fundadores so tambm os responsveis pela constituio do Museu -- Helena Vaz e Jos Gil. Foi anteontem empossado o grupo de trabalho interministerial que vai preparar a articulao entre as bibliotecas escolares e as da rede de leitura pblica. Chefiada pela escritora Isabel Alada, a equipa, constituda por representantes dos ministrios da Cultura e da Educao, tem trs meses para analisar a situao e fazer propostas. A administrao pblica no tem capacidade do ponto de vista veterinrio para fiscalizar e punir os prevaricadores. Nos termos do novo regulamento, devem ainda ser destacadas algumas inovaes. Estabelece-se que no h lugar suspenso ou demisso do mdico veterinrio coordenador ou dos executores no decurso do programa sanitrio anual, a no ser por motivo de fora maior, devidamente justificado e aceite pela direco regional de agricultura, com posterior homologao pela Direco Geral de Veterinria. Consagra-se tambm o direito desta ltima efectuar visitas de inspeco e auditoria tcnica s OPP e exploraes nelas integradas, bem como aos produtores individuais, impor as correces tidas como necessrias ou propor medidas sancionatrias. Antes do relator-geral, falaram o cardeal Angelo Sodano, prefeito da Congregao para os Religiosos e os Institutos Seculares, e o arcebispo Jan Schotte, secretrio-geral do Snodo, que fizeram a sntese dos trabalhos preparatrios. Durante esta semana e parte da prxima, os cerca de 350 participantes -- bispos, superiores de congregaes religiosas masculinas e femininas, peritos e auditores -- estaro reunidos na aula sinodal, ou sesso plenria. As intervenes so feitas com base nos temas e contedos do documento de trabalho, elaborado precisamente em resultado do processo preparatrio. Depois desta fase, os participantes dividem-se em crculos menores, por grupos lingusticos, de modo a produzir sugestes para os documentos finais. Perdoa-nos, Erich Honecker era o ttulo de primeira pgina do antigo jornal do Partido Comunista sovitico, Pravda, segundo o qual os actuais dirigentes da Rssia teriam aberto um precedente para si prprios ao entregar o seu antigo amigo e camarada. A mulher de Honecker, Margot, abandonou ontem a embaixada chilena em Moscovo, tendo seguido directamente para Santiago do Chile, onde vive a sua filha Sonya, e no para Berlim como chegou a ser anunciado. Margot, a quem os alemes de leste chamavam de bruxa, foi ministra da Educao da ex-RDA e actualmente est a ser investigada por suspeita de ter forado vrios dissidentes polticos a entregarem os seus filhos para adopo. O professor Marcelo Rebelo de Sousa, ao exigir o referendo nacional, est a facilitar imenso a resoluo de to momentoso problema. No estamos esquecidos de que o eng Guterres fez da sua promessa da criao das regies administrativas uma das principais bandeiras da sua campanha eleitoral. Queremos que cumpra a promessa, mas de maneira digna e sensata. O tempo, quente e hmido, impediu a obteno de grandes marcas, mas no foi obstculo ao domnio fcil de Pinheiro sobre os seus mais directos adversrios -- Pedro Pessoa, do Bairro Santiago (31m31s), Lus Vieira, da AA Peniche-ptica 2000 (31m55s) e Carlos Almeida, da UR Dafundo (31m57s). Em senhoras, Paula Laneiro tambm no teve dificuldades em bater Umbelina Nunes, do Casal Privilgio (267 da geral e primeira veterana), e Lusa Almeida, do Cruz Quebradense (295). inicialmente, apareceu na lista de classificao como sendo Octvio Sousa, do Super Estrelas. Isto deve-se possivelmente ao facto de Pedro Pessoa ter feito a prova com dorsal trocado. Casos semelhantes a este so frequentes nas corridas de estrada que se efectuam em Portugal e dificultam e falseiam a elaborao das classificaes individuais e colectivas. Magalhes Mota, que se confessou muito perturbado ao tomar conhecimento da morte do seu grande amigo, evocou um encontro de ambos, h cerca de um ms, na Buchholz, livraria lisboeta que o professor Miller Guerra frequentava assiduamente. R. -- No. Peo-lhes para desligar e tentar nova ligao porque esto a invadir a privacidade do meu auscultador. P. -- Um amigo precisa de emprego e concorre sua empresa. Outra pessoa com melhor currculo candidata-se ao mesmo emprego. D o emprego ao seu amigo? P. -- O pay-tv chega tarde a Portugal? R. -- Chegou um pouco atrasado porque Portugal comeou mais tarde com a rede de cabo. Mas se tivermos em conta a nossa dimenso, ningum em to pouco tempo cobriu o pas como ns. Se estamos atrasados, um ano ou dois. O incidente de ontem grave mas no o primeiro. Vrias vezes, nos ltimos dois meses, afegos e iranianos trocaram acusaes sobre violaes do territrio ou do espao areo, alegaes nunca confirmadas por fontes independentes. Em Agosto, por exemplo, Teero anunciou uma troca de tiros, que ter sido efmera e com um nmero indeterminado de vtimas, sem conseguir no entanto provar uma agresso. Lus de Matos apresenta 45 minutos de magia com Pequenas Grandes Iluses. A msica do Chile pelo grupo Alturas . A nossa participada A. DE MATOS atingiu uma facturao de cerca de 1.150.000 contos, mais 48% do que no ano anterior. Na sua actividade tomou um peso importante a venda de produtos transformados, na linha de estratgia que fixmos. O mau tempo atrasou consideravelmente as operaes, disse Ajit Vardi, um dos responsveis da coordenao das operaes em Bombaim, capital do estado de Maharashtra. Entretanto, a ndia rompeu com a sua tradio, aceitando com gratido a ajuda material estrangeira. Recusou, contudo, as propostas dos governos estrangeiros de enviar para o pas equipas de especialistas. O Governo indiano considera que dispe de meios suficientes para realizar com sucesso as operaes de socorro, mostrando-se reticente, segundo um responsvel da distribuio da ajuda internacional, no que respeita a receber pessoal estrangeiro. Equipas propostas pela Rssia, ustria, Itlia, Alemanha e Sua estavam a postos para partir para a ndia. Mas o estado de alerta foi suspenso no sbado com a justificao de que j era demasiado tarde para encontrar sobreviventes e que a ndia no fizera um apelo oficial, considerando suficientes os seus prprios recursos. Em relao situao da Fonte da Telha e s demolies que ali j se efectuaram, Maral Pina afirma no saber explicar por que que parte do entulho se mantm no local, remetendo a explicao para o Ministrio do Ambiente. Ministrio que, alis, acusa de ser culpado de parte dos problemas que se vivem na Costa da Caparica. O vereador do Ambiente da Cmara Municipal de Almada, CMA, Jos Lus Leito, por seu turno, refere que a limpeza das matas que envolvem a zona da Fonte da Telha, que feita, dentro do possvel, pela autarquia, deveria ser competncia do Instituto de Conservao da Natureza, ICN, pois a rea est sob a sua alada, atravs da tutela do Ministrio do Ambiente. -- The Edge, realizado por um neozelands em Hollywood, Lee Tamahori, e baseado numa pea de David Mamet, acabou de estrear.The Edge, realizado por um neozelands em Hollywood, Lee Tamahori, e baseado numa pea de David Mamet, acabou de estrear. Junta Anthony Hopkins e Alec Baldwin no Alaska, depois de um desastre de avio. Hopkins um bilionrio, e pensa que Baldwin, um fotgrafo de moda, est a dormir com a sua mulher, Elle MacPherson, top model a tentar que a sua carreira cinematogrfica pegue. Hopkins diz que, ao contrrio dos seus filmes anteriores, que eram to divertidos quanto ver tinta a secar e bons exerccios de disciplina e economia e tudo isso, este um filme do tipo que lhe apetece fazer agora -- aco. -- U-Turn, de Oliver Stone, com Sean Penn, tambm acabou de ser distribudo nos ecrs americanos. Penn tem um problema no seu carro que o obriga a parar numa pequena cidade de loucos no meio do Arizona. De l no vai conseguir sair. No vai ser um filme que obrigar Stone a pedir desculpa a dezenas de pessoas nas prximas dezenas de anos, como ele ainda hoje anda a fazer depois de J.F. K . e de Assassinos Natos. Os procedimentos de registo, complicados e morosos, devem s por si afugentar muitos ciganos, j de si com baixos nveis de alfabetizao. E os interessados tm de apresentar s autoridades checas documentos obtidos em trs reparties eslovacas diferentes. O objectivo da lei livrarmo-nos de habitantes inconvenientes, explica Michal Pulo, presidente da maior organizao de ciganos da Repblica Checa. Jorge Sampaio formaliza a sua candidatura Presidncia da Repblica, na Reitoria da Universidade Clssica de Lisboa. Terceiro aniversrio da assinatura do Tratado de Maastricht pelos ministros dos Negcios Estrangeiros e das Finanas da Comunidade Europeia. A soluo encontrada parece ser a do encaminhamento da populao escolar para os centros de sade, entregue aos cuidados do respectivo mdico de famlia, mas capacitando estes servios do seu papel crescente na sade escolar. Se no houver, de facto, uma substancial alterao na conduta dos centros de sade nesta rea, a sade escolar tender a ser o que j fora dos concelhos de Lisboa, Coimbra e Porto: a medicina curativa das urgncias hospitalares, a que os pais recorrem quando o filho est doente. Representantes das 60 famlias que ocupam ilegalmente casas na urbanizao do Vale de Arcena, delegados do Centro Regional de Segurana Social, responsveis da Cmara Municipal de Vila Franca de Xira e da empresa proprietria dos fogos, a Eurocapital, reuniram-se, na sexta-feira, naquela cidade, para discutirem a forma como ser feito o levantamento individual que determinar quais os moradores que ali podero continuar a residir no caso de terem disponveis as verbas pedidas por cada casa. Nesta avenida de luxo, cortada longitudinalmente por um canal, perfilam-se, de um lado, os bancos (alm dos alemes, cerca de cem bancos estrangeiros operam na capital da Vesteflia); do outro, os estabelecimentos comerciais -- Cartier, Armani, Gucci, Van Cleef, etc. Ourivesarias que expem nas suas montras conjuntos de pulseiras e anis com preos superiores a meia dzia de milhares de contos, casas de moda com casacos de homem a custar mais de duas centenas de contos. Cheira a dinheiro em Dsseldorf, uma cidade em que, mesmo nas horas de ponta (e apesar do trfico intenso), as nicas bichas so as dos imigrantes turcos junto ao seu consulado. Uma cidade onde at h carros de lixo com monitores de televiso em vez de espelhos retrovisores. Jos Gregrio (Grego) o novo campeo nacional de surf, ttulo conquistado durante a II Semana Radical da Beira Litoral, que terminou no ltimo sbado na Figueira da Foz. O surfista da APSSOC ganhou a finalssima, onde estiveram os oito primeiros no conjunto das duas mangas. A APSSOC, com trs atletas nos quatro primeiro lugares, a campe nacional de clubes. Entretanto, est a disputar-se a Taa de Portugal, competio por equipas, na praia Grande, at o dia 12 de Agosto. Paulo Silva (Fluorescente) foi o vencedor do Campeonato Nacional de bodyboard, que tambm se realizou na Figueira da Foz. Apesar do quinto lugar nas duas mangas, Paulo Silva, da Quinta dos Lombos, venceu a finalssima. O Surfing Clube de Portugal, de So Pedro do Estoril, venceu por equipas. Na praia Grande, j terminou a Taa de Portugal de bodyboard, disputada por equipas e a vitria coube ao Surfing Clube da Caparica. Subir ao monte Garizim, no primeiro dia de neve do ano, quando as pedras e os abetos se escondem sob um manto de neve, ascender ao paraso, to grande a beleza celestial da paisagem. Mas Shalom Cohen quer descer do cume da montanha dos bem-aventurados para um lugar mais terreno. O seu desejo sentar-se no Conselho Palestiniano, onde Yasser Arafat reservou uma cadeira para um dos trs candidatos do mais pequeno e mais antigo povo do mundo -- os samaritanos. So cerca de 600 almas, 284 a viver em Nablus, a maior cidade da Cisjordnia, e 297 a residir em Holon, prximo de Telavive. Arafat sempre defendeu um Estado multitnico e, ao apoiar os descendentes de uma das tribos de Israel, a de Levi, filho de Jacob, ele mostra que tambm os judeus sero bem-vindos numa futura Palestina. Alm disso, ao proteger os samaritanos de Nablus, mostra que vai mais longe do que o Estado hebraico, que no d aos de Holon a oportunidade de estarem representados no Knesset (parlamento). A Espanha vai investir at ao fim do ano 67,6 mil milhes de pesetas em infra-estruturas ferrovirias. Mas, aps o esforo destinado ao Sul, com a linha de alta velocidade Madrid-Sevilha, em 1992, as prioridades de 1995 apontam para o Leste e Norte. O chamado corredor mediterrneo (Valencia-Barcelona) e a linha de alta velocidade Barcelona-Narbonne representam metade do investimento total em infra-estruturas de transporte ferrovirio. A UNITA declarou ontem que o seu lder, Jonas Savimbi, no aceitar o cargo de vice-presidente que lhe proposto nos acordos de paz de Angola, preferindo viabilizar um Governo de unidade nacional desempenhando o papel de lder de uma oposio leal, com direito a ser consultado pelo Presidente sobre todas as questes nacionais. As pessoas no MPLA temem que a UNITA queira romper o processo, estando deliberadamente a provocar atrasos at Novembro, data em que expira o mandato do Governo, disse Reuter o chefe dos negociadores da UNITA em Luanda, Isaas Samakuva, explicando que tinha entregue ao Governo e ONU um documento com propostas concretas. Mas os atrasos correspondem s dificuldades que temos enfrentado, acrescentou Samakuva. O Governo tem capitalizado sobre o que ns no fizemos, o que s contribuiu para enfraquecer a liderana da UNITA, dificultando-lhe o cumprimento das tarefas restantes. A segurana a antecipao a todos os nveis para ver, ao longe, e ser-se capaz de tudo prever -- um mal-estar, uma sacanice dos outros. No conheo um s piloto que no conduza sua velocidade de segurana. Segundo o artigo 1 do Cdigo da Estrada, aquela que permite ter o domnio da viatura e de parar ao mnimo impondervel. Baseado na sua experincia e em testes realizados sob controlo mdico, Jean-Pierre Beltoise afirma que a maior parte dos condutores no tem a menor necessidade de parar de duas em duas horas. Antes pelo contrrio: uma paragem maadora pode fazer baixar a vigilncia. Em contrapartida, se se sentir com dores nas costas ou se um enorme desejo de ir apanhar malmequeres o estiverem a massacrar, uma pausa relanar a energia. Uma cochilada? Tudo bem, se for capaz de passar em poucos minutos da viglia ao sono e vice-versa. Especialistas em luta anti-terrorista e no complexo puzzle de grupos e grupsculos afegos identificam partida duas organizaes como as mais susceptveis de fornecerem treino a jovens idos de outros pontos do mundo. E isto porque possuem um conceito internacionalista da guerra santa contra o Ocidente. o outro o Markaz Al-Dawat, organizao extremista wahabita. Os lees de Faro e de Lisboa lutaram ontem muito no Estdio de So Luiz, mas no conseguiram marcar. Os da casa esto ainda muito longe de uma boa forma fsica e a partir dos 25 minutos do primeiro tempo refugiou-se quase sempre na defesa. O Sporting dominou, alguns jogadores voltaram a mostrar pormenores interessantes, mas faltou-lhes inteligncia no futebol ofensivo. Um empate que acaba por ser justo, para duas equipas que precisam ainda de ser muito trabalhadas. O Sporting iniciou o jogo com o habitual 4x4x2 de Octvio, mas com algumas novidades em relao temporada passada. Saber ocupou a direita da defesa, enquanto l mais na frente Yordanov jogou na esquerda do ataque, cabendo a Hadjy alinhar no meio no apoio ao avanado Leandro, ficando a direita a cargo de Pedro Barbosa. Lang subiu muito no terreno, ficando Oceano sozinho no apoio defesa, facto que valorizou o futebol ofensivo dos lees. Alberto Joo Jardim foi a Porto Santo para conferir posse ao delegado do Governo Regional na ilha, Jos Rosado. O dirigente do PSD da Madeira aproveitou a ocasio para dizer que, at 1985, no houve qualquer desenvolvimento empenhado (no arquiplago) por iniciativa de Lisboa. Para Alberto Joo Jardim, s a autonomia poltica da Madeira e de Porto Santo juntos permitiu grandes saltos nos ltimos anos2 e seria suicdio para Porto Santo se essa solidariedade e unidade fossem quebradas. O chefe do Governo regional fez apelo ao investimento privado para assinalar que ao sector pblico no cabe resolver todos os problemas da ilha. Joo Alberto da Rocha Pris, um dos dezanove diplomatas despromovidos pelo recente Acordo do Supremo Tribunal Administrativo, que ps em causa as promoes efectuadas em 1987, apresenta hoje as credenciais ao Presidente angolano Jos Eduardo dos Santos. O novo embaixador de Portugal em Luanda tem 46 anos, iniciou a carreira diplomtica em 1969 e desempenhava as funes de director-geral para a Cooperao. No que o sbio das matrizes encontrou, no PSD, entusiastas seguidores? A morgue do Hospital de Beja foi privatizada! J s falta a Paz eterna ... Ainda por cima em Beja, um dos beros do colectivismo em Portugal! aposto que ele no partilharia os mtodos seguidos pelos seus discpulos do Baixo Alentejo e era bem capaz de organizar concursos pblicos abertos ... O aval de 600 mil contos do Governo UGT foi posteriormente abordado por Cunha Rodrigues. O procurador adiantou que j lhe foi entregue um parecer sobre o caso, mas s dever pronunciar-se dentro de 15 dias. Foi peremptrio, no entanto, ao afirmar que, se o Supremo Tribunal Administrativo declarar a anulao do emprstimo, a central sindical tem de devolver o dinheiro, mesmo que este j tenha sido gasto. A seleco nacional de futebol e as eventuais agresses a prostitutas durante o estgio efectuado antes do jogo com a Irlanda, no final de 1995, tambm foi abordada. Cunha Rodrigues diz que tudo est nas mos do procurador do Tribunal de Cascais, de quem aguarda um relatrio para breve. A extrema direita fez uma campanha eleitoral moderna, servindo-se do processo direct mailing americano enviando os seus panfletos xenfobos, do tipo Hamburgo tem de permanecer um estado alemo, pelo correio, em especial a milhares de jovens eleitores. E, por ironia do destino, fazem-no a coberto da lei eleitoral hamburguesa que autoriza os servios municipalizados a fornecerem as moradas de qualquer habitante, por cinco marcos cada. Esta legislao, altamente contestada pelos servios de proteco de dados, no foi alterada a tempo pelas autoridades da cidade. Uma das coisas que mais me impressionaram foi a desertificao do pas no interior. E todos os esforos que tm sido feitos no tm dado resultado, disse o Presidente, que desculpou a actuao dos seus governos ao afirmar que hoje temos melhores condies e melhores tcnicos do que tnhamos. Valente de Oliveira, que falou antes do Presidente, foi hbil ao fazer uma interveno que quase esvaziava o discurso de Soares. mas estes problemas esto a ser resolvidos. Quando Borges morreu, a criada apareceu com outro testamento, que era o que desaparecera e que ele tinha procurado antes de partir para Genebra. Por isso, Borges fez novo testamento. Um e outro dizem a mesma coisa, no h qualquer problema jurdico quanto a isso, a nica diferena que, no primeiro, ele deixa uma determinada soma a Fanny e, no segundo, essa soma um pouco menor. Voc fala constantemente daqueles que se dizem amigos de Borges, dos que lhe vinham pedir que escrevesse os seus prlogos, e garante que a esto a acossar. Quem so? Os primeiros judeus srios autorizados a emigrar pelo Presidente Assad j chegaram aos EUA. Em Damasco, outros querem seguir o exemplo, aproveitando este novo privilgio que amanh lhes pode ser negado. Tchernomirdin est a cozinhar um Governo com o Parlamento. Sabe que se no chegar a acordo, chumbado. A Rssia caminha para uma coligao. A maioria comunista ter uma palavra a dizer. A nata dos reformistas foi excluda ou auto-excluiu-se. Os mercados inquietam-se. Ieltsin garantiu a Clinton que as reformas sero uma prioridade. Mas o Presidente est cercado por pedidos de renncia. O Parlamento Europeu condenou ontem a deciso anunciada pela multinacional Seagate de encerrar a fbrica de Palmela, que ir provocar o despedimento de 850 trabalhadores. A resoluo aprovada pelos deputados europeus foi apresentada pelo grupo da coligao de esquerda e subscrita pelos trs deputados comunistas, Barros Moura, Miranda da Silva e Srgio Ribeiro. Considerando que a fbrica est tecnologicamente bem apretechada, com elevados nveis de produtividade e com uma excelente situao financeira, os parlamentares insistiram na necessidade da Comunidade e dos estados-membros exigirem aos parceiros comerciais, respeito das condies sociais mnimas, tendo por base as convenes e recomendaes da Organizao Internacional do Trabalho (OIT). A Frana est em vias de tomar medidas para a concretizao de um embargo comercial ao Haiti, tendo j pedido aos seus parceiros comunitrios que acompanhem Paris na tomada destas medidas. Um porta-voz do ministrio francs precisou ainda que o Governo est em vias de congelar todos os bens pblicos haitianos. O ministro dos Negcios Estrangeiros, Roland Dumas anunciou ontem, que a Frana vai aplicar as decises tomadas pela Organizao dos Estados Americanos (OEA). Decises estas referentes imposio de um embargo comercial, at que o presidente do Haiti, Jean-Bertand Aristide volte ao poder. a msica verbal parece pedir a correspondncia com a msica do violoncelo, e uma e a outra so associadas a ideias e sentimentos de runa e ruptura, embora a sua produo e perfeio impliquem, talvez, um alvio e uma catarse. e a sua forma, ou a sonoridade que produz, expressa por imagens bem visveis ou concretas: pontes, arcos, arcadas, balastres; rio, caudais, sorvedouro; barcos, lemes, mastros; urnas, blocos de gelo. 6 Alis, h no poema uma aluso explcita cor (brancos os arcos, para no falar na implcita dos alabastros e dos blocos de gelos) e vrias imagens cinticas: de que esvoaam, por baixo passam, se despedaam, caudais de choro, trmulos astros ... O rond, ou rondel (como lhe chamou o poeta), Viola chinesa imita e fala de um som lento, montono, fastidioso, idntico ao de uma parlenda ou de uma lengalenga, que todavia repercute subitamente na conscincia do enunciador, ou no seu corao, tornando visvel uma sua cicatriz melindrosa e permitindo a distenso das suas asitas. A relao cambial entre o escudo e a peseta no tem constitudo matria pacfica nos ltimos meses. Por um lado esto os empresrios, defensores de uma maior estabilidade cambial entre as duas divisas. Opinio diferente tm por vezes as autoridades monetrias nacionais, empenhadas em fazer passar a ideia de que o escudo no tem sempre de seguir a evoluo da moeda espanhola. Mas os agentes econmicos e o mercado consideram que h uma ligao inevitvel entre as duas moedas, enquanto o Banco de Portugal e o Ministrio das Finanas vo reafirmando a autonomia cambial do escudo, embora na prtica sigam parcialmente a peseta. Este tema alvo de uma anlise no ltimo relatrio da SAER -- Sociedade de Avaliao de Empresas e Risco, entidade ligada ao ex-ministro das Finanas Ernni Lopes. Para a SAER, a recente apreciao do escudo face peseta poder no ter sido a deciso mais correcta. Isto porque, nestas condies, e atendendo ainda a que Portugal e Espanha produzem e vendem, grosso modo, a mesma gama de bens e servios, deixar o escudo apreciar-se relativamente peseta reduz a capacidade concorrencial das nossas empresas no pas, em Espanha e em terceiros mercados, ao mesmo tempo que facilita a penetrao das exportaes espanholas no mercado nacional e em mercados nossos clientes. Os principais opositores do Presidente russo, Boris Ieltsin, vo reunir-se hoje em Moscovo para definirem uma estratgia que os leve ao poder. Foi pelo menos essa a inteno do ex-vice-Presidente Aleksandr Rutskoi e do lder do Partido Comunista, Guennadi Ziuganov, quando convocaram o encontro que vai juntar, no centro parlamentar em Moscovo, a oposio nacional-comunista e as foras conservadoras que recusaram assinar, em Abril, o pacto de paz civil de Ieltsin. A peseta continuou no topo do mecanismo de taxas de cmbio do SME onde a divisa portuguesa ocupa agora o terceiro lugar. O Banco de Portugal no interveio no mercado interbancrio. A taxa mdia do overnight desceu para 4,949 por cento. No mercado da dvida pblica foram colocados 15 milhes de contos de Bilhetes do Tesouro a 182 dias, taxa mdia ponderada de 4,4593 por cento, contra 4,4887 conseguidos no ltimo leilo da mesma maturidade, realizado a 15 de Janeiro. A criao da comisso de inqurito da AMS foi motivada por uma denncia de moradores de So Marcos, freguesia de Agualva-Cacm, de alteraes efectuadas nas cartas de ordenamento do PDM, aps a sua aprovao pelos deputados municipais, em que espaos antes urbanos passaram a zonas industriais. Funcionrios da fiscalizao municipal foram tambm acusados de aconselharem residentes a venderem as suas propriedades. Na proposta de relatrio elaborada pelo relator, Silvino Teixeira (CDU), constata-se que desde 1982, incio da elaborao do PDM, at Maro de 1993, a autarquia esteve totalmente alheada da gnese, definio, estudo e acompanhamento do plano. Mas a situao no melhorou aps a aprovao pela AMS, em Abril de 1994, pois as linhas mestras do ordenamento do concelho no foram enviadas para ratificao governamental, como a presidente anunciara, mas tero ficado nas gavetas ou prateleiras da Cmara. Mas para alm dos temas desse lbum, e em particular da verso minimalista de All apologies, um original de Kurt Cobain que Sinad adoptou j depois da morte do lder dos Nirvana, espera-se ainda a interpretao de algumas das outras canes que a tornaram notada, em particular as do lbum I Do Not Want What I Haven't Got (que continha Nothing Compares 2 U), j que as do trabalho que se seguiu, Am I Not Your Girl?, constitudo apenas por verses de velhos clssicos da msica popular acompanhados por orquestra, devero ficar de fora, por razes bvias. Num pas com mais de sete mil ilhas e onde os golpes de Estado se foram sucedendo quase mensalmente nos anos 80 e onde a guerrilha fundamentalista islmica avana na zona meridional do arquiplago, os incidentes de ontem permitiram comisso de eleies dizer que o dia foi relativamente calmo. Os cinco mortos e 26 feridos foram vtimas aparentes da guerrilha independentista muulmana, que atacou com fogo de morteiro vrias localidades nas ilhas de Mindanao, Sulu e Jolo, de maioria muulmana. Nas eleies locais de 1988, ocorreram 149 mortos, e nas presidenciais de 1992 o balano foi de 63 mortos. No Porto, a gare do aeroporto de Pedras Rubras era ontem pequena demais para albergar a confuso de gente que a inundou. Havia bichas para tudo, desde a tabacaria at aos bares, onde as provises mostravam uma perigosa tendncia para se esgotar. As pessoas viam-se obrigadas a improvisar lugares para esperar os avies que se iam atrasando. Atrasos que, segundo as informaes recolhidas junto dos balces da TAP e da ANA, no se deviam greve dos pilotos, mas ao elevado volume de trfego e a um elemento que tem mais poder sobre os cus que os pilotos: o nevoeiro. Os passageiros que se preparavam para partir, na sua esmagadora maioria, para frias no se mostravam muito preocupados com os atrasos que nalguns casos ultrapassavam as trs horas. E quanto to falada questo de segurana, pelo menos Andreia Silva, de partida para Dublin, garantia que no havia meio de transporte mais seguro e aludia ao acidente da madrugada na portagem dos Carvalhos para exemplificar como viajar por terra mais perigoso do que pelo ar. Mesmo o Gil, a mascote da EXPO 98, de braos abertos junto zona das partidas, parecia concordar com ela. Sem os necessrios e competentes requisitos tcnicos e a adequada experincia para o exerccio daquele complexo e difcil cargo, dadas as suas exguas habilitaes, e sem um mnimo de conhecimentos de gesto, este mesmo senhor tem vindo a gerir a dita instituio de maneira desastrada, discricionria e inbil, transformando mesmo aquela jia do patrimnio duriense numa casa arruinada! confrangedor olharmos hoje em dia para os meios de comunicao social e verificarmos o caos financeiro em que se encontra atolada aquela casa de to nobres tradies. So notcias de penhoras, pelos tribunais, execues fiscais, montes de dvidas banca e Segurana Social, esbanjamento de vrios milhes em negcios pouco claros com a Real Companhia Velha, etc. etc. Uma gesto calamitosa e incrivelmente ruinosa que os vinicultores durienses deveriam responsabilizar criminalmente! Mas o que mais espantoso no meio de tudo isto que toda esta baguna provocada pelo senhor Mesquita Montes tem contado com a complacncia dos sucessivos governos, nomeadamente os seus vrios ministros e secretrios de Estado da tutela! Tem contado tambm com o completo alheamento do Parlamento, que to pressuroso se mostra, por vezes, a discutir assuntos de lana caprina e nada faz para esclarecer este monstruoso escndalo! E o mau exemplo dos Estados Unidos impede que a maior potncia militar possa herdar a tarefa de controlar os conflitos que persistem ou estejam a caminho. A partir de agora, as guerras como a de Angola ou da Bsnia so praticamente impossveis de travar de fora e qualquer interveno -- cirrgica ou no, em nome da ONU ou no -- acarreta um grau de incerteza to grande que, ao ser accionada, no s pode eternizar os fogos que se pretendiam extinguir como ainda desencadear incndios em zonas dantes poupadas. Foi j em tom de festa que a Comisso Central da Queima das Fitas de Coimbra anunciou o pr-programa da grande celebrao dos estudantes deste ano, que ter pela primeira vez uma mascote: um morcego de ar matreiro, que em sete poses diferentes retrata o estudante ao longo dos sete longos dias da Queima. a Comisso Central continua a contar com a participao da Seco de Fado, como exige o regulamento, afirmando desconhecer oficialmente a deciso daquela de no participar no programa da Queima das Fitas/96. Tudo por causa da atribuio a outro grupo acadmico, a Fan-Farra, de uma actividade que os Fados consideram sua: o Festival de Tunas (ver PBLICO de 23/2). Nuno Guerra, presidente da Comisso Central, minimizou o diferendo entre as duas estruturas e afirmou que a linha orientadora trabalhar com a Seco de Fado, pelo que o convite vai ser feito, como hbito. S se o Fado no aceitar que a Comisso assume a realizao dos dois eventos. Problema ser conseguir fazer uma Serenata Monumental sem os fadistas da Academia, que entretanto prometem fazer uma serenata paralela, no mesmo dia e mesma hora, e em local onde os estudantes no podero ignorar. No entanto, aps esta corrida ao dlar durante a primeira metade da semana, o mercado optou por uma posio mais defensiva que provocou um ligeiro recuo do dlar face generalidade das restantes divisas. Os factores que contriburam fortemente para este movimento de correco foram, por um lado, os receios de intervenes por parte da Reserva Federal travando a rpida apreciao da paridade dlar / iene e, por outro lado, o facto de o Bundesbank ter deixado uma vez mais as taxas de desconto e lombarda inalteradas respectivamente nos 5.75 por cento . e 6.75 por cento. LLoyd Bentsen, secretrio do Tesouro norte-americano, afirmou, num discurso proferido na quinta-feira, que um iene fraco no soluo aceitvel para os problemas econmicos no Japo e insistiu na necessidade urgente de um estmulo efectivo que atenue o enorme excedente comercial japons. Foi com duras crticas ao presidente da Cmara de Vila Real, Manuel Martins, que o ex-nmero dois desta autarquia, Caseiro Marques (PSD), renunciou esta semana ao seu mandato. Na sua opinio, Manuel Martins, que tambm lder da Concelhia do PSD local e que agora se apresenta novamente como candidato Cmara, afunilou o PSD a nvel local e no est altura de conduzir este partido para o futuro. At concluso deste complicado processo, a CGD dever continuar a suspender a execuo das dvidas, excepto para as situaes de abandono. Este acordo surge aps anos de presses polticas e sociais para resolver a situao irregular das 234 famlias que podiam ficar na rua, uma vez que a Caixa Geral de Depsitos tinha iniciado autos de penhora. A Urbanizao da Bela Vista foi construda no incio dos anos 80, no mbito de um contrato de desenvolvimento de habitao celebrado entre o antigo Fundo de Fomento da Habitao (actual IGAPHE), a Caixa Geral de Depsitos e a empresa construtora Amadeu Gaudncio, tendo na altura a Cmara Municipal do Montijo favorecido a aquisio de um terreno e procedido iseno de taxas por ser um contrato de habitao social com custos limitados. O processo de execuo fiscal vai passar a ser exclusivamente aplicado na cobrana de dvidas ao Estado e a outras pessoas de direito pblico, de acordo com o decreto-lei 241/93 de 8/7/93. O fim desta norma evitar a execuo fiscal de dvidas de organismos sujeitos a um regime de gesto privada, como a Caixa Geral de Depsitos. Mas no ser aplicvel aos processos pendentes. Relativamente ao FEF do presente ano (um bolo global de 253 milhes de contos), as autarquias portuguesas devero, assim, receber no prximo ano mais quase 18 milhes de contos. Um crescimento que, todavia, no deixa a ANMP satisfeita. Lamentamos profundamente que as verbas para as autarquias voltem a ser atribudas em funo de uma lei desadequada e injusta, mas infelizmente os partidos no se entenderam quanto promulgao de uma lei que cumpra a Constituio da Repblica e que promova uma distribuio equitativa dos dinheiros do Estado, criticou Mrio de Almeida. Pelo seu lado, Jos Augusto de Carvalho preferiu sublinhar que o aumento supera em muito o valor previsto para a inflao no prximo ano (dois por cento), acrescentando que o actual Governo tem cumprido a Lei das Finanas Locais em vigor. A Associao Nacional de Municpios Portugueses pode ter outra leitura, mas esta realidade inequvoca, acrescentou. Das vsperas da institucionalizao do Estado Novo queda da cadeira que afastou Salazar, meio milhar de cartas trocadas entre o ditador e o seu delfim, que ilumina zonas vitais de um longo e complexo relacionamento que foi feito de solidariedade e de conflitualidade, de intensidade e de distanciao, de sintonia e de divergncia. Com prvio enquadramento histrico. Nos pavilhes da FIL, os visitantes podero apreciar a variedade da oferta de embarcaes a motor -- a lancha Campion Marine, de 18 ps, por 3200 contos; a Status 180, um arrojado exemplar de 18 ps, com atrelado, a 4730 contos; ou a sofisticada linha italiana Cranchi, especialmente a de 36 ps e a Azimute, o mais luxuoso fly bridge do mundo, que custa 50 mil contos. Entre os extremos, v-se tambm uma coleco de embarcaes da tradicional Bayliner, a refinada Caravelle, Celebrity, Searay, Argus, Chaparral, Rinker, Fairline, Gobbi, Sealine, Nimbus e Cobalt, a ltima considerada o Rolls Royce dos mares. Como atraces extra, apresenta-se o Balt, um pequeno barco de 5,30 metros, com motor de 50 cavalos, vocacionado para o uso em rios, por 3679 contos, e a Cobia, uma linha de barcos para pesca com consola central. Nos veleiros, as opes no so tantas, sendo a principal novidade representada pelo Jeanneau 24, de 24 ps, que se pretende organizar em classe nacional para futuros campeonatos, alm dos populares e prticos veleiros McGregor e dos conhecidos GibSea, barcos de cruzeiro seguros e confortveis. Em termos de navegao, h uma oferta vasta e actual de equipamentos de convs e electrnicos, especialmente sondas (com um novo modelo que permite a visualizao do fundo, alm da proa, o Echopilot Incastec) e GPS (Global Posiotioning System). Desde o Vero de 1993 mais de sete mil xiitas iraquianos fugiram para o Iro, atravessando imensos lamaais, para escapar s perseguies e ataques do Exrcito do presidente Saddam Hussein. Calcula-se que mais de 20 mil pessoas tenham abandonado as suas vilas e aldeias e se dirijam para reas prximas da fronteira. J em territrio iraquiano contam -- em relatos que tm surgido nos ltimos meses, sobretudo na imprensa britnica -- o pesadelo que deixaram para trs: quilmetros e quilmetros de terrenos lamacentos onde anteriormente existiam pntanos, rios, lagos, culturas agrcolas; aldeias destrudas, populaes massacradas, animais e plantas mortos em consequncia do que se suspeita serem ataques com armas qumicas. As bases de dados piratas existentes em Portugal dedicam-se afixao e difuso de cpias ilegais de programas sujeitos a proteco, de resultados de experincias de penetrao em sistemas informticos, de informao acerca de proteces, de portas de acesso e de passwords de entrada nas redes Telenet, Telepac, Itapac e, em alguns casos, nmeros de cartes de crdito. Algumas das chamadas portas de entrada vendidas eram da Lusa, do Laboratrio Nacional de Engenharia Civil (LNEC) e da empresa Aeroportos e Navegao Area (ANA). A heterogeneidade dos alunos que nos dias de hoje chegam escola determina diferentes atitudes e, obviamente, diferentes tipos de indisciplinas. Durante o debate foram claramente distinguidos dois deles: aquele que tpico do ensino preparatrio e das classes baixas e aquele que se verifica mais no secundrio e que protagonizado em conjunto por jovens das classes mdias-altas, que se sentem protegidos pela sua origem. Momentos antes deste verificao, Albino Esteves, professor da Secundria de Clara de Resende, enunciava medidas preventivas para a indisciplina na escola: o olhar reprovador do professor, o silncio tctico, a sobrevalorizao dos pequenos sucessos dos alunos mais fracos e o acompanhamento discreto das crises dos alunos. De um memorando de Jorge Ferreira sobre a eleio anual dos lderes dos grupos parlamentares. Poder dar alguns exemplos concretos do programa que tinha apresentado? No foi verdade, por exemplo, que os prprios livreiros se manifestaram contra a realizao de concertos e debates, considerando que isso era desmobilizador dos potenciais compradores? Isso no pode ser dito fora de contexto. fizemos um palco suspenso e as pessoas que convidmos tambm eram um chamariz para o imenso pblico que acorreu ao local. Protestaram, com alguma razo, porque os concertos foram previstos tardiamente, visto que as eleies para a APEL foram muito em cima da data da feira. Mas as pessoas acabaram por admitir que tnhamos feito o nosso melhor. Numa reunio no final da feira, pedimos desculpa pelo ocorrido e fomos perdoados e ovacionados, porque foi considerada uma iniciativa interessante. Na altura, assumi o compromisso de este ano ser dialogante com as pessoas; de preparar a feira com tempo, de tomar nota de um conjunto de crticas. Mas pensava que esse modelo seria de manter em 1996. Por exemplo, uma das minhas propostas inclua um concerto com Mrio Laginha e Pedro Burmester, e tambm a vinda de Eunice Muoz ... O Comissrio europeu para o audiovisual apelou anteontem a uma mudana de atitude da indstria europeia do sector, de modo a adaptar-se globalizao mundial das redes de distribuio. A preparar um Livro Verde sobre a poltica europeia para o sector, Joo de Deus Pinheiro, falando num seminrio sobre os media, considerou que a Unio Europeia devia encorajar o desenvolvimento de novas e mais eficazes redes de comunicao transeuropeias. A TVI-Televiso Independente e a TDF-Telediffusion de France fundaram uma sociedade de teledifuso para operarem em Portugal, a RETI, que ter sede social em Lisboa. O acordo foi assinado no incio do ms e, na nova empresa, a TVI detm 55% do capital e a TDF 45%, podendo o capital vir a ter mais participaes portuguesas. Est prevista a cobertura de 80% do territrio portugus, atravs de 19 estaes repetidoras. As associaes servem para prestar servios sociedade empresarial e aos fins dos empresrios. Mas, se me pergunta se o parque de feiras de Lisboa ser mais rentvel que o do Porto, eu pessoalmente digo-lhe que estou convencido que sim. Desde logo por causa do grande afluxo de pblico ... O nome diz tudo. O nome do disco, e o nome do grupo que o assina. A coleco completa dos singles dos Pet Shop Boys um daqueles registos que o consumidor vido de msica pop no pode deixar de receber de braos abertos. Imaginem s :18 canes de puro sumo pop, sem bafio nem conservantes. Se o formato sinttico -- tanto na frmula da cano utilizada pelo grupo como pela sonoridade que a envolve --, neste caso o palavro s tem conotaes positivas. Porque Neil Tennant e Chris Lowe tm, de facto, uma capacidade extraordinria para surpreender e agradar. Porque sabem ser concisos, sem falsas simplicidades, porque so inteligentes, sem armar grandes espalhafatos, porque conseguem ironizar e autoparodiar-se sem fazer figuras tristes. A luta de todos, o que prejudica os colegas, prejudica-me a mim, disse Maria Teresa Feio, educadora de infncia de Albergaria, enquanto se preparava para participar na manifestao organizada pela Federao Nacional de Professores (Fenprof), ontem, em Lisboa. Acabou a America's Cup de 1992, viva a de 1995. Os americanos conseguiram defender a posse do trofu, mas os desafiantes prometem regressar mais fortes, daqui por trs anos, baa de San Diego. os cubanos podem simplesmente no acreditar que os Estados Unidos os retenham perpetuamente em Guantanamo, ou que os enviem para outros pases, e ento a fuga continuar. O estmulo imediato para a mudana de poltica foi o receio de que a Florida fosse invadida por uma vaga de refugiados -- uma possibilidade que traria srios perigos polticos. a incapacidade de Jimmy Carter em conter a vaga de embarcaes vindas de Mariel em 1980 contribuiu para a sua derrota a favor de Ronald Reagan. Clinton, cuja poltica externa tem uma pobre reputao, ficaria seguramente mais vulnervel s crticas. claro que nesta perspectiva as autoridades espanholas tudo tentaro fazer para integrar o peloto da frente, cenrio sem dvida mais favorvel a Portugal. Mas as coisas em Espanha no esto nada fceis. A derrapagem da economia acentua-se. um em cada cinco espanhis esto sem emprego. O Governo socialista previa que essa fasquia fosse atingida apenas no final deste ano, depois de um forte aperto na economia, mas, supreendentemente, foi j alcanado. Com este estado de coisas, iniciou-se uma forte presso dos mercados sobre a peseta. O Banco de Espanha foi chamado a intervir e comearem os rumores de que a peseta poderia tomar o caminho da libra e da lira e abandonar o SME. Calcula-se o que poderia posteriormente suceder ao escudo, sabendo-se, como se sabe, que os tcnicos do Comit Monetrio aconselham uma desvalorizao substancial da moeda nacional. Como em Frana o clima tambm de incertezas, agravado pelas eleies marcadas para meados do prximo ms, a Europa passa nas prximas semanas por um perodo periclitante. To periclitante, que se sobrevier nova tempestade cambial, o SME muito simplesmente poder acabar e dar o lugar tal UEM acelerada para as economias mais fortes, e mais lenta para os que ainda tm muito que convergir. cada unidade uma unidade. O processo do desejo e da sua impossvel misso. Ao entregar um cargo equiparado a ministrio representante de um partido historicamente avesso aos compromissos polticos, Itamar Franco abriu uma crise sem precedentes no Partido dos Trabalhadores (PT). Eu curvei-me s razes do Presidente, disse Luiza Erundina, que contrariou o veto pblico de Lula participao do PT no Governo e o prprio partido, que j havia declarado oficialmente a sua oposio a Itamar Franco. Eleita para suceder ao ex-presidente Jnio Quadros na Prefeitura de So Paulo em 1988, Luiza Erundina, de 58 anos, geriu at ao final do ano passado o terceiro maior oramento do Brasil. Fui oposio a vida toda. s vezes cheguei a ser radical e sectria, mas aprendi que a gente s interfere se estiver dentro do barco, declarou a ex-prefeita imprensa paulista, ao justificar a sua presente deciso. O mundo est a tornar-se amigo das mulheres, mas as salas dos conselhos de administrao no. Menos de trs por cento das empresas ocidentais so dirigidas por mulheres, revela um estudo da Organizao Internacional do Trabalho (OIT), ontem divulgado em Genebra. O reprter da Rdio Elmo de Pinhel, Craveiro Lopes, foi condenado em tribunal a 14 meses de priso e 60 dias de multa por ter criticado a actuao da PSP, informou ontem a Lusa. A pena foi suspensa por 28 meses, atendendo conduta do arguido anteriormente aos factos e por ter demonstrado algum arrependimento por os haver praticado. O reprter foi accionado por, em crnicas emitidas em Julho de 1992, ter criticado o comandante do posto da PSP, subchefe Amadeu Eiras e o guarda Antnio Soares, crticas que o Ministrio Pblico considerou difamatrias. De acordo com o processo, Craveiro Lopes fora detido, meses antes, por aqueles polcias por factos susceptveis de integrarem ilcito criminal, aps o que ter dito aos microfones ser inconcebvel que, nas mos de homens sem qualquer escrpulo, se deposite poder. Em tribunal, o reprter afirmou no ter tido inteno de difamar, mas to s de informar, no pretendendo atingir a honra e considerao dos visados. O tribunal considerou, porm, que as afirmaes de Craveiro Lopes so objectiva e subjectivamente difamatrias e susceptveis de atingir seno a essncia da personalidade destes, pelo menos o patrimnio do seu bom nome, do crdito e da confiana por eles adquiridos enquanto pessoas, cidados e agentes da autoridade numa comunidade pequena onde todos se conhecem. Em declaraes Lusa, Craveiro Lopes disse que se limitou a tornar pblicas situaes de perseguio que ainda perduram. Se hoje tiver de fazer e dizer o que disse, no hesitarei. O presidente da Associao Portuguesa de Surdos advertiu ontem que o desenvolvimento da maioria da populao surda deixa muito a desejar. Falando Lusa do I Congresso Nacional de Surdos, que decorrer durante o fim de semana em Coimbra, Jos Bettencourt considerou que os mtodos de ensino tm feito tudo ao contrrio e os resultados esto vista. Chamou a ateno para o facto de sendo a lngua gestual pura e rica do ponto de vista gramatical e lingustico, e constantemente ignorada pela sociedade, importa alertar para a sua importncia na educao, formao e na plena integrao na vida activa. Sem rejeitar o mtodo oralista, Jos Bettencourt defende que o sistema educacional da criana surda deve contemplar lngua gestual, escrita e falada. A isto, precisou, damos o nome de comunicao total. A questo dos televisores estereofnicos tem agora alguma actualidade, uns vez que muitos dos consumidores que compraram aparelhos com essas caractersticas antes do incio das emisses em estreo so agora confrontados com o facto de os seus televisores no emitirem em estreo o sinal que recebem das emissoras. Vindo directamente do Algarve, de avio, chegou s instalaes da Igreja ao volante de um Mercedes e vestindo uma t-shirt. Vestiu depois um fato azul escuro, com gravata vermelha e camisa branca de riscas cinzentas. E culos escuros -- porque estava muito cansado -- que tirou para o fotgrafo. J se definiu uma vez como moo de recados de Deus. Qual foi o primeiro recado que recebeu? Dentro dos filmes (curtas e mdias metragens de 17 pases) sobre temticas actuais destaca-se a presena de Black Harvest de Bob Connoly (Austrlia), Aspen de Frederick Wiseman (EUA), Face Value de Johan Van Der Keuken (Holanda) e Das Ungehobelte Pack de Nana Swiczinsky (ustria). A retrospectiva / concurso de cinema documental asitico inclui ainda a homenagem ao cineasta sueco Stefan Jarl, qual a Cinemateca Portuguesa adere apresentando duas pelculas suas. Outras quatro obras de Jarl estaro presentes no festival, simbolizando a metdica abordagem de um dos cineastas suecos mais experimentais, mais radicais, traando sempre novas vias no documentalismo, como salienta Manuel Costa e Silva. Tambm Portugal se encontra representado com a estreia mundial de Crenas e Lendas de Joo Soares Tavares, obra inscrita na retrospectiva RTP, homenageada nestes III Encontros. Nas dcadas de 60 e 70, a produo documentarista em Portugal tinha um ritmo diferente e raizes de produo diversas, devido estrutura das sesses de cinema que permitia o visionamento de um bloco de filmes curtos e de noticirios antes da apresentao das obras ficcionais do programa. Assim, as empresas apostavam em pequenos filmes de prestgio, longe dos esforos de canalizao de publicidade que a televiso hoje mobiliza, para serem projectados nas salas de cinema e esses documentrios passaram a ser, para essa gerao, a grande escola inicitica para quem desejava seguir a aventura do cinema, relembra Manuel Costa e Silva. A fase II do terminal multipurpose, o novo porto de pesca com a respectiva lota, o sistema de comando e controlo dos terminais petrolfero e petroqumico, diversos acessos rodo-ferrovirios e a beneficiao e tratamento paisagsticos da zona marginal de Sines, investimentos rondando os 4,7 milhes de contos, foram as infra-estruturas que Azevedo Soares inaugurou. O ministro -- que referiu representarem as receitas do porto de Sines 12 por cento do produto interno bruto do Alentejo, assegurando cerca de 4200 postos de trabalho -- elogiou a boa colaborao entre a autarquia local, a Cmara de Sines, e a Administrao do Porto de Sines, APS, afirmando que em Sines disse-se pouco e fez-se muito, numa indirecta aos problemas entre a Administrao do Porto de Lisboa e a cmara da capital. Porque que isto aconteceu? No porque todos os condutores fiquem subitamente atacados de uma febre Fittipaldi quando se sentem como que rodeados por algodo doce. O que acontece, explica Robert Snowden, que estuda a percepo visual do movimento h mais de dez anos, que quando est nevoeiro vemos o mundo com menos contraste. Quanto maior o contraste detectado pelos nossos olhos, mais rpido as coisas parecem andar, diz o investigador. Isto porque os nossos olhos confundem facilmente uma mudana de velocidade com uma alterao do contraste. que as clulas localizadas numa zona do crebro chamada rea visual 5 so to sensveis a uma como ao outro e a linha de fronteira entre estas duas situaes tnue. As pessoas que tm esta rea do crebro danificada no so capazes de ver o mundo em movimento. Para elas, est tudo sempre parado, numa imagem muito difusa, explicou Snowden. -- de toda a maneira, no aceito, enquanto coordenador, pela Antena 1, dos referidos programas, as suas afirmaes de academismo, montagem que deixa muito a desejar, soporfero, e a referncia s bibliotecas como espao a evitar por quem trabalha em rdio -- referncia, no mnimo, curiosa. Com participao de elementos do grupo, neste caso T Pereira, ser lanado pela Tribal um CD s com trabalhos da Kaos (os temas includos devero repartir-se entre os primeiros lanamentos e os mais recentes) misturados numa megamix de T Pereira. Ainda deste, aqui sob o nome de DJ Vibe, ir ser lanado j em Janeiro um maxi-single j anunciado pela Kaos, a que se juntaro os dos Urban Dreams e T Ricciardi. Depois disso, a Kaos tem prevista a estreia do LL Project, do DJ Lus Leite, de um novo projecto do Porto designado Algo Rtmico, um novo maxi dos Ozone, e lbuns destes dois ltimos projectos. Em finais de Fevereiro, princpios de Maro, ser editada uma segunda compilao da Kaos que vai incluir todos os lanamentos no contidos em Totally Kaos, mais trs edies exteriores editora -- um projecto chamado Duplex de um portugus radicado na Alemanha, e mais dois em negociao -- e ainda uma remix pelos USL para Bottom heavy de Danny Tegnalia. A explorao do aparthotel, que ter 134 quartos -- todos eles com kitchenete e distribudos ao longo de oito pisos (entre o segundo e o nono) -- caber cadeia Orion, pertencente aos scios franceses do grupo Amorim, maioritrios na Inogi. J a explorao das reas comerciais, que se vo distribuir por trs pisos -- o rs-do-cho, o primeiro piso e um mezzanino -- est ainda por definir a quem ir ser atribuda. Estamos em negociaes com vrias empresas, entre elas a FNAC, mas no h ainda negcio fechado com nenhuma delas, disse ao PBLICO Almeida Guerra. O Decreto Regulamentar 19/91 (B) descobriu que a factura e a guia de remessa permitem controlar o efectivo carcter, pblico ou particular, do transporte, pelo que acabou com a guia de transporte. Menos papelada sempre bom, ainda que saiba a pouco. Se quiser telefonar -- servio manual -- para o Transkei prepare-se para pagar 51000 por minuto, o mesmo que, por exemplo, para o Afeganisto, Qatar, Gmbia, Brunei ou Honk-Kong. J se quiser falar, ainda por servio manual, para Trindade e Tobago, Tortola, Estados Unidos, Jamaica ou Austrlia, o custo por minuto ser de 34000. Um Hrcules C 130 da Fora Area Portuguesa seguiu ontem para Moambique e poder vir a servir, no sbado ou no domingo, para o transporte de So Tom para Lisboa de mais um grupo de estrangeiros que entretanto sejam retirados do Huambo. Na sua viagem para Maputo, o avio militar portugus transportou 27 militares moambicanos, dois cabo-verdianos e seis so-tomenses, que participaram em aces de formao em Portugal, para alm de algum material destinado aos portugueses do batalho de comunicaes (BT4) que integra a misso das Naes Unidas em Moambique (Onumoz). O gabinete do primeiro-ministro vai apresentar queixa na Alta Autoridade para a Comunicao Social contra o jornal Semanrio, que ontem titulou em primeira pgina que Cavaco deu 2,5 milhes de contos ao PSD. De acordo com a nota oficiosa emitida pelo gabinete de Cavaco Silva, o referido ttulo completamente falso e abusivo porque o primeiro-ministro no d dinheiro aos partidos, e associa o teor da notcia a um procedimento legal luz da lei de financiamento dos partidos. Em termos de unidades vendidas, se se venderam cerca de 145 mil LP de preo mximo, em CD, na mesma escala de preo, venderam-se quase 405 mil -- uma discrepncia que se acentua na facturao (176 mil contos em LP de preo top contra cerca de 817 mil contos em CD tambm de top), devido ao segundo formato ser vendido sensivelmente pelo dobro do primeiro. A cassete do mesmo escalo continua a no ir muito bem. Vendendo menos que o vinil: pouco mais de 103 mil, o que corresponde a uma facturao de cerca de 124 mil contos. Os discos mais vendidos so os que chegaram ao fim do ano passado j com maior nmero de galardes. o caso por excelncia de Waking Up The Neighbours, de Bryan Adams, que agora chegou a sextuplo de platina (cada disco de platina equivale venda de 40 mil unidades). As colectneas de xitos dos Bee Gees, Queen e Tina Turner tambm recolheram mais um galardo de platina. O nico grupo portugus que alcanou este estatuto foram os Onda Choc com Ela S Quer, S Pensa Em Namorar. O lbum de estreia dos Resistncia tambm j disco de platina, mas ainda no consta nas contas do trimestre, porque s o alcanou em Abril. P. -- Face aos dados de que dispe, continua a acreditar em poder alcanar a maioria absoluta? R. -- Continuo. P. -- J percebemos a sua cautela. Define a maioria absoluta como um objectivo, mas, se no a atingir, isso tambm no ser para si uma derrota ... O Comit Central do PCP est reunido para analisar a situao na Unio Sovitica. Em cima da mesa, ainda a situao interna do partido. Mas os dirigentes iro tambm poder ler um documento sado de uma reunio em que o indito aconteceu. Militantes comunistas encontraram-se em pblico e exigiram a antecipao do XIV Congresso. Querem novos dirigentes, nova ideologia, novo programa, novos estatutos. Em suma, um novo partido. Esto dispostos a lutar por isso, mas vo fazer uma pausa at 6 de Outubro. Com uma carteira de ttulos obrigatria para o estudo da obra nemesiana -- Vitorino Nemsio.A Obra e o Homem, ed. Arcdia, 1978, Temas Nemesianos, ed. Angra, 1981, e Vitorino Nemsio -- Luz do Verbo, ed. Vega, 1988, alm de ter prefaciado vrios livros para a Imprensa Nacional-Casa da Moeda, que est a publicar as Obras Completas --, Martins Garcia encontrou-s pela primeira vez com Vitorino Nemsio num exame. Foi em 1960, na Faculdade de Letras de Lisboa. Actualmente a leccionar Teoria da Literatura e Literatura Portuguesa na Universidade dos Aores (doutorou-se, em 1985, com a tese Fernando Pessoa: ' Corao Despedaado ' ), tem sido -- a par de David Mouro-Ferreira, Antnio Manuel Machado Pires, Ftima Morna e Maria Margarida Maia Gouveia -- um dos investigadores que mais tem contribudo para que Nemsio no caia no esquecimento. Contactado pelo PBLICO, o mdico confirma ter entregue os medicamentos a um senhor que se calhar facilitou e despejou [ o lixo ] de qualquer maneira. Esta , para David Paiva, uma justificao suficiente para no se considerar directamente responsvel pela situao criada. Mesmo depois de ter sido informado que, ao incio da tarde de ontem, o PBLICO observou trs crianas -- o Antnio, de 8 anos, o Tiago, de 12 anos e o Lus, de 10 anos --, todos residentes no bairro camarrio da Pasteleira a brincar no meio do lixo. Estvamos s a mexer nas latas. No tocamos nos remdios afirmou Lus. Mas o Bruno, de 10 anos, e tambm residente no Bairro da Pasteleira, revelou que o lixo tinha l umas seringas que depois pusemos a arder. Apesar da evidncia do perigo, David Paiva, limita-se a notar que no se tratam de lixos do tipo hospitalar. So coisas secas que j esto fora da validade. Mas para o responsvel pelos Servios de Fiscalizao da Cmara do Porto, Jos Adriano, os produtos encontrados representam sempre perigo, mesmo que no estejam no prazo de validade. So coisas que j esto de tal maneira fora de validade que j no tm aco farmacolgica. Ou seja, podem no cumprir o fim para o qual so indicadas mas no deixam, por isso, de constituir perigo se forem ingeridos. Penso que no porque o tempo tambm desactiva os medicamentos, remata, sem certezas, o mdico. Lus Afonso -- Quando comecei a fazer o Bartoon, em 1993, tinha essa angstia. Mais do que isso, vivia aterrorizado com o facto de ter de arranjar uma ideia todos os dias. Hoje vivo em paz com esta relao. P. -- Criaste condies para isso, j que vives em Serpa. Rodeaste-te de algum material especial, como foi? O cineasta americano Steven Spielberg apresentou na quinta-feira, em estreia mundial, num ptio de Liceu Sophie Charlotte, em Berlim, na presena de uma plateia de jovens alunos impressionados e atentos, um documentrio bastante emotivo sobre os sobreviventes do holocausto, um CD-Rom, intitulado O Futuro da Educao. uma maravilha estar aqui convosco disse o autor da Lista de Schindler, que foi agraciado com a Grande Cruz de Mrito, a mais alta distino alem. Porque o debate foi um pantanal de lugares-comuns e de manobras de diverso, no qual os oradores, em vez de encararem objectivamente a realidade dos factos, procuraram, antes de mais, a minimizao do papel dos outros. At o moderador do debate contribuiu para o arraial, dando ares de perdio que sugeriam a imagem dos negociadores comunitrios face impossibilidade de um cessar-fogo na Bsnia. E a quota do tomate, perguntou, a dada altura do caos em que se transformou a discusso sobre uma reforma que apenas contempla os sectores das culturas arvenses, do leite, da carne bovina e do tabaco. mostrou que, apesar das boas palavras, a capitalizao de protagonismo poltico est acima da discusso sria e profcua. e o homem da CNA, repetiu a confrangedora ideia que o documento inicial da reforma protegia as exploraes extensivas e que o documento final as penalizava, quando, afinal, poucas linhas se alteraram nesta questo. A conquista de mercados parece ser, no entanto, a preocupao imediata da Soporcel para os prximos tempos. At data, a rede de distribuio da Wiggins Teape demonstrou que a nossa opo estratgica tem sido a mais correcta, considerou lvaro Barreto, realando a necessidade de um parceiro externo que opere nos mercados mundiais do papel. Alm disso, a Soporcel est em vias de adquirir uma importante posio numa distribuidora espanhola, divulgou o presidente do Conselho de Administrao da Soporcel. Os mercados da Europa do Sul so para j os alvos estratgicos para a distribuio do papel produzido pela empresa, principalmente os dos pases Ibricos que so os que mostram mais elevada taxa de crescimento, cerca de 4,5 por cento ao ano. Os produtos fabricados so, fundamentalmente, o papel de cpia, o papel de impresso offset e os papis de computador, que representam os mercados mais promissores. Lusa Senos, chefe de Diviso de Sismologia do Instituto de Meteorologia (IM), relatou os ltimos progressos da rede nacional de sismgrafos. Gerida pelo IM, a rede funcionava desde os anos 70 com nove estaes analgicas em Portugal continental, 11 nos Aores, uma na Madeira e outra em Macau. Em cerca de 600 sismos no continente, s em dez a 12 por cento se conseguia calcular os parmetros ssmicos. Razo que levou o IM a decidir a instalao de duas redes digitais. Uma das redes digitais, em fase de instalao entre 1994 e 1997, custar 150 mil contos, ter 14 estaes (12 no continente e duas na Madeira). Neste momento, conta j com seis estaes no continente. Os registos falam por si, pois as estaes analgicas no detectavam sismos de magnitude inferior a trs na escala de Richter. Em 1995, duplicou o nmero de sismos detectados. Com a rede digital detectaram-se sismos de magnitude inferior a dois. A segunda rede digital, que ser instalada nos Aores entre 1997 e 99, ter 12 estaes e custar cerca de 200 mil contos. O Trifsica um bar da 24 de Julho, isto , longe das minuscularias do Bairro Alto. muito engraado. As portas e janelas so no nmero das letras do bar, ostentando, uma a uma, cada letra da palavra T-R-I-F--S-I-C-A, em torno do gaveto das Escadinhas da Praia com a 24 de Julho, num vidro fosco, em que s transparece o corpo da letra. Nove corpinhos bem feitos. A decorao tambm divertida. Detrs do balco, as bebidas protegidas pela barreira visvel de um vidro esto ligadas a mangueiras, que entornam os preciosos lquidos computadorizadamente para os nossos copos. Isto equivale a no poder servir melhor o whisky de um amigo, a no ser que lhe mangueire dois para dentro do copo. E a tem de se haver com o computador, que lhe cobra os dois! Enfim, este convvio com computadores custa-nos um bocado. que so uns antipticos que nunca oferecem copos. A no ser que os homens, seus donos, lhes ordenem! Boa! Mas, para alm de no nos oferecer copos, a no ser que a obriguem, esta mquina muito bonita. to gira que a empresa que a comercializa se chama Coisas Giras. R. -- Vamos l a ver. nos primeiros tempos, a minha viso era a de um homem que vivia numa boa casa, com ar condicionado, boa comida, comodidades. Nesses tempos at olhvamos com alguma sobranceria para as populaes, eu ainda no me tinha apercebido de que os negros talvez no fossem afinal uns selvagens, tinham era uma cultura diferente ... Depois, foi na guerra, quando tive de penetrar no mato e andar pelas bolanhas da Guin ou pelas savanas de Angola, quando descobri povoaes isoladas, etc., que passei de facto a conhecer frica. P. -- Onde estava quando comearam os massacres em Angola? A anomalia criou dificuldades, nomeadamente, em fbricas e restaurantes locais. Tambm, ontem, em entrevista RDP-Madeira, Alberto Joo Jardim, presidente do governo regional madeirense, surgiu em defesa de Jorge Sampaio, numa autntica trgua, face aproximao da presidncia aberta de Sampaio no arquiplago. Mostrando dvidas quanto interpretao da lei que revogou a obrigatoriedade do PR ouvir o CEMGFA na reconduo de um chefe de ramo, Sampaio, disse o lder do PSD-Madeira, ter agido bem, j que, sublinhou, uma carta daquelas no se escreve ao Presidente da Repblica. At porque Fuzeta da Ponte, nas palavras de Alberto Joo, no era grande espingarda e antiptico, devendo por isso ter sido demitido logo. Depois, Jardim aproximou-se da tese defendida pelo PCP, da exigncia da demisso de Veiga Simo, ao defender que o actual ministro da Defesa, com Marcelo Caetano j dera cabo da Educao. TEATRO NACIONAL DE D. MARIA II. Hoje, s 21h30; amanh, s 16h00 e s 21h30; dom., s 16h00; 2, s 21h30. TEATRO NACIONAL. Dias 5 e 7, s 18h00. A Gr-Bretanha estar preparada para impor o seu domnio directo sobre Gibraltar, para controlar a banca e o sistema legal respectivos, numa tentativa para acabar com as alegaes de que a colnia se transformou num centro de lavagem de dinheiro, informou ontem o jornal Sunday Telegraph. Funcionrios do Governo britnico confirmaram que a Gr-Bretanha foi aos arames com as posies das autoridades gibraltinas sobre a concretizao de directivas da Unio Europeia, bem como sobre o tema da lavagem de dinheiro. O jornal acrescenta que os gibraltinos receiam que a medida seja um primeiro passo para os britnicos desistirem da soberania sobre o rochedo, situado num promontrio no sul de Espanha, devolvendo-o aos espanhis. O jornal garante, embora um porta-voz oficial do Ministrio dos Negcios Estrangeiros no o tenha confirmado, que o Governo britnico est pronto para, antes do Vero, tomar medidas para que Gibraltar deixe o seu estatuto de colnia com governo prprio, passando a depender directamente da Coroa britnica. Um mido de trs anos cujas pernas foram cortadas por uma ceifeira-debulhadora foi operado para a reimplantao dos dois membros e recupera no Hospital Rei Eduardo VIII em Durban, frica do Sul, informou ontem fonte hospitalar. Amos Mosea brincava no meio de um milheiral, sexta-feira, numa quinta perto de Underberg, a 200 quilmetros da cidade porturia de Durban, no Oceano ndico, quando foi atropelado pela ceifeira-debulhadora e lhe cortou as pernas abaixo dos joelhos. Os gritos da criana alertaram um vizinho que a conduziu a uma clnica local, em estado de coma e sangrando abundantemente. Mas o mido saiu de coma para dizer: Estou doente. Conduzido de helicptero para Durban com os cotos das pernas mergulhados em gelo, foi imediatamente operado durante oito horas. Pouco a pouco, foi-se esquecendo em grande parte o objectivo primordial que para ali levara norte-americanos, paquistaneses, italianos e outros soldados de meio mundo: distribuir comida a muitos africanos que estavam perigosamente beira da morte. E a situao passou a ser, em primeiro lugar, a de um confronto cada vez mais agudo entre a fora expedicionria estrangeira e as milcias locais, muito em particular a de Aidid, que se tem feito passar por um nacionalista e um paladino da cultura islmica. O ataque areo de ontem, em que participaram nove helicpteros, e a operao terrestre que se lhe seguiu fizeram com que somalis furiosos tivessem morto um fotgrafo da agncia Reuter, o anglo-americano Dan Eldon, de 22 anos, e outro da Associated Press, o alemo Hansi Krauss, de 30, estando ainda por confirmar a morte de mais um fotgrafo, Hos Maina, e de um operador de som da Reuters Television, Anthony Macharia, ambos quenianos. Para medir a largura da regio emissora da radiao, os astrnomos utilizaram, como se se tratasse de uma autntica lente de aumentar, a bolha de gs em expanso que envolve o pulsar. O gs tambm constitui um dos restos da supernova e encontra-se em expanso sob o efeito da onda de choque gerada pela exploso. Visto que o pulsar era demasiado pequeno para a sua imagem ser medida desde a Terra, mesmo pelos telescpios mais potentes, a ideia consistia em captar imagens do gs para, a partir da, extrair a imagem do pulsar. A bolha de gs fornece uma resoluo equivalente de uma ' lente ' do tamanho da rbita da Terra, explica um comunicado ontem emitido pela Associao Astronmica Americana. Para o presidente da CML, o objectivo da UCCLA no ter dinheiro a prazo nos bancos, mas servir as populaes das cidades que so membros. E disse tambm que a transformao da UCCLA em fundao, proposta por Pinto Machado, uma novidade absoluta para todos os membros da Assembleia Geral. O autarca classificou as suas relaes pessoais com Pinto Machado como sendo as melhores e considerou no haver qualquer ruptura no plano institucional, porque o secretrio-geral acaba de apresentar a sua demisso. Negou ainda que as suas divergncias com Pinto Machado resultem de um conflito poltico-partidrio no municpio lisboeta (onde o PP est na oposio) que tenha sido transposto para a UCCLA, o que foi confirmado pelo secretrio-geral demissionrio. Os desafios da globalizao e as respostas para o desenvolvimento de frica na viragem do sculo so o tema de um frum de dois dias, a comear amanh, em Lisboa. Organizado pela Sociedade de Geografia de Lisboa, em colaborao com o ISCSP-Instituto Superior de Cincias Sociais e Polticas, o encontro conta com a participao de representantes das empresas portuguesas com investimentos em frica e dos respectivos beneficirios. A Cmara de Palmela vai submeter apreciao da Assembleia Municipal uma proposta que reduz em 0,1 por cento a contribuio autrquica, fazendo com que, em 1995, esta taxa, em vez de representar 1,3 por cento sobre o valor dos prdios urbanos, passe a ser de 1,2 por cento. Outra declarao, desta vez vinda do Governo indiano, voltou a deixar preocupadas as cinco potncias nucleares (EUA, Frana, Rssia, China e Gr-Bretanha). Um responsvel indiano pelo projecto de fabrico de msseis, Abdul Kalam, disse que o Agni (Fogo) -- um engenho com capacidade para transportar uma carga nuclear a uma distncia de 1500 metros -- est pronto a ser produzido em srie. E outro, com maior alcance, vir a caminho. Segundo o assessor cientfico do primeiro-ministro indiano, Atal Behari Valpayee, trata-se de um Agni melhorado, com um alcance de 2500 metros, num estado avanado de desenvolvimento e cuja produo j recebeu a aprovao do Governo. As sanes no nos afectaro neste domnio, disse ainda Kalam, referindo-se s penalizaes econmicas impostas contra a ndia pelos EUA, Japo e Canad. O nosso programa nuclear 100 por cento indiano. Um incndio de grandes propores deflagrou ontem, cerca das 20h00, na escarpa da Serra do Pilar, em Vila Nova de Gaia, numa fbrica de estatuetas localizada na margem esquerda do rio Douro. Consciente da debilidade partidria no relacionamento com a sociedade, Gonzlez apostou forte em independentes de prestgio. O efeito Garzn, a incluso na lista de Madrid do magistrado que mais reconciliou os espanhis com a Justia, no teve, no entanto, o efeito duradouro que o lder socialista pretendia. To-pouco o tiveram as promessas de um novo impulso democrtico, necessariamente regenerador, uma das chaves da sua campanha, porque no foram completadas com um lavar da roupa suja a nvel partidrio. O crculo vicioso, entre o enunciado da mudana e a prudncia aconselhada pelo momento eleitoral, deixou pouco sabor na boca. O mau momento seria agravado pelos resultados do primeiro debate televisivo, com um Felipe Gonzlez acabrunhado pela avalanche das crticas dos conservadores. O programa televisivo Falar Claro viveu, na segunda-feira, um dos seus mais acalorados momentos. Isto porque o jornalista Joaquim Furtado, sem peias nem concesses como seu timbre, decidiu esta semana tomar o pulso ao estado do futebol portugus. Espetou duas lminas na cara da boneca. Deitou-lhe para cima sangue ou tinta vermelha, no se percebia bem. Comeou a bater-lhe com um martelo. Cortou-lhe a cabea com uma navalha. Depenou-a. Deve ter passado meia -hora. Vera apagou a luz. Vestiu o casaco. Bebeu gua. Pegou no papel e na caneta e foi-se embora. Mais tarde disse que teve a sensao que as pessoas estavam espera do climax que no aconteceu. De facto, estavam. Vera quis mostrar a usurpao da carne e chamou sua performance Foda de Morte porque num ensaio de Angela Carter sobre Sade, a foda levava morte das mulheres. a death fuck. Os automobilistas, esses, continuam espera ... Trs centenas e meia de pessoas a ver, ouvir e aplaudir o espectculo montado pela Associao Timorense Lafaek. Ontem noite, num clube desportivo de Darwin, muitos australianos sentaram-se ao lado dos estudantes e levantaram-se para gritar Viva a misso de paz!, Viva Timor-Leste, Viva Xanana, quando est prxima a chegada do Lusitnia. Crianas e adultos da Associao danaram, vestidas com os trajes e panos coloridos que h sculos os timorenses fabricam. Leu-se o poeta Borja da Costa e o cntico que Xanana Gusmo escreveu na montanha para a Mulher Timor. Rui Marques, coordenador da misso que quer ir a Dli, foi chamado para ler a carta enviada clandestinamente pelos estudantes timorenses presos na Indonsia. Obrigado pela vossa visita a Timor-Leste. Depois, num cmulo de msica e emoo, cantou-se Peace Mission, o hino composto e ensaiado para o primeiro dos dois dias de festa. Venham e juntem-se misso de paz. (...) Duzentos mil j morreram. Acreditam? Acreditam? Num local da enorme sala, um jovem timorense vestido de guerreiro soluou e chorou abraado s pessoas. O movimento ecologista blgaro Ecoglasnost pediu aos pases da CEE para fornecer gratuitamente energia Bulgria com o objectivo de possibilitar o encerramento da central nuclear de Kozlodoui, situada no Danbio e considerada perigosa pelos peritos blgaros e internacionais. Os quatro reactores ainda em funcionamento da central, de 440 megawatts cada um, foram construdos entre 1974 e 1975 e inspeccionados recentemente por uma misso da Agncia Internacional da Energia Atmica. Dois deles esto num estado extremamente envelhecido. Como que explicamos todas estas medidas israelitas? Chamamos a isto terrorismo de Estado organizado, acrescentou Arafat. Enquanto a violncia de palavras entre Israel e a OLP aquece, a violncia continua a marcar a cena no Lbano, onde caas israelitas lanaram ontem mais dois raides contra foras hostis ao processo de paz. No bombardeamento da base da Frente Popular de Libertao da Palestina- Comando Geral, na fronteira lbano-sria, ficou ferido um guerrilheiro. No ataque anterior contra uma base do Hezbollah, no vale de Bekaa, Lbano, morreu um guerrilheiro fundamentalista e outros trs ficaram feridos. Ramos Horta no veio a Portugal s para receber os parabns pelo Prmio Nobel da Paz com que foi distinguido juntamente com o bispo Belo. 13 anos depois da ltima votao sobre Timor-Leste na ONU, chegou o momento de voltar a levar o tema ao julgamento da Assembleia Geral daquela organizao. A defesa desta estratgia de guerra total no plenrio das Naes Unidas no assenta exclusivamente no impulso que a questo timorense ganhou com a atribuio do Nobel a Horta e a Belo -- Ramos Horta j tinha sugerido esse passo num memorando que fez chegar ao Palcio das Necessidades meses antes de se conhecer a escolha de Oslo --, mas o certo que o Nobel da Paz timorense constitui um reforo importante. para o aproveitar que Ramos Horta vem agora pessoalmente insistir na sua proposta. Os que querem passar uma esponja sobre o passado impedem a reconciliao. Ele sada a esperada vinda do Presidente israelita, Ezer Weizman, s cerimnias do 8 de Maio, como um gesto de reconciliao por parte da Alemanha. Mas, acrescenta, o ritmo e a forma desta reconciliao no devem ser ditados do ponto de vista dos culpados, mas pelo das vtimas. Em vez da esperada barracada musical, em Alvalade houve outro tipo de cenas bem menos divertidas. Por causa do perigo de morte representado pela pala que ameaa que cai mas no cai, alguns milhares de pessoas pagaram para assistir ao concerto num local e acabaram noutro, por falta de espao. Est tudo cheio, diziam elementos da organizao, tente na bancada do lado. O mais conhecido dos ardinas de Lisboa, Carlos Francisco dos Santos, foi ontem a enterrar no Cemitrio do Alto de So Joo. Carlos dos jornais, como todos lhe chamavam, tinha 77 anos e faleceu no domingo, num hospital da capital, em consequncia de uma prolongada doena pulmonar. Com 57 anos de profisso, Carlos dos Santos, tornou-se popular pelos preges que usava na venda de jornais e de lotaria, na zona da Baixa e Bairro Alto, e pela maneira afvel com que se relacionava com toda a gente. Fernando foi a figura do jogo de ontem entre o Rio Ave e o Sporting ao marcar trs golos ao clube lisboeta. Peixe acabou por ser o heri sportinguista, ao defender, no ltimo minuto e sobre a linha de golo, um remate que daria o empate ao clube de Vila do Conde. No ano passado, os organizadores pensaram em acabar com a corrida, devido insuficincia de apoios. os Sinos uma das provas de estrada com maiores pergaminhos no nosso pas. Falta agora que os corredores contribuam, com a sua presena. No Roubars. O roubo neste episdio no de bens materiais. Tudo circula volta de uma criana, abanadonada pela me natural e criada por outra mulher que reconhece como me. A primeira vem roub-la. A criana ir depois escolher entre elas. A Comunidade Econmica, por outro lado, dever preparar as condies para a transio gradual para a livre circulao de mercadorias e servios em todos os Estados que a ela aderirem. Mais tarde, quando estiverem criadas condies econmicas, proceder-se- criao de um mercado comum de mo-de-obra. No captulo das bases econmicas, os membros devero acordar aces nos domnios da poltica monetria, financeira, alfandegria e de regulamentao de impostos. Tambm se prev a criao de uma moeda nica, embora, quem o deseje, possa ter a sua prpria divisa. Ao sair da penumbra, arrastando uma coisa parecida com nvoa cinzenta e oleosa, suspensa a dois palmos da cabea, Joo Carlos andou trs ou quatro passos e parou com um suspiro, como se as suas pernas utilizassem um sistema de suspenso a ar. Via-se que, por vrias causas, Joo Carlos se cansa com facilidade, pois tambm baixou os ombros e inclinou o queixo para o peito, sem nunca olhar para os lados das grandes janelas do tribunal. E depois a juza chamou-o, vendo um homem com uma gabardina enorme, ainda maior que ele, um homem com cabelos grossos e brilhantes como crina de cavalo que tem andado chuva, sem acompanhamento veterinrio e com pouca forragem. Os olhos dele saam para fora da cara e quase chegavam ao nvel da cana do nariz, no caso de se apanhar de perfil a figura perdida e magrssima de Joo Carlos, que tanto podia ter vinte e poucos anos (estragados), como muitos mais, e de facto tinha. Com 37 anos, Joo Carlos acabara de ser preso mais uma vez por furto num supermercado. passou as caixas registadores trazendo escondidas nos bolsos 12 embalagens de 10 lminas de barbear. Mas poderia vende-las. alis certo que as venderia num reles mercado negro, com tanto orgulho como, h umas dcadas, uns senhores vendiam numa ruela da Baixa de Lisboa os esticadores de colarinho, e parece que disso viviam (disso e de preservativos clandestinos). Madrid tambm fechou em alta de 0,55 por cento (mais 2,05 por cento), com o seu ndice geral a atingir 376,82 pontos. A alta madrilena foi causada pela abertura em alta de Wall Street e pela subida dos mercados obrigacionistas, que compensaram os sustos com o anncio da demisso do general russo Alexander Lebed. O volume de negcios da sesso de ontem 31,2 milhes de contos. Em Frankfurt, registou-se uma subida de 0,05 por cento, com o ndice DAX a fechar a 2.176,26 pontos, devido baixa que atingiu Wall Street na quarta-feira. Nos mercados orientais, Tquio foi a excepo e, ao meio da manh, a bolsa tendia para uma alta marginal, com o ndice Nikkei a marcar 12,07 pontos no fim da sesso da manh. As baixas dos demais mercados marcaram uma tendncia. Em Hong-Kong, no entanto, houve uma quebra e o ndice Hang-Seng atingiu 61 pontos negativos, com uma subida posterior para 46 pontos. Singapura tambm fechou com uma ligeira baixa de 2,54 pontos. Vance destacou a aceitao pelos beligerantes do envio de uma fora de manuteno da paz e sustentou que o novo cessar-fogo indica, ao contrrio dos anteriores, os procedimentos necessrios para um acordo especfico. O ponto mais crtico reside agora nas possibilidades em manter o cessar-fogo. Agora vou para Nova Iorque, onde relatarei a Butros-Ghali [ secretrio-geral da ONU ] o que aconteceu nos ltimos dias, afirmou antes de entrar apressadamente no Mercedes preto, j com o motor a trabalhar. Antes, tinha-se recusado a comentar quais as suas prximas iniciativas caso o acordo no resulte. Vincent Askew, com 20 pontos, liderou os Seattle SuperSonics na sua vitria sobre os Los Angeles Clippers no nico jogo da Liga Norte-Americana de Basquetebol profissional (NBA) realizado no domingo. Terry Dehere, dos Clippers, foi o melhor marcador do encontro, com 24 pontos. Os SuperSonics alcanaram o seu terceiro triunfo na poca e esto em quarto lugar na Diviso Pacfico. Quanto aos Clippers, ainda no conheceram o sabor da vitria e seguem com cinco derrotas. A China quer organizar em 1997 um Grande Prmio de Frmula 1 no circuito de Zhuhai, que tem 4,83km de permetro e est situado perto de Hong Kong. Nas obras previstas para o complexo, que incluir um campo de golfe e seria terminado em 1996, as autoridades pretendem gastar cerca de 30 milhes de contos. Se a Federao Internacional do Automvel, que j aprovou os planos de alterao ao circuito, aceder aos sonhos dos chineses, estes garantem a presena de 200 mil espectadores. Folha marcou em Chaves o seu segundo golo do campeonato, que foi considerado pelo PBLICO como o melhor desta jornada. Mais um golo conseguido em jogada individual. O portista pegou na bola na esquerda, progrediu pelo centro do terreno, entrou na rea e, com dois toques preciosos, sentou os dois centrais flavienses, Manuel Correia e Amarildo. Depois, perante a sada de Orlando rematou de p esquerdo j a curta distncia da baliza. Depois desta jornada, que rendeu 18 golos, Juskowiak e Marcelo continuam a liderar a lista dos melhores marcadores, ambos com oito remates certeiros. Os dois jogadores voltaram a marcar no fim-de-semana, curiosamente ambos em bonitos remates de cabea. Diatriba de amor contra un hombre sentado assinala amanh a estreia na escrita teatral do colombiano Gabriel Garcia Marquez. Romancista, contista, cronista e guionista, faltava a Gabo experimentar, em livro, o teatro, um meio para o qual outros, por ele, verteram muito dos seus contos. A tiragem inicial da obra, um monlogo, ser de 50 mil exemplares. Segundo o jornal de Bogot El Tiempo, este texto de Gabo estar nos palcos em breve, numa encenao a cargo de Ricardo Garcia. A obra, redigida em 1987, esteve sete anos na gaveta antes de ser editada. Sete bias de sinalizao da barra do Douro vo ser hoje recolocadas, marcando o percurso da Cantareira at Vila Nova de Gaia. Cinco desses aparelhos esto equipados com um sistema de iluminao, para que a entrada nocturna de embarcaes se possa fazer com toda a segurana. As bias tinham sido arrastadas durante as cheias de Dezembro e Janeiro passados e deviam ter sido repostas logo no incio do ms de Maro. Uma fonte da Capitania do Porto do Douro disse ao PBLICO que o sistema vai entrar imediatamente em funcionamento, colmatando assim o processo de recolocao das bias destrudas pelas cheias, que ocorre desde Fevereiro. Fica assim totalmente sinalizado o percurso entre a barra do Douro e a Ponte de D. Lus I, para o qual foi necessria a substituio das bias originais por outras de maior dimenso, enviadas pela Direco de Faris. As novas unidades ficaro seguras por blocos de beto. Iliescu tinha anunciado que se deslocaria amanh ao vale de Jiu, mas ontem os servios presidenciais disseram que a viagem fora cancelada. Os trs atletas portugueses em aco no quarto dia de provas no foram felizes. A sportinguista Teresa Machado foi a primeira a entrar em aco, lanando no segundo lugar do grupo A qualificativo do disco. a portuguesa, muito nervosa, falhou por completo o primeiro lanamento. No segundo ficou a mais de dez metros do seu recorde nacional (63,70m) com 53,60m, e s no derradeiro intento (a qualificao s tem trs) se aproximou um pouco mais do que vale, com 56,02m, mas no chegou. Foram precisos 61,22m para ir final. Quem brilhou de manh foi Lucrcia Jardim (Benfica), na eliminatria de 200m. Correu a srie 4 e ganhou russa Natalya Voronova (23,45s) e americana Gwen Torrence, a campe olmpica (23,46s), com 23m45s. De tarde, porm, tudo acabaria em frustrao. No terceiro quarto-de-final precisava de uma posio nas quatro primeiras para ir s meias-finais e perdeu esse precioso quarto posto para a jamaicana Dhalia Duhaney mesmo nos metros finais, com 23,11s contra 23,16s. Ganhou a francesa Marie-Jos Prec com 22,73s. Resta a pequenssima consolao de ter feito o melhor tempo nacional de 1993. De que tem medo o IPACA? O Instituto Portugus das Artes Cinematogrfica e do Audiovisual (IPACA, ex-IPC) tambm no respondeu s perguntas formuladas pelo PBLICO. Responsveis seus comearam por pedir que as perguntas fossem enviadas por fax, para serem respondidas por escrito -- para no haver mal entendidos. O fax foi enviado, tal como pedido, presidente do Instituto, Zita Seabra. Quarenta por cento da gua potvel usada em sua casa vai pela sanita abaixo. O Presidente da Repblica, Mrio Soares, enviou no incio da semana um telegrama Coreia do Norte, atravs do embaixador de Portugal em Pequim, apresentando condolncias pela morte de Kim Il Sung, soube o PBLICO junto de fontes coreanas em Lisboa. Portugal tem relaes diplomticas com a Coreia do Norte desde 1975 e representado em Pyongyang pelo chefe da sua misso diplomtica na capital chinesa. O telegrama dirigido a Kim Jong Il, filho do falecido Presidente da Coreia do Norte e seu presumvel herdeiro. O primeiro-ministro, Cavaco Silva, fez apresentar na embaixada norte-coreana em Lisboa, atravs do seu adjunto diplomtico, sentidos psames pela morte de Kim Il Sung, falecido h uma semana, com 82 anos. Entretanto, ontem, a Coreia do Sul, encorajada pelos sinais tornados at agora pblicos sobre o que se vai passando na semana de luto no vizinho Norte, indicando uma aparente passagem calma de poderes de pai para filho, decidiu levantar parte do alerta especial em que tinha colocado as suas tropas, na sequncia do falecimento do Grande Lder. P -- Mas as Assembleias Municipais s tm parecer vinculativo numa segunda fase ... R -- Sem dvida. Mas no faria muito sentido, se as Assembleias Municipais se pronunciarem neste ou naquele sentido j nesta fase, vir a ser aprovada uma lei de criao das regies contrariando uma vontade que se sabe que vai ser manifestada ulteriormente. Mas o sistema de inscrio foi alterado h cerca de um ano. Deixou de ser foroso as pessoas deslocarem-se sede para se inscrever em turnos de 15 dias num dos centros de frias. Agora podem tratar directamente, em contacto telefnico ou por fax, com o centro onde pretendem passar frias. O sistema antigo s se mantm para os Centros de Entre-os-Rios, So Pedro do Sul e Foz do Arelho. Ns, dentro de Espanha, sempre manifestmos que deveria haver interesse por Portugal, que sempre teve um certo receio, porque a histria foi o que foi, comenta ao PBLICO Jordi Pujol, se temos uma identidade prpria e no temos medo, muito menos os portugueses, que tm um Estado prprio. desta forma directa que o presidente da Generalitat, sem dvida um dos dirigentes polticos mais influentes de Espanha, equaciona as relaes entre os dois pases peninsulares. Pujol recebe oficialmente pela primeira vez o Presidente da Repblica de Portugal -- uma anterior visita de Mrio Soares, no Outono de 1987, foi privada --, mas no esquece o ocorrido h quase nove anos. A visita de Soares foi privada mas muito positiva para ns, assegura, relembrando tempos de busca de protagonismo que, nos ltimos trs anos, com socialistas e conservadores no poder em Madrid, deram lugar condio de parceiro indispensvel. Ainda quanto a alteraes na Liga de Clubes, a direco ser reduzida de quatro para trs membros, passando apenas a haver um presidente (eleito em AG), um director-executivo e outro elemento eleito pela direco da Liga, onde tm assento onze clubes. Este ltimo ser o nico que no coincide com o OA. De resto, decidiu-se ainda aumentar os elementos da Comisso Arbitral da Liga, que sero nove em vez de sete. Todas estas decises, disse Damsio, foram tomadas por unanimidade. A FPF confirmou ontem a repetio do Benfica-Sporting para amanh, no estdio do Restelo, negando assim provimento contestao do Sporting. A hora do jogo, referente 30 jornada do campeonato nacional e repetido por alegado erro do rbitro na expulso do benfiquista Caniggia, ficou dependente do clube de Alvalade. A dvida entre as 18h30 ou as 20h30, esta ltima proposta pelo Benfica, para permitir a transmisso televisiva. Confundir identidade e fisionomia: um sonho tenaz. Uma vez em interaco com os outros, a gesto das nossas expresses faciais representa um capital simblico que implica um domnio e um controle socialmente construdos. Os presentes autores, seguindo de perto as coordenadas tericas e os parmetros metodolgicos e cronolgicos de Foucault, propem-nos uma histria do modo como a emergncia da racionalidade moderna instituiu uma tenso constante entre o que orgnico, e como tal objecto de um saber exacto, e o que expressivo, logo passvel de uma hermenutica que valida processos contnuos de reclassificao social. Trata-se de uma separao politicamente profcua, j que, ao longo de os scs. XVI a XIX, o homem ter interiorizado a necessidade de vigiar a maior ou menor conformidade entre rosto, preceitos ticos e poder social, ao mesmo tempo que lhe pedem que seja singular e autntico. Uma verdade subjectiva incorporada atravs de normas sociais e, inversa e complementarmente, prticas sociais que avaliam do grau de integrao de cada um. O cuidado de si como inscrio do poder. No termo de um Vero em que a febre dos divrcios e escndalos delapidou o patrimnio de prestgio da coroa britnica, foi a vez de a libra cair em descrdito. Tal como a rainha Isabel II, John Major limitou-se a acompanhar os factos. Mas se o silncio pode favorecer a recuperao da imagem da coroa, a indeciso mortal, tanto em poltica como em economia. O primeiro-ministro britnico no pode querer assinar Maastricht e manter a libra fora do SME e da sua correspondente disciplina. Como no pode continuar a afirmar a prioridade da luta contra a inflao deixando a libra desvalorizar-se. Menos pode ainda permitir que o seu ministro das Finanas escreva aos colegas de Gabinete notas prevendo que a libra ir continuar fora do SME por meses, talvez anos. O presumvel autor de 15 assaltos ocorridos recentemente em residncias de Setbal e do Pinhal Novo, atravs da extrao do canho das fechaduras, foi detido pela PJ daquela cidade e viu a priso confirmada pelo tribunal local. Trata-se de um jovem de 20 anos, que ter confessado os crimes e a venda dos objectos roubados para a compra de herona. O detido saiu pouco antes do incio da srie de assaltos de uma instituio de recuperao de toxicodependentes, aparentemente reabilitado, e comeara a frequentar um curso de formao profissional financiado pela Unio Europeia. O facto de os assaltos ocorrerem, por norma, hora do almoo ou em perodos a que o jovem faltava s aulas ps a PJ na sua pista. No foi divulgado o valor dos objectos em ouro e electrodomsticos roubados. Os dois autores de um assalto a uma bomba de gasolina da Cepsa em Vila Franca de Xira, ocorrido na madrugada de ontem, foram detidos cerca de uma hora depois na Praa de Espanha, s 4h25, em Lisboa. Os mesmos indivduos so acusados de outros trs assaltos ocorridos em Coimbra entre o dia 14 de Agosto e a tarde da ltima quinta-feira. A PSP sabia que os assaltantes se deslocavam numa carrinha Renault Express, facto que a levou a mandar parar o veculo onde seguiam. Levados de volta a Vila Franca, os assaltantes, de 28 e 31 anos, foram identificados. Nos quatro assaltos, preferencialmente feitos a postos de gasolina, foram roubados mais de 300 mil escudos. A viatura em que seguiam era igualmente roubada. Cerca de metade dos resultados apurados, concretamente sete milhes de contos, vai ser distribuda ao accionista (Estado). Quanto ao cash flow da EDP, atingiu os 128,3 milhes de contos em 1991. Estes resultados foram positivamente influenciados pelo acrscimo da procura de electricidade, pela estabilidade dos preos dos combustveis importados para produo de electricidade e pela conteno dos restantes custos de explorao, refere a mesma nota. Como dividir os ministrios? Os dirigentes turcos Mesut Yilmaz (ANAP, direita) e Necmettin Erbakan (Refah, islamista) iniciaram ontem o processo de formao de um Governo, decidindo criar uma comisso encarregue de repartir os ministrios entre os dois partidos. Esta questo esteve na origem das divergncias que surgiram durante as negociaes do fim-de-semana e que quase puseram em causa os esforos para um acordo. A comisso mista dever iniciar amanh os seus trabalhos que, espera-se, estejam terminados no fim da semana. Chegmos a acordo sobre a maior parte das questes, s restam algumas divergncias menores, explicaram Yilmaz e Erbakan na conferncia de imprensa conjunta que deram ontem. Segundo a agncia Anatlia, o Refah aceitou que Yilmaz seja primeiro-ministro at ao incio do prximo ano. Em seguida, se for aplicado como previsto o princpio da rotatividade, a Turquia dever passar a ter o seu primeiro chefe de Governo islamista. O nico empate de ontem faconteceu no jogo que ops o Friburgo ao Bayern Leverkusen (1-1). Anteontem jogaram Werder Bremen-Nuremberg (2-3) e Hansa Rostock-Wolfsburg (3-3). Os marginalizados do sistema de ensino: que soluo? Os desfavorecidos: que a soluo para os marginalizados do sistema de ensino? O Governo admite criar uma linha de crdito bonificado para ajudar a atenuar os prejuzos do temporal que nos ltimos dias afectou as regies do interior Norte e Centro. Quem o diz o secretrio de Estado da Administrao Interna, Armando Vara, que passou o dia de ontem a inteirar-se dos efeitos da intemprie em Bragana, o distrito mais atingido, mas onde j no h aldeias isoladas. Um Vestido para cinco mulheres a pea encenada por Diogo Infante que transformou Margarida Pinto Correia numa loura. Um texto em torno do sagrado matrimnio no qual se discute as possibilidades de encontrar um homem que no seja casado, nem maricas e tenha emprego. Um teatro assumidamente comercial. O Hospital de Santo Antnio, no Porto, dever pedir ainda esta semana Lipor para ali depositar as vrias dezenas de toenaladas de cinzas resultantes da queima de lixos hospitalares que, h um ano e meio, se acumulam em contentores de plstico. o resultado de um ofcio enviado na semana finda pelo Ministrio do Ambiente, o qual atesta a inocuidade daquelas cinzas e aconselha a sua deposio do aterro de Ermesinde. As escrias do Santo Antnio vo, assim, passar a conhecer um fim semelhante generalidade das cinzas produzidas pelas incineradoras hospitalares. Mais complicada parece a situao do Hospital de Guimares. Desde o encerramento, em Julho, da incineradora do Hospital de S. Marcos, de Braga, que ali so tratados os resduos de todas as unidades de sade do distrito, estando a laborar perto do seu limite. Toxicodependncia que o levara a contrair o vrus da sida. Cauteleiro de profisso, era, portanto, seropositivo, embora a doena estivesse ainda numa fase controlada. Nascera na freguesia de Miragaia, no Porto, e estava em Santa Cruz do Bispo. Na cadeia era bem-comportado e ocupava-se em trabalhos de faxina. Suicidou-se a 7 de Agosto. Sem deixar qualquer sinal que exteriorizasse o seu estado de esprito, uma conversa com algum. Sara da cela s 8h30, pedindo ao guarda para voltar pouco depois, para descansar. s 9h30, encontravam-no enforcado, com um lenol, nas grades da cela. O caso est a ser analisado pela Direco Regional de Educao do Centro e o Coordenador da rea Educativa (CAE) j admitiu no haver condies objectivas para que os professores executem as suas tarefas e continuem as actividades lectivas, sobretudo enquanto o director do Colgio do Mondego se encontrar ausente no estrangeiro. Mas o Sindicato dos Professores da Regio Centro (SPRC) reagiu, pedindo a demisso da direco do instituto de reinsero. A situao dos professores deste reformatrio complicada e complexa, o que motiva a maior parte dos protestos do SPRC. que os professores colocados no Colgio do Mondego pertencem escola bsica do 2 e 3 ciclo de Santa Clara, na Guarda, instituio para a qual concorrem. S que esta EB 2,3 tem um protocolo com o Colgio para a disponibilizao do seu corpo docente. Sendo assim, vo parar ao instituto de reinsero social professores sem experincia profissional e sem formao adequada situao. No outro dia, apareceram l dois guerrilheiros do PAIGC. Um deles olhou para mim e reconheceu-me logo. Era Oto, um ex-controlador de trfego areo em Bissalanca que se juntara guerrilha. Deu-me um mao de cigarros e sossegou-me. No dia seguinte, um domingo, chegmos a Conakry. Levaram-me ao Ministrio da Defesa para encontrar algum a quem me entregar, mas o edifcio estava vazio. Demos imensas voltas pelos corredores at os gendarmes decidirem levar-me para uma esquadra de polcia. No dia seguinte, abriram-me a porta da cela, fizeram-me subir umas escadinhas e entrar para uma sala. Sentados a uma mesa trapezoidal encontravam-se cinco indivduos de grand boubou [ vestimenta muulmana ] at aos ps, com os seus gorros tpicos, todos muito grandes e com ar srio. Transportado para os Estados Unidos, foi mantido nove meses numa cela sem luz, vigiado 24 horas por dia por uma cmara de televiso e por guardas prisionais. Lavagem ao crebro, presso do advogado na altura que lhe deu conta da inevitvel cadeira elctrica na falta de uma confisso, alega o presumvel homicida de Luther King. James Earl Ray, reza a publicidade da Thames, reclamou a sua inocncia e pediu julgamento. Ningum ouviu. At agora. Os bastidores do negcio que levou entrada do Banco Central Hispano (BCH) no capital do Banco Comercial Portugus foram a gota de gua. Alis, muito possivelmente vo ser expostos em tribunal. Tudo porque Neto da Silva, ex-secretrio de Estado do Comrcio Externo e depois lder da Socifa, exige receber uma comisso alegadamente prometida por Amrico Amorim caso conseguisse vender o lote de aces do empresrio no banco portugus. O que Amrico Amorim no confirma, recusando-se a pagar qualquer prestao de servios. Tudo comeou quando o rei da cortia concedeu a Neto da Silva um mandato para procurar um comprador para as aces que detinha no BCP. no havia limite de tempo para a concretizao do negcio, a comisso a receber seria de trs por cento do montante total envolvido, incluindo impostos, e deveria ser observado o mais rigoroso sigilo sobre a transaco. Marius Weiers, alto funcionrio do ministrio sul-africano do Comrcio e Indstria, disse ontem em Joanesburgo que Portugal rene condies para vir a constituir a porta da frica do Sul na Europa. apenas 4,6 por cento dos patres so licenciados e s cerca de 30 por cento tm habilitaes equivalentes ao secundrio. Uma das preocupaes claras do manifesto esclarecer que o congresso no concorre no plano dos partidos polticos. Como ali afirmado, no cabe exclusivamente classe poltica dar respostas sobre as interrogaes que o futuro levanta, antes exige a participao activa da sociedade civil, que tem igualmente responsabilidades e deveres de que no pode nem deve demitir-se. Gomes Motta, respondendo s perguntas dos jornalistas, iria mais longe ao afirmar que os partidos no esgotam a actividade cvica do pas e, afastando qualquer mal-estar que o congresso possa provocar nas hostes socialistas por eventualmente ofuscar alguns dos seus projectos, esclareceria ainda que esta iniciativa e as do PS em certa medida completam-se. Este manifesto ser publicado na ntegra nos jornais, sendo acompanhado por uma ficha de inscrio para participar nos trabalhos do congresso que decorrero no pavilho da FIL, de 8 a 10 de Maio prximo. At l, sero organizadas sesses em vrias cidades publicitando a iniciativa e recolhendo sugestes sobre a incorporao de temas regionais. No esquema apresentado, so seis os painis em debate, que abrangem temas que vo desde a cultura, o ensino e a estruturas econmicas at ao Estado democrtico, solidariedade e o papel de Portugal na Europa e no mundo. Estes so alguns dos dados da avaliao externa feita pelo Instituto de Inovao Educacional (IIE). O estudo representa um dos mais importantes indicadores das aprendizagens dos alunos, tendo inaugurado a era das provas aferidas no sistema educativo portugus. recentemente (ver PBLICO de 27/03/97), o ministro da Educao anunciou que seriam lanados testes de aferio de mbito nacional nos 4, 6 e 9 anos. A inteno figura no documento estratgico para a Educao entregue na Assembleia da Repblica, no qual, curiosamente, se retira a matria da alada do IIE e se entrega ao Gabinete de Avaliao Educacional (Gave). O relatrio adianta que todos os relatos de tortura e maus tratos se referiam a pessoas que tinham sido detidas sob suspeita de terem cometido delitos criminais, tendo em muitos dos casos sido libertadas sem serem inculpadas. R. -- lgico que os directos s podem existir se houver regras claras. E tm de existir porque, neste momento, impossvel produzir um filme sem ser em co-produo e sem dar contrapartidas a realizadores estrangeiros. Mas acredito que as preocupaes que os realizadores tm, tambm a direco do IPACA as tem. P. -- Est a falar pelo IPACA? O espanhol Jesus Montoya (Amaya) venceu ontem a 16 etapa da Volta Espanha em bicicleta, disputada entre Santander e o Alto de Campoo, na distncia de 173,4km e subiu ao 4 lugar da classificao geral individual. O Banco Nacional Ultramarino (BNU) tinha aprovado at Dezembro de 1991 cerca de seis milhes de contos de financiamentos para linhas dedicadas a pequenas e mdias empresas. A dotao global das linhas especficas para as PME de 12,5 milhes de contos, oriundos do Banco Europeu de Investimentos, do Fundo de Regularizao da Dvida Pblica e da Caixa Geral de Depsitos. Este apoio foi decidido em 1990 e inclui o reforo de capitais prprios, linhas de crdito em colaborao com a CGD, tal como os fundos de apoio s iniciativas de jovens empresrios. Por outro lado, at ao final de 1991 o BNU tinha prestado 292 garantias a favor do Instituto de Apoio s Pequenas e Mdias Empresas, para a liberalizao prvia de incentivos concedidos por este instituto. Toxicodependncia: falar claro ou confundir? A toxicodependncia, pelo desespero que provoca, um terreno propcio a todo o tipo de manipulaes e os mass media no tm fugido a esta tentao. As notcias revelam, salvo raras excepes, um cariz sensacionalista, que alimenta sobretudo a ideia de cura mgica ou o desejo de encontrar bodes expiatrios para a explicao do fenmeno, permitindo, por exemplo, que se fale de eficcia do(s) tratamento(s) de formas pouco honestas, por ignorncia ou por manipulao deliberada. bvio que quem lucra com esta situao no so os toxicodependentes e as suas famlias. Um exemplo recente de como se pode confundir a opinio pblica ocorreu no programa de TV Falar Claro, no passado dia 22 de Junho. O relatrio foi elaborado por trs dos mais respeitados peritos da Europa: John Wilesmith, director de epidemiologia no Laboratrio Central do Governo britnico, Bram Schreuder, do Instituto de Cincia e Sade Animal da Holanda, e C. Straub, do Centro Federal Alemo de Pesquisas sobre Vrus e Doenas de Animais. O que espanta, porm, no que se pretenda fazer marcha atrs relativamente a um referendo que devia ter sido realizado h j muito tempo. Bem vistas as coisas, os portugueses dificilmente compreenderiam o facto de serem agora chamados a pronunciar-se sobre algo que h muito foi decidido sua revelia. Pelo que, muito provavelmente, voltariam a abster-se de ir s urnas. Talvez prefiram ir comer castanhas para qualquer lado. O que realmente impressiona que ningum parea inclinado a defender -- por simples analogia -- que uma absteno em massa no referendo das regies por tambm em causa, e de modo irremedivel, o processo de regionalizao. Exceptuando o PCP e alguns perigosos radicais socialistas, o referendo das regies parece ter sido definitivamente aceite como algo de imprescindvel. Circunstncia que, tendo em conta a argumentao agora utilizada para tirar a castanha da Europa do braseiro da indiferena popular, s pode ser lida de uma forma: a tendncia dominante da classe poltica j traou o destino velha quimera da descentralizao. Na melhor das hipteses, o povo, devidamente aterrorizado pelo fantasma do separatismo, chumba o processo. No pior dos cenrios, o pas volta a abster-se e a regionalizao tem assegurados mais 25 anos de permanncia no fundo falso da gaveta da democracia. H, mas todas elas esto j transpostas para o direito nacional. Referem-se a normas de segurana e de sade e higiene, regras de iluminao, etc. As questes mais complicadas esto ainda sobre a mesa. o caso da organizao do tempo de trabalho e disposies sobre mulheres grvidas, cuja aprovao no se espera para a presidncia britnica, no entrando, assim, em vigor a 1 de Janeiro de 1993. s vezes, pensa que isto de imagem de empresa passa tambm por uma boa auditoria e que as empresas de auditoria estrangeiras do melhor nome. Poder recorrer a elas, mesmo que no estejam sediadas em Portugal? Os partidos pr-governamentais voltam atrs quanto prometida reviso constitucional. 13 de Maio -- A oposio apela de novo populao para que se manifeste contra o Governo a partir do dia 17. A formao espanhola do Chapela venceu ontem o Torneio Internacional Feira de S. Mateus, que decorreu em Viseu, ao derrotar na final o Valladolid, por 28-27 (13-13 ao intervalo), na quinta e ltima jornada da prova, na qual participaram ainda FC Porto, Benfica, Sporting e Madeira SAD. O Feira de S. Mateus foi uma excelente oportunidade de ver em aco estas quatro equipas portuguesas que sero, certamente, uma ameaa hegemonia do tetracampeo ABC. Segundo um alto responsvel da instituio, que solicitou o anonimato, o ritmo de recuperao da economia mundial dever acentuar-se j no segundo semestre de 1992, opondo-se, deste modo, s teses mais pessimistas que prognosticam fortes probabilidades de uma recesso mundial. Embora se constatem riscos em algumas zonas, as nossas expectativas so de uma recuperao global, disse a mesma fonte. Apesar de no ser previsvel que a Europa abrande ainda mais a sua actividade, a retoma do seu crescimento dever ser mais fraca do que em outros pases industrializados, devido insuficincia das reformas estruturais, acrescentou. Tomando como caso concreto a Frana, com um mercado de trabalho considerado pouco flexvel, o mesmo responsvel do FMI antev que a Europa dever continuar a registar um crescimento lento, caso no concretize rapidamente as necessrias reformas estruturais. Esta situao faz prever que a Cimeira de Lisboa acabar por aprovar apenas um acordo poltico sobre o Pacote Delors II que constitua um compromisso formal dos Doze relativamente sua futura aprovao detalhada. A demisso de Hans-Dietrich Genscher e a crise poltica que atravessa o Governo de Bona (tendo em pano de fundo uma derrapagem econmica, resultado da unificao, que est a afectar a Alemanha e a ser inevitavelmente exportada para os outros pases europeus) , talvez, o maior revs que a presidncia portuguesa tem de enfrentar no s quanto aprovao do Pacote Delors (Genscher era um sincero apoiante das novas perspectivas financeiras para a Comunidade) como quanto generalidade dos dossiers que esto em cima da mesa. A Administrao da Casa Branca decidiu ontem manter algumas distncias face s declaraes do antigo Presidente democrata norte-americano Jimmy Carter aps a sua visita a Pale -- o bastio da liderana srvia da Bsnia nos arredores de Sarajevo --, ao recordar que os srvios bsnios continuam a ser considerados os agressores no conflito que se prolonga h mais de dois anos e meio nesta repblica balcnica. O Concurso de Dana de Salo para a Terceira Idade outro projecto destinado aos idosos sintrenses, que, previsto para o incio de Fevereiro, levar a diversas colectividades locais todos os que queiram concorrer ou simplesmente trocar uns passos de dana. Distribudos por vrios escales segundo a idade, aos concorrentes basta pertencer ao universo de cerca de 30 mil reformados do municpio, podendo inscrever-se na altura do baile, a divulgar brevemente por todos as associaes e grupos de idosos. Na sequncia de anteriores projectos, como as visitas ao Jardim Zoolgico, os espectculos de teatro e um passeio de cacilheiro no Tejo, a aco deste ano, como explica Jaime da Mata, pretende continuar o trabalho j iniciado: Impulsionar uma vivncia que salutar e necessria. Ainda segundo Travessa de Matos, as cmaras da regio querem que a estrada passe a ter caractersticas de itinerrio complementar e os anteriores responsveis apenas queriam proceder pavimentao do piso a partir de Pinheiro, Pvoa de Lanhoso. A beneficiao da EN 103 chegou a estar prevista no PIDDAC (Plano de Investimentos e Despesas para Desenvolvimento da Administrao Central) para 1995, tendo as reclamaes apresentadas pelos municpios Junta Autnoma de Estradas impedido a sua concretizao. Agora vo exigir ao ministro da tutela a urgente rectificao da EN 103, entre Braga e Chaves. As tcnicas de terapia gentica -- tambm chamada geneterapia -- consistem, em termos genricos, em inserir um gene teraputico nas clulas de um doente. O gene agora introduzido, de nome p53, considerado como um dos mais importantes genes supressores de tumores, pois comanda a produo de uma protena-chave da regulao da diviso celular. Recorde-se que a cancerizao se produz quando, por alguma razo, as clulas do organismo comeam a dividir-se de forma descontrolada sem chegarem maturao. Da que, quando o p53 defeituoso, a protena no fabricada e surge o cancro. Os cientistas estimam que cerca de metade dos casos de cancros esteja associada a mutaes do gene p53. No caso do doente agora tratado, que sofre de uma forma comum do cancro do pulmo, o tratamento consiste em administrar vrias injeces do gene p53 normal -- ou seja, dotado da sua aco supressora de cancros -- dentro do prprio tumor, a bordo de um vrus. Como o vrus tem uma propenso para infectar as clulas humanas, consegue em princpio fazer penetrar o gene dentro das clulas cancerosas, servindo de cavalo de Tria aos genes p53. Os Estados Unidos reexaminaro as bases do acordo de cooperao econmica concludo com o Japo, se no se alcanarem acordos credveis entre os dois pases durante a cimeira Clinton-Hosokawa, prevista para 11 de Fevereiro em Washington. A advertncia foi feita ontem em Tquio pelo secretrio norte-americano do Tesouro, Lloyd Bentsen, depois de se ter encontrado a seu pedido com o primeiro-ministro Morihiro Hosokawa e o ministro das Finanas Hirohisa Fujii. Os investimentos estrangeiros vo continuar a afluir este ano China, mas a um ritmo menos elevado do que em 1993, e o governo vai reforar o controlo sobre as joint-ventures, segundo o jornal China Daily. A baixa ir dever-se principalmente ao arrefecimento da actividade imobiliria, cujo crescimento exponencial no ano de 1993 se deveu em grande parte a capitais estrangeiros. Estes investigadores vacinaram quatro macacos com uma estirpe pouco virulenta do vrus HIV2 humano (o vrus da sida mais vulgar na frica), que no provoca a doena nestes animais e que desaparece rapidamente do seu organismo. A seguir, infectaram-nos com o vrus da sida dos macacos, o SIV. Quase quatro anos depois, trs dos animais ainda se encontram em boa sade, tendo o quarto morrido h uns meses. Todos os elementos de um grupo de animais que no tinha sido vacinado com o HIV2 morreram da sida dos macacos, nos meses que se seguiram sua infeco pelo SIV. Com os meus colegas, tencionamos agora tentar desenvolver vacinas destinadas ao ser humano, e em particular contra o vrus HIV2. Tratar-se- de vacinas preventivas que podero ser eficazes contra qualquer estirpe do HIV2. Por outro lado, j estamos a colaborar com colegas do Instituto Nacional de Sade da Guin-Bissau, na seleco de populaes que possam vir a participar em ensaios clnicos, quando tivermos uma vacina potencial. Reais jias do Norte de Portugal o ttulo de uma exposio que ir decorrer entre os prximos dias 10 e 26 de Novembro no Palcio da Bolsa, no Porto. Comissariado conjuntamente por J Tvora e Manuel Adlio Valle Gomes, o certame conta com o alto patrocnio dos duques de Bragana, que cederam para a ocasio a tiara em brilhantes do sculo XIX usada por Isabel Herdia no casamento com Duarte Pio. Segundo Virglio Folhadela, presidente da Associao Comercial do Porto, entidade que promove a iniciativa, a mostra tem como um dos seus objectivos principais revelar a forte tradio nortenha nos campos da ourivesaria e joalharia. A exposio -- formada sobretudo por objectos provenientes de coleces particulares -- inicia-se cronologicamente no sculo XVII e vem at aos nossos dias. Entre as jias apresentadas contam-se colares de brilhantes e esmeraldas (sc. XVIII e XIX), peas de ouro popular, trenedeiras, crislitas, laas, condecoraes das ordens de Malta e de Cristo em minas e brilhantes e uma caixa de rap do Rei Carlos Alberto. Assinale-se ainda a presena de um stand da Christie's no espao da mostra, bem como de um avaliador oficial daquela leiloeira inglesa. Mais de um milho de contos dever, segundo a Lusa, custar o seguro das obras expostas. Os independentistas tchetchenos prosseguiram o cerco a diversas posies militares russas, que responderam ao ataque abrindo fogo sobre concentraes de combatentes tchetchenos, segundo o termo utilizado pelo centro de imprensa instalado em Mosdok (Osstia do Norte), quartel-general das foras de interveno russas. No foram fornecidas informaes sobre o balano destes confrontos. Segundo a agncia Interfax, a artilharia russa bombardeou na noite de sexta para sbado as localidades de Samachki e Zakan-Iurt (situadas respectivamente a 30 km e 15 km a oeste de Grozni). Aps o fim do cerco capital tchetchena pelas foras russas, tm decorrido violentos e incessantes combates a sudoeste da cidade. A artilharia e a aviao russas tm vindo a atacar regularmente povoaes situadas nestas zonas, para onde recuaram os combatentes tchetchenos aps a queda de Grozni. Os confrontos tambm esto a atingir Argun, 15 km a leste da cidade. O mar. Tudo o que tenha a ver com o mar, no mar, ao p do mar. Quem que espera no encontrar nestas frias? O meu porteiro. A stima jornada do campeonato ingls de futebol no trouxe alteraes ao topo da classificao, j que os trs primeiros venceram os seus jogos e mantm as posies relativas. O Norwich bateu em casa o Southampton por 1-0 e soma agora 16 pontos, o Coventry foi ganhar ao terreno do Oldham por 1-0 e est com 15 e o Blackburn Rovers goleou no seu estdio o Nottingham Forest por 4-1, somando 14 pontos, mas com menos um jogo. Belmiro de Azevedo, presidente da Sonae, acredita que nos prximos anos os negcios do grupo no Brasil vo crescer at chegarem dimenso que actualmente tm em Portugal. Vamos ser to grandes no Brasil como em Portugal, disse ontem, no Porto, o patro da Sonae, no decorrer de uma videoconferncia que colocou em dilogo oito personalidades do mundo dos negcios dos dois pases. So muitos os registos, ao longo de sculos, de inmeras e variadas expresses de pnico e supersties provocadas pela apario sbita de um objecto brilhante, projectado no escuro da esfera celeste. A sua forma estranha, a mudana de posio relativamente s estrelas e a alterao de forma sugeriam interpretaes de almas vagabundas de grandes homens desaparecidos, ou sinais dos deuses anunciando prxima a vingana de algum comportamento menos ajuizado dos terrestres. Ora, os ricos no esto para isto. Como dizia o eng. lvaro Barreto, ser ministro ganhar mal, perder negcios, empobrecer alegremente. O prprio professor Cavaco, de modesta fortuna, farto de nos aturar, mandou tudo quele stio -- aplicando uma bofetada sem mo aos que vivem da politiquice -- e retomar a carreira profissional ganhando o dobro, chateando-se pela metade. Resta, por excluso, o governo dos pobrezinhos, humildes mas honrados. Modelo muito querido ao dr. lvaro Cunhal, mas completamente ultrapassado. L'Incoronazione di Poppea foi a ltima a ser reeditada e este regresso s discotecas no suporte j irreversvel conquistador do mercado ter de ser saudado com uma efusividade guardada para, e s para, as ocasies muito, muito, muito especiais. Nunca mais se poder falar, no futuro, da Poppea de Harnoncourt. Porque, depois deste registo de 1972, ficaria imortalizada em som e imagem -- na realizao visual de Jean-Pierre Ponnelle -- uma aproximao dos finais dos anos 70 deliberadamente expressionista e, agora, em 1993, em Salzburgo, Harnoncourt surpreende (e abre uma inflamada polmica) ao explorar at aos limites do sustentvel uma viso quase grotesca, comportando a contaminao por uma componente burlesca surpreendentemente suportada pelo cinismo da narrativa, com inesperada opulncia de meios instrumentais e com um estranho elenco reunindo intrpretes familiarizados com escolas de canto bem distintas e distantes das requeridas por uma especializao na msica seiscentista. Os investidores estrangeiros esto de volta ao Brasil. No centro das atenes esto as privatizaes que h pouco mais de um ms tiveram incio. Para concretizar os seus investimentos, os estrangeiros aguardam apenas a concluso do acordo entre o Brasil e o Fundo Monetrio Internacional (FMI) a realizar ainda este ms, que trar ao Brasil mais trs mil milhes de dlares. Alguns investidores externos aguardam apenas pela luz verde para a concretizao do emprstimo, interpretando de forma positiva o facto de o FMI estar disposto a aceitar a carta de intenes do governo de Collor de Mello, e a disponibilidade deste para aceitar o tratamento de choque que ser imposto economia brasileira. Quantos melhores jogos j aconteceram neste Mundial? O Nigria-Espanha e o Inglaterra-Romnia, da primeira fase? O Holanda-Jugoslvia e o Argentina-Inglaterra dos oitavos-de-final? O Brasil-Dinamarca? Ou, o at aqui melhor de todos, o Holanda-Argentina que, por si s, justifica a existncia de um campeonato e a paixo que todos temos? Este futebol que apazigua os desejos e termina com as nostalgias foi lanado desde o primeiro segundo. O tempo de estudo, esses aborrecidos, inteis e interminveis minutos iniciais, deram lugar ao jogo claro. Uma densidade construtiva, um vocabulrio variado e extremo, uma intensidade magnfica, difcil de atingir. Poder-se- ir mais longe? Quando, aos 38', Veron decidiu uma pequena pausa em dois passes laterais sem progresso, levou uma monumental assobiadela. Ningum queria que aquilo parasse. O teatro de Beaumarchais. A msica de Mozart. O cinema de Renoir. A Regra do Jogo a sntese perfeita do esprito dos dois primeiros na arte do ltimo. Toda a comdia humana numa dana beira do precipcio, a guerra que se avizinha. Um filme premonitrio, que ao tempo foi proibido e mutilado pela censura. Assim, de acordo com informao da CML, na Rua Edison inverte-se o sentido, passando a circular-se da Av. de Roma para a Av. de Madrid. Na scar Monteiro Torres, o troo compreendido entre a Rua Oliveira Martins e a Av. de Roma volta a ter sentido nico, circulando-se da Oliveira Martins para a Av. de Roma. No outro lado da guerra, em Belgrado, a eleio do escritor Dobrica Cosic, um ex-prximo de Tito, como Presidente da nova Jugoslvia foi cumprida, como previsto, pelo parlamento. Cosic era o nico candidato e os diplomatas ocidentais notam que o seu poder , na prtica, fictcio, pois o homem que mexe os cordelinhos em Belgrado continua a ser o Presidente srvio, Slobodan Milosevic. A contestao a este continua a subir de tom e ontem, pelo terceiro dia consecutivo, as ruas de Belgrado foram o palco de manifestaes exigindo a sua demisso. Mais de dez mil estudantes exigiram pacificamente o afastamento de Milosevic, a formao de um governo de salvao nacional e a realizao de eleies. Decretaram uma greve, ocupam trs faculdades e prometem resistir at satisfao das suas exigncias. Hoje de manh, o dia do encontro de Fernando Nogueira com o Presidente da Repblica e o ministro j manifestou grande curiosidade quanto ao que Ben Ali ter para lhe dizer. Segue-se a reunio com o secretrio-geral do partido no poder (RCD, Rassemblement Constitutionnel Democratique, Liga Constitucional Democrtica), e a entrevista com o ministro de Estado e do Interior, Abdallah Kallel, tido como brao direito do Presidente e o nmero dois do Governo. Um dos resultados prticos desta fuso que Michael Eisner, presidente da Walt Disney, se tornou da noite para o dia o homem mais poderoso do sector. Nada mau para quem era acusado de ter demasiadas cautelas na conduo do seu grupo e de estar tolhido por uma notvel falta de viso global. o seu amigo e colega de direco Frank Wells morreu num desastre areo e, logo depois, o director dos estdios, Jeffrey Katzenberg (responsvel pelo renascimento da animao na Disney), bateu com a porta, insatisfeito por no chegar ao topo da empresa, fundando a produtora SKG-Dreamworks com Steven Spielberg e David Geffen. Di Matteo recebeu a bola na sua intermediria, progrediu pela zona central e, a 25 metros da baliza, arrancou um tiro fulminante que s parou nas redes. Estavam decorridos apenas 42 segundos do jogo da final da Taa de Inglaterra em futebol, que opunha o Chelsea ao Middlesbrough. No fim, o Chelsea juntou a este golo mais um, de Newton (83'), e conquistou o trofu pela segunda vez na histria. A circular justifica a iniciativa por terem surgido alguns problemas no tratamento de Testemunhas de Jeov, quando imperativo ou h a eventualidade de se administrar sangue, como medida indispensvel manuteno da vida. Em declaraes ao PBLICO, Francisco Costa, membro da comisso de ligao do Hospital, um rgo de contacto entre os hospitais e as Testemunhas de Jeov, disse conhecer o documento, que, no essencial, reitera uma prtica utilizada anteriormente pela direco do hospital, excepo do que se passava com o tratamento de crianas. Enquanto at aqui o HDL tinha em ateno o parecer dos pais no tratamento de crianas a necessitar de sangue, de agora em diante os mdicos esto livres de o fazer. Francisco Costa, h trs anos na comisso de ligao hospitalar criada pelas Testemunhas, refere que a posio do HDL no tem semelhanas com os procedimentos de outros hospitais dos distritos volta, citando os casos de Coimbra e Santarm. P. -- Como economista, acredita que isso possa acontecer? R. -- No, no acredito. Nunca quis assinar uma carta de intenes com o FMI que colocasse metas que no pudessem ser cumpridas. Se o acordo for cumprido, tal com ele foi escrito, a consequncia vai ser mais recesso, maior aperto. Os indicadores apontam para uma quebra na actividade econmica, para um aumento do desemprego e da inflao. O acordo inclui um maior aperto fiscal e logo a diminuio dos gastos e da actividade e da oferta dos servios de infra-estruturas. O acordo indica que vamos continuar num processo recessivo. As metas quanto inflao conseguir que esta, at ao final de 1992, se situe nos 12 por cento ao ms. P. -- Porqu? A confirmarem-se os confrontos, sero o primeiro incidente fronteirio grave desde a morte de oito diplomatas e um jornalista iranianos no Afeganisto, no princpio de Agosto, o acontecimento que agudizou a crise entre os dois pases. Desde ento, Teero, que viu derrotados os seus ltimos aliados na guerra civil afeg, e Cabul, com a confiana renovada e o apoio do Paquisto, envolveram-se numa escalada de ameaas verbais, concentrando ao mesmo tempo frente a frente poderosos efectivos militares -- os maiores desde o fim da guerra entre o Iro e o Iraque, em 1988. As famlias ali residentes, que actualmente sobem a p cinco andares por escadas de madeira que ameaam ruir, vo passar a ter elevador e vero o interior das suas residncias modernizado, com a instalao de casas de banho e cozinhas devidamente equipadas. O lanamento do projecto de recuperao, por concurso pblico entre arquitectos, ter lugar em Julho, estando previsto o incio das obras, oradas em 75 mil contos, para o Vero de 1997. Joo Pinto -- Antnio Oliveira no lhe dever fazer a mesma surpresa de Carlos Queiroz que, para admirao de todos, em Junho deste ano o remeteu para o banco dos suplentes no encontro com a Estnia, de apuramento para o Mundial dos Estados Unidos, trocando-o por Abel Xavier. O capito portista continua numa forma excelente e, apesar dos anos, no perdeu a velocidade, percorrendo o seu flanco com grande -vontade. O jovem sportinguista Nlson ter de esperar. Hlder -- O central benfiquista ter mais dificuldades em conseguir uma cadeira no onze do seu clube -- onde os lugares parecem estar destinados dupla brasileira Mozer / Paulo -- do que na equipa nacional. Uma situao complicada porque Hlder se tem mostrado, neste incio do campeonato, como um dos melhores jogadores do Benfica. Se, como se prev, a Irlanda do Norte jogar com dois pontas-de-lana, Hlder ter funes de marcao e a deve jogar mais em antecipao para evitar descuidos por alguma lentido. Quem culpabiliza o exterior pelas suas prprias falhas est na verdade procurando desculpas para o seu prprio insucesso. As etapas boas e ms por que cada indivduo tem de passar fazem parte do sentido delineado para a sua prpria vida. Isto , necessrio para essa pessoa ter de enfrentar essas circunstncias e prosseguir sem apego. (Atitude basicamente budista em relao a todas as coisas da vida, porque as desagradveis causam repulsa, e as agradveis tristeza quando j se as no tem). O amor deve servir como pano de fundo vida de cada pessoa. Claro que se evoca aqui um amor justo e totalmente abrangente, com base em regras ticas universais. As coisas que ocorrem ao longo da vida consideradas como desagradveis devem ento ser encaradas como um novo passo na aprendizagem existencial, algo pelo qual temos de passar para evoluir. No h que sentir revolta, desnimo nem tristeza, devendo procurar-se ver sempre o outro lado da moeda. At l, no entanto, o imenso pas precisa da ajuda do Ocidente e, se bem que esteja convencido da irreversibilidade das reformas em curso, o Presidente russo no deixaria de argumentar que, se elas falharem, a haveria que pagar dez vezes mais do que os investimentos que agora so necessrios para sustentar a reforma. No encontro com a imprensa, Kohl afirmaria que foi a primeira vez que houve um debate com total sinceridade e boa vontade, num respeito total por cada um dos parceiros, at porque a Rssia entrou na via da democracia, de um Estado de direito e do respeito pelos direitos humanos. Em Castelo de Vide, no Centro Municipal de Cultura, est patente a exposio Diversidades ... com Mestre, com obras de pintura de Martins Correia. As motos BMW fazem 75 anos de existncia e o Centro Comercial Colombo comemora esse aniversrio com uma exposio histrica, na Praa Trpico de Cncer (a praa central do Centro). Ocasio para ver 22 modelos antigos e oito recentes de motos BMW. Em verdade vos digo que Indiana Jones e os seus mulos ainda esto na fase do desmame se os compararmos com alguns heris do passado, do serial ou do filme de aventuras. Neste ltimo caso, o destaque vai para um filme de guerra, Jornada Trgica, que ter menos a ver com o conflito a que se referia do que com o western ou a floresta de Sherwood. Errol Flynn ps de lado o arco e as flechas, mas levou o mesmo esprito a bordo do bombardeiro. Ao vermos filmes como Jornada Trgica, Objectivo Burma e outros Sargentos Imortais, podemos interrogar-nos como foi possvel que a Alemanha e o Japo tivessem resistido tanto tempo a estas pelotes de indomveis patifes. Vamos para a Austrlia despachar os japoneses! Promessa cumprida apenas com um desvio na rota, porque a segunda incurso de Flynn na guerra foi na Birmnia, onde faz uma razia entre os filhos do Sol Nascente. O filme chamou-se Objectivo Burma e, embora mais srio, resultava no mesmo. Realce ainda para o mau dia de Marco Pantani, que apenas veio a Espanha para se preparar para o Mundial da Colmbia. O italiano chegou na 66 posto, a 13m45s do primeiro, e agora 27 na geral, quatro lugares abaixo de Zuelle. Hoje, o peloto tem mais uma etapa no difcil traado dos Pirenus, com incio em Naut Aran e final em Luz Ardiden, na parte francesa, aps 179km. Destaque para a subida do Tourmalet, a 2115m de altitude, que uma das montanhas mticas da Volta Frana. Para a Quercus, as 40 incineradoras de resduos hospitalares existentes no pas so ilegais. Na Procuradoria Greal da Repblica deu j anteontem entrada um processo criminal contra o Hospital Jlio de Matos, de Lisboa. Um momento histrico como pode classificar-se a presena do Castelo da Maia na final-four da Liga das Taas em voleibol, que entre hoje e amanh decorre na cidade de Cuneo, no Norte de Itlia. Depois de terem terminado a fase regular s com uma derrota nos sete jogos realizados, os maiatos no devero ir muito mais longe. Com adversrios como os gregos do Olympiakos, os espanhis do Gran Canaria e os italianos do Alpitour Cuneo (actuais detentores do trofu), ao Castelo da Maia pouco mais resta do que tentar ter uma participao digna. A criao de um Conselho Nacional do Ambiente seria um passo decisivo para criar um frum onde se cruzassem todas as foras que atravessam horizontalmente as questes de ambiente e desenvolvimento. Contribuiria, a meu ver decisivamente, para um distender de tenses e limar de arestas entre os planos de desenvolvimento e a correcta gesto e preservao de bens alimentares, e seguramente para o indispensvel dilogo entre foras que dele tm andado arredias. As experincias do Conselho Econmico e Social ou do Conselho Nacional de Educao, com todos os seus eventuais defeitos, so sem dvida um indicador da possibilidade desta inovao. tempo de desdramatizar o ambiente e de o encarar de uma forma racional e corajosa, no basta o bom senso. A ver vamos. Em 1990, existiam 800 supercomputadores em todo o mundo, distribudos por grandes empresas ou organismos pblicos que investem nestes equipamentos para os disponibilizar a instituies cientficas que pagam pelo tempo de utilizao -- como a FCCN em Portugal (ver Universidades nacionais sem supercomputador). Para o presidente da FCCN, o nmero destas mquinas mais ou menos constante, porque, conforme se vo desenvolvendo novos equipamentos, outros deixam de se enquadrar na definio de supercomputador. As duas grandes famlias da supercomputao so as chamadas mquinas de multiprocessamento vectorial (como alguns Cray) e as de processamento paralelo -- de que a mais conhecida a Connection Machine, da Thinking Machines. Ao contrrio do processamento vectorial, em que vrios processadores utilizam a mesma memria -- e onde, na opinio de Heitor Pina, apenas podero existir progressos marginais --, as mquinas de processamento paralelo, surgidas nos anos 60 e generalizadas no final da dcada de 80, tm uma memria dedicada para cada processador, acabando com os engarrafamentos na partilha da memria. Falamos, obviamente, de Nuno Gomes, que marcou quatro golos num jogo do campeonato, selando a vitria do Benfica sobre o Varzim. O jovem de Amarante, 21 anos feitos em Julho, custou ao Benfica cerca de 600 mil contos e demorou a mostrar servio. com Souness e o seu futebol de cruzamentos, Nuno Gomes no era a primeira opo mas, provavelmente, acabar at por ter mais facilidade em marcar golos. O treinador ingls liberta mais a equipa, joga com mais unidades na rea e favorece o ponta-de-lana. Nuno Gomes marcou quinze golos no Boavista da poca passada, sobretudo atravs de uma segunda parte da poca -- com Mrio Reis -- muito forte, numa equipa que tinha ainda Jimmy. Era uma dupla terrvel, que se completava pela potncia do holands e o jogo mais tcnico do jovem portugus. Funcionou s mil maravilhas e o Boavista acabou por ganhar a Taa de Portugal com dois golos de Sanchez e um de Nuno Gomes. Deparamos ento com uma situao caricata. Os israelitas dizem que no abandonaro a sua zona de segurana enquanto os srios continuarem a ocupar o Lbano. S que, ao mesmo tempo, pedem aos srios que dominem o Hezbollah, reconhecendo implicitamente que s Damasco pode pacificar a sua fronteira mais vulnervel. possvel que os srios tenham permitido, ou at mesmo fomentado, esta subida de tenso para obrigar os israelitas a admitir que o Lbano um protectorado de Damasco. Numa negociao sempre bom ter dois ferros no fogo, observou o jornalista francs Patrice Claude. Como corolrio inevitvel do processo, e na impossibilidade de injectar mais capitais prprios, a administrao da Jotocar decidiu solicitar em tribunal um processo de recuperao, lamentando os inconvenientes que esta situao no deixar de trazer aos credores. Quanto data da reabertura da empresa, como disse ao PBLICO um responsvel da Cuf-Txteis, ainda imprevisvel. Teixeira da Mota, porta-voz do BFE, limitou-se a dizer, sobre este assunto, que, aps seis meses de efectiva gesto da actual administrao, concludos em Agosto de 1992, o banco considerou o processo encerrado e que o comprador confirmou ao BFE, nesse mesmo ms, a plena viabilidade e eficcia do contrato celebrado entre as partes. E mais no disse. E as variaes dos papis no foram mais expressivas. O vector accionista atravessa, de facto, um mau momento, apresentando permanentes hesitaes. Com muitos dos papis com a cotao interrompida em consequncia do perodo de pagamento de dividendos, o negcio continuou pouco expressivo, sem profundidade. No se prev, alis, que at ao final do ms se registem alteraes com significado. O ndice BVL Geral encerrou em baixa, cotando-se nos 973,03 pontos, menos 0,23 por cento, enquanto o BPA Contnuo cedeu 0,18 por cento ao fixar-se nos 154,72 pontos. Em termos de totais, na Bolsa de Lisboa intermediaram-se 10,592 milhes de contos, mais 258,62 por cento. A Bolsa do Porto encerrou com 5,094 milhes de contos, mais 215,44 por cento que na segunda-feira. Mesmo antes de jogar, o FC Porto j estava a ganhar com a viagem Crocia. Os responsveis pelo futebol croata gostaram de rever Ivic e, principalmente, da coragem revelada pelos portistas ao aceitarem jogar numa zona de conflito militar premente, e j garantiram o direito de preferncia ao FC Porto na escolha de futuros talentos. Uma bom investimento na terra de onde saram Boksic, Prosinecki, Boban ou Suker ... Um lugar na primeira linha da grelha de partida na contratao de futuros talentos croatas foi, para j, o que o FC Porto conseguiu com a visita a esta ex-repblica jugoslava. Afastado das competies internacionais desde 1990 -- a seleco jugoslava apurada para a fase final do Europeu da Sucia ficou em casa ltima hora e foi substituda pela surpreendente Dinamarca --, o futebol croata tenta agora voltar ao circuito europeu. E a presena do FC Porto faz parte dessa tentativa, com a particularidade dos drages no cobrarem cachet para serem a primeira equipa de nomeada a visitar o pas. Se for aprovada a verso final do anteprojecto de lei de liberdade religiosa -- que hoje ser apresentada publicamente --, os crentes que o desejarem podem passar a indicar que destino pretendem para uma quota equivalente a 0,5 por cento do imposto sobre o rendimento das pessoas singulares (IRS). De acordo com o texto proposto no ponto 3 do artigo 31, os contribuintes podem indicar qual a igreja ou comunidade religiosa radicada no pas, a inscrever na declarao de rendimentos, que desejam que receba aquela dotao para fins religiosos ou de beneficncia. O valor de 0,5 por cento no aleatrio. Ele corresponde sensivelmente ao valor actual da devoluo do IVA, que o Estado faz Igreja Catlica, de acordo com a interpretao que tem sido feita da Concordata. foi essa a opo em Espanha e Itlia, pases com acordos de Concordata semelhantes a Portugal. Desde h poucos anos, em ambos os pases optou-se por a possibilidade de cada pessoa indicar, na declarao de rendimentos, qual o fim que pretende dar aquela percentagem -- 0,8 em Espanha, 0,523 em Itlia. Destaque: Grande parte da imaginao da infncia passa-se na descoberta e na contestao da mentira das regras que o mundo lhe impe. noutras encaram-nas como um jogo e respondem-lhes jogando tambm, mas sua maneira. O viaduto da Infante Santo ter uma faixa em cada sentido, dispondo tambm de um passeio para pees. A sua abertura ao trfego estava inicialmente prevista para o ms de Novembro. A medida preconizada prende-se com o objectivo de se criaram mais empregos e visa, sobretudo os pases da Comunidade Europeia que, entre 1972 e 1992, viram aumentar o nmero de postos de trabalho em cinco por cento, contra uma taxa de 37 por cento nos Estados Unidos, Canad e Japo. Para combater o mal -- que s nos 24 pases da Organizao para a Cooperao e Desenvolvimento Econmico (OCDE) abrange 36 milhes de pessoas --, o FMI sugere a promoo da flexibilidade no mercado de emprego, cuja aplicao dever levar diminuio de regalias sociais, mas, em alguns pases, preciso retirar as restries aos horrios de trabalho e sobre os empregos em ' part-time '. Defende o Fundo que h uma necessidade urgente em quase todos os pases de reexaminar o financiamento e a generosidade global dos regimes de segurana social, com o objectivo de eliminar os elementos que desencorajam a criao de novos empregos. No relatrio citam-se os elevados encargos sociais das empresas, a generosidade dos subsdios de desemprego, o muito elevado salrio mnimo e as regras de proteco de emprego, consideradas muito rgidas, como factores desmotivadores da criao de postos de trabalho. Entretanto, a Junta de Freguesia de Riba de Ave foi recebida anteontem por Agostinho Fernandes, a quem se manifestou preocupada com a situao e props o encerramento da ETRSU at que seja reparada a avaria. O senhor presidente disse-nos que o que se est a passar inadmissvel e que a populao tem razo para estar preocupada, mas adiantou-nos que est convencido de que se trata de um problema tcnico, declarou-nos Miguel Lopes, cujo pedido para suspender a laborao da estao at reparao definitiva da avaria no recebeu uma resposta concreta. O presidente da Cmara, alis (na linha do que dissera ao PBLICO o director regional do Ambiente do Norte, Guedes Marques), afirmou Lusa que uma avaria pontual nunca poder justificar o encerramento de uma estrutura deste tipo [ ETRSU ]. O Conselho de Arbitragem da Associao de Futebol de Lisboa apoiou ontem Jorge Coroado e contestou a Comisso de Arbitragem da Liga Portuguesa de Futebol Profissional, que excluiu aquele rbitro do jogo FC Porto-V.Guimares depois de ter montado um esquema para culpar Coroado de fugas de informao. O conselho lisboeta considera censurvel o comportamento de Coroado, mas recorda que no est prevista nas normas qualquer pena para a infraco que este ter cometido, pelo que a comisso da Liga no podia t-lo excludo do jogo. Por isso, condena as interpretaes de convenincia da lei e pe os seus servios ao dispr de Coroado, para apoiar o rbitro em qualquer aco disciplinar que porventura lhe seja injustamente instaurada. O volume financeiro em entradas de cinema atinge 5000 milhes de dlares, enquanto o mercado dos videojogos vale 13 mil milhes de dlares anuais (7000 milhes gastos em salas de jogos e mais 6000 milhes em videojogos domsticos). Por outro lado, segundo dados da editora Capcom citados na revista Business Week, foram vendidas 12 milhes de cpias de Street Fighter em todo o mundo, havendo 25 milhes de norte-americanos que j o jogaram. A Capcom cr que, com estes nmeros, no ser difcil recuperar os 40 milhes de dlares investidos no referido filme. Segundo o presidente da empresa, Kenzo Tsujimoto, o que se ganha na produo do filme o conhecimento e a experincia de que a Capcom necessita para fazer videojogos no futuro. Nem de propsito, na Primavera sair Street Fighter III: The Movie Game. Mas a dificuldade desta opo assumidamente militante da organizao est nessa nova vocao dos portugueses ter de nascer durante o curto perodo em que a exposio estiver aberta ao pblico. Na verdade, a organizao da exposio tem de juntar duas realidades que sempre andaram afastadas: arquitectura e grande pblico. O desafio posto organizao da exposio ento levar os muncipes de uma cidade a apaixonarem-se perdidamente por um edifcio ao ponto de sarem para a rua em sua defesa. por isso que a exposio Cassiano Branco e o den -- Lisboa 1991 resultou na maior operao de mediatizao da arquitectura jamais vista em Portugal. A grande dvida que circula entre a comunidade timorense a razo que teria levado esta mulher doente -- Wendi tem esclerose mltipla -- a escolher o jornal Kompas para prestar tais declaraes, pois se, em tudo isto, existe um fundo de verdade, um peridico indonsio seria a ltima escolha para algum que quer ter o mnimo de credibilidade, comentam. [ Jos Ramos-Horta reagiu j, acusando Wendi Holland de estar a ser utilizada pela inteligncia indonsia. Num depoimento telefnico prestado ao CMR -- o primeiro rgo da informao estrangeira a divulgar o contedo da entrevista publicada pelo jornal indonsio --, Horta disse tratar-se de uma campanha que se arrasta h vrios meses e foi lanada pelo jornalista Petrus Suriadi, que esteve recentemente em Portugal. A Comisso Europeia divulgou na semana passada as novas previses de crescimento para as economias dos Doze em 1994. Para o crescimento mdio apontada agora uma estimativa de 1,6 por cento, superior em 0,3 pontos previso adiantada anteriormente, no Outono de 1993. Mas enquanto a evoluo mdia foi revista em alta, os valores para Portugal registaram um recuo ligeiro: dos anteriores 1,4 por cento para 1,25 por cento. O trabalho dirio de Peter Williams construir mundos e imaginar o modo como as pessoas vo interagir neles (ou como pensa que isso vai acontecer). Deuses em ascenso, os criadores de realidades virtuais vo influenciar as sociedades futuras com as imagens que criarem. Se j se do conta disso, algo que no deixam transparecer ... Numa casa georgiana com vista para o Tamisa, fora de Londres, coexistem dois mundos. Para os diferenciar, tabuletas indicam o Real World e, outra, apontada ao estdio, o Virtual World. Carlos Cidade, Linhares de Castro, Lus Janurio, Leal Amado e Moura e S -- o nico que no ex-militante do PCP -- so os nomes escolhidos e j ratificados por o Ncleo de Coimbra da Plataforma de Esquerda para integrar as listas do PS cmara e assembleia municipais da capital do distrito. A Plataforma de Esquerda dever ainda ficar representada nos concelhos de Montemor-o-velho, Lous, Condeixa e Cantanhede, Arganil, Figueira da Foz, Miranda do Corvo e Soure. Essencialmente no concelho de Coimbra e em vrios casos, a PE poder mesmo encabear a candidatura presidncia das juntas de freguesia. a primeira vez que feita uma perseguio federal a organizadores de apostas na Internet. Nos sites das empresas explicava-se como se podia apostar sobre os resultados de jogos profissionais e universitrios de futebol, basquetebol, hquei e basebol. Os apostadores teriam de abrir uma conta e depositar a entre 1000 e 5000 dlares (185 a 925 contos). As apostas sobre os resultados do jogo, feitas por telefone ou pela Internet, custavam entre dez e 50 dlares (1.85000 a 9.25000) -- valores sobre os quais as empresas retinham dez por cento. Os eventuais ganhos seriam depositados nas contas bancrias dos apostadores ou enviados por correio. Cada um dos acusados arrisca-se agora a uma pena mxima de cinco anos de priso e a uma multa que pode ir at aos 250 mil dlares (45 mil contos). Mas Benjamin Brafman -- advogado de Jay Cohen, presidente e proprietrio de uma das empresas (a World Sports Exchange) -- no tem a certeza de que o Governo norte-americano tenha jurisdio neste caso. Mary Jo White considerou no entanto que as empresas no estavam protegidas pelo facto de a sua sede se situar fora dos Estados Unidos. Ao longo da semana o Banco de Portugal cedeu liquidez num montante superior ao da semana anterior, tendo mantido as taxas. No que concerne Dvida Pblica corrente, realizaram-se trs leiles de Bilhetes do Tesouro. e no terceiro, a 182 dias, foram colocados 25 milhes de contos, taxa mdia de 10,48 por cento, o que acabou por reflectir uma subida das taxas em relao aos leiles anteriores para idnticos perodos. O abortamento tem sempre uma justificao. Bem basta sofrla, bem basta ter de o realizar por causa dela. Digase, de uma vez e claramente, o que se quer ou o que se quer mais. As reformas a levar a cabo podem conduzir ao desaparecimento do ENDA e sua substituio por outro rgo, que englobe as diversas tendncias do movimento associativo universitrio. Quando as coisas no correm bem, preciso mudar alguma coisa, justificou um dos participantes num encontro de dirigentes associativos que decorreu durante o fim de semana nas instalaes da Universidade de Aveiro. A realizao de um inqurito destinado a apurar o encargo mdio mensal de cada estudante universitrio foi outra das medidas sadas do encontro de Aveiro. Pretendemos definir com o mximo rigor os custos mdios de cada estudante, conforme a regio onde se encontra, pois actualmente no existem dados concretos sobre o assunto, esclareceu Miguel Rodrigues, que tambm presidente da Associao Acadmica da Universidade de Aveiro. As concluses do inqurito -- que ser realizado com o apoio do Conselho de Reitores -- serviro de base a uma proposta a apresentar ao Ministrio da Educao relativa ao montante das bolsas de estudo e propinas. J no prximo dia 25, os dirigentes associativos iro reunir-se com o ministro Couto dos Santos para discutir, mais uma vez, o problema do pagamento das propinas universitrias. De facto, os indcios acumulam-se nesse sentido. No s por aquilo que se viu ontem nas fortalezas tradicionalmente inexpugnveis do PCP, mas tambm pela receptividade que a campanha de Torres Couto est a encontrar e, sobretudo, pelo que as sondagens comeam a indicar. Segundo o estudo publicado ontem pelo Expresso, a CDU perderia em Setbal quatro dos sete deputados de que dispe, passando o PS exactamente para a situao inversa. Em 6 de Outubro, ver-se- se estamos mesmo perante uma dbacle comunista em Setbal. No bvio primeira vista, mas uma observao mais cuidada no deixa dvidas. O portal da Quinta do Castro, o nico que resta dos dois que j existiram, est acentuadamente inclinado e pode cair a qualquer momento. Um aterro feito durante a construo da nova estrada Valena-Mono est a pressionar a estrutura para trs e ir, inevitavelmente, provocar o seu desabamento. O muro contguo ao portal, parte da estrutura original, foi desmantelado durante a implantao da nova via, obra que esteve a cargo das empresas Soares da Costa e Monte & Monte. No seu lugar, surgiu um muro feito de blocos de cimento. Os pedaos da estrutura original foram escondidos pela zeladora nas proximidades, cobertos por mato, prevenindo eventuais furtos da pedra trabalhada. De acordo com Manuel Cunha, o actual presidente da Junta de Freguesia, durante as obras, a empresa responsvel necessitou de retirar saibro da zona onde o muro se encontrava para efectuar um aterro. Escavou quanto quis, imediatamente atrs do muro que deitou abaixo para as mquinas poderem passar -- diz o autarca que com a promessa de, depois, reconstruir o muro. Em 1990, a estrada foi inaugurada e, quatro anos depois, tudo est na mesma. alguns pases, mas sobretudo a Alemanha, pretendem evitar que a moeda nica assuma logo desde o incio o mesmo estatuto legal que as divisas nacionais, neste caso, o marco. Se os Quinze conseguirem resolver estes trs problemas, resta apenas cimeira de Madrid escolher o nome da moeda nica, tendo em conta que a denominao ecu, expressa no Tratado de Maastricht, rejeitada pela Alemanha, que prefere euro. R. -- Porque o simbolismo das escadas est em todo o lado: servem para descer ao inferno ou subir ao cu. Sobem-se os degraus para chegar ao sucesso ou descem-se, em caso de fracasso. P. -- Quais as suas influncias no campo da pintura? R. -- O PC serviu-se da extrema-esquerda, como ponta-de-lana, mas depois foi vtima da sua impreparao e do seu espontanesmo. Quando, noite, os Comandos atacam a Polcia Militar, outra vez o Costa Gomes que, de madrugada, consegue convencer o Partido Comunista a desistir. P. -- Com que contrapartidas? Esta uma das afirmaes principais da posio da Igreja Adventista do Stimo Dia sobre o aborto, que considerou ter chegado a altura conveniente de divulgar o ponto de vista daquela instituio religiosa. essa expectativa que poder esboroar-se totalmente se Jernimo e Matos desistirem boca das urnas, j que nesse caso o sucessor de Mrio Soares ser inevitavelmente eleito no dia 14. Isto porque, de acordo com a lei, nas presidenciais eleito o candidato que obtiver mais de metade dos votos expressos -- ou seja, pelo menos 50 por cento mais um --, excluindo-se desta contabilidade os votos em branco e os nulos. Nem sempre foi assim. A questo do peso dos votos em branco no escrutnio final s ficou resolvida com uma alterao da lei eleitoral, de 26 de Novembro de 1985, que veio pr ponto final ao diferendo que ops nessa matria o Supremo Tribunal de Justia (STJ) e o Secretariado Tcnico dos Assuntos para o Processo Eleitoral (STAPE) Comisso Nacional de Eleies (CNE). Separados por linhas subterrneas, o Metropolitano de Lisboa e a Cmara Municipal andam s avessas. As solues adoptadas para a expanso da rede esto a gerar posies contrrias e desta vez a empresa que escreve a Sampaio para refutar as afirmaes do Municpio. 07.00-09.00 Cafena. Os ouvintes da Radical acordam com Pedro Marques. Nono ms do calendrio lunar islmico, durante o qual mil milhes de pessoas se abstm de comer, fumar, ouvir msica ou de ter relaes sexuais, o Ramado o ms sagrado muulmano. Mas para alguns tambm sinnimo de jihad (guerra santa). H onze meses, exploses na cidade e massacres em zonas rurais causaram pelo menos 400 mortos e traumatizaram a cidade, onde o grupo islmico armado (GIA) enfrentou, com sucesso, a apertada malha das foras de segurana argelinas. o que se teme agora, segundo os panfletos, aparecidos sobretudo em mesquitas no controladas pelo Estado, dentro das quais as oraes tm sido acompanhadas por murmrios pouco comuns. Traumatizado, refugio-me no Canal 1, onde se estreia Tudo pelos Outros, com Vtor Norte a mostrar que estaria muito melhor num palco. Aguento, com um estoicismo que eu prprio admiro, a inundao de lugares-comuns, comparvel das guas do Tejo, porm menos benfica para a agricultura. Mas, quando chega o momento de homenagem s mezinhas, no resisto a zapar e a saltar novamente para a SIC. Perplexidade e desorientao: a SIC tambm transmite homenagens s mezinhas e aos paizinhos, a nica diferena est em que, em vez de flores, do-lhes msica pimba. Zapo e rezapo e os momentos de sincronismo repetem-se com enlouquecedora constncia: as mesmas lgrimas, as mesmas fungadelas, o mesmo bem-fazer, os mesmos familiares abraados uns aos outros. J no sei onde estou, talvez j nem saiba quem sou. Gramm ser o primeiro a declarar formalmente a candidatura, na prxima semana, na sua cidade natal de College Station, Texas. Na verdade, Gramm planeia a sua campanha h anos e anda activamente na estrada h meses. O ex-governador do Tennessee Lamar Alexander ser o seguinte a declarar-se, dias depois. Pouco conhecido fora do Tennessee e de Washington, Alexander conseguiu ganhar a ateno do seu partido ao apresentar sua volta alguns nomes de peso. Um documento confidencial do Painel Intergovernamental Sobre Mudanas Climticas (IPCC), citado na ltima edio do semanrio britnico Independent on Sunday, afirma que o globo est de facto a aquecer e que a poluio de origem humana um dos factores responsveis pelo fenmeno. O documento do IPCC vai ser apresentado numa conferncia internacional que ter lugar em Roma em Dezembro. O IPCC rene nesse documento a opinio de 2.000 meteorologistas que prevem no futuro uma maior incidncia de doenas tropicais, o aumento de secas e cheias, a morte de florestas e a diminuio de colheitas nos pases pobres. O anterior relatrio do IPCC, elaborado em 1990, considerava que a temperatura do globo estava a aumentar mas no considerava provado que a causa fosse a aco humana. Agora, dizem os investigadores, j no restam dvidas. O IPCC um grupo de peritos que foi estabelecido em 1988 pela Organizao Meteorolgica Mundial e pelo Programa das Naes Unidas para o Ambiente. Mas o Itamaraty tem vindo a sofrer a aco corrosiva dos ltimos governos, que lhe impuseram ministros sem preparao, alm da reduo de mais de 50 por cento do seu oramento. Os diplomatas brasileiros queixam-se da falta de recursos, j que so obrigados a cobrir muitas despesas com dinheiro do prprio bolso. Contudo, o Itamaraty apresenta alguns excessos que surpreendem. o caso das embaixadas situadas nos Estados Unidos e em Londres, Paris e Roma, onde as mordomias saltam aos olhos dos brasileiros. o diplomata brasileiro vive num apartamento de quase mil metros quadrados na Avenue Foch, junto ao Arco do Triunfo, em Paris. Ultrapassadas as consideraes gerais, o documento mergulha quase ponto por ponto nas propostas do ministrio sugerindo caminhos, apresentando alternativas e recomendaes. Desde logo no que toca ao conceito de propina. Neste ponto, o CNE avana com dois cenrios sobre a controvrsia, revelando como o consenso no seio dos conselheiros no tem sido fcil. Todos temos de fazer sacrifcios e ns estamos dispostos a colaborar. Mas -- acrescentou -- receamos que o sacrifcio no esteja a ser repartido de uma forma justa pela populao. Est a ser pedido aos idosos que se sacrifiquem duas ou trs vezes mais do que o resto das pessoas. Primeiro, porque vai aumentar os impostos dos beneficirios da Segurana Social. Segundo porque introduzir mudanas nos programas de assistncia mdica- Medicare e Medicaid- que, ao baixarem os honorrios dos mdicos, far com que estes no queiram assistir tantos doentes, e, terceiro, porque o aumento sobre a energia aumentar os preos do combustvel dos sistemas de aquecimento caseiro, que uma parte considervel dos oramentos dos idosos de baixos rendimentos. O segmento accionista da Bolsa de Tquio fechou com reduzida alteraes face sesso anterior. Operadores disseram que o mercado dever manter a mesma tendncia durante as prximas sesses devido forte procura dos investidores domsticos e estrangeiros, tal como vem sucedendo nas ltimas semanas. O ndice Nikkei perdeu 0,16 por cento. As audincias mdias de Fera Ferida e Mulheres de Areia permitem tambm analisar o percurso dos noticirios da noite, que lhe esto colados na lgica do programador e nos hbitos do espectador. o mrito do Telejornal maior se atendermos ao facto de que esse quinto equivale a primeiro ... entre os programas que podem considerar-se fora do pacote de propostas indigentes que continuam a liderar o top). Os dados de audincias da semana fornecem outras surpresas, de vrio tipo. Na TV2, a transmisso, em horrio domingueiro de almoo, dos Campeonatos Europeus de Atletismo, hora a que a portuguesa Manuela Machado vencia a maratona feminina, no alcanou mais do que uns mseros 3,0 por cento da populao. a SIC, que investiu uma nota negra na compra dos direitos de cobertura, e outra nota ainda mais negra nos meios necessrios cobertura, no consegue mais do que um mximo de 5,5 por cento de audincia mdia para as imagens recolhidas por esses meios nunca vistos. Tanto barulho para nada, parece. Talvez por, nestes dias, canalizar outros clientes, atrados pelos nomes de David Lynch e Mark Frost, que tambm perderam os seus crditos pela manso de Hugh Heffner. Uma ltima constatao: a TVI est pulverizada no Top 20 Nacional, sem qualquer proposta acima ou igual aos 10,9 por cento de Szinhos em Casa do Canal 1. O que, nos tempos que correm, talvez no seja defeito mas feitio. Ao longo de uma obra original em que tem procurado figurar a gnese e os momentos mais marcantes da nossa modernidade filosfica e esttica, o filsofo Manfred Frank (ainda desconhecido do leitor portugus, mas j muito traduzido em Frana e na Itlia) serve-se, entre outros, do mito do judeu errante, nestas duas vertentes, para seguir um percurso que, na literatura, conduz da viragem para a Idade Moderna e das primeiras viagens pelo desconhecido at contempornea deriva pelos mares da interioridade e condio niilista deste nosso sculo, preo da emancipao crtica em relao s promessas de estabilidade da Razo e de uma modernidade que um dia viria a falar com lngua bfide, afirmando no discurso cientfico o que negava no potico. As duas vertentes -- Aasvero e o Holands -- cruzam-se e fundem-se a partir do sculo XVI, com um predomnio, no plano da matria diegtica, da deriva martima sobre a errncia terrestre, desde Os Lusadas e as narrativas de viagens inglesas e holandesas, at ao Ancient Mariner de Coleridge, Lgende des Sicles de Victor Hugo, ao Bateau Ivre de Rimbaud e ao Navio Fantasma de Wagner, culminando num episdio-chave da anti-Odisseia que o Ulysses de Joyce: o dos lotfagos (com um antecedente importante no sculo XIX, o longo poema de Alfred Lord Tennyson The Lotos-Eaters), onde Leopold Bloom surge como simbiose do Ulisses polytropos (muito viajado e de muitas manhas), do Judeu errante e do marinheiro Sindbad, comendo do fruto do esquecimento que o levar deriva infinita, para alimentar uma insacivel sede de experincia, sem Itaca que possibilite o reencontro de si. A Polcia Judiciria de Coimbra deteve um casal suspeito da autoria de vrios crimes de burla, dos quais tero sido vtimas centenas de pessoas. Ao que tudo indica, o modo de subsistncia do casal consistia na publicao de anncios -- no Jornal de Notcias e no semanrio Expresso -- de emprego fictcios, relacionados com actividades agrcolas no estrangeiro, nomeadamente em Inglaterra. Era um pneu que s testmos durante trs voltas e corremos o risco de o usar, pensando que era o mais indicado. Na categoria de 250cc o japons Tetsuya Harada, da Yamaha, alcanou a sua terceira vitria no Mundial -- s perdeu na Malsia --, batendo por mais de quatro segundos o italiano Massimiliano Biaggi (Honda) e o francs Jean-Philippe Ruggia (Aprilia), respectivamente, segundo e terceiro classificados. A corrida foi completamente dominada por Harada, estreante nesta edio do Campeonato do Mundo, que assumiu o comando partida e nunca mais o largou. A partir da Zambujeira, as praias favoritas so acessveis atravs dos montes, que se atingem atravessando o ribeiro nas traseiras do caf Fresco. O Carvalhal a mais conhecida, mas outras h com menos gente e, por isso mesmo, mais escolhidas pelos nudistas. J quase no limite do concelho de Odemira, aparece a praia da Amlia, mesmo ao lado da polmica Odefrutas de Thierry Russel. Mais a sul, Odeceixe. E na Azenha imprescindvel comer um arroz de marisco. A data de 4 de Novembro tem sido apontada, geralmente, como o limite para o incio da deslocao. No incio desta semana, no entanto, um diplomata indonsio admitiu a possibilidade da visita comear em finais de Outubro, o que, a confirmar-se, apanhar a Comisso Eventual praticamente no grau zero da preparao da visita. Na ltima reunio [ quarta-feira passada ], entrmos, tommos uma bica, demos umas palmadas nas costas e viemos embora, porque no havia informaes, revelou ao PBLICO um membro da Comisso. No s desconhecemos os critrios para os convites imprensa, como ignoramos quem vai e de que partidos, pois o PRD desapareceu da actual AR [ tinha um deputado na Comisso ] e o PCP e o CDS diminuram a representao. A data aproxima-se, h pequenas coisas a fazer, como por exemplo vacinas, e no podemos estar a vacinar as Cortes inteiras .., acrescentou o deputado, manifestando-se preocupado com o atraso na preparao de uma visita que funciona como um jogo na corda bamba, pois tanto pode ser favorvel a Portugal como Indonsia. Uma reaco semelhante teve o PCP, por intermdio de Vitor Dias, da sua Comisso Poltica, que salientou que a base da gesto camarria a coligao entre socialistas e comunistas, continuando esta a manter a sua capacidade, lembrando que o que existira com o CDS fora um acordo pontual entre os vereadores centristas e o PS. Mas, na conferncia de imprensa -- onde estava ladeado pelo seu assessor de imprensa, Antnio Metello --, Jorge Sampaio deixou tambm entender estar para muito prximo uma deciso importante. A relativa sua recandidatura. Portugal, com 6,7 por cento dos pobres da CEE. Um em cada 3 portugueses pobre, uma em cada 3 casas no tem casa de banho e s um em cada 5 jovens sabe utilizar um processador de texto. Qual ser o pas mais jovem da CEE no sculo XXI? P. -- Ainda no que respeita ao nvel dos rendimentos, h uma queixa que a oposio e as organizaes socioprofissionais fazem ao Governo e que tem a ver com os custos dos factores de produo, principalmente no que respeita s taxas de juro. No acha que a actual situao, inserida num contexto fortemente concorrencial, est a penalizar os agricultores portugueses? R. -- Considero que temos alguns custos de produo mais altos, isso ningum poder negar. Mas temos outros mais baratos, como os alimentos para os animais e o preo da mo-de-obra. Sampaio visitou, durante todo o dia, as zonas do Pico e Faial mais atingidas pelo sismo do passado dia 9. Optou pela pedagogia nos contactos que teve com os sinistrados e nem sequer esqueceu a sua experincia como autarca, recordando inmeras vezes os tempos em que foi presidente da Cmara de Lisboa para explicar as suas teorias -- mais propriamente alguma insatisfao natural -- sobre realojamentos. Ou pelo menos a forma de evitar alguns conflitos entre a populao. Foi o prprio Presidente a pedir para reunir com todas as pessoas atingidas pela catstrofe. Na localidade de Flamengos, onde ainda esto mais de 200 pessoas a dormir em tendas, Sampaio apelou ao bom senso e solidariedade tpica do povo aoreano para compreender que no se podem construir todas as casas ao mesmo tempo. Quando fui autarca e fazia realojamentos era normal que quem no conseguia logo uma habitao ficasse insatisfeito. Aqui temos de compreender que no se pode ter tudo ao mesmo tempo. A soluo, para uma melhor harmonia, que seja a populao a indicar quais so os casos prioritrios, explicou. Com caixotes do lixo, os alunos finalistas do Instituto Superior de Economia e Gesto de Lisboa (ISEG) fecharam ontem o anexo de Buenos Aires em protesto contra a alterao dos planos curriculares. Sob o mote 40 cadeiras chegam, os estudantes impediram a entrada no edifcio a partir das sete da manh, tentando assim resolver o problema depois de esgotadas todas as vias diplomticas. Com os olhos na Rssia, tecnicamente em violao do texto assinado em 1990, os pases-membros do tratado sobre as armas convencionais na Europa voltam a reunir-se a partir de hoje, em Viena, para tentar limpar definitivamente o continente de armas convencionais. Mas ningum aposta num sucesso fcil. -- Contraditria. Ao regime no interessava o desenvolvimento cultural do pas. O que interessava era beber vinho, beber vinho era de comer a um milho de portugueses, era dar s crianas sopas de cavalo cansado, era ir a Ftima . -- Ao futebol -- Bom, s para ver o Benfica! Outra ideia exposta por Soltwedel foi a de que o mercado de trabalho, apesar de envolver pessoas, deve funcionar como qualquer outro mercado, acrescentando que os europeus tm de se livrar da ideia que os salrios no tm nada a ver com o trabalho realizado. Se no se mudar este curso, os asiticos vo esfregar as mos. Nesta reflexo sobre o futuro da economia europeia ps-GATT, o professor alemo peremptrio em afirmar que os salrios vo descer, em funo do aumento da competio das empresas, ao mesmo tempo que o desemprego provavelmente crescer ou estabilizar. Na sua ptica, isto deve-se a todo o tempo em que a indstria europeia esteve protegida. Depois de tanto tempo debaixo do guarda-chuva, evidente que a Europa se vai molhar. O Ministrio da Sade vai lanar at ao final de Maio 15 novas experincias na rea dos centros locais de sade, de modo a melhorar a acessibilidade aos cuidados de sade primrios. O projecto Alfa, assim designado, foi ontem apresentado na extenso de Ferno Ferro do Centro de Sade do Seixal, e passa pela criao de grupos interdisciplinares e pelo alargamento do horrio de funcionamento destes plos de sade. A importncia do projecto foi realada pela presena em Ferno Ferro do primeiro-ministro e da ministra da Sade. A titular da pasta da Sade disse que o futuro atendimento nos centros de sade ser mais humanizado e personalizado. preciso reganhar a confiana das pessoas e incentivar os profissionais, afirmou. Os bombeiros no conseguiram remover de imediato o corpo, que ficou preso entre as ferragens do conjunto dianteiro de rodas. Apenas ao meio-dia -- trs horas aps o acidente -- chegou estao Socorro uma carruagem com ferramentas, macacos hidrulicos e cerca de vinte funcionrios da Diviso de Manuteno do Metro. Os mecnicos levaram ainda uma hora para remover algumas peas do conjunto de rodas, antes de levantar a automotora A-78 e, finalmente, retirar o corpo mutilado de Pedro Alexandre. Os pais do jovem, depois de ouvirem a notcia do acidente no Metro, j haviam ido s urgncias do hospital So Jos e depois seguiram at entrada do Metro no Martim Moniz. Aps reconhecerem os documentos do filho, foram conduzidos para a esquadra da PSP da Mouraria, acompanhados da mesma multido de curiosos, aparentemente insatisfeita com a simples apario do corpo numa maca coberta. Isto parece uma manifestao, comentou um oficial da PSP, antes de ordenar a disperso da aglomerao. O segundo lugar do torneio repartido pelo ingls D. J. Russel e pelo irlands Ronan Rafferty, ambos com 67 pancadas. Este, vencedor da ordem de Mrito de 89 e 17 do ranking mundial, posiciona-se agora como o principal favorito vitria no Open portugus. Embora ainda faltem trs dias de prova, a verdade que o seu momento de forma excelente, alis como o comprovam os resultados alcanados na presente temporada. O irlands foi primeiro em Palm Mcadows e segundo nos Asian Classic, Dubai Deset Classic e Hong Kong Open. O seu score no final do primeiro dia do Open, surge na sequncia de 16 voltas abaixo do par. Um total de 57 pancadas, para uma mdia de trs por volta. Actualmente quinto classificado na Ordem de Mrito Europeia, o eventual triunfo no evento portugus permitir-lhe-ia ultrapassar Severiano Ballesteros no terceiro lugar da lista europeia dos ganhos monetrios. Aqui reside a principal curiosidade da prova. David Silva, o profissional de Vila Sol, por seu lado, foi o melhor portugus em competio, terminando os 18 buracos em Par do campo. No entanto, comeou mal, fazendo trs bogeys nos trs buracos iniciais. Seguir-se-ia uma recuperao de grande categoria, na qual alcanou cinco birdies, o que lhe permitiu manter as aspiraes a um lugar entre os 65 finalistas dos dois derradeiros dias da competio. Se jogar bem amanh [ hoje ], um lugar entre os finalistas inevitvel. R. -- No tenho sentimentos religiosos, no acredito na vida eterna. Sei que no h vida do lado de l. le nant, dizem os franceses. Tenho dito aos meus que devem preocupar-se com esse dia, no por mim, mas por eles. uma bonita palavra esta, saudade. Eu deixarei de existir. O que ficar de mim a conscincia do que fui e que os outros recordaro -- ou no. mas se a lembrana for de saudade, ento acho que pode ser uma boa coisa. Ferreira de Almeida disse ao PBLICO estar a par da situao e que vai solucionar o problema brevemente, colocando uma conduta de cimento ligada ao colector. Por ora, os moradores desta zona da Ajuda, vivem rodeados de esgotos por todos os lados. Mas, afirma Vtor Castelinho, a Junta de Freguesia tem um projecto para aproveitar o espao livre que d para a Rua Eduardo Bairrada construndo um pavilho polidesportivo e dois campos de tnis, para servir a populao da Ajuda, espera de luz verde na Cmara. Vtor Mendes, de 40 anos, licenciado em direito e antigo recordista nacional (2,15m) do salto em altura, desde o ano passado o director da seco de atletismo do Sporting e foi confirmado nas suas funes j pela nova direco. A sua equipa, encabeada pelos coordenadores tcnicos Bernardo Manuel e Abreu Matos, tambm se mantm, e a tnica antes a do seu reforo, com a entrada de Fernando Mamede para o quadro tcnico, com funes ainda para definir, e de um incondicional do atletismo do clube, Antnio Frade, como seccionista. um perodo de euforia, em Alvalade, que tem um significado extraordinrio, pois foi o corolrio de um trabalho desenvolvido desde h um ano, como explicou Vtor Mendes. Quando vim para o Sporting nem havia praticamente equipa de pista formada, houve uma reduo drstica do oramento e estava quase tudo por reconstruir. Agora os dois ttulos quebraram um jejum de sete e oito anos e foram conquistados, para mais, quando o Benfica era dado como favorito. S com essa condio aceitei ficar frente da seco. Ex-primeiro-ministro, de 1974 a 1978, e ex-ministro dos Negcios Estrangeiros da Blgica, no incio da dcada de 80, Leo Tindemans , hoje, o presidente do Grupo Democrata Cristo do Parlamento Europeu, o segundo maior agrupamento parlamentar de Estrasburgo, a seguir aos socialistas. Europesta convicto, como de resto o so todos os governos deste pequeno pas, dividido ao meio pela lngua e pela cultura, entre um Norte flamengo e um Sul francfono, Tindemans defende que o grau de unio poltica conseguido em Maastricht insuficiente. Seria necessria uma nova conferncia intergovernamental sobre a Unio Poltica Europeia -- mesmo antes da reviso do Tratado, agendada para 1996 -- e, em seu entender, o alargamento da Comunidade aconselharia a que esta se dotasse de uma verdadeira constituio com metas e com princpios. Por fim j no conseguia remar mais. Tinha as mos cobertas de bolhas, as costas queimadas, o corpo doa. Com um suspiro, mal agitando a gua, deslizei para o mar. Suficientemente refrescante e estival. O segundo livro traduzido nesta coleco, deste autor sul-africano. um primeiro livro e recria a Amrica dos fins dos anos 40 renovando a grande tradio de Hammet e Chandler. Criou a personagem de um detective negro, Easy Rawling que, no entanto, no procura fazer o contraponto de Marlow e Spade mas, sim, erguer uma figura particularmente convincente num livro contado num ritmo imparvel e com um sabor pungente e autntico. Segundo o escritor e realizador de cinema Nicholas Meyer era nesta percentagem que Sherlock Holmes dilua a sua cocana, antes de ter sido curado do seu mau hbito pelos amistosos cuidados do dr. Watson. Mas o optimismo continuou a dominar nas sesses seguintes, apesar das indefinies quanto evoluo das taxas de juro. O Dow Jones passou novamente a barreira dos 2.900 pontos para se situar pouco acima dos 2906 pontos na sesso de quarta-feira. Ainda com tendncia positiva, a Bolsa de Nova Iorque evoluiu de forma hesitante at aos 2910 pontos, para no dia seguinte voltar a descer. Isto apesar de os indicadores econmicos at agora divulgados indicarem um final rpido para a recesso no pas. Na quinta-feira foi o indicador das encomendas de bens duradouros, que em Abril cresceu cerca 2,9 por cento, melhorando as perspectivas de crescimento industrial. Na sesso de sexta-feira, ao subir para os 2913,91 pontos (mais 0,45 por cento do que na sesso anterior) o ndice Dow Jones permitiu ganhos da ordem dos 0,95 por cento na semana. Boris Ieltsin continua a dar sinais contraditrios sobre a forma como vai utilizar a sua vitria poltica no referendo de domingo passado para acelerar o ritmo das reformas e imprimir-lhes uma orientao mais clara. O cabelo ter ficado grisalho mas o seu bom aspecto e o ar agaiatado no deixam transparecer os seus 50 anos e, muito provavelmente, milhes de fs continuaro a am-lo quando ele tiver 64. O ex-Beatle Paul McCartney iniciar em breve mais uma extenuante tourne de um ano pelo mundo, para promoo do seu novo lbum. A campanha para as eleies de dia 19 no Punjab comeou sob o signo das armas e do terror dos separatistas sikhs, que tendem a controlar o curso dos acontecimentos naquele estado indiano. Ontem foram assassinados cinco militantes do partido Bharatiya Janata, da direita hind. Ontem, o FC Porto voltou a mostrar fibra de campeo e deu a volta ao jogo com o Martimo, mais uma vez graas ao olho do treinador, que tambm erra, mas acerta mais vezes do que os outros, e a uma equipa que nunca desiste. Os campees so assim e este, ao cabo de 17 jornadas, leva 11 pontos de avano sobre o segundo, o Guimares, que ontem empatou no Bonfim e j sonha com a Liga dos Campees. O Rio Ave, outra surpresa, foi derrotado em casa pelo Farense e tem j muitos candidatos nas suas costas dispostos a roubar-lhe o terceiro lugar. O Estrela da Amadora continua a fazer um campeonato Fernando Santos, ou seja, um campeonato certinho e sem crises, e at venceu o Salgueiros na Reboleira. O Boavista est a melhorar e empatou no que comea a ser muito complicado estdio do Campomaiorense, numa partida que tambm terminou empatada nas crticas dos treinadores ao rbitro. O Chaves, com lvaro a treinador, comprou um balo de ar com a vitria em Coimbra. O Belenenses foi perder ao campo do Varzim e continua a sua marcha a passo acelerado para a Diviso de Honra. H mais de uma semana que o cessar-fogo entre srvios e muulmanos est a ser mais ou menos respeitado e, semelhana das restantes frentes de combate, a situao nesta regio tem-se mantido calma. Apesar de em Brcko, 54 quilmetros a noroeste, se terem registado violentos confrontos no fim-de-semana. Uma espessa sopa de feijo com pedaos de carne, po e beterraba o almoo destribudo no sbado aos soldados srvios. A refeio subitamente interrompida pela apario de um jipe NIVA 600 de onde saem dois soldados armados -- andar armado nesta regio, seja-se civil ou militar, medida obrigatria -- com os boletins de voto para o referendo dentro de um grande envelope. Na Videoteca de Lisboa, s 22h30, realiza-se o II Encontro da Associao de Vdeo, Arte e Novas Tecnologias Interactivas PT. No Centro Cultural da Malaposta inaugurada, tambm no mbito das comemoraes da Revoluo dos Cravos, a mostra colectiva Artistas de Abril, com obras de Joo Vieira, Jos Santa Brbara, Lus Ralha, Maria Keil, Vespeira e Rogrio Ribeiro. s 21h. A entidade patronal da Guial, a empresa txtil de Barcelos cujos trabalhadores se mantm em greve desde a passada quinta-feira, requereu ao Tribunal Judicial de Barcelos a ilegalidade da medida tomada recentemente pelos trabalhadores e que consistiu no impedimento da sada de quatro carrinhas da companhia carregadas de mercadoria. Na sesso de ontem do Mercado Monetrio Interbancrio, que deu incio a um novo perodo de constituio de reservas de caixa, o Banco de Portugal voltou a no anunciar as taxas directoras de interveno e a manter suspensa a facilidade diria, anunciando, no entanto, uma cedncia de fundos at ao montante de 300 milhes de contos, a seis dias, em sistema de leilo de taxa de juro contra a recompra de Bilhetes de Tesouro. JOS CALADA -- As relaes entre o sindicato e o ministrio no existem. Esto, em bom rigor, como sempre estiveram. Desde 22 de Dezembro de 1993 que vimos solicitando senhora ministra uma audincia. Isto j foi feito por variadssimos faxes, atravs de contactos pessoais em Janeiro de 94, mas no tivemos, at agora, resposta. Os contactos -- de natureza informal, nem sequer oficiosos -- tm sido com o sr. subsecretrio de Estado-adjunto da ministra. Em termos estritamente oficiais, este sindicato ainda no conseguiu ser recebido pela ministra, o que no deixa de ser espantoso tendo em conta o tempo decorrido desde a primeira audincia. P. -- E porqu? Quanto a Portugal, Ferreira do Amaral reafirmou ontem que as vias prioritrias de ligao Galiza so a concluso da auto-estrada Porto-Valena- e bem assim a concluso do IP1, entre a ponte que ontem inaugurou com Jos Borrell e a ponte que ambos tambm inauguraram h dois anos no Guadiana-, a auto-estrada Famalico-Guimares e o restante trajecto do IC5 at Vila Pouca de Aguiar e da at Chaves e Verin (IP3), e ainda a concluso do IC1 (Porto-Valena) ligando alguns troos dispersos j construdos, como a variante da Pvoa de Varzim e a nova ponte de Viana. Tudo terminou na Pousada de S. Teotnio, na zona histrica de Valena, entre um matraquear de perguntas dos jornalistas. Mas teve a compensao de ver, ao lado do seu homlogo, na larga varanda da pousada, os primeiros veculos no oficiais a atravessarem a nova ponte. E avistou, do lado contrrio, a construo metlica de Eiffel, sucumbida a mais um engarrafamento. Talvez o ltimo. Sem entrar em pormenores, devo recordar que as empresas portuguesas a trabalhar na Guin-Bissau rondam a meia centena, das quais seis ultrapassam, em investimentos e responsabilidades, o milho de contos. Por si ss, aos numerosos pequenos investidores (que no esto includos nos 50) cabe uma fatia de 2,5 milhes de contos. O Grupo Champalimaud, a Petrogal, a TAP-Air Portugal, a Marconi, a Salvador Caetano, a Tertir, a Somec e a Soares da Costa so algumas das que mais investiram e, presume-se, maiores interesses tm a defender. Se a isto acrescentarmos o que est a ser perdido -- passageiros e mercadorias na TAP, mercadorias na Portline e na Transinsular, dormidas nos hotis (Hotti e 24 de Setembro), operaes na banca (Banco Internacional da Guin-Bissau e Totta & Aores), turistas nos clubes de caa (como o de Cap) --, as verbas perdidas causaro vertigens. Quando iam a caminho do Pas Basco, jornalista e operador de cmara receberam a notcia da morte, em Bilbao, de Luiz Andrs Sampiero, um tenente da Guardia Civil encarregue sobretudo de assuntos ligados toxicodependncia. Esperavam, pois, encontrar uma cidade em estado de stio, mas depararam-se com uma estranha normalidade. Ainda chegaram a tempo de registar a sada do funeral da Capela do Rei, que no o dos bascos, acompanhado por muitas flores e palmas. Quando se trata de falar que j mais difcil. Os bascos no gostam de falar da ETA ou das suas aces. No falo, choro, declara um transeunte perante as cmaras. um povo apaixonado, o povo basco. Mas tambm um povo adormecido pelas atrocidades cometidas pela Espanha federalista contra a individualidade de um povo. E a maior dessas atrocidades passou-se h 60 anos, em Guernica, para sempre imortalizada no famoso quadro de Pablo Picasso. Frasto da Silva, ex-ministro da Educao e actual presidente do Instituto Nacional de Administrao, admitiu ter sido sondado para o lugar de Antero Ferreira nos conselhos directivo e de administrao da Fundao das Descobertas, tal como noticiou a edio de ontem do PBLICO. No falei com o primeiro-ministro, mas fui sondado nesse sentido, disse. Claro, o tom j outro. Jacques Chirac fala de terrorismo. Chama brbaros aos srvios. Finalmente utiliza a nica linguagem que eles conseguem compreender: a da firmeza. Fale, senhor presidente! Procure falar j! Talvez baste uma palavra mais forte, e que cale fundo, para fazer recuar os fora-da-lei --, trata-se de um progresso que se impe saudar. Contudo, uma nica pergunta: o novo Presidente ficar-se-por aqui? Ou ir ao ponto de dizer: mais intolervel ainda que a humilhao de um soldado a humilhao de 300.000 homens, mulheres e crianas, bombardeados quotidianamente, desde h trs anos? Ele sabe que no lhe dei o meu voto. Mas tambm sabe que goza de um estado de graa que confere a todas as suas palavras uma imensa repercusso. Oxal aproveite esta oportunidade. Oxal seja o primeiro chefe de Estado a tomar finalmente, e de forma clara, o partido da democracia e do direito. Como gaullista que , seria esta a ocasio histrica para se mostrar fiel ideia da Frana de que se diz herdeiro. Milo. Conferncia de imprensa de Jean-Pierre Elkabbach comentando os resultados da cimeira das televises pblicas europeias que reuniu na vspera em Paris. O que ao certo uma televiso pblica? E o que a distingue de facto das televises ditas comerciais? No fundo, e de acordo com Elkabbach, trs apostas: o primado da produo sobre a difuso; a recusa em opor divertimento a cultura; e por fim a vontade de conciliar o imperativo do mercado (o famoso nvel de audincias ao qual definitivamente absurdo pensar que qualquer televiso possa escapar) e a exigncia de qualidade ( qual perfeitamente escandaloso acreditar que uma srie, mesmo popular, deva ser estranha, por natureza). Trata-se de facto de apostas. Ou desafios. Quer sejam formulados com nitidez, quer sejam formulados, sobretudo, medida de essa rede de televises tecida escala do continente -- esta a boa nova da semana. Parte significativa, mas no integral, do seu reportrio gravado para essa companhia foi reeditado numa compilao temtica de seis LP, primeiro editada pela CBS (actual Sonny) em1986. Essa mesma colectnea deu mais recentemente origem a um reedio em dois CD duplos, que se encontram esgotados no nosso mercado. RESPONSVEIS polticos do Imen e da Arbia Saudita tentavam ontem fazer diminuir a tenso entre os dois pases devido a uma velha disputa fronteiria que provocou trs mortos em combates perto de uma ilha contestada do Mar Vermelho. O vice-primeiro-ministro e ministro dos Negcios Estrangeiros iemenita, Abdel Kader Bajamal, dever deslocar-se hoje Arbia Saudita para tentar desbloquear a querela, disse AFP um diplomata em Sanaa, capital iemenita. Foi ministro de Jacques Chirac (1986-1988) e agora considerado o seu principal rival, embora ainda no se tenha candidatado oficialmente s presidenciais. douard Balladur avisou que no tomaria nenhuma deciso antes de Janeiro e acusou Chirac de usar o RPR como uma fortaleza. As diferenas polticas entre os dois so menos bvias do que o contraste nos estilos. Chirac, o extrovertido; Balladur, o taciturno. Balladur nasceu em Esmirna (Turquia), em 1929, e formou-se na Escola Nacional de Administrao, de onde saiu a elite da funo pblica francesa. Foi colaborador de Pompidou, de 1969 at 1974, depois administrou trs empresas e, em 1986, entrou no parlamento como deputado do RPR. No Governo, apesar da subida do desemprego, continua a ser popular. A ltima sondagem d-lhe o apoio de 53 por cento dos franceses, contra 47 para Chirac. O presidente cessante da Comisso Europeia, Jacques Delors, no se apresentou ainda como o candidato da esquerda sucesso do seu correlegionrio Mitterrand. Mas ningum duvida de que tem a inteno. Se no incio as sondagens lhe atribuiam o ltimo lugar, agora mais popular do que Balladur. Delors, 69 anos, aderiu ao Partido Socialista em 1974. Antes foi militante da Juventude Operria Crist, lder sindical, funcionrio do Banco de Frana e professor de Gesto na Universidade de Paris-Dauphine. Foi tambm ministro da Economia e maire de Clichy. Na CE, quis ser um pioneiro, defendendo o ideal de uma verdadeira federao europeia at ao fim do milnio. A equipa do Sporting apresentou-se no Municipal de Chaves com Capucho a ocupar a posio de Cadete. Da substituio no resultou, entretanto, qualquer ganho ofensivo para os lees. A falta de profundidade atacante, a azelhice no remate -- aspecto em que apenas Juskoviak esteve diferente para melhor -- foram razes para o nulo que se verificava na primeira parte. Os onze remates do Sporting contra apenas um do Chaves, durante a metade inicial, do bem a ideia da superioridade atacante dos homens de Alvalade e no abonam em nada a capacidade concretizadora dos seus avanados. Na segunda parte, a tnica do jogo continuou a ser a mesma. Henrique Calisto decidiu-se, por isso, a mexer na equipa trocando Saavedra por Omer. Face passividade de Saavedra, o novo treinador do Chaves procurou injectar sangue novo na dianteira, esperando que a sua equipa se tornasse mais agressiva na frente de ataque. apenas o finlands Tarkki (outra estreia) incomodava, aqui e ali, a defesa sportinguista. A comercializao e instalao do servio de televiso por cabo em Valongo e Ermesinde da responsabilidade da TV Cabo Porto, uma empresa do grupo TV Cabo Portugal, que prev, at ao fim de 1995, estender este servio a 75 mil habitaes na rea do Grande Porto. A caravela Boa Esperana chegou doca do Jardim do Tabaco tera-feira, ao entardecer. Terminava uma rota ocenica de 10 mil milhas nuticas atravs dos portos de Lisboa (Portugal), Cdiz (Espanha), Las Palmas (Canrias), San Juan (Porto Rico), Nova Iorque e Boston (EUA) e Liverpool (Inglaterra) na Regata Colombo 92, uma viagem comemorativa dos 500 anos de descoberta do Novo Mundo. Foram quase 4 meses de navegao em que 60 velejadores portugueses filiados Aporvela fizeram as vezes de marinheiros sob o comando dos skippers e irmos Joo Lucio Costa Lopes e Jos Incio Costa Lopes. O estado dos sectores produtivos -- agricultura, indstria --, dos servios e da complexidade da mquina administrativa, bem como a sua articulao com os problemas da pobreza e da modernidade outra das reas. A pobreza e a Solidariedade assim o tema de um colquio a que Soares assiste em Setbal, no dia 6, onde intrevm o presidente da Cmara, o bispo de Setbal e Bruto da Costa. Por outro lado, Soares encontra-se com agricultores em A-dos-Ces, no domingo, 31, em Manique do Intendente, na tera-feira seguinte, e ao fim da tarde do mesmo dia, em Vila Franca. A 5, no Barreiro, visita a Sociedade Agrcola Lavradiense, a 9, na Azueira, Mafra, encontra-se de novo com agricultores, na Central Fruteira e a 10 visita a Adega Regional de Colares. J no domnio da indstria, inaugura as instalaes da Dan&Cake, em Alverca, e visita uma empresa de flores e o Centro Tecnolgico da Cortia, a 6, no Montijo. Prefiro no fazer prognsticos pois, como disse ao Presidente [ Felipe Gonzalez ] e ao meu colega, at que recebamos respostas definitivas aos convites de participao j enviados, no gostaria de fazer qualquer anteviso, sublinhou James Baker. O senhor Baker est quase a conseguir a paz, sentenciou, por seu lado, Gonzalez. Como o PBLICO noticiou, a Unicre remeteu para a DGCP a fundamentao da nova taxa, onde se refere que os custos do servio, por operao, de 15850. Um valor bastante superior soma da taxa de cliente, de 100 escudos, com a nova taxa de 30 escudos. Isto , a Unicre sustenta que os 130 escudos a cobrar diminuem, mas no eliminam os prejuzos do servio. Pais Antunes, director-geral da Concorrncia e Preos, no ficou inteiramente convencido com os argumentos avanados pela empresa gestora da rede Visa. E por isso pediu a suspenso preventiva da taxa. So vrias as dvidas suscitadas DGCP pelo processo da taxa. Pais Antunes entende, antes de mais, que o modo como a Unicre quer taxar as gasolineiras discriminatrio, na medida em que, em toda a rede Unicre, as bombas so as nicas sujeitas a um valor fixo, e no a uma percentagem sobre o valor das transaces. Num sector como o dos combustveis, em que as margens de comercializao so esmagadas e sujeitas a preos fixados administrativamente, uma taxa fixa poder, na opinio de Pais Antunes, contribuir para que as companhias se encostem sistematicamente aos preos mximos. Por outro lado, o director-geral destaca o facto de Portugal ser o nico pas que conhece em que os portadores de Visa esto obrigados a pagar uma taxa de 100 escudos. Quanto ao peso dos encargos financeiros no valor das vendas, 57 por cento das empresas declara que so inferiores a cinco por cento, ao passo que trs por cento revela encargos superiores a 20 por cento. Analisando estes dados, a AIP constata que, se um facto que em todos os subconjuntos (empresas industriais, de construo, comrcio e servios, exportadoras e no exportadoras) se verifica uma maioria de empresas em que os encargos financeiros so inferiores a cinco por cento do volume de vendas, bastante significativo o facto de um quarto das empresas apresentar uma relao encargos financeiros-vendas entre os cinco e os dez por cento (29 por cento no caso das industriais) e 18 por cento apresentarem valores superiores a dez por cento. Com a partida, para terras de outras gentes, de Buddy Guy, o blues ficou mais pobre. Com o regresso, nossa terra, de Buddy Guy, o blues est mais rico. Se o seu ltimo disco, Feels Like Rain, j garantia a proximidade da ressurreio, Slippin' In confirma-a. E, mesmo ao cair da folha, 1994 tornou-se um ano de alegria. Mestre da guitarra de doze cordas e emblema do blues do Piedmont, Blind Willie McTell um dos guardadores da memria da genuna msica popular da Amrica. A comear em Outubro de 1929 e acabar em Setembro de 1933, totalizando 41 peas, esta compilao um disco de cabeceira. A Cmara Municipal de Gaia vai demolir todas as construes de raiz do acampamento cigano de Francelos. A deciso foi tomada ontem, depois de um encontro com uma representao dos moradores que se queixam do trfico de droga na zona, atribuindo-o comunidade cigana ali instalada h mais de 20 anos. Esta uma das quatro reivindicaes que a autarquia contemplou para agrado dos moradores que afirmam agora ir aguardar pacificamente a actuao das autoridades. Paralelamente, circulou tambm na localidade um panfleto apelando ao boicote s aulas nas escolas da zona. Recorde-se que as mesmas pessoas ameaaram j no votar nas prximas eleies se o problema no for resolvido. A Polcia Judiciria est a investigar a morte de Jos Antnio Tom, de 39 anos, cujo corpo foi encontrado, ontem de madrugada, em Troves, S. Joo da Pesqueira. Ramos Lopes, um histrico do PSD local, que foi presidente da primeira concelhia e actualmente director da Fundao Gulbenkian, encabea uma lista sem nomes sonantes da actual gesto autrquica, que integra, entre administradores de empresas, o escultor scar Guimares e Maria Joo Vieira, assessora do ex-ministro Diamantino Duro. Aos 15 anos, Pedro, Meloman, Sapito para o people da Arrentela e do Monte da Caparica, j fora acusado e absolvido de assalto a um estabelecimento. J passara muitas noites nas esquadras da PSP e nos postos da GNR e j apanhara muita porrada da bfia. Mais qualquer discurso fez que no recordo. Mas mais espantoso ainda que nos esperou depois porta de sada, no para cumprimentar os clientes -- como de bom tom! --, mas de novo agredir verbalmente o meu anfitrio, dizendo-lhe No volte c mais! ... e atirando-lhe com a porta na cara. No pretendo fazer ironia nem obter resultados. mamem a pastilha elstica caladinhos ... seno rua !!! no precisa de se preocupar. Mas no lhe ensinaram aquela antiga regra do negcio que diz que um cliente descontente um cliente perdido? E at pode no saber quem ... s estava presente o aoreano. Nos vinhos a coisa melhorava um pouco: uns borbas e uns reguengos razoveis, um Paulo da Silva (Colares) e alguns Bairrada, vinho e espumante. Azeite, creme de castanha, frutos secos e algumas ovelhas concluam o que Portugal tinha para mostrar. Foi pouco. Depois de trs dias a visitar a feira as opinies dividiam-se um pouco, sobretudo entre professores e alunos. Os primeiros estavam desencantados com o facto do Salo deste ano no ostentar o brilho, nem as dimenses, de edies anteriores. Foi um dos primeiros sintomas do efeito que as decises de Maastricht tero na ultra-subsidiada agricultura francesa -- e o primeiro sinal de retrao. Vitimado por um ataque cardaco, morreu no passado dia 3 de Julho, em Nova Iorque, o guitarrista Johnny Copeland, um dos ltimos nomes carismticos do blues texano. A notcia da sua morte s ontem foi conhecida, atravs da revista americana Variety. A maior rea queimada verificou-se no distrito de Santarm, com oito mil hectares, enquanto em Bragana arderam apenas 64 hectares. No perodo de 1 de Junho a 7 de Julho os bombeiros foram chamados a 7249 fogos florestais. Entretanto, o violento incndio que deflagrou tera-feira no concelho do Fundo, entre as povoaes de Barco e Lavacolhos, voltou a reacender-se na madrugada de ontem, aps ter sido considerado extinto na quarta-feira, segundo informao dos bombeiros locais. Ontem, discursando no Washington Institute for Near East Policy, no mbito da sua terceira visita oficial aos EUA, Netanyahu voltou a propor o reatamento do dilogo com a Sria sem condies prvias. Isto , ignorando tudo o que foi negociado nos ltimos cinco anos, o que Damasco considera inaceitvel. Netanyahu garantiu ao Presidente srio, Hafez Assad, que encontrar nos israelitas parceiros razoveis e cooperantes se escolher o caminho da paz. O problema que, para Netanyahu, esse caminho passa primeiro pelo fim dos ataques do Hezbollah no Lbano, enquanto para Assad passa primeiro pela devoluo dos Gol. Ns dizemos que queremos retirar-nos do Lbano, mas na imprensa sria respondem-me: no se retirem, queixou-se Netanyahu. A Sria procura ostensivamente a nossa retirada mas, na prtica, impede-a. Para os actores uma teraputica, para os espectadores uma aprendizagem e uma lio. Os oitos actores do Grupo Teatro Teraputico do Hospital Jlio de Matos entregam-se com tal verdade e liberdade, falam to desassombradamente de temas-tabus, que os espectadores se sentem em desvantagem. Quem so os doentes: eles ou ns? Rua do Ouro: reparado at hoje. Rua da Prata: reparado at hoje. ... ela atende a Academia Sueca ... Ele, em Frankfurt, ela em casa. Foi 15 minutos antes do anncio. Eu estava na cozinha, onde quase tudo de importante acontece naquela casa, e telefonaram-me. Atendi e creio que falou uma senhora em ingls -- eu no sei ingls --mas quando ouvi ' Academia Sueca ' no ouvi mais nada, fez-se um vazio na cabea, no sei se agradeci, se fui corts, se dei um grito. Porque toda a gente ia para o aeroporto de Madrid espera dele, era o que estava previsto. A minha preocupao era essa, ter mandado toda a gente para o aeroporto ... Quando lhe perguntamos quais so as possveis razes desta ascenso aparentemente irresistvel das mulheres, Whipp responde-nos que acha que existem dois factores principais. Talvez no seja de estranhar, alis, que as novas campes da corrida sejam originrias da China, o pas mais populoso do mundo -- onde o universo da escolha potencialmente maior do que em qualquer outro pas do Mundo. O ritmo de treino das novas campes chinesas, por exemplo, absolutamente espectacular. Entre outras coisas, elas correm uma maratona por dia em terreno acidentado -- mais de mil quilmetros por ms! -- e durante cinco a seis vezes por ano o seu treino decorre na meseta tibetana, a cerca de cinco mil metros de altitude. Os etopes esto este ano ausentes da pista das Aoteias, o que retira algum fulgor prova masculina. Na corrida feminina, o nvel superior, pese embora a falta da americana Lynn Jennings, a tricampe mundial de crosse. ... e municipais na Tunsia. Quanto Federao Nacional de Professores, que ser recebida na prxima tera-feira pelo Presidente da Repblica, a quem vai falar sobre a situao no sector da educao, mantm um dos dois dias de greve marcados durante o mandato de Diamantino Duro. Os dirigentes da Fenprof avisam no entanto desde j que se Couto dos Santos insistir nalgumas das directrizes dos seus antecessores arranjar lenha para se queimar. A mudana de ministro no basta, diz a Fenprof, disposta a mostrar o descontentamento dos professores na manifestao do prximo dia 27, data da greve. Depois de uma anterior fase de conversaes, terminada h 11 dias, a Itlia conseguiu persuadir a comunidade armnia do Nagorno-Karabakh a deixar de boicotar as negociaes de Roma. Mas depois da nova ofensiva azerbaijana, com centenas de tanques, helicpteros e avies de ataque, os dirigentes armnios do enclave disseram que no conseguiam sair de l. As conversaes de ontem, presididas por Mario Raffaelli, o mesmo que dirige o processo de paz moambicano, comearam sem eles, embora a prpria Armnia tenha estado presente. Mantendo uma relao evidente com a estrutura histrica dos encontros, Albano da Silva Pereira, organizador principal, articula as escolhas entre o nacional e o internacional, a consagrao e a revelao, o pedaggico e o experimental. por outro, corre riscos estticos. Sendo a fotografia um lugar de registo dos corpos, parece que, este ano, do conjunto de exposies se desprende mais uma atitude de registo, a participao do fotgrafo na construo e seleco da imagem do que a afirmao das prprias imagens, do seu corpo. estabelecer uma ordem na multiplicidade catica das imagens pode ser uma tarefa exterior e desligada desse real mas tambm pode, ao procurar entender as intenes dos fotgrafos, enriquecer o conjunto fornecer-lhe um novo sentido. Foi este homem, marginalizado at em decises que diziam respeito ao seu pelouro, que em Janeiro decidiu partir a loia. No queria ver o seu nome ligado a uma gesto altamente duvidosa e pediu ao secretrio de Estado dos Mercados Agrcolas que mandasse investigar as irregularidades de que tinha conhecimento. No essencial, tratava-se da adjudicao de servios de promoo de imagem e de obras sem concurso pblico nem visto do tribunal de contas (ver PBLICO de 21 de Abril). Chegadas Comisso Parlamentar de Agricultura atravs de uma notcia do PBLICO, as acusaes em questo levaram os deputados a ouvir Pedro Rodrigues, agora na prateleira do Iroma, e Branco Rodrigues, agora presidente do Conselho de Administrao da PEC-Alimentao, a empresa que controla as quatro PEC regionais entretanto criadas. Concluda a audio parlamentar, o deputado comunista Lino de Carvalho, relator do processo, no teve dvidas em propor que a Comisso Parlamentar assumisse a iniciativa de inqurito s irregularidades e ilegalidades detectadas no captulo da promoo da imagem das PEC e da publicidade da sua privatizao. O PS anuu de imediato e o PSD, atravs do seu coordenador para a rea da Agricultura, Carlos Duarte, admitiu claramente a hiptese de vir a subscrever a proposta de Lino de Carvalho. Depois de ouvirmos o depoimento do eng. Pedro Rodrigues ficmos com algumas dvidas relativas aos concursos pblicos e penso que h aqui algumas questes que o prprio Governo no conhecia, afirmou Carlos Duarte ao PBLICO h duas semanas. Anteontem, porm, o PSD rejeitou a proposta de inqurito pretextando com o facto de estar em curso uma inspeco da iniciativa do prprio Ministrio da Agricultura. Escrevemos ao PS pedindo um envio rpido da contra-proposta, passadas trs semanas ainda no conhecemos oficialmente a posio do PS. Procurei na ltima semana contactar o presidente da concelhia socialista, mas, encontrava-se no estrangeiro, espero vir a concretizar esse contacto ainda esta semana -- conclui Carlos Arrojado. As celebraes do tricentenrio da morte do (assim convencionado) primeiro heri nacional do Brasil, Zombi dos Palmares, tem destaque especial na edio de hoje do Acontece, na TV2. mais de 300 mil cpias vendidas desde o Natal, shows continuamente esgotados no Caneco, por onde passaram j cerca de 50 mil pessoas, recepo calorosa por parte do pblico. O regresso s memrias do passado coloca nos olhos de Joaquim Afonso um brilho de indisfarvel saudade. Dantes, a vida era cheia de felicidade. A aldeia tinha mais liberdade, garante. A agricultura foi, desde pequeno, a sua nica profisso. Mas fiz o exame da quarta classe com distino, assegura, com evidente orgulho. Tem razes para isso, j que as aulas exigiam-lhe diariamente um sacrifcio de mais de 22 quilmetros, feitos a p, qualquer que fosse a poca do ano. medida que vai desfiando parte do seu passado, Joaquim Afonso olha fugazmente o cu, abre mais os olhos, esboa um sorriso e l vai continuando a sua conversa. Desta vez, para lembrar uma prenda do seu pai, que, num dos invernos rigorosos de Busteliberne, resolveu fazer-lhe uma surpresa, oferecendo-lhe umas botas. Eu gostava tanto delas que, quando passava em stios onde sabia que no encontrava ningum, tirava-as e ia descalo. Para as poupar, justifica. A cotao de Lenine nunca foi to baixa. As filas gigantescas entrada do imponente mausolu na Praa Vermelha desapareceram. Os russos preferem agora passar horas numa fila para um smbolo do capitalismo: o McDonald's e os seus hamburgers. O prximo czar da Rssia vai ser colocado no trono pelo dinheiro, proclama Alexandre Melnik, um diplomata russo. As crianas que abandonam a escola na ilha de So Jorge, Aores, para ajudar os pais e a economia familiar so ainda em grande nmero, segundo informou o presidente da Comisso de Proteco de Menores de Velas. Carlos Noysan acrescentou que o absentismo escolar constitui o principal motivo de queixas apresentadas comisso. Entre as razes apontadas pelo mesmo responsvel para a fuga escolaridade obrigatria, em So Jorge, esto dificuldades no pagamento do almoo das crianas nas cantinas das escolas, a falta de dinheiro para a compra de livros e a carncia de transportes. Problemas que, de acordo com Carlos Noysan, o Instituto de Aco Social e o Centro de Prestaes Pecunirias da Segurana Social tm procurado solucionar. O Governo Regional dos Aores comprometeu-se a pagar os juros de um emprstimo a contrair pela Cmara de Santa Cruz da Graciosa e empresrios locais, visando a compra de um barco de passageiros orado em 54 mil contos. A embarcao, de 30 lugares e 14 metros de comprimento, destina-se ao trfego entre as ilhas do grupo central do arquiplago, viajando prioritariamente entre a Graciosa e a Terceira. O secretrio regional da Economia, Duarte Ponte, justificou o apoio governamental com o envolvimento camarrio no projecto, desafiando outras autarquias das ilhas a iniciativas semelhantes. A partir de Maio, outra embarcao com capacidade para 150 passageiros ser alugada pelo governo regional para ligar as ilhas dos grupos central e oriental. Para o desempenho das suas atribuies especficas no campo social, a Santa Casa da Misericrdia de Lisboa (SCML) assegura a obteno de meios financeiros prprios atravs, sobretudo, da organizao e gesto, a nvel nacional, da lotaria nacional, das apostas mtuas (totobola, totoloto e joker) e, agora, tambm, da lotaria instantnea, em cujos lucros comparticipa. Foi D. Maria I que, por real decreto assinado a 18 de Novembro de 1783, concedeu a lotaria Santa Casa da Misericrdia, com o objectivo primordial de sustentar os hospitais reais de enfermos e expostos. A medida acabaria por imprimir ao jogo uma credibilidade que, de outro modo, talvez no tivesse. Actualmente, de acordo com a lei vigente, a SCML recebe um tero dos lucros lquidos da lotaria e o Estado fica com dois teros. Em Monte-O-Novo feriado municipal. s 15h00 sai rua a procisso em honra de S. Joo de Deus. Uma hora mais tarde o Movimento Democrtico de Mulheres promove um convvio no ginsio municipal. No Cine-Teatro Curvo Semedo, Mafalda Veiga, Francisco Fanhais, entre outros, participam, s 21h00, num espectculo produzido pelo grupo Porta Aberta. O grupo de teatro Maizum apresenta a pea Florbela. Silvina Pereira interpreta o papel da poetisa Florbela Espanca. s 17h00, no Museu Nacional de Arte Antiga, s Janelas Verdes. No lado oposto desta realidade esto o Barcelona e o Real Madrid. Os catales, em baixa de forma que no poupa mesmo Figo, suavizam as suas prestaes com o ataque mais realizador -- 37 golos -- e o terceiro posto. Lugar de pdio, mas a sete pontos do comandante e a dois do rival local, o Espanyol, outra das revelaes. Est longe a constituio no Nou Camp de um novo dream team, como o de Romrio e Stoichkov, o que j enerva a direco. Consequncia imediata: as diatribes do tcnico Johan Cruyff, consentidas com os xitos, apresentadas mesmo como sinais de genialidade, passam agora a ser criticadas. J no Real Madrid, apesar de a equipa continuar mal -- foi humilhada pelo Deportivo da Corunha na quinta-feira com trs golos de velocidade de Bebeto --, de no reencontrar a frescura que a levou na poca passada ao ttulo, o treinador Jorge Valdano continua com o apoio da direco. Adivinha-se que a prazo, enquanto as dvidas do clube concentrarem a ateno dos dirigentes e as contas no possibilitarem o pagamento da resciso do contrato. No de literatura do que aqui se trata, embora a histria, densa e trgica, nos envolva desde a primeira pgina. Uma histria bem real, to real e violenta que o livro foi j posto fora de circulao oficial no Brasil, tal as ondas de choque que provocou o seu testemunho-denncia, vinte anos j passados sobre o desaparecimento sem rasto de Snia Maria de Moraes Angel Jones. Os Estados Unidos devem subir substancialmente as taxa de juro durante o prximo ano, de forma a manter a inflao sob controlo e manter um ritmo de expanso sustentada da economia. O conselho foi ontem dado pelos tcnicos da OCDE, no seu mais recente relatrio sobre o estado da economia norte-americana, em que se avisa as autoridades do pas a reduzirem os gastos com a Segurana Social com o objectivo de assegurar, a longo prazo, a sade financeira federal. A organizao prev que os Estados Unidos cresam 2,9 por cento em 1995, contra 3,8 por cento este ano, enquanto a inflao passar de 2,1 por cento em 1994 para 2,8 por cento no prximo ano. A OCDE considera que nos dois ltimos anos a economia norte-americana teve um bom desempenho, mas receia o aumento das tenses inflacionistas e novas quedas do dlar, caso as taxas de juro no venham a ser aumentadas. Ao que tudo indica, a deciso de uma nova reunio com o ministro foi j tomada, embora a data ainda no tenha sido estabelecida. Enquanto isso, os universitrios do Norte tm estado a reunir-se e a estudar cada um dos princpios apresentados pelo titular da pasta da Educao. o caso do ncleo do Porto do Movimento Nacional Contra o Aumento de Propinas e da Associao de Estudantes da Universidade de Aveiro, que tm estado a discutir as mesmas propostas com vista a elaborar um documento onde seja patente a sua tomada de posio sobre o assunto. Para Miguel Dias, dirigente estudantil da Universidade de Aveiro, as palavras de Couto dos Santos no Porto deixaram muitas dvidas em relao justia social. Concretizando, aquele universitrio referiu a convico de que no devem ser os estudantes a pagar pelos erros de certos reitores que no tm feito bem a gesto dos dinheiros. A inteno dos estudantes de Aveiro , agora, saber como feita a gesto das universidades e, por outro lado, saber qual o fim do aumento das propinas. Para os habitantes do Sul, observou a Reuter, a discusso da guerra civil nas Naes Unidas d credibilidade ao seu Estado, embora este seja apenas reconhecido pela Somalilndia, outro territrio secessionista, da vizinha Somlia, que tambm ningum reconheceu. H dois tipos de reconhecimento na lei: um claro e outro implcito, comentou Abdul-Moneim Abdullah, professor de Direito na Universidade de den. Por enquanto, estamos felizes s com o reconhecimento implcito. Enquadrada no local mais terciarizado da capital, a exposio Habitao Lisboa/92, organizada pela autarquia no Terreiro do Pao, pretende, durante a semana de 16 a 23, no s lanar o debate sobre os problemas no sector como relembrar a fuga de habitantes para fora da cidade, substitudos pelos servios e o comrcio. Com excepo para o Instituto Nacional da Habitao e do Instituto de Gesto e Alienao do Patrimnio Habitacional do Estado (os dois organismos estatais ligados ao sector) -- que declinaram o convite feito pela Cmara -- estaro presentes no certame diversas entidades com responsabilidades na Habitao. Assim, entre os 28 expositores contam-se as autarquias, as empresas construtoras, os bancos, as imobilirias, as seguradoras e as associaes como a dos inquilinos e a dos proprietrios, entre outras. 11 de Setembro -- O Bundesbank intervm em defesa da lira, sendo seguido pelo banco central da Blgica. 13 de Setembro -- O Comit Monetrio da Comunidade opta por um realinhamento do SME, com a desvalorizao da lira italiana em 3,5 por cento e a valorizao das restantes divisas do Sistema em 3,5 por cento. Carlos do Carmo interpreta temas capella, outros em que acompanhado por apenas um instrumento e apresenta uma cano indita. o primeiro concerto do programa Vozes. Espectculo do Ballet Folclrico Mexicano, no mbito da programao oficial do pas. Um ms aps a Declarao anglo-irlandesa sobre o futuro do Ulster, continuam longnquas as hipteses do Sinn Fein, o brao poltico do IRA vir a aprovar os seus termos. O documento pouco favorvel aos republicanos que s na Pscoa se pronunciaram oficialmente sobre ele. Interessa antes de mais decidir quem preferimos para representar a Repblica, a que somos e a que queremos vir a ser. A representao depende muito da qualidade do ser do representante. A este nvel o estilo o homem. Nenhuma dvida me sobra ao comparar Cavaco, smbolo de um economicismo rasteiro e de uma extrema escassez cultural e espiritual, com Sampaio, pessoa de outra espessura humana e promessa de que a poltica continuar a predominar em Belm. E quem no sentir que o vnculo entre entre Sampaio e as liberdades polticas no s mais antigo mas mais visceral? A agncia Nova China informou que para redigir este dicionrio de 34 470 entradas em lngua chinesa, foi necessrio o trabalho de 300 especialistas durante trs anos. A enciclopdia considerada o primeiro grande instrumento de trabalho exaustivo e sistemtico para o mestudo do marxismo-leninismo a ser publicado depois do nascimento da doutrina marxista, explicou a agncia. A primeira edio, de 11 mil ecxemplares, est j reservada na sua totalidade. A FRENTE Polisrio acusou ontem Marrocos de violar pela terceira vez o cessar-fogo no Sara Ocidental ao enviar avies para sobrevoar a povoao de Mijek, no sudeste do territrio. Para que possa no responder s violaes marroquinas, a parte sarau exige que a comunidade internacional lance um alerta a Marrocos para que cesse as provocaes e para que se comnporte de forma responsvel, respeitando os seus compromissos, declarou a Polisrio, organizao que luta pela independncia do territrio do Sara Ocidental, num comunicado divulgado em Argel. O norte-americano Pete Sampras foi afastado pelo seu compatriota Jim Courier (24 ATP) pelos parciais de 7-6 (7-5), 6-4, o que significa que o nmero um do mundo vai chegar catedral da terra batida, Roland Garros, com duas derrotas em outros tantos encontros disputados sobre o p de tijolo. Michael Chang, nmero dois do ranking, foi eliminado pelo argentino Hernan Gumy (54 ATP) com os parciais de 6-3, 6-2. Surpreendentes foram tambm as derrotas dos finalistas do torneio de Hamburgo, na semana passada, ambos batidos por australianos. Andrei Medvedev, vencedor do torneio alemo, foi afastado por Scott Draper por 7-5, 6-3, e Flix Mantilla (cabea de srie n13) foi eliminado por Patrick Rafter por 6-1, 3-6 e 6-4. Quando a advogada lhe perguntou porque que o carro dos portugueses no teria sido interceptado, Cardiell afirmou que pensava estarem combinados com a polcia, tendo em conta a forma como as coisas se passaram. Se nos tinham localizado, podiam ter actuado quando ainda estvamos com os portugueses, afirmou. A polcia no quer que eu preste declaraes. No lhes interessa saber mais dados. V-se que este trabalho foi preparado pela polcia, que ter facilitado a entrega da droga para nos deter, rematou o arguido, sugerindo que pode perfeitamente identificar os portugueses envolvidos se as autoridades mostrarem interesse nisso. Para um utilizador experiente ou para um principiante, uma ligao rpida, eficaz e econmica Internet em Portugal s assegurada atravs da Teleweb ou da Esotrica, dois dos cinco fornecedores que operam em Portugal. Esta a concluso a que chegou a revista de consumidores Pro Teste, que publicou na sua edio de Setembro um estudo sobre esta matria. Fora das fronteiras do Imprio, consegue pouco sucesso no Oriente, mas seduz os germanos atravs da heresia do arianismo, que considerava Jesus apenas como um homem, excluindo a sua dimenso divina. Mais tarde, os povos que aderiram e esta derivao sero duramente reprimidos. Inicialmente religio dos pobres, a nova mensagem depressa se estende a todas as camadas sociais e ser vigorosamente divulgada pelos pequenos grupos iniciais, que depois do origem a novas comunidades. A devoo ou o arrependimento de pessoas ricas implica, em simultneo, a concesso de doaes s autoridades crists, que as enriquecem. Trezentos anos aps a morte de Jesus, o cristianismo iria tornar-se a religio oficial do Imprio. Um grupo autodenominado Combatentes pela Liberdade do Lbano reivindicou ontem o rapto de um oficial da Fora Area norte-americana e do seu filho, que desapareceram na Turquia, e ameaou execut-los caso no seja libertado um dirigente do Hezbollah, revelou ontem em Ancara a agncia Anatolia. A agncia referiu que um indivduo no identificado e que se exprimia mal em turco telefonou de um pas estrangeiro para afirmar que o grupo tinha em seu poder o tenente-coronel Mike Couillard, 37 anos, e o seu filho Matthew, dez anos. Os dois norte-americanos desapareceram h trs dias no Ocidente da Turquia. Segundo apurou o PBLICO, o ministro Ferreira do Amaral nomeou o presidente do Conselho Superior de Obras Pblicas, Armnio Faria, para representar o MOPTC nas negociaes tendentes a um acordo. Um gesto que revela disponibilidade do Governo para um acordo de cavalheiros que o presidente da Cmara de Fafe, Parcdio Summavielle, classifica de francamente positivo. A histria remonta a 1983, altura em que o Governo do Bloco Central, atravs do Ministrio do Equipamento Social, titulado pelo socialista Rosado Correia, assinou com a Cmara de Fafe um protocolo no qual se comprometia a custear um centro coordenador de transportes at ao montante de cem mil contos e a financiar em 90 por cento a construo de uma via circular cidade, como contrapartidas desactivao da linha frrea entre Guimares e Fafe. O XVIII Festival Internacional de Msica da Pvoa de Varzim abre hoje a sua programao com um concerto por o Coral de Letras da Universidade do Porto e da Orquestra Esproarte, s 21h30, na Igreja Matriz da cidade. O espectculo conta com a participao de Rui Taveira (tenor), Oliveira Lopes (bartono), Thomas Gal (piano), Isabel S (harpa) e Helena Sofia Pereira (tmpanos). Os msicos iro interpretar, sob a direco do professor Jos Lus Borges Coelho, a cantata BWV 4 de J. S. Bach, bem como a Sinfonia Simples e a cantata Misericordium, ambas de Benjamin Britten. Em Paos de Brando inicia-se tambm o XIX Festival de Msica de Vero. Na sede do Crculo de Recreio, Arte e Cultura actua, pelas 21h45, o Quarteto Lrico do Real Teatro de Queluz, que formado por Elsa Saque, Carlos Guilherme, Ana Ferraz e Wagner Dinis. Os cantores sero acompanhados ao piano por Armando Vidal. Hoje, a ltima oportunidade de assistir ao espectculo de dana clssica indiana, com coreografia e interpretao de Mallika Sarabhai, uma das mais eminentes bailarinas indianas, especializada nos estilos Bharata Natyam e Kuchipudi, e cuja participao na obra de Peter Brook, Mahabharata, no papel de Draupadi, lhe concedeu fama internacional. O treinador portista optou por uma marcao individual e muito atenta sobre os homens do Corunha. E logo aos 4', as cerca de 500 pessoas que se deslocaram ao Pavilho das Antas assistiram ao primeiro golo apontado por Pedro Alves. Durante a primeira parte, a constante rotao defensiva operada pelos portistas conseguiu anular a rapidez dos espanhis. A concentrao defensiva, e consequente nmero de bolas recuperadas, viria a originar o segundo e terceiro golos portistas, ainda na primeira parte. Primeiro por Pedro Alves, num inesperado remate meia volta, e a dois minutos do intervalo por T Neves que, de costas, conclui uma bonita jogada de envolvimento. Para terminar: Francisco Paula de Oliveira merece hoje o protagonismo e a importncia que lhe foi negado. O reconhecimento que tarda. O respeito que se impe. Os New Jersey Nets perderam a invencibilidade no campeonato, pois foram derrotados pelos Chicago Bulls, por 99-86. Os Bulls, com 21 pontos de Steve Kerr, regressaram s vitrias e somam agora quatro triunfos -- todos eles conquistados em casa -- e duas derrotas, que lhe do o terceiro lugar na Diviso Central. Apesar da derrota, a equipa de New Jersey pode orgulhar-se de ter obtido, nesta poca, a melhor srie vitoriosa num incio de campeonato (quatro triunfos consecutivos) desde 1976, altura em que se estreou na NBA. A privatizao do BFE processar-se- por concurso pblico de 65 por cento do capital do banco. O preo mnimo por aco de 1980 escudos, valor que se encontra prximo da oferta pblica de aquisio final feita pelo Banco Portugus de Investimento, em Fevereiro, o que atribui ao BF um valor global de 158,5 milhes de contos. A operao de venda, que incide sobre 52 milhes de aces, permitir ao Estado um encaixe mnimo de 103 milhes de contos, destinado ao Fundo de Regularizao da Dvida Pblica. O concurso pblico para a alienao do BFE aberto a investidores, individualmente ou em grupo, que observem os requisitos de dimenso de activos, fundos prprios e capitais, estipulados no caderno de encargos, a aprovar por Resoluo do Governo. Stuart Eizenstat, sub-secretrio de Estado do Comercio norte-americano, lanou um apelo s autoridades do Estado da Califrnia para que reveja a sua deciso de boicote aos bancos suos, aparentemente sem efeito. O Governo suo reagiu criticamente a este boicote, ameaando recorrer Organizao Mundial do Comrcio (OMC). A tenso entre os EUA e a Sua vai seguramente crescer com a entrevista dada ao L'Hebdo, de Genebra, pelo venervel universitrio Jean-Franois Bergier e publicada quarta-feira. A Oliva informa os obrigacionistas da emisso Oliva/89 que o valor lquido do juro por obrigao de 34 escudos e est disponvel a partir de 20 de Junho na sede da empresa. A Indelma-Indstrias Electro-Mecnicas SA estar cotada durante 30 dias no mercado sem cotaes. A permanncia no mercado comea a ter efeito a partir do dia 31 de Maio. O ANTIGO chefe de Estado do Uruguai, Julio Sanguinetti, foi eleito para a Presidncia da Repblica nas eleies gerais de domingo, segundo a projeco de um instituto privado aps o escrutnio de 15 por cento dos votos. Segundo o instituto Cifra, o Partido do Colorado (liberal), de Sanguinetti, ter vencido a corrida eleitoral com 33,5 por cento dos sufrgios, contra 31 por cento do Partido Nacional (conservador), do Presidente cessante Lus Alberto Lacalle, e 30 por cento da Frente Ampla (esquerda). Sanguinetti, um advogado de 58 anos, que dirigiu pela primeira vez o pas entre 1985 e 1990, ficar no poder por um mandato de cinco anos. Arrigo Sacchi, seleccionador nacional italiano de futebol, indicou ontem os 22 jogadores que vo representar o pas no Mundial de Futebol dos Estados Unidos. Os atacantes Gianluca Vialli e Gianluigi Lentini ficaram de fora. O Milan -- finalista, com o Barcelona, da Liga dos Campees Europeus -- o clube que mais jogadores fornece seleco transalpina, com sete futebolistas, seguindo-se o Parma, com cinco, a Juventus e o Lzio de Roma, ambos com trs. Na manh seguinte, com surpresa minha, o telefone tocou e falei tudo o que era necessrio. Espantada com o facto, resolvi aproveitar e fazer algumas chamadas que se encontravam pendentes h dois dias. S consegui fazer duas, terceira tudo voltou ao princpio. Chegada ao meu local de trabalho, tentei desta vez falar com a assistente do meu nmero de telefone. Finalmente algum me entendeu, pois de imediato me disse que deveria tratar-se de ... uma avaria na central! Como fiquei contente por, ao fim de dois dias, ter conseguido fazer-me entender pela Portugal Telecom. E confesso que a minha lngua-me o portugus ... Frei Bento Domingues, O.P. Diz-se que um documento do Vaticano ou provoca uma grande polmica -- e torna-se um sucesso editorial -- ou vai dormir tranquilamente para as bibliotecas eclesisticas. Joo XXIII teria sido a santssima excepo. Frederico Cunha celebrar o seu 48 aniversrio no prximo domingo, na companhia dos seus familiares no Brasil. Distante, com a pena de 15 meses de priso suspensa, pela infraco de favorecimento pessoal, o seu afilhado Jos Miguel Noite completar 28 anos no prximo dia 24, num pas da Unio Europeia onde estuda como bolseiro de uma instituio madeirense. Os mercados de aces fecharam ontem em alta, com as bolsas de Londres e Paris (Frankfurt esteve encerrada) a beneficiarem de uma onda de compras coincidente com o comeo da cimeira do Grupo dos Sete (G7) pases mais ricos do mundo, em Halifax, Canad. O dlar subiu para valores acima dos 1,41 marcos, aproximando-se cautelosamente dos 85 ienes e mostrando grande firmeza. Um dlar estvel ajudou os investidores activos nos mercados de aces, na medida em que serve de suporte aos exportadores europeus, disseram operadores. O ndice FTSE-100 da Bolsa de Londres subiu 0,92 por cento, enquanto o CAC-40, de Paris, valorizou-se 1,43 por cento. A Bolsa de Tquio, por seu lado, encerrou nos 14867,26 pontos, mais 206,77 pontos, ao contrrio de Hong Kong, que terminou em baixa ligeira. Wall Street, a meio da sesso encontrava-se em alta, com o ndice Dow Jones a cotar-se nos 4493,85 pontos, mais 2,77 pontos da vspera e muito prximo da barreira psicolgica dos 4.500 pontos. No seu todo o dia foi muito positivo, afirmou um trader, que salientou estarem os mercados bastante activos. Analistas disseram que a reunio do G7 pode terminar com o diferendo entre Washington e Tquio relativo ao sector automvel. Dos vrios tipos de fundos existentes, os que registaram maior procura, proporcionalmente sua quota de mercado, foram os internacionais. Os pouco mais de 3,3 milhes de contos que geriam no final do ano passado passaram no final de Junho a 21 milhes de contos, ou seja, um crescimento de 536 por cento. A este aumento no alheio o facto destes produtos serem os que actualmente oferecem as maiores taxas de rentabilidade, a par dos fundos de aces. A especilaizao do mercado foi, segundo lvaro Peixoto, a nota mais importante do semestre. Para o secretrio geral da ASGFIM torna-se, no entanto, necessrio proceder a certas alteraes dentro do sector, por forma a torn-lo mais competitivo. Uma das reivindicaes a urgente alterao da actual lei que baliza o comportamento dos fundos. A lei de 1988 e obriga a que as sociedades gestoras tenham pelos menos 25 por cento das suas aplicaes em dvida pblica nacional e 75 por cento das aplicaes tm que ser em ttulos cotados em Bolsa. A vitria, por 3-0, do Estrela da Amadora frente ao Desportivo de Chaves indiscutvel, como tambm indesmentvel que aquilo que se viu ontem na Reboleira pouco teve a ver com futebol. Os visitados, que fizeram uma exibio sofrvel, s depois de os transmontanos ficarem com menos um jogador conseguiram criar jogadas com princpio, meio e fim. O jogo dos visitantes descreve-se com uma palavra: pauprrimo. Tudo possvel encontrar no IX Salo de Antiguidades e Coleccionismo, desde objectos de ouro e prata, moedas, moblias clssicas, armas, tapearias, pinturas, livros e postais antigos, onde marcam presena de destaque as peas de Art Deco, a mais nova das antiguidades. Estes trs items foram considerados pelos alunos -- e at por professores do ensino secundrio -- como os mais polmicos da primeira chamada. No que diz respeito analogia fluido est para vtreo assim como viscoso est para translcido, o jri considerou que a validade do contedo da questo muito reduzida, dado que o grau de dificuldade da relao estabelecida horizontalmente, acrescida do facto de os termos terem um significado em Fsica no coincidente com o da linguagem corrente. Assim, todos os candidatos recebem os dois pontos atribudos a este item. O investimento directo portugus no estrangeiro elevou-se no perodo em anlise a 482 milhes de dlares, o que mais 48,31 por cento do que o verificado no perodo homlogo do ano de 1991, e que se explica em grande parte pelo aumento do investimento industrial em Espanha. Quase metade dos valores registados destinam-se ao pas vizinho e, desse montante, cerca de 95 por cento ter sido aplicado na indstria. O investimento lquido nacional em ttulos estrangeiros, por seu turno, elevou-se a 298 milhes de dlares quando no ano anterior praticamente no tinha expresso. Um incndio destruiu, ontem tarde, quatro habitaes de madeira do bairro de pescadores avieiros de Vila Franca de Xira, desalojando um total de dez moradores, que no sofreram quaisquer danos pessoais. Traves -- Discos frente, tambores atrs. Rodas -- Pneus 185/60 R 14. Entre 1 e 4 de Setembro decorreu em Dsseldorf, a edio Primavera-Vero 1992 da Igedo, certame de vesturio, no qual Portugal participou como pas-parceiro. A Igedo comeou em 1949 com apenas sete exibidores e hoje cresceu at aos dois milhares e meio, em representao de 47 pases. ; e a sua relevncia tanto maior quanto o seu calendrio a torna um barmetro da estao que antecipa. Os pases mais representados em termos de exibidores so a Frana, a Itlia e, obviamente a Alemanha, enquanto os maiores compradores vm de Blgica, Escandinvia e Holanda. Coordenadas que fazem com que a Igedo seja especialmente atractiva para as empresas portuguesas viradas para a exportao, sobretudo quando o processo de adeso CEE torna o momento decisivo para o que um dos sectores produtivos de maior peso na economia do nosso pais. Uma senhora, porm, que no resistiu a abraar com algum orgulho o vereador Oliveira Dias, chamando-lhe ex-camarada (recorde-se que Oliveira Dias foi, durante largos anos, o rosto da CDU na autarquia portuense), abeirou-se de Gomes para lhe dizer que os armrios que a Cmara instalou na avenida so exguos para guardar a mercadoria. O presidente desculpou-se, lembrando-lhe, mais uma vez, que tudo aquilo provisrio. Quem no colocou nenhum obstculo foram as vendedoras de peixe, que saudaram efusivamente o autarca.culo foram as vendedoras de peixe, que saudaram efusivamente o autarca. filho, d-me um abrao, que eu fao sempre campanha por ti e eu hoje no cheiro a peixe como no outro dia, afirmava, emotiva, uma senhora de meia-idade. Para o autarca vila-franquense, a hesitao do PSD na discusso desta matria estar relacionada com a posio assumida pelo ministro das Obras Pblicas, Transportes e Comunicaes, em reunio recente com a Cmara e a Assembleia Municipal. Transpareceu a tentativa de evitar discutir este problema e, depois, uma certa vontade de que o assunto fosse dado como definitivamente arrumado. Finalmente, o ministro acabou por admitir que esta questo ter que ser analisada futuramente, no deixando de considerar que a abolio ter que se verificar, mas sem definir qualquer horizonte temporal, sustenta o presidente da Assembleia Municipal. Carlos Arrojado aguarda que, depois das frias parlamentares, o projecto-lei e a petio pela abolio das portagens sejam discutidos, em plenrio da Assembleia da Repblica, at ao final deste ano ou princpio de 93. Os interesses da Brisa e do Governo, ao nvel do Oramento de Estado, no podem continuar a contrariar a necessidade de criar melhores condies de vida populao, acrescenta o autarca, que salienta que por haver portagens no pode ser decidida a proibio da circulao de pesados no interior de Vila Franca e Alverca. A nica justificao para no haver a abolio de portagens de que, ali, se recebe muito dinheiro, conclui o presidente da Assembleia Municipal. O Braga conseguiu ontem em Chaves a sua primeira vitria no campeonato nacional, por 2-1. Tal como aconteceu na partida frente ao Farense, Wosniak voltou a ser a principal figura do encontro, mas, desta feita, pela positiva. Bem pior est o Chaves, que continua sem ver a cor dos pontos. E, ao fim de trs jornadas, a formao de Jos Romo soma outras tantas derrotas. O Presidente da Assembleia da Repblica solicitou a vrios juristas ligados ao seu gabinete que se pronunciem sobre a legalidade da forma como tem sido descontada no vencimento de deputados do PSD a quantia correspondente s multas aplicadas por faltas de comparncia. Barbosa de Melo foi sensvel s reclamaes de alguns parlamentares da maioria que a ele se dirigiram pondo em causa tais descontos sem deles terem sido previamente informados e, segundo o PBLICO apurou, admitiu mesmo vir a solicitar um parecer Procuradoria Geral da Repblica. Na reunio de ontem, o lder dos sociais-democratas portuenses no deixava qualquer indcio sobre a forma como ir conduzir todo o processo face s candidaturas que se desenham. Lembrava apenas que tudo est ainda em aberto e, por isso, insistiu na necessidade de no haver declaraes precipitadas de apoio at porque poderiam fragilizar eventuais candidaturas que possam vir a aparecer. E contrariando as teses de que o prximo presidente do PSD ser fatalmente um lder de transio, defendeu a aposta numa soluo para quatro anos. se o PSD interiorizasse a ideia de que seria uma liderana a prazo estaria a admitir desde j que os resultados das prximas eleies autrquicas iriam redundar num novo desaire eleitoral. Pelo contrrio, Menezes considera que os resultados obtidos nas eleies presidenciais so potenciadores de uma vitria nas autrquicas. Sem iludir a derrota, defendeu que em causa esteve ainda o julgamento do cavaquismo conjugado com um estado de graa do governo socialista, circunstncias que lhe permitem concluir que a margem de 46% dos votos obtida por Cavaco Silva , apesar de tudo, animadora. Para alm do mais, diz Menezes, o PP saiu destas presidenciais fragilizado, e comeam a ser notrias as fracturas internas. o PSD tem de se afirmar como uma oposio credvel, com uma liderana forte, at porque nas previses de Menezes tambm entram as medidas impopulares do governo socialista que estaro a a chegar ... Ora a est ... talvez chegue mesmo auto-estrada. Perigosamente. O citado desvio, pela escarpa do Observatrio, aproxima-se dos acessos Ponte do Freixo e auto-estrada. Nada mais nada menos que o Itinerrio Principal n.1, o eixo virio mais importante e carregado do nosso pas. Logo que a V.C.I. do Porto fr dada por concluida, um autntico formigueiro de carros em andamento. Sempre na nsia de, primeira oportunidade, cortarem pelo caminho mais recto. As autoridades paquistanesas criticam a ndia pelos seus planos de desenvolver msseis balsticos, sobretudo o Agni, que tem um alcance de 2500 quilmetros, e o Prithvi, de mdio alcance. O ano passado, o Paquisto enviou uma carta ao secretrio-geral da ONU depois de o Washington Post, citando os servios secretos americanos, ter noticiado que a ndia instalara msseis Prithvis prximo da cidade de Jullundur, no estado do Punjab, no noroeste. Nova Deli desmentiu a notcia. ndia e ao Paquisto reconhecida a capacidade de fabricar -- e de j possuir -- armamento nuclear. Ambos se recusaram a assinar, no final de 1996, o tratado de interdio total de testes nucleares (CTBT, segundo o acrnimo ingls). Helena Vaz da Silva, uma ex-jornalista, manifestou a opinio de que a qualidade vende e que os ' media ' ganhadores so aqueles que conseguem conjugar qualidade, lucro e interveno cvica. Menos optimista sobre a relao qualidade / venda est o analista econmico Francisco Sarsfield Cabral, que apontou o caso do mercado britnico, onde os tablides vendem dez vezes mais do que os jornais de qualidade. O tambm ex-jornalista da RTP lembrou que se est a voltar ideia de que o accionista deve interferir na vida das empresas, mas referiu que o jornal para ser credvel, no pode ser portador de recados do seu proprietrio. Para defesa da credibilidade do prprio jornal e para permitir aos leitores um julgamento, disse, a estrutura dos accionistas deve ser conhecida como, alis, acontece na maior parte dos casos em Portugal. Sarsfield Cabral recordou que os ganhos dos investidores mediticos devem ser financeiros como acontece noutras reas empresariais, mas no devem ser esquecidos os outros tipos de proventos que eles recolhem: A influncia poltica e a influncia social. Freneticamente, o Governo tem-se empenhado em gastar centenas de milhares de contos em propaganda para nos convencer das vantagens do aumento em 50 por cento da capacidade (seis faixas) da ponte em termos rodovirios e noutra inquestionvel vantagem que a futura implantao nela do caminho-de-ferro. sero 120.000 passageiros luxuosamente transportados todos os dias e que, como bvio, deixaro de circular de automvel. Depois, exagera desmedidamente com os paternais cuidados com os utentes da Ponte 25 de Abril, ao ponto de, aps tantos benefcios, ainda lhes querer dar mais uma ponte. (...) Depois de tantas melhorias, qual a necessidade da nova ponte? No se estar a comportar o Governo como aqueles babados paizinhos que estragam os filhos com guloseimas? Estaro os esforados ministros a prever uma exploso demogrfica no pas? Os responsveis pelas condies de trabalho da imprensa, do Ministrio dos Negcios Estrangeiros, j montaram, entretanto, no CCB, 630 telefones directos, 25 faxes, cinco aparelhos de telex e 350 mquinas de escrever para os, certamente poucos, jornalistas que no usem o respectivo computador. primeira vista, se compararmos os nmeros -- de jornalistas e de telefones, por exemplo --, no parece estar garantido de antemo que, nas horas de ponta, se evitem os engarrafamentos. Confrontado com o pedido da mudana toponmica, o municpio viseense, atravs do seu lder, Fernando Ruas, oficiou a Junta no sentido de dar o nome de Lus Martins a uma outra avenida que atravessa a freguesia e que se situa num troo da Entrada Nacional 2 gerido pela Cmara -- o que no agradou aos elementos da autarquia repesense. Jos Ferro, presidente da Junta, referiu que, mesmo assim, a proposta da edilidade vai ser analisada numa prxima sesso da Assembleia de Freguesia, a realizar em Abril. Um americano de 43 anos, John Esposito, que se entretinha a sequestrar crianas numa cela metlica enterrada no jardim da sua casa em Nova Iorque, que vigiava atravs de circuitos internos de vdeo, entrega-se polcia. O custo de vida no Funchal superior ao de Lisboa, revelam estudos que apontam as taxas porturias praticadas no arquiplago como causa do desequilbrio observado. O aparecimento de filtros e de certo tipo de tabaco mais suave em cigarros, destinados a evitar os cancros das vias respiratrias, esto a multiplicar os casos de uma certa forma de cancro do pulmo, adianta um estudo realizado pela Sociedade Americana de Cancro (ACS). At aos anos 50, o fumo do tabaco usado nos cigarros era extremamente irritante e, em consequncia disso, era difcil inal-lo profundamente. A maior parte dos fumadores realizava, assim, apenas uma inalao parcial e a maior parte do fumo ficava-se pelas vias respiratrias superiores e depositava-se na garganta e na boca. Era a que se fixava a maior parte das substncias cancergenas e a maior parte dos cancros dos fumadores surgiam nestas regies. Foi quando apareceram os filtros e os tabacos mais suaves que os fumadores comearam a inspirar o fumo mais profundamente, levando-o mesmo at aos alvolos pulmonares, permitindo a criao de depsitos de substncias cancergenas nas vias mais finas, na periferia dos pulmes -- o que deu origem ao desenvolvimento de uma forma de cancro chamada adenocarcinoma. Pinochet passeou-se por Portugal, tudo numa visita oficialmente privada. Governo e Exrcito fizeram vista grossa passagem do general. Eis quando seno um jipe dos Comandos deu o no dito por dito. Certo que Pinochet fez do pas uma casa portuguesa, com certeza. Secreta foi uma reunio nos arredores de Lisboa e uma saltada-relmpago a Londres. O chefe do grupo conservador Rossia no Parlamento russo, Serguei Babourine, concordou que no existe nenhuma base legal para julgar as aces de Erich Honecker enquanto chefe de Estado de um pas internacionalmente reconhecido. O prprio Gorbatchov considerara imoral a expulso de Honecker mas a posio poltica em que se encontra actualmente o Presidente sovitico no a mais propcia para fazer fazer as suas opinies. No dia dedicado a debater as questes do emprego, o antigo presidente da Comisso Europeia brindou-os com as suas quatro condies para o xito da moeda nica (uma questo que divide profundamente os alemes), entre as quais esto um pacto de confiana entre os pases que ambicionam participar na terceira fase da UEM e um novo sistema monetrio europeu que crie o quadro das relaes entre os que aderem numa primeira vaga e os que ficam de fora. Falando ao corao dos sociais-democratas, numa crtica velada ao ministro das Finanas Theo Waigel, Delors afirmou que a poltica econmica no s uma questo de oramento e de moeda a cheirar a thatcherismo defunto, mas deve ter tambm em conta a dimenso humana, que envolve os salrios e o emprego. Mas no deixou de advertir para o facto de o pleno emprego no ser possvel com dfices. Estas so algumas das novas linhas directivas do TPC anunciadas recentemente pelo Ministro da Educao britnico, David Blunkett, para as escolas pblicas. Estudos recentes revelaram que metade dos alunos da quarta classe no estavam a receber TPC compulsivo. A GNR de Barcelos deteve esta semana um indivduo de 28 anos, solteiro, desempregado, residente na freguesia de Carvalhas, acusado de ter agredido e tentado violar uma viva de 48 anos, residente na mesma localidade. O alegado violador foi detido em sua casa, depois de a vtima ter denunciado o caso s autoridades policiais quando se encontrava no Hospital Distrital de Barcelos para receber tratamento. Os factos, segundo a queixa apresentada, ocorreram ao incio da noite, num caminho ermo, sem qualquer casa prxima, uma passagem pouco utilizada na aldeia. No momento em que regressava a casa, o indivduo lanou-se sobre ela, tapou-lhe a boca, agrediu-a e atirou-a ao cho. Depois rasgou-lhe a roupa e tentou consumar a violao. A mulher ofereceu resistncia e, alguns minutos mais tarde, apareceu um filho da vtima, de 16 anos, que pegou num pau e agrediu o alegado violador. Na Escola Superior de Educao Jean Piaget, de Macedo de Cavaleiros, est instalado o clima de medo. Nesta vila transmontana tm ocorrido algumas situaes de confronto entre estudantes e grupos de jovens locais, a ltima das quais teve lugar na madrugada de sbado passado e resultou em ferimentos com alguma gravidade num aluno. Para resolver o problema, o director da instituio de ensino universitrio vai hoje reunir com o presidente da Cmara e o comandante da GNR locais. Centros urbanos e candidatos independentes so os denominadores comuns dos pequenos partidos na disputa autrquica. Tudo por causa dos poucos meios financeiros e das restries da lei candidatura de listas de cidados. Mais uma semana que passou mais, mais uns mximos histricos estabelecidos pela Bolsa de Nova Iorque. Apesar de a economia norte-americana estar a dar mostras de um arrefecimento, as aces cotadas em Wall Street continuam a valorizar-se de uma forma progressiva e continuada. Na semana que passou os fundos de investimento foram os investidores mais activos. O ndice Dow Jones fechou nos 4585,84 pontos, mais 1,66 por cento face semana anterior. O director responsabilizou ainda a Redaco pela degradao da situao de O Primeiro de Janeiro, exigindo maior empenho por parte dos jornalistas. , porm, perceptvel que s aparentemente os dois mundos tm vivido completamente separados. Por um lado -- salientando os aspectos positivos --, as produes estrangeiras conduziram a um apetrechamento tcnico dos prestadores de servios nacionais, constituio de excelentes equipas, rodagem de actores que beneficiaram as produes nacionais e, nalguns casos, promoveram internacionalmente a nossa imagem. No entanto, ao mesmo tempo, elas muitas vezes vampirizam a imagem do pas, determinam um aumento nos custos de produo em Portugal -- vide, de novo, as declaraes de Cunha Telles --, o que, por sua vez, conduziu a que o IPC deixasse de ter possibilidade de suportar, para o mesmo nmero anual de produes, a totalidade do seu custo e marginalizaram os nossos criadores -- realizadores, guionistas e msicos -- que s muito excepcionalmente foram chamados a intervir nessas produes. A necessidade de obter financiamentos fora dos apoios do IPC tem levado a uma alterao estrutural das relaes produtores / realizadores no mbito do cinema portugus, com o reforo do papel dos primeiros, que deixaram de ser meros gestores dos subsdios obtidos pelos segundos para passarem a ser portadores do valor acrescentado que o financiamento, normalmente externo, que complementa os apoios locais -- IPC e RTP -- e permite a montagem financeira do projecto. Por sua vez, o reforo do papel dos produtores e a presena de produtores estrangeiros trazem alteraes decisivas na prpria natureza dos projectos a montar, passando naturalmente a existir uma ateno diferente em relao ao espectador e, consequentemente, s perspectivas comerciais da obra. Do alto de uma varanda, microfone na mo, ele atordoa a praa, vibra e faz vibrar todos os que o escutam. S Sampaio parece no ser muito admirador do estilo, cultivando uma sobriedade pouco compatvel com o espectculo que lhe foi preparado. Mas a verdade que foi em Abrantes que o lder socialista teve a melhor entrada e apresentao da campanha. Quando chega ao palco a assistncia j est rendida. Resta apenas dar substrato poltico a uma adeso emocional garantida. O responsvel por este ambiente um jovem de 28 anos, lder da JS de Abrantes e apontador de profisso. Trabalha na Cmara local, mas a sua paixo mesmo o espectculo, a rdio, o jornalismo. Isto comeou na escola. Logo naqueles espectculos de escola, mostrei que tinha um certo jeito para aquilo, explica Manuel Maurcio. Mais tarde inscreveu-se numa escola de msica e no perdia tudo quanto era festinha. Solicitado para apresentar, ler poemas, animar a malta. Um entertainer ribatejano que foi ganhando notoriedade. Nestes ltimos dias, os bracarenses apenas tm estudado pormenores do jogo, pois, quando duas equipas de nvel semelhante se encontram, os pormenores que decidem, reforou Donner, que vai iniciar o jogo com uma defesa 3x2x1, agressiva quanto baste. Depois, na transio para o ataque, Mrio Costa e Paulo Faria (ainda em dvida devido a uma leso) saem para dar lugar aos atacantes Bolotskih e Dobrescu. tem o ttulo praticamente assegurado. Jos Saldanha mostra-se ainda muito surpreendido com a suspenso de que foi alvo a ETEM por parte da Direco-Geral de Armamento (DGA) e acusa este organismo de no assumir as suas responsabilidades. Alegando ter tomado conhecimento da suspenso atravs do PBLICO, Jos Saldanha diz que as responsabilidades tm que ser assumidas pela DGA que recebeu todos os documentos da operao e autorizou a exportao. quem assinou os documentos de exportao no fui eu. Jos Saldanha acusa a DGA de ter um comportamento ambguo e vago nesta histria, dando cobertura a investigaes sem credibilidade nenhuma que foram utilizadas na Bolvia num contexto de luta poltica pelo poder nas ltimas eleies presidenciais. Saldanha no quer ser o bode expiatrio deste caso e admite meter um processo judicial contra o Estado portugus, com pedido de indemnizao. A situao dos professores de Portugus a leccionar no estrangeiro dominou a reunio, na passada quinta feira, da Federao Nacional dos Sindicatos da Educao (FNE) com as secretarias de Estado da Administrao Escolar e da Inovao e Educao. Uma das vitrias reivindicadas pela FNE a integrao destes professores no Estatuto da Carreira Docente dos educadores de infncia e dos professores do ensino bsico e secundrio. Uma medida que, no entender da entidade sindical, s peca por tardia e vai alterar o decreto-lei de 1979 que regulamentava a actividades desses professores, estabelecendo-se agora novas condies e regras para o exerccio do ensino da lngua portuguesa no estrangeiro. A exigncia de clareza nos concursos para o recrutamento dos professores a leccionar no estrangeiro foi outra das reclamaes da FNE, que rejeita a prtica actual, sujeita a despachos pontuais. Defende, por isso, que os concursos para recrutamento de professores passem a ser regulados por decreto. Outra exigncias da estrutura sindical foram que o portugus como segunda lngua faa parte integrante dos currculos das comunidades emigrantes no estrangeiro, que o sistema de segurana social dos professores assegure a proteco na sade e que o vencimento-base seja acrescido de um subsdio de custo de vida. A guerra das pedras deu credibilidade internacional luta dos palestinianos e obrigou Arafat a renunciar ao terrorismo e a reconhecer Israel. Mas a frustrao pela degradao das suas condies de vida e, sobretudo, a m orientao da liderana palestiniana, fizeram com que os palestinianos perdessem quase tudo o que tinham conquistado, quando decidiram apoiar Saddam Hussein na crise do Golfo. Os Cocteau Twins fecharam a 18 edio do Printemps de Bourges, infelizmente sem aquilo a que se pode chamar uma chave de ouro. Valeu uma compensao chamada Morphine, desconhecida de muitos, mas que s veio confirmar a excelncia do programa deste ano do festival francs. Os comentrios de Miguel Sousa Tavares (MST) inseridos no artigo do PBLICO de 4 de setembro sob o ttulo em epgrafe so deveras esclarecedores no que concerne aos tributos dos cidados. tem que ver com a contribuio autrquica no que respeita a bens no geradores de rendimento, como o caso de terrenos classificados de urbanos, a partir de 1989. Depois, verifica-se uma enorme discrepncia de valores na sua avaliao para efeitos fiscais, sempre gravosa, sem atentar sequer nas flutuaes do mercado e das prprias leis. mas, se o objectivo foi libertar os terrenos, evitando a estagnao, seria necessrio que as autarquias respondessem s solicitaes, o que no acontece. Assim, encorajado pela total disponibilidade do meu interlocutor, que igualmente admito, tentaria que entendesse, pelo menos, as vantagens econmicas, em termos de desenvolvimento local (turstico, por exemplo), da salvaguarda da invulgarmente importante, mesmo escala mundial, jazida com pegadas de dinossurios da serra d'Aire, cujo valor cientfico j trouxe aqui cientistas de todo o mundo (dos EUA China), e da transformao do stio num local pblico com todos os equipamentos necessrios sua fruio como bem cultural, cientfico e pedaggico. Quanto a este assunto, tentaria convencer o professor a visitar o local e procurar saber as opinies dos responsveis locais e das respectivas populaes e, ainda, a auscultar o sentir dos vrios sectores da vida portuguesa sobre este caso, incluindo o do cidado annimo. A propsito da exposio Dinossurios da China, em curso no Museu Nacional de Histria Natural, comearia por insistir no convite a Sua Excelncia a visit-la. Aproveitaria para lhe mostrar o estado lamentvel de um enorme casaro, esventrado e em tosco, na sequncia do incndio da Faculdade de Cincias (h 18 anos!), de reinstalao sempre adiada e pomposamente referido como Museu Nacional, que pouco deve tutela, mas ao qual se reconhece uma obra cientfica, cultural e pedaggica notvel. Dir-lhe-ia que reunir aqui esta magnfica coleco de verdadeiros fsseis, alguns gigantescos, dos terrenos mesozicos da velha China foi mrito exclusivo deste Museu, que no contou com quaisquer apoios diplomticos, numa realizao que vai fazer mais pelo estreitamento das relaes entre os dois pases do que quaisquer outras j realizadas. As centenas de milhares de visitantes esperados assim o permitem concluir. Dir-lhe-ia, ainda, como lamentvel neste caso a falta de apoio do Ministrio da Educao. De resto, os comerciantes espanhis parecem estar mais atentos ao que se passa no Algarve do que os colegas portugueses. Alm do po, tambm a pastelaria espanhola passou a ocupar as prateleiras dos supermercados, a preos concorrenciais. Quanto s discotecas, parecem ser o mealheiro dos jovens -- nelas que investem a principal fatia do oramento de frias --, apesar de tambm neste ramo os espanhis estarem a descobrir um filo. Fiis aos seus pubs em Albufeira, continuam os ingleses, mas longe vo os tempos em que com um libra -- em 1988 valia 285 escudos, hoje fica-se pelos 230 -- faziam figura de reis e senhores. A primeira das crticas ontem apresentadas foi para o novo sistema retributivo, que os enfermeiros consideram nunca se ter adaptado carreira de enfermagem. Segundo explicou o dirigente Jos Azevedo, a entrada em vigor dos novos diplomas trouxe consigo inmeras anomalias, geradoras de injustias gritantes, como o caso dos profissionais que, depois de subirem na carreira, ficam a ganhar menos do que antes da progresso. Quanto a horas extraordinrias, os enfermeiros salientam no terem acesso s vultuosas verbas que nelas se consomem e que para alguns profissionais da sade funcionam mais como complementos disfarados de vencimento do que como resposta a necessidades efectivas. Outros processos a contribuir para o descontentamento da FENSE so os que dizem respeito ao descongelamento de vagas, criao de uma Ordem dos Enfermeiros e s equivalncias de ttulos acadmicos. Em relao ao primeiro, a desactualizao dos quadros das instituies cria um desajustamento em relao s necessidades que faz com que, na prtica -- diz Jos Azevedo -- nem um tero das vagas disponibilizadas seja aproveitado para colocar novos enfermeiros. Resultado: h, ao mesmo tempo, instituies com falta de pessoal e enfermeiros no desemprego. O que a FENSE pede um regime de excepo na contratao de novos enfermeiros, como o exigem as circunstncias. Professor. Por que razes escolheu fazer frias em Portugal? Foi uma escolha acertada? R. -- uma constatao, uma diferena de estilo. Sempre fui muito claro nas minhas convices. Acho que a militncia partidria toda a gente percebeu que no vou ter, nem terei, nem tenho tido. Mas no faz sentido, em meu entender, no quadro actual da democracia portuguesa, fazer isso hoje. Acho que a avaliao que as pessoas tm que fazer sobre as personalidades, sobre o perfil e a sua adequao funo tem muito a ver com as pessoas e no com ter ou no ter um carto. P. -- Mesmo que seja eleito, no se desvincular do partido? Muito provavelmente, a resposta oficial ser positiva. Apesar de parecer paradoxal, Milosevic e o seu partido tm agora boas razes para alguma esperana, pelo menos para a sua prpria sobrevivncia poltica no poder, se no mesmo para um futuro que poder revelar-se brilhante e colorido. Os antigos (e agora reformados) comunistas conseguiram regressar ao poder em diversos pases (Hungria, Polnia, Bulgria) e uma opo poltica similar tem tambm agora boas possibilidades nas prximas eleies presidenciais na Rssia, marcadas para Junho. Por saber que desempenha um papel decisivo na sua nova funo de construtor da paz na Bsnia -- sob os desgnios da Pax Americana delineada em Dayton, Ohio -- Milosevic espera que ele prprio e o seu regime na Srvia possam estar a salvo. Entre as promessas, a mais importante foi a de que tenciona retirar as tropas russas da Tchetchnia, embora no tenha dito quando, mas apenas que no ser na totalidade, e no seja a primeira vez que anncios semelhantes so feitos nas vsperas de novas ofensivas militares. Queremos retirar as tropas at s fronteiras da Tchetchnia, disse Ieltsin durante uma visita a Tcheliabinsk, nos Urais. Se retiramos totalmente, os bandidos degolaro imediatamente toda a populao civil. Eles no se ficaro por a. Iro at ao Daguesto, ao Karachaevo-Tcherkssia e a outras repblicas do Cucaso do Norte, para as ocupar, e haver terrorismo e banditismo internacional. Programa Quadro de Cincia e Tecnologia da Unio Europeia. Neste captulo, a principal vantagem da nossa adeso foi a abertura de novos horizontes, o lanamento de relaes com grupos de investigao e empresas -- a gerao de um fluxo de informao que permitiu abrir novas perspectivas no sistema de investigao cientifico e tecnolgico nacional. Jos Antnio relembra que Max era levado da breca para contar histrias e para pregar partidas. Um dia, o Humberto Madeira, o Raul Solnado e o meu pai iam em viagem e passaram pelo pinhal de Leiria. O Humberto Madeira resolveu contar que esta era uma zona onde, em tempos, tinha havido muitos assaltos. No regresso, o meu pai j vinha a dormir. Rebentou um pneu e ouviu-se um grande estrondo. A sua distraco ia ao ponto de telefonar para casa a perguntar se o filho se chamava Jos Antnio ou Antnio Jos. Embora no soubesse uma nota de msica, trauteava e compunha por intuio. Tinha um espectacular ouvido musical e era capaz de distinguir os sons de cada instrumento na mais complicada melodia. Sou assduo na melhor esplanada da cidade, no Molhe. H vinte anos, era arriscado l ir, por causa dos marginais. Depois, tornou-se local de convvio da mais alta qualidade. Mas receio que esteja outra vez a entrar num mau perodo, com a explorao excessiva e de mau gosto. No prximo ano, devido ao princpio da gratuitidade dos transportes escolares preconizado na lei de bases do sistema educativo, os alunos do oitavo ano tero tambm transporte gratuito, o que agravar em 17 por cento o custo comparticipado pela Cmara. Aos alunos de Alqueiro, Casal das Pimenteiras, Casal da Fonte e Beselga a autarquia assegura a utilizao de carreiras pblicas ou at de txi por considerar que o percurso a p oferece riscos s crianas. De Wilde, Costinha, Lus Miguel, Gil Baiano, Marco Aurlio, Beto, Pedrosa, Oceano, Vidigal, Pedro Martins, Peixe, Pedro Barbosa, Afonso Martins, Dominguez, S Pinto, Yordanov, Ouattara e Paulo Alves foram os futebolistas escolhidos por Octvio. Ficaram de fora, alm dos chamados s seleces, Vujacic, Tiago e Balajic. A cena tirada de uma pgina asfixiante de Joseph Conrad: um cadver em pijama, deitado numa cama com uma manta verde nos ps, dentro de uma pequena cabana de madeira a tresandar a luto, flores e formol, velado por a viva, por a filha e com uma guarda de honra de cinco jovens soldados Khmer. No restam dvidas que o corpo dele e que est morto, garantiu um dos jornalistas nesse grupo, o americano Nate Thayer, correspondente da Far Eastern Economic Review e o estrangeiro que melhor conheceu a histria do fundador dos Khmer Vermelhos. Pol Pot morreu oficialmente com um enfarte cardaco, quarta-feira noite, mas inevitvel pensar em causas menos naturais para o seu desaparecimento. As reaces foram cautelosas e os EUA, porque nos maus filmes de terror os mortos-vivos conseguem sempre levantar-se, exigiram uma autpsia. Na passada Primavera, a Assembleia de Freguesia do Carregado solicitou Cmara de Alenquer, de maioria socialista, o processo de candidatura da elevao a vila, mas j em Setembro, Vasco Miguel, do PSD, entregava no Parlamento um projecto de lei nesse sentido, atitude que o presidente da Cmara classificou de oportunismo poltico, alegando que o diploma caducaria com o final da legislatura e no chegaria a ser discutido, o que de facto aconteceu. Agora, o PSD voltou a pegar no assunto, pelas mos de Duarte Pacheco, que sustenta a retomada desta iniciativa na vontade das pessoas que nasceram e vivem no Carregado e no concelho de Alenquer. Duarte Pacheco alega que o Carregado a zona do concelho de Alenquer com maior ndice de crescimento industrial e tambm um dos maiores centros populacionais da regio, pelo facto de se situar num dos maiores ns rodovirios do pas, acrescido das ligaes fluviais e ferrovirias. O ministro portugus da Administrao Interna avistou-se ontem com o ministro holands da Justia para receber os dossiers sobre a imigrao e do Grupo de Trevi que transitam da presidncia holandesa da Comunidade para a portuguesa. A questo da criao da Europol (uma polcia comunitria) foi um dos temas em anlise no encontro. A passagem do testemunho, entre Haia e Lisboa, em matria de poltica interna comunitria apenas se fez agora por indisponibilidade da agenda do ministro holands da Justia. O Holanda-Inglaterra decide quase tudo neste grupo. Quem ganhar fica praticamente com o visto no passaporte para os EUA, enquanto Noruega basta um ponto na Polnia para pela primeira vez participar numa fase final do Mundial. claro que as contas podem todas complicar-se, caso holandeses e ingleses empatem e polacos venam noruegueses. Nesse caso, a Polnia permaneceria na corrida. A deciso do Grupo ficaria ento muito complicada, com trs equipas com hipteses de chegar ao fim com 14 pontos e a ser preciso recorrer ao foto-finish para saber quem se apuraria. o maior grupo, com sete equipas. Para j Repblica da Irlanda e Dinamarca ocupam os dois primeiros lugares, mas a Espanha ainda est na corrida. O Repblica da Irlanda- Espanha de hoje j ajudar a definir posies, mas dever ser o Espanha- Dinamarca de 17 de Novembro que tudo esclarecer. A no ser que o Eire vena hoje a Espanha e a Dinamarca derrote a Irlanda do Norte, ficando Eire e Dinamarca de imediato com a presena nos EUA garantida. O objectivo genrico destas iniciativas, patrocinadas pela Unio Europeia, o desenvolvimento de uma poltica global de reduo da insegurana, volta de quatro eixos fundamentais: respeito pelos direitos humanos, participao activa dos cidados na definio e aplicao da poltica de segurana requerida, aposta em solues de partenariado envolvendo os sectores pblico e privado e preocupao com a segurana do cidado, para evitar que ele se torne autor ou vtima de crimes. As actividades amadoras vo ser desinflaccionadas negociando o ordenado dos atletas. No o marketing e a sponsorizao que resolvem os problemas. Esta nova equipa prope-se fazer um trabalho honesto e claro. O Benfica j conta, neste momento, com setenta mil scios e sete mil praticantes. Os scios vo poder criar lugares para os seus filhos praticarem desporto. Manuel Monteiro mentiu aos portugueses. O que j escrevemos esta semana excitou as conscincias do Partido Popular, ciosas em mostrar que o processo da lista de candidatos por Lisboa correu na maior das lisuras, incluindo o folhetim Lus Nobre Guedes. no dia 9 de Junho o lder dos populares disse que j tinha a resposta de Nobre Guedes ao convite para encabear a lista da capital e torn-la-ia pblica na semana seguinte, depois de a comunicar estrutura distrital. At ontem, ningum conhecia, pela boca de Monteiro, a resposta ao convite. So estes os factos e de nada vale que Nobre Guedes venha desagravar o comportamento do seu lder -- a crer na ltima edio do Independente. Diz o jornal, referindo-se comisso poltica de ontem, que Nobre Guedes sublinhar que o presidente do PP nunca mentiu no decurso do processo de constituio das listas. assim a transparncia do Partido Popular, feita de recados. Teresa Vasconcelos -- Digo isto no livro e digo com convico: h muitas Anas neste pas. Mas tambm h muitas outras que poderiam ser como a Ana e se desmobilizaram. P. -- Por falta de condies, falta de formao, falta de acompanhamento? P. -- Foi um colega seu ... R. -- Faam o favor de tomar nota do que vou dizer, e agradecia que fosse transmitido, porque de vez em quando as coisas no saem de acordo com a verdade dos factos. A EDP tem tido um comportamento impecvel em todo este processo. Quando comearam os estudos da barragem de Foz Ca, fez um estudo de impacto ambiental. Quando se descobriram os vestgios arqueolgicos, fez um protocolo com o Ippar [ Instituto Portugus do Patrimnio Arquitectnico e Arqueolgico ], porque no percebia da matria. Depois foi a EDP que vedou e policiou toda a rea onde foram descobertos os achados arqueolgicos. Por isso, se aquilo est recolhido, deve-se EDP. Quando os protestos eclodiram, os media internacionais divulgaram para todo o mundo informaes sobre uma carnificina em larga escala da populao revoltada da cidade pelas foras fiis a Ceausescu. Estas notcias alarmistas -- num relatrio recente, os servios secretos romenos acusaram espies hngaros e de outras nacionalidades de terem incitado revolta -- contriburam decisivamente para o derrube do regime, mas mancharam a revolta. Afinal, os cadveres nus e alinhados descobertos no eram vtimas da represso, mas antes de pessoas pobres, doentes e invlidas, exumadas de fossas comuns e colocadas no local. Hoje, o monumento aos heris desconhecidos de 1989 no faz qualquer aluso a este episdio e no enorme talho continua a ser enterrado quem no pode pagar uma sepultura decente. Estvamos em estado de choque, vtimas de uma alucinao colectiva. Vi o cadver de uma mulher com o seu filho no ventre. Eram talvez cinco corpos, mas eu via vinte, contou recentemente, ao enviado do Le Monde, Cornel Balint, presidente de uma das seis associaes de revolucionrios de Timisoara, que agrupam cerca de 800 pessoas recompensadas por terem sido feridas, presas ou molestadas durante os combates. Um dos objectivos dos Encontros -- tal como os definiu h seis anos Madalena Perdigo, fundadora do Acarte -- era permitir o acesso dos coregrafos portugueses aos circuitos europeus, dos quais estavam arredados por desconhecimento mtuo. De um lado, pela situao geogrfica de periferia e, do outro, pelo desconhecimento das tendncias em vigor nos grandes festivais internacionais. Os Encontros Acarte propunham-se responder a esse duplo anseio: informar o pblico e estimular os autores portugueses, na perspectiva de os despertar para as estticas do ps e neo-modernismo. A linha fundamental [ dos Encontros ] servir o pblico, mant-lo dentro da actualidade coreogrfica. Mas a afirmao da sua identidade nunca se fez pela adopo de uma linha esttica nica. No h uma direco nica [ no conceito de modernidade ]. O pblico tem o direito a escolher o que lhe interessa e no creio que seja necessrio conduzi-lo pela mo como se fosse um indigente. Smidovitch, com um fomos aonde planemos ir, declinou confirmar, no entanto, se a fiscalizao abrangera sedes de ministrios, quinta-feira vedadas categoricamente ao pessoal da ONU pelo regime de Bagdad. Sobre se haviam encontrado algo suspeito tambm no adiantou pormenores. A misso de 22 peritos a primeira a chegar capital iraquiana desde que, tambm sem resultados, foi passada revista ao Ministrio da Agricultura, a 28 e 29 do ms passado, misso que ps termo a um impasse de trs semanas entre o Governo de Saddam Hussein e a comisso da ONU. As inspeces, recorde-se, comearam em Abril do ano passado, pouco depois do fim da Guerra do Golfo e resultaram dos acordos de cessar-fogo ento assinados. Aps negociaes com o principal patrocinador da equipa, a Shell, Hardwick entregou um dos wild cards (convites) a Felisberto Teixeira, que conta tambm com o apoio da Federao Nacional de Motociclismo (FNM) e est convicto que o portugus far bom trabalho, reconhecendo, porm, que qualquer estreia, e muito especialmente em 500cc, sempre difcil. No pas prosseguem os incidentes: de percursos para uns, perigosos para outros (Noticirio das 13 horas da Rdio Nacional). ... Os laboratrios do Departamento de Estado e do Pentgono ainda no conseguiram dar uma resposta precisa quanto imprevisibilidade da reaco do seu ex aliado (Jonas Savimbi), que parece ter ingerido demasiadamente esterides anabolizantes nos tempos de ajuda encoberta e ter crescido demais (Correio da Semana, 22.9). mesmo de supr que se trata de uma das mais importantes exposies fotogrficas dos ltimos anos, a nvel internacional. O tema da gua, ligado aos oceanos como proposta genrica da Expo-98, permite abordar esse elemento em todos os seus estados, liqudo, fsico e gasoso, e obviamente relaciona-se com o prprio processo fotogrfico. Sendo, alm de crtico de pera e fotografia, professor de qumica, trabalhando no domnio da termodinmica dos objectos moleculares e das relaes entre artes e cincias, Calado estava habilitado como ningum a comissariar esta exposio. O Instituto de Investimentos e Privatizaes dos Aores (IIPA) est a negociar um emprstimo de 1,5 milhes de contos, para poder fazer face aos compromissos assumidos na rea dos incentivos de base regional, disse ao PBLICO o presidente do IIPA, Joo Bernardo Rodrigues. De acordo com aquele gestor, o IIPA contactou o Banco Comercial dos Aores, a sociedade Esprito Santo- sociedade de investimentos e o Banco Pinto & Sotto Mayor. O emprstimo ser negociado por tranches, j que o valor em causa para o binio 1991-92. O valor total dos compromissos assumidos de 4,2 milhes de contos, dos quais 1,5 milhes so da responsabilidade do Governo Regional dos Aores via IIPA e o restante das comunidades. Segundo Joo Bernardo Rodrigues foram aprovados 60 projectos, cujo valor total de investimento da ordem dos nove milhes de contos, distribuidos, entre outros, pela agro-pecuria, construo civil, basaltos, cimentos e para a realiazao de uma nova unidade fabril de cerveja. Indignados com a ausncia de negros em lugares elegveis, os representantes da comunidade imigrante africana aconselharam o voto em branco. Fernando K, 41 anos, presidente da Associao dos Guineenses desde 1987, no compreende como num pas com uma presena multissecular africana, no h um poltico, um deputado, um presidente de cmara, sequer um vereador negro. Deputado suplente do PS com entradas fugazes no Parlamento, acusa Antnio Guterres de trair promessas feitas em Julho a representantes da comunidade e prepara uma campanha internacional contra uma situao que torna os negros portugueses em cidados de segunda classe. As cidades candidatas organizao dos Jogos Olmpicos de 2004 vo conhecer hoje o regulamento a que devero obedecer, durante a sua campanha de preparao e promoo. Ser durante um encontro a realizar no Museu Olmpico, em Lausana, na Sua, onde os representantes das 11 cidades se iro reunir pela primeira vez. Jogo grande foi o que juntou, em Phoenix, os Suns e os Portland Trail Blazers, e que terminou com a vitria dos donos da casa, por 118-109. A.C. Green, ex-jogador dos Lakers, marcou 31 pontos para liderar os Suns, num jogo em que Sir Charles Barkley averbou apenas 21 pontos. Discreto no campo, o temperamental Barkley voltou a ser notcia fora dele. Agora, vai ter de responder em tribunal a uma queixa de agresso interposta por um f. Barkley acusado de ter agredido Edward Durham num bar, depois de este o ter criticado por empurrar uma senhora que apenas pedia um autgrafo. Barkley no comenta o incidente, e o mesmo faz a sua equipa. Os Suns, finalistas da poca passada, esto mais concentrados no campeonato, onde ocupam o segundo lugar da Diviso Pacfico, liderada pelos invictos Seattle Supersonics. Um mtodo muito comum de movimentar ficheiros na Internet. O FTP uma forma especial de fazer login noutro local da Internet com o objectivo de carregar ou enviar ficheiros. H muitos servidores da Internet que contm imenso material que pode ser acedido por qualquer pessoa atravs de um acesso annimo, pelo que so chamados servidores FTP annimos. Um mtodo bastante utilizado para realizar menus de material disponvel na Internet. Gopher um programa de utilizao cliente-servidor, pelo que requere que o utilizador disponha de uma verso cliente do Gopher. Os Gopher difundiram-se muito rapidamente atravs da Internet nos ltimos anos, mas esto a ser suplantados pelo hipertexto suportado atravs da World Wide Web. Contudo, existem ainda milhares de Servidores Gopher na Internet, que devero subsistir por mais algum tempo. semelhana de muitas outras igrejas evanglicas, o pagamento do dzimo na Universal no mais do que a adopo de uma prtica bblica. Ainda hoje o cdigo catlico de Direito Cannico refere que a igreja tem o direito, independentemente do poder civil, de exigir aos fiis o necessrio para o culto divino, para a honesta sustentao dos seus ministros e para satisfazer outros fins prprios ... o que faz todos os meses a Igreja do Reino de Deus. Em Portugal, o dinheiro dos crentes colocado num envelope, que os obreiros da IURD, sempre atentos, recolhem e controlam. No Brasil, existe uma caderneta individual onde cada seguidor regista, religiosamente, as suas contribuies para a igreja de Macedo. Se a escrita sobre arquitectura no abunda entre ns, Manuel da Graa Dias tem dado um excelente contributo para alterar essa situao. No s as memrias descritivas dos seus projectos so, s por si, criativos textos sugerindo, com especial eficcia, uma multiplicidade de imagens a partir dos desenhos que apresenta, como a sua produo para a imprensa revela idntica qualidade. Pelo uso de uma linguagem despojada de excessivas conotaes tcnicas inibidoras da aproximao do leigo, pela originalidade dos sentidos que ajuda a desvendar nos temas do quotidiano.. Os portugueses esto em forma no surf. Pelo menos os jovens que esto a participar no Campeonato Europeu de juniores, a decorrer at domingo em Newquay, Inglaterra. no surf, Joo Macedo, Andr Pedroso (em sub-18), Ruben Gonzlez e Pedro Monteiro (em sub-16) esto j nas meias-finais. No haver greve nem bloqueio poca da NBA, a Liga Norte-Americana de Basquetebol profissional, independentemente de ainda no existir acordo sobre o tecto salarial, decidiram os proprietrios e jogadores. Este anncio vem sossegar aqueles que temiam que sucedesse no basquetebol o que j aconteceu no basebol e no hquei no gelo -- a interrupo ou adiamento dos campeonatos. Sendo assim, e uma vez que o acordo para o contrato colectivo de trabalho, cujas negociaes se arrastam h meses, est previsto para breve, tudo indica que a NBA comear, como planeado, a 4 de Novembro, ou seja, na prxima sexta-feira. Ainda no foyer, pde-se desde logo perceber que no Servio de Msica da Fundao Calouste Gulbenkian se est em momento de reflexo, j que foi distribudo ao pblico um questionrio onde, entre outros pontos, se pede uma opinio sobre a periodicidade de apresentao da msica contempornea. Precisamente no concerto dessa noite se propunha um percurso pela msica mais recente intersectado de forma exemplar por sonoridades do passado. A enquadrar as obras de Wolfgang Rihm La lugubre Gondola / Das Eismeer e de Luigi Nono No hay caminos, hay que caminar .. Andrei Tarkovsky, tivemos a msica de Giovanni Gabrieli (Canzone a tre cori e In Ecclesiis) em orquestraes da autoria de Bruno Maderna. A relao entre Luigi Nono e Giovanni Gabrieli ultrapassa a simples passagem e vivncia na cidade de Veneza, situando-se principalmente ao nvel da explorao das potencialidades musicais inerentes espacializao sonora. Apesar de no se ter procedido a qualquer distribuio de msicos no espao do Grande Auditrio Gulbenkian, imagem do que se fazia com as obras do msico renascentista na Catedral de So Marcos, a orquestrao de Maderna reflecte um total respeito pelas estruturas de dilogo entre coros com uma constituio tmbrica diferenciada. Desde o primeiro dia do ano, entre os palestinianos procurados, 13 foram mortos pelas foras de segurana e 15 feridos mas 40 foram libertados pelo Exrcito, disse Yatom durante uma visita de inspeco regio de Belm (sul da Cisjordnia). Quatrocentos outros so ainda procurados pelas foras de segurana. Informaes do Exrcito, em termos globais, do conta de 23 palestinianos mortos e 211 feridos por soldados, desde o dia 1 de Janeiro, na Cisjordnia. O nmero confirmado pelos palestinianos. Danny Yatom disse que a fraca proporo de palestinianos mortos em relao ao nmero de palestinianos presos prova que o Exrcito no dispara seno em casos excepcionais, contrariamente a certas alegaes. R. -- No h uma circulao do pensamento. H muitos poucos debates, as ideias que so objecto de discusso internacional -- as questes relacionadas com a realidade virtual, a com o ps-humano, ou com as auto-estradas informticas -- no se discutem em Portugal. Por outro lado, no h radicalismo. Sem radicalismo no h qualquer evoluo da sociedade. P. -- Uma questo bvia: em ano de eleies, e embora o debate sobre a situao portuguesa seja feito em Fevereiro, a iniciativa no se colar demasiado s foras polticas -- socialistas-Plataforma de Esquerda -- que esto frente da Cmara de Cascais? Apesar dos elevados montantes que o Banco de Portugal tem vindo a injectar no sistema ao longo do actual perodo de constituio de reservas de caixa, a sesso de ontem do mercado monetrio interbancrio voltou a apresentar-se bastante pressionada pela procura de fundos, com os primeiros nveis de oferta no curto prazo a serem superiores ao fecho da sesso de ontem. H medo no Sul dos Estados Unidos. As igrejas frequentadas pela populao negra esto a ser incendiadas e os lderes das comunidades acreditam que so manobras de intimidao dos que sonham com o regresso da supremacia branca. Casos como o da Sopete, Lusotur ou Aliana Seguradora ilustram de forma pouco favorvel a segurana que os investidores podem ter nas previses adiantadas por algumas empresas ... Na Sopete, os resultados alcanados em 1991 no chegam sequer para cobrir os dividendos de 125 mil contos estimados no primeiro semestre, isto apesar de no comentrio ao balano do primeiro semestre de 1991 ser referido que se aponta para o segundo semestre um real crescimento da actividade do todo da empresa, e em especial da rea do jogo e da rea hoteleira. Enfim, e esse ter sido o ponto decisivo da sua interveno, Antnio Tanger sublinhou que a ocorrncia de um novo problema poderia criar a Portugal algum embarao diplomtico. No quadro poltico actual, em que Portugal ocupa a presidncia das Comunidades, era de todo indesejvel que a diplomacia portuguesa tivesse de intervir para protestar junto de qualquer pas dessa regio. Isso poderia revestir-se de particular gravidade se o ataque ocorresse dentro de Angola, onde h poucos dias dois padres e uma freira foram mortos num ataque de bandidos. Aps esta interveno, os membros da expedio reuniram-se com a organizao da viagem para deliberar sobre o que fazer. Foi na sequncia desse encontro que Pedro Villas Boas decidiu dar por finda a expedio, inviabilizando desta forma o prosseguimento da viagem anteriormente definido por uma pequena coluna expedicionria. Wayne Barker, cineasta arborgene da Austrlia, fez a apologia duma cultura viva, que conserve a herana dos nossos pais mas com os olhos postos no futuro. Walter Saunders, outro realizador e produtor arborgene, sublinhou a pilhagem cultural que a sua comunidade continua a sofrer, por exemplo no domnio das imagens: dos motivos arborgenes utilizados para ornamentar loua e tecidos, os smbolos que ornam os avies australianos, os boomerangs fabricados e vendidos por australianos brancos, sem que os arborigenes recebam quaisquer direitos .. um disco que estava j meio feito no Canad com canes compostas antes do 25 de Abril. Acabou por ter uma vertente mais poltica em consequncia dos acontecimentos imediatos, por uma questo de sensibilidade linguagem da poca e da vontade de participar nela. (...) Mas a pesquisa formal basicamente a mesma. Podem dizer-se coisas muito vlidas politicamente com uma linguagem pobre e que no entra nas pessoas. Cecilia Bartoli cantou em So Carlos, em Lisboa, no passado 24 de Maro. No numa pera de Rossini, como reclamaria a misso do teatro, no ano do bicentenrio. Mas oferecendo um conjunto de peas para voz e piano do compositor, numa memorvel demonstrao da vitalidade da redescoberta do canto rossiniano. Se h esttica de canto que foi deteriorada quase irreversivelmente com os novos conceitos de drama musical que imperaram durante a segunda metade do sculo passado e a primeira metade do presente, essa foi certamente a perseguida por Rossini. Tourabi foi tambm o artfice de uma aliana nica, entre um pas rabe sunita -- o Sudo -- e um pas no rabe xiita -- o Iro, selada quando o Presidente iraniano, Ali Akbar Hashemi Rafsanjani, visitou Cartum em Dezembro de 1991. impossvel saber se foi Rafsanjani quem manipulou Tourabi, ou vice-versa, mas seguro que Sudo obteve vantagens polticas e financeiras, porque se transformou no coordenador de vrios movimentos islamistas, observou Balta, enumerando a FIS, na Arglia, a An-Nahda, na Tunsia, a Irmandade Muulmana, na Jordnia, o Hezbollah, no Lbano, a Gama'at Islami, no Egipto, e o Hamas, na Faixa de Gaza -- uma internacional de barbudos. A faixa etria e as horas do dia mais perigosas coincidem em todos os distritos, mas Beja foi o que registou menor incidncia de condutores alcoolizados em Dezembro: apenas 2,4 por cento. Durante o ltimo ms do ano passado a Brigada de Trnsito aplicou o teste de alcoolmia a 25.093 condutores, em todo o continente e detectou 1.578, ou seja, 6,2 por cento do total, que conduziam sob o efeito do lcool. No foi por isso, contudo, que a Vidisco se sentiu menos ofendida, de modo que acabou por decidir meter o caso em tribunal, porque aquilo no se faz. A televiso paga por todos ns, para qu? Para eles fazerem publicidade privada, como se fosse um programa pago? No digno de um programa de um organismo oficial, que na verdade de uma associao de multinacionais. Tentmos encontrar logo um advogado que estivesse dentro da legislao, o que foi difcil por causa da quadra de festas. Agora, temos um advogado e se ele considerar que h suficiente matria legal para isso, vamos a tribunal. Resta saber quem poder ser processado. Mas ser que vai ficar por aqui? O que houve entre a primeira e a segunda metade desse ano parece, assim, uma espcie de intervalo -- um tempo fora da Histria. Enquanto o possvel medo de represlias fazia com que as acusaes dos jogadores contra Octvio nunca se tivessem concretizado, ele, em contrapartida, teve a coragem de dar a cara em alguns momentos de grande polmica. Assim, no seu primeiro Porto-Sporting como adjunto, com a rivalidade Pinto da Costa-Joo Rocha estava no auge e com Artur Jorge a no conseguir ganhar nas Antas, Octvio seria acusado de agredir cabeada e a murro alguns responsveis sportinguistas, indo assim mais alm do que o prprio treinador principal que, impvido e sereno, comandou a sua equipa sem se envolver nas guerras marginais em que Pinto da Costa e Octvio mostravam estar como peixes na gua. Quando Quinito foi contratado para o FC Porto e se props recuperar, entre outros, Gomes e Madjer, o homem de Palmela afastou-se e, nas suas costas, comearam a aparecer muitas das histrias a seu respeito que, at ento, tinham ficado escondidas nos segredos dos bastidores. Mark Miller o treinador-jogador, mas possvel que Toni Formosa venha a ser uma espcie de manager da equipa, com assento no banco, se aceitar a proposta que lhe fez o presidente do clube. Ter que dar uma resposta at prxima semana. Nos jogos com o Ekranes, os dois golos foram marcados pelo lbero Buttigieg, um jogador elegante e talvez o melhor do pas. Os outros internacionais so Cluett (guarda-redes), Caucchi e Buttigieg (defesas) e Buhagiar (normalmente defesa-esquerdo). R. -- Parecer-me-ia lgico, num pas pobre, de recursos escassos como o nosso, que, no mnimo, as famlias e os estudantes suportassem uma percentagem correspondente dos ingleses. P. -- O que iria tornar o ensino superior um ensino mais elitista, dos ricos. Abriu ontem ao pblico a primeira iniciativa da galeria do Instituto de Arte Contempornea (IAC). No Pavilho Branco do Museu da Cidade, em Lisboa, a exposio de Adriana Varejo inaugura uma nova rea de interveno do IAC. Os objectivos polticos so clarificados no texto do director: criar um local de reflexo, enfatizando a cidade como sede do pensamento e de problematizao de temas que fazem parte do nosso tempo, num campo que , nesta fase, dedicado multiplicidade de formas e conceitos que se originam a partir do conhecimento de culturas e geografias que nos so de menor adjacncia. Assim, a galeria pretende acolher imagens ligadas ao que se convencionou designar por multiculturalismo. A inteno descentralizadora do programa Rotas, assumido pelo Ministrio da Cultura e que o IAC integra, pois confrontada com uma iniciativa tendente a centralizar a discusso artstica e a capitaliz-la. Iniciar a programao atravs da apresentao da obra de Adriana Varejo uma opo politicamente perfeita: porque uma artista brasileira, porque uma artista que recorre (entre outros) ao manancial infinito das imagens deixadas no Brasil pelo colonialismo portugus, porque est integrada nos circuitos mais dinmicos da internacionalizao artstica actual. Adriana Varejo situa-se no campo das reflexes sobre o cruzamento de culturas. Nesse sentido, cruza imagens dos diferentes tempos e civilizaes, criando imagens de sntese e procurando problematizar, atravs do objecto artstico contemporneo, os resultados de uma experincia histrica universal. Um catlogo com textos de Isabel Carlos (do prprio IAC) e Paulo Herkenhoff (comissrio para a Bienal de So Paulo) completa a apresentao. Leonor Beleza, presidente da mesa do Congresso do PSD, foi convidada a colaborar com o Governo no debate sobre as reformas do sistema eleitoral. Os jornalistas tinham perguntas agressivas para o Presidente, mas ficaram algo desarmados com o ar triunfal de Clinton. Um ainda indagou se ele tencionava mesmo lanar a invaso, contra a opinio da maior parte dos americanos e do Congresso. Clinton, que afinal de contas no teve de lanar invaso nenhuma, deu exemplos de outras intervenes militares, como em Granada ou no Panam, em que os Presidentes no pediram autorizao ao Congresso. Nem todas as decises podem ser populares. No se pode conduzir a poltica de acordo com as tendncias das sondagens. No creio que o povo americano queira que eu fizesse isso. como se pode dizer aos americanos que Cdras um crpula e dois dias depois afirmar que se trata de um honrado militar? Clinton balbuciou qualquer coisa sobre a inutilidade dos ressentimentos e vinganas, citando Jean-Bertrand Aristide, o Presidente exilado que dever, at 15 de Outubro, regressar ao Haiti, e de quem a CIA diz ser um doente mental com perigosas ideias anti-americanas. O mesmo documento afirma que a forma jurdica e os critrios de gesto nela implcitos se revelaram inapropriados misso que foi cometida ao Teatro de So Carlos, quando da sua transformao em EP em1980: organizar de forma permanente espectculos de msica, pera e bailado e para os divulgar pelo pas, para dar a conhecer as obras de autores nacionais e para formar e manter um corpo de cantores / actores e de msicos. Ao longo de quase 200 anos, a histria do Teatro Nacional de So Carlos acompanhou a vida cultural portuguesa, sendo imperioso criar condies para que, no novo renascimento que Portugal atravessa, as estruturas da rea da cultura se adequem ao acrescido dinamismo da sociedade. Em concluso, ser de natureza privada a entidade responsvel pela nova gesto do Teatro Nacional de So Carlos, diz, a terminar, o comunicado. Os Missionrios do Esprito Santo vo alargar cidade do Porto o trabalho que j desenvolvem na regio de Lisboa, de apoio aos imigrantes lusfonos que residem em Portugal. A deciso dever ser concretizada a curto prazo, depois de o captulo provincial (assembleia) daquela congregao religiosa ter debatido a aco do Centro Padre Alves Correia (Cepac), de Lisboa -- o organismo, da responsabilidade dos missionrios, que esteve em diversas aces de apoio legalizao dos imigrantes clandestinos, e que denunciou a corrupo existente na delegao de Faro do Servio de Estrangeiros e Fronteiras, investigada pela Polcia Judiciria (ver PBLICO 20/07/94). Nesse sentido, eu gostaria de ver alargado o trabalho do Cepac a cidades como o Porto, porque ele um sinal de ateno misso no nosso pas. Da ltima vez em que estivemos em Paris nossa casa foi um apartamento na Rue de la Pompe, no 16 arrondissement. Uma alternativa mais barata para hotel, com a vantagem adicional de nos permitir simular uma certa domesticidade francesa. Viajamos para fugir da nossa rotina mas somos seduzidos pela rotina dos outros e ter um lugar certo onde comprar a baguette para o caf da manh passa a ser um indispensvel prazer parisiense. Conhecer no a vida mas a vidinha de um lugar depende de estabelecer-se na comunidade por alguns anos. Durante dez dias passamos vrias geraes na Rue de la Pompe. Na nossa vizinhana imediata existiam quatro agncias funerrias e duas lojas especializadas em queijos. A concentrao de funerrias se deve, imagino, proximidade do cemitrio de Passy. Lojas s de queijos so comuns em Paris e uma experincia turstica interessante e barata entrar numa delas e respirar fundo. Voc sai com a certeza de que chegou muito perto do corao selvagem da Frana ou ento de uma cura definitiva para a sinusite. A coincidncia de funerrias e queijarias na nossa circunstncia no significava nada, portanto, mas no pude escapar da mrbida observao de que estvamos cercados pelas pompas da morte na Rue de la Pompe. O queijo tambm a encomendao cerimonial de um morto, no caso uma poro de leite. James Joyce chamou o queijo de defunto do leite, e a qualidade de um queijo o resultado da aco de bactrias vivas num corpo morto. Ou seja, de apodrecimento controlado. o que acontece tambm na preparao de carnes secas e aves e peixes faisandes e no correcto acondicionamento de corpos humanos para a eternidade. os dois assimilam o facto que a vida uma doena incurvel com uma taxa de mortalidade de 100 por cento, e se tornam melhores por isso. Rodrigo Mil Homens venceu ontem, na classe Runabout Limited, a segunda etapa do Europeu de Jet Ski, que durante o fim-de semana se disputou no lenol de gua do Rio Lima, na praia do Prior, em Viana do Castelo. Miguel Valente, por seu lado, foi o vencedor na classe Sport Limited. Estes foram os nicos portugueses a conseguir um primeiro lugar nas vrias classes que fizeram parte desta etapa do Europeu, que contou com a participao de 186 concorrentes, 28 dos quais portugueses. Finalmente, apesar de j ter sido editado em 1992, merece referncia o clssico norte-americano Spiderman -- o Homem-Aranha --, uma criao original de Steve Dikto (desenho) e Stan Lee (texto), que evocado por ocasio do 30 aniversrio da sua criao (1962) numa edio em comic-book de luxo pela editora espanhola Forum (Planeta-De Agostini). La Saga del Traje Alienigena tem a assinatura de Ron Frenz, Ruck Leonardi (desenho), Tom DeFalco e Roger Stern (argumento). BB -- Em geral? [ Risos ...] A funo da arte a de expandir as possibilidades da linguagem visual. Por outras palavras, o que a arte nos d mais possibilidades de ver as coisas de diferentes maneiras. E creio que o bastante. O mesmo se passa em relao literatura. Acredito que a funo da literatura e da arte a de expandir as possibilidades do que as pessoas podem dizer, mas no necessariamente para fazer delas melhores seres humanos. possvel que tenha essa consequncia, mas no se deve misturar as coisas. Quais que so as prioridades? O fundamental, para mim, que a arte seja livre de constrangimentos. P -- H uma nova moral nos Estados Unidos? J presente no local sob os auspcios das Naes Unidas para garantir o respeito das zonas de excluso area decretadas na Bsnia, a Aliana Atlntica no dever encontrar grandes dificuldades no alargamento da sua misso. Mas, apesar da imagem de coeso que ontem se esforaram por exibir durante uma reunio no Luxemburgo, os ministros dos Negcios Estrangeiros da CE e o secretrio de Estado norte-americano, Warren Christopher, no esconderam algumas nuances sobre a estratgia a seguir face Bsnia. No existe, pois, um conceito operacional de urgncia, como bem atestam as urgncias dos hospitais. 1 Prioridade -- situaes exigentes: situaes mais ameaadoras para a vida e que necessitam de interveno instantnea. Exemplo: fractura da laringe com obstruo respiratria completa. 2 Prioridade -- situaes emergentes: situaes que exigem interveno imediata num perodo de poucos minutos. Exemplo: pneumotrax de tenso [ ar dentro da cavidade pleural a comprimir os pulmes e o corao, eventualmente ]. 3 Prioridade -- situaes urgentes: situaes que necessitam de interveno dentro da primeira hora. Exemplo: hemoperitoneu devido a hemorragia intra-abdominal contnua [ leso dentro da cavidade abdominal a sangrar continuamente ]. 4 Prioridade -- situaes deferveis: situaes que podem ou no ser imediatamente evidentes, mas que necessitaro de tratamento subsequente. Outro aspecto que suscita preocupao o uso, por vezes indiscriminado, de antibiticos. Entre as resistncias a estes frmacos provocadas por este tipo de actuao -- por parte dos profissionais e da populao --, inclui-se, como um dos exemplos mais graves, a ineficcia dos medicamentos contra a tuberculose, doena cuja incidncia continua a deixar Portugal muito mal colocado no panorama europeu. A emergncia da sida, qual est associada uma srie de infeces oportunistas, o problema da hepatite C (recentemente identificada e ainda sem cura nem tratamento) e o recrudescimento da papeira (mais de onze mil notificaes durante o ano passado) constituem outras das facetas mais negativas da situao portuguesa em matria de doenas transmissveis. Mas a margem de 87 para 10 a favor da Lei de Reautorizao dos Institutos Nacionais de Sade, de que fazia parte a clusula respeitante ao tecido fetal, fez renascer as esperanas, entre muitos dos interessados em prosseguir as pesquisas envolvendo tecido fetal, de que esta medida tenha fora suficiente para ultrapassar o veto presidencial. A votao representa tambm uma vitria significativa para os que se propem efectuar este tipo de investigao, uma vez que conseguiram convencer muitos senadores antiaborto que defender a utilizao de restos fetais no a mesma coisa que defender a prtica que lhes d origem. Voltando aos terrores da Lapa dos Morcegos: fartos de tantas vtimas, os aldees decidem armar um cavaleiro capaz de defrontar a fera. Terrvel foi o combate e assustadores os gritos da batalha, que o improvisado cavaleiro acaba por vencer, expulsando a fera, que foge por entre o pblico e vai aterrorizar outras paragens. De entre os que assistiam a esta pea da companhia Aquilo, as crianas eram, sem dvida, as mais fascinadas. Algumas choravam, aterrorizadas, outras respondiam aos urros do monstro, e um petiz chegou mesmo a propor aos pais uma incurso na gruta, para mat-la, fera, pois ento. No a desvalorizao que vai resolver os problemas de fundo do sector txtil. assim que Jorge Seabra, gestor txtil, avalia a recente desvalorizao da moeda portuguesa que, no entanto, considerou insuficiente. Para Jorge Seabra, a questo tem de ser posta a dois nveis. Por um lado, a efectiva perde de competitividade dos produtos portugueses face a produtos concorrentes provenientes do espao europeu, em especial da Itlia e da Espanha. nesse nvel que, em sua opinio, a desvalorizao foi insuficiente. Mas, por outro lado, no a desvalorizao do escudo que vai resolver o problema de fundo de competitividade da indstria portuguesa. Uma autocombusto ocorrida ontem de manh em depsitos da empresa de tratamentos pretolferos Nesta, em Sines, no chegou a provocar ferimentos nos operrios nem prejuizos avultados. A Sociedade Lisboa 94 poder vir a desvincular-se do apoio, de 25 mil contos, produo da pera O Corvo Branco, de Philip Glass e Bob Wilson, com libreto de Lusa Costa Gomes, encomendada pela Comisso Nacional para as Comemoraes dos Descobrimentos (CNCDP). Marcada inicialmente para o fim da presidncia portuguesa das Comunidades, em Junho de 1992, depois anunciada para Abril de 1994, no Centro Cultural de Belm (ver PBLICO de 10/2/1992) a pera s poder ser apresentada em Janeiro / Fevereiro de 1995. Vasco Franco, vereador responsvel pela PM lisboeta, contesta e salienta que a aco da polcia alfacinha mais importante do que parece primeira vista. Alm do controle da venda ambulante, os efectivos acorrem s situaes de catstrofe, fiscalizam bairros degradados, para evitar o aparecimento de novas barracas. Muitas vezes, a PM a nica autoridade que passa em determinados bairros. S este ano, at Setembro, a polcia alfacinha j retirou da via pblica 1600 veculos abandonados, misso essencial numa cidade com pouco estacionamento, sublinha. Alm disto, ainda acompanham as aces de despejo. A verdade que se tratam de tarefas executadas noutros municpios por fiscais camarrios. Mas, com menos eficcia que a PM, assegura Vasco Franco. Entretanto, Couto Ferreira apresentava ontem um alto teor de cido rico no sangue e encontrava-se em acelerado emagrecimento, segundo informou o director-clnico do Hospital Prisional S.Joo de Deus, Manuel Pinu. Por agora, as alteraes registadas ainda no so irreversveis mas a presena de um elevado teor de cido rico no sangue pode ter consequncias a nvel renal, frisou o citado mdico. Couto Ferreira encontra-se a ser acompanhado pelo mdico mas recusou qualquer tipo de tratamento, caso entre em coma. O lder do PSD-Porto, Lus Filipe Menezes, desafiou ontem o presidente da Cmara do Porto a assumir o mandato de deputado durante a discusso do Oramento de Estado para 1998 para que prove o seu empenho na defesa da regio, mas Fernando Gomes, em jeito de comentrio, passou ao lado do repto. s picardias e jogos de palavras do doutor Lus Filipe Menezes no tenho rigorosamente nada a dizer, porque tenho mais com que me preocupar, disse. Este grupo religioso j foi visitado pelas autoridades policiais, em fins de Novembro, no seguimento de queixas em matria de urbanismo, contra esttuas de dimenses desmesuradas erguidas perto do mosteiro. Os cavaleiros foram, inclusive, proibidos de construir um templo pirmide de propores megalmanas. Entretanto, nos Estados Unidos, os sobreviventes do desastroso ataque policial contra os membros do culto davidiano, o massacre de Waco, juntamente com os familiares das vtimas mortais, vo processar o presidente norte-americano, Bill Clinton, e o FBI. O exerccio do poder nas organizaes , por vezes, descrito em termos de jogos e de jogadores. Os autores de The Strategy Process referem o especialista em cincias polticas Graham Allison, que, em 1971, descreveu os jogos nas organizaes e nos governos como complexos, subtis, simultneos e sobrepostos. a coligao de regras, em cada momento define o jogo. Mintzberg e Quinn identificam uma srie de jogos de poder, de entre os quais retirmos e adaptmos os que apresentamos no quadro junto. O essencial, na compreenso dos jogos de poder, conseguir saber quem est envolvido e onde est a fora em cada situao concreta. A gesto de topo -- alvo a atingir em muitos destes jogos -- pode ganh-los se os conhecer bem e souber jog-los. 23 de Maio -- Manifestaes espontneas em protesto contra o facto de Daniel Cohn-Bendit, lder do Maio de 68, ter sido proibido de residir em Frana. A Unio Nacional dos Estudantes de Frana, o sindicato dos Professores do Ensino Superior, o Movimento 22 de Maro e os Comits de Aco Liceal convocam uma manifestao para o dia 24 em solidariedade com Cohn-Bendit. cerca de 30 mil jovens manifestam a sua solidariedade com Cohn-Bendit. s 20 horas, o general De Gaulle fala ao pas, propondo a realizao de um referendo. Os manifestantes reagem interveno, agitando lenos brancos e gritando Adeus De Gaulle. O chefe do Estado-Maior da Fora Area, general Mendes Dias, decidiu proceder a algumas alteraes nos quadros das Oficinas Gerais de Material Aeronutico (OGMA), tendo sido determinada, na passada sexta-feira, a substituio do general Rui Espadinha, na direco daquele estabelecimento fabril. Interinamente, o brigadeiro Portela ir assegurar a chefia das OGMA, mas, nos prximos meses, dever suceder uma reestruturao mais profunda. Seria bom que os lderes partidrios pr-referendo fossem mais claros expondo razes objectivas, como por exemplo as de foro constitucional, tico ou consuetudinrio. Os cidados gostariam de saber porque se deseja retirar aos deputados o direito que tm de aplicar a sua sapincia e poder na feitura de leis que a todo o povo e ao pas dizem respeito. E tambm gostaramos de saber porque se exige a transferncia de tais poderes para o veredicto do povo, sendo sabido que se trata de assuntos de elevada complexidade poltica, social e jurdica, para os quais o cidado comum tem menor preparao tcnico-cientfica. A investigadora acha mesmo que no h razes para se excluir as protenas animais das raes, desde que exista uma certificao da origem dessas protenas que garanta que elas provm apenas das partes no perigosas das carcaas. Antes de 1988, a contaminao era imparvel e, hoje, estamos a tomar precaues excessivas numa altura em que o risco j muito mais pequeno. E diz que, daqui a uns anos, a BSE, que era uma doena rara at h dez anos, tornar a ser novamente isso: uma doena rara. A guerra entre os jornais desportivos est no ponto de rebuado. A Gazeta, que passa a ser editada cinco dias por semana, parece em vias de alcanar o seu objectivo: obrigar o arqui-rival A Bola a tornar-se um jornal dirio, o que, segundo P&N apurou, ser um facto a partir de Janeiro do prximo ano. E, de caminho, instalou-se na redaco de A Bola um clima de autntico PREC, com a demisso do chefe de redaco, Joaquim Rita, e a venda, por parte de Aurlio Mrcio, da quota que detinha no jornal. Mrcio amealhou 70 mil contos e est de malas feitas para o Record, acompanhado, na sua transferncia, por Joo Alves Costa e Norberto Santos. Quem disse que as transferncias eram s no defeso? J.M. -- A que est. Temos prestado uma ateno diminuta interaco entre a realidade fsica, o problema ambiental e o sistema poltico. Se pegarmos no caso de frica, a grande quantidade de gente que ali morre de fome no resulta do problema ambiental ou de uma agricultura subdesenvolvida, mas dos governos. So pases mal geridos que, por isso, no conseguem fazer chegar os alimentos aos seus povos. Curiosamente, na Amrica do Sul, que dantes estava bem pouco desenvolvida, as populaes conseguem bastar-se em alimentos e vivem da exportao de alguns produtos. Portanto, voc deveria preocupar-se com os sistemas polticos vigentes. E.G. -- Continuo a no estar convencido de que se trate apenas de um problema poltico. Porque a produo por hectare em frica tem diminudo drasticamente, o campo est a desertificar-se. O lenol fretico baixa todos os anos, as pessoas tm de cavar poos cada vez mais fundos. Muita dessa desertificao tem sido disfarada com o uso crescente de adubos, no tanto em frica como na ndia e noutros stios. uma causa macia de subnutrio e fome. Na semana passada, Fidel Castro avisara que se Washington no alterar a poltica quanto aos emigrantes cubanos poder ficar sujeita a uma inundao, pois que a Havana deixar de tentar impedir os seus cidados de partir e os parentes de os virem buscar. O Presidente levantou o espantalho de um xodo como o de 1980, ano em que 125 mil cubanos se dirigiram de barco para os Estados Unidos, onde tantos dos seus compatriotas j se encontravam a viver no estado da Flrida. Os bombardeamentos podem produzir uma reviravolta na guerra que ameaa, seno a integridade territorial do pas pelo menos a estabilidade do regime de Mobutu Sese Seko que o domina h mais de trs dcadas, admitiram analistas citados pela agncia francesa. Mas so escassas as informaes sobre as vrias frentes de combate, desconhecendo-se at que ponto as foras governamentais progridem no terreno. S h rumores, por exemplo que os militares tomaram, com a ajuda de antigos soldados ruandeses (hutus) e mercenrios, posies perdidas h dez dias na estrada para sudeste, para Walikale, e a cidade de Bafwasende, a Leste, e que a aviao se prepara para atacar Punia, situada entre Kisangani e Bukavu. Assim, o projecto de lei ontem aprovado por unanimidade na Assembleia da Repblica, em vez de afirmar que as taxas ficam suspensas at ao fim do ano (como o texto inicial, da iniciativa do social-democrata Rui Rio, previa), passou a estabelecer como limite para a suspenso a entrada em vigor do diploma que regule a utilizao de cartes de dbito de pagamento automtico. E Rui Carp, o vice-presidente da bancada laranja responsvel pelas questes econmicas, foi claro ao afirmar no plenrio que, num curtssimo perodo de tempo o Governo dever deixar esta questo resolvida. Outras das alteraes introduzidas no projecto de lei diz, alis, respeito iniciativa legislativa que inevitavelmente se seguir. que, enquanto o primeiro projecto afirmava que uma vez suspensa a taxa, a Assembleia da Repblica apresentaria um projecto de regulamentao do uso dos cartes, o projecto ontem aprovado apenas afirma que dever ser aprovada at 31 de Dezembro do corrente ano legislao que preencha cabalmente o vazio legislativo existente. Ou seja, no ser necessariamente o Parlamento a faz-lo, e a convico dominante na direco da maioria que, a partir de agora, o Governo ir ter uma participao decisiva. A teoria diz que necessria a existncia de uma certa distncia entre o charme e o anticharme para o Psi ser produzido. Se ns cumprimimos a matria, no deixamos os quarks distanciarem-se o suficiente para produzirem o Psi, explica Paula Bordalo. H uma distncia mnima para o Psi se formar e teoricamento se no se conseguir observar o nmero suficiente de Psis em relao fsica normal, porque podemos estar na presena do plasma, continua. Desde as vrias recolhas de dados com a experincia do oxignio, passando pelo enxofre e agora com o chumbo, o detector de mues -- a partcula em que o psi se decompe e que os fsicos procuram para saber se o psi foi produzido -- foi sempre mostrando uma diminuio do psi. O zimbabweano admitiu mesmo que Sampras dos poucos jogadores que costuma observar em aco. Ele to suave. Todas as suas pancadas so quase perfeitas, justificou. Com esta presena nos quartos-de-final, o seu melhor resultado em provas do Grand Slam, Black angariou pontos suficientes para subir cerca de 30 lugares no ranking, mas no s isso que o tenista africano leva de Flushing Meadow. O mais importante saber que sou capaz de bater jogadores do ' top-10 ', afirmou ele, referindo-se eliminao neste Open de Michael Stich (8 ATP) e Thomas Enqvist (9 ATP). Mas o dia j tinha comeado bem para as cores norte-americanas, com o triunfo de Meredith McGrath e Matt Lucena na final de pares mistos. O conjunto norte-americano derrotou a tambm norte-americana Gigi Fernandez e o checo Cyril Suk por 6-4, 6-4, conquistando o ttulo e o cheque de 7500 contos. Tnhamos decidido desde o primeiro encontro que amos divertir-nos e que no ficaramos zangados se as coisas corressem mal, afirmou Lucena, que tambm revelou que os dois concordaram em jogar apenas cinco dias antes de a competio se iniciar. McGrath j tinha sido finalista nesta variante em 1989, o ltimo ano em que o ttulo tinha revertido tambm para um par da casa. Segundo Artur Moreira, a notria leu a escritura e s faltava assinar. Depois, continua, disse que s o faria na presena de duas testemunhas. No dia seguinte, Armanda e Moreira voltaram ao cartrio, acompanhados das testemunhas. Sem qualquer explicao, a notria disse que no fazia a escritura, recorda o vendedor. E as testemunhas no chegaram a ser ouvidas. Seguiu-se uma troca de palavras menos amistosa e Elvira Maris acabaria por chamar a polcia, declarando ter sido maltratada e que o bom funcionamento do cartrio havia sido perturbado. A autoridade chegou e procedeu identificao dos dois indivduos. O que eu pergunto por que que para a polcia a minha identificao suficiente e para a senhora notria no , questiona Maria Armanda. Eu estou a ser alvo de discriminao. Ela est a brincar comigo, com o vendedor, com as testemunhas, com todos ns, conclui. Por sua vez, Moreira afirma conhecer outro travesti que comprou uma casa j h 18 anos, sem ter tido qualquer problema. Contactada pelo PBLICO, Elvira Maris declarou no poder fazer uma escritura quando aparece uma pessoa com uma identidade diferente da presente no documento comprovativo da sua identidade. A Alemanha no lhes concedeu asilo e os seus pases muitas vezes no os querem de volta. So detidos, para evitar tentaes de clandestinidade. Nas prises de deportao alems, esto cerca de quatro mil indesejados espera da expulso. At l, podem passar 18 meses em celas superlotadas. Sem terem cometido qualquer crime. Sem nada poderem fazer. A no ser esperar pelo destino de quem nasceu no pas errado. No incio do sculo, era uma esquadra de polcia. As celas tinham capacidade para albergar at 140 detidos por um curto perodo de tempo. Perante as novas necessidades, o velho edifcio na Kruppstrasse foi reciclado. Hoje a mais conhecida priso de deportao berlinense. Portugal uma rebaldaria. E como no h Constituio, ento vamos todos tomar atitudes arruaceiras. De bandeira branca ao ombro, reclamando o encerramento das grandes superfcies ao domingo, o comerciante, de face congestionada, resumiu sua maneira a inteno dos 150 lojistas presentes na reunio realizada anteontem noite, em Vila Nova de Gaia. Depois de discutidos o melhor dia e a melhor hora para a manifestao, a maioria pronunciou-se pelo ajuntamento no prximo dia dois de Maro, sbado, a partir das quatro horas da tarde, na sala de visitas do Porto: a Praa General Humberto Delgado. Um falso mdico passeou-se durante pelo menos dez anos pelo Hospital de Santa Maria. O doutor Dinis, segundo a Polcia Judiciria, chegou a dar consultas no estabelecimento de sade lisboeta e a encaminhar pacientes para especialistas a quem convenceu da condio de colega. O director de Santa Maria admitiu ontem ter sido detectado um indivduo que se fazia passar por mdico, imediatamente entregue s autoridades, mas garantiu desconhecer que ele desse consultas. O deputado do PSD Guilherme Silva esteve reunido h dias com as estruturas sindicais dos juzes, a quem comunicou o facto de o seu partido, apesar de estar sensibilizado para as queixas dos juzes, apenas tencionar desbloquear a situao num contexto em que se gere na Assembleia da Repblica um amplo consenso democrtico, designadamente com a adeso do PS que, segundo a direco parlamentar do PSD, se tem mostrado fechado a considerar o assunto numa perspectiva de regime. Esta perspectiva de regime de que fala o PSD implica a busca de uma soluo no quadro do Estatuto Remuneratrio dos Titulares dos Orgos de Soberania, e no apenas uma alterao pontual deste diploma com base na interpretao legal avanada pelo Sindicato dos Magistrados do Ministrio Pblico. Acredito que se referissem a animais indefesos, pois o termo cobardia no se aplica a quem enfrenta toiros. Apresenta esses dois grandes escritores -- por convenincia --, mas olvida nomes importantes no mundo das letras e das artes. No lhe diz nada Ernest Hemingway, Prmio Nobel da Literatura, um homem cativado pela festa e que escolheu a terra de Espanha para sua ltima morada? E Federico Garcia Lorca, que deixou to belas obras e alguns sentidos poemas dedicados a um famoso toureiro? Esquece Picasso, Goya, Mariano, Benlliure, que honraram a festa com as suas pinturas e esculturas? Quantos artistas, quer estrangeiros quer nacionais, a festa de toiros tem motivado. o cinema nasceu em Portugal? E o football? Contudo, somos capazes de fazer isso tudo e bem. Por fim, restar um lote de aces que ser vendido na Bolsa de Lisboa. Os preos das aces para investidores nacionais sero mais vantajosos comparativamente aos valores para os no residentes. No Ministrio da Indstria existe o desejo de manter o mximo de capital possvel em mos nacionais. A justificao que este um sector em que podemos dar cartas, pelo que no convm uma alocao excessiva de poder de deciso para o exterior. No entanto, por definir continuam as parcelas exactas de capital que sero alienadas em Portugal e no estrangeiro. O Governo ainda no delineou os contornos definitivos da operao, o que dever acontecer no final de Abril ou princpio de Maio em Conselho de Ministros. Actualmente, uma das questes colocadas pelos investidores a de qual ser o preo a pagar por cada ttulo. Mas tambm aqui ainda nada est definido. No entanto, o PBLICO soube que foram realizadas duas avaliaes da empresa de celulose. Uma pela dupla BFE/Salomon Brothers e outra pelo BPI em associao com a UBS (Union des Banques Suisses). O II Encontro Nacional de aquarofilia iniciou-se ontem com a inaugurao de exposies de filatelia, aqurios, artesanato temtico e fotografia, estando ainda includas no evento outras iniciativas, como workshops sobre fotografia e montagem de aqurios, animao musical e projeco de filmes. Esta tarde tem incio o colquio, que conta com a participao de especialistas nacinais e estrangeiros em aquariofilia e actividades subaquticas, bem como responsveis por alguns aqurios europeus e o director do Oceanrio da Expo'98. A pensar nos mais novos, o Programa Aquajnior permite a utilizao de programas multimdia relativos vida marinha. O Porto tem um novo grupo de teatro que obedece ao princpio da diversidade. A troca de experincias com outros grupos, a produo de espectculos, a organizao de debates e de ateliers de formao contam-se entre os objectivos do grupo, fundado por quatro actores profissionais. O primeiro workshop, orientado por um especialista japons em commedia dell'arte, Kuniaki Ida, comeou esta semana. A agitao volta das filmagens suscitou igualmente um novo interesse por aquele romance. A Ilhu Editora teve de pr nas bancas mais uma edio cabo-verdiana -- a terceira desde que a obra conheceu a luz do dia --, de modo a satisfazer a curiosidade dos novos leitores deste primeiro e bem sucedido livro de Germano Almeida, cuja bibliografia no cessa de aumentar desde que decidiu lanar-se nesta tarefa de dar corpo s histrias e que, segundo ele, vo ter consigo todos os dias no seu escritrio de advogado, no Mindelo. Segundo Francisco Manso, os custos do seu filme devero situar-se entre os 250 mil e 300 mil contos, oramento que ele considera razovel para os padres portugueses. Portugal, atravs do IPACA, RTP e vrios outros organismos pblicos e privados, assegura o grosso do financiamento, cabendo a parte restante ao Brasil e Cabo Verde, e ainda a entidades ligadas Unio Europeia. Os juristas franceses esto algo apreensivos com o novo Cdigo Penal, que entra hoje em vigor e que substitui o velho Cdigo Napoleo de 1810, ao cabo de um trabalho que durou 18 anos. Quando em todo o mundo caram os muros com que alguns tambm tentaram defender a irracionalidade dos seus princpios, eis que em Portugal se estabelecem quilmetros infinitos de aramados, que esquartejam o pas em talhes, onde uma minoria autorizada a explorar at ao extermnio o que a natureza a todos oferece. Pelo caminho, com o conluio de uns poucos que ocuparam ou ocupam altos cargos na Administrao (e que envergonham muitos outros que honrosamente se batem por um adequado ordenamento cinegtico), tm ficado enxovalhados, espezinhados e chumbados, o direito propriedade privada, o interesse nacional e o direito vida. Com uma viso facciosa e mope do que o desenvolvimento rural (j que as receitas da caa raramente ficam nos locais onde esta se pratica), continua a Secretaria de Estado da Agricultura e do Desenvolvimento Rural (SEADR) a empenhar-se em propalar sem quantificar os proveitos desta actividade. Mas quanto perde o pas por cada ave de rapina ilegalmente abatida? Quanto perde o pas por cada predador envenenado? Quanto perde o pas por cada espcie rara morta? Quanto gasta anualmente a JAE na reposio dos milhares de placas que anualmente servem de alvo a caadores sem escrpulos? Em quanto contabilizar os prejuzos causados por centenas de ces de caa que so anualmente perdidos ou abandonados pelos caadores? Em quanto ficaria a limpeza de toda a imundcie provocada na natureza pelos almoos e lanches que fazem parte deste ritual de matana? Quem deve ser responsabilizado pela poluio dos nossos campos e guas, provocada pelas toneladas de chumbo anualmente disseminadas por milhares de disparos? E as perdas de rendimento ao nvel regional, causadas pela eliminao, nas zonas de caa, das prticas agrcolas e pastoris consideradas incompatveis com as actividades cinegticas? E os postos de trabalho assim eliminados? Entretanto, durante toda a manh de ontem, uma testemunha tentou, na esquadra da PSP, dizer o que tinha visto e ouvido. Mas esse direito foi-me sempre negado, afirma a testemunha, A. Carvalho. O polcia disse-me que o agrediu, mas agora, horas depois, desmente tudo, adiantou. O caso foi entregue PSP das Caldas da Rainha e remetido ontem ao tribunal, que decidiu que o condutor no pode sair da rea da residncia, embora possa conduzir, tendo sido instaurado inqurito para averiguaes. Tambm em Outubro, quando a Cria Diocesana denunciou publicamente o fenmeno das profanaes, Aristides Lima, dirigente do PAICV, foi uma das vozes a levantar-se na condenao dessa prtica e a manifestar comunidade catlica a solidariedade do seu partido. R. -- Nestes casos, a boa poltica que durma em casa. Assim, ao menos sabe-se onde e com quem. A vizinha do apartamento da frente insiste em fazer ioga nua, vista de todos. Queixa-se ao senhorio? O Teatro Experimental de Cascais apresenta Inventrios, no Auditrio Mirita Casimiro. MONTEMOR-O-NOVO A Unidade de Infncia do Centro Dramtico de vora estreia hoje noite, no Cine-Teatro Curvo Semedo, O Meu Amigo Rodrigo. Polticos israelitas e da Organizao de Libertao da Palestina (OLP) afirmam-se dispostos a punir os fundamentalistas islmicos que no domingo causaram 19 mortos numa povoao a norte de Telavive. O ministro israelita dos Negcios Estrangeiros, Shimon Peres, afirmou ontem que o Governo de Yitzhak Rabin vai pressionar o lder da OLP, Yasser Arafat, no sentido de que se actue energicamente perante a Jihad Islmica, o grupo radical que reivindicou o facto de dois dos seus militantes suicidas -- naturais da Faixa de Gaza -- haverem sido os responsveis pelo atentado de Beit Lid. Cristo ressuscitou? Entre metade e um quarto dos estudantes catlicos acreditam em Deus mas no em Cristo. um fenmeno de desintegrao social. As consequncias so imprevisveis. Voltando a mostrar-se particularmente vontade em condies de pouca aderncia, o brasileiro Ayrton Senna (McLaren / Ford) conseguiu ontem o melhor tempo na primeira sesso de qualificao para o Grande Prmio da Europa que amanh se disputa em Donington (Inglaterra). No entanto, a luta pela pole position no est ainda terminada, com o francs Alain Prost (Williams / Renault) a prometer dar luta ao brasileiro, hoje, na derradeira sesso de qualificao. Aquilo que os pilotos temiam confirmou-se, com um primeiro dia de treinos muito chuvoso, a dificultar as suas tentativas de qualificao. Prost comeou por ser o mais rpido mas resolveu recolher s boxes, aguardando por um final de sesso com menos gua. Pouco depois, era a vez de Senna bater o seu tempo. Embora a pista de Donington tenha uma drenagem melhor que a de Interlagos, acho que correr aqui mais perigoso. Com a entrada do Tratado de Maastricht na Assembleia da Repblica prevista j para a prxima semana-- depois de o Conselho de Ministros ter decidido ontem o envio de uma proposta de resoluo para ratificao parlamentar--, o debate sobre o processo e o timing adequado sua aprovao passou imediatamente ordem do dia. Muitas questes subsistem ainda, nomeadamente sobre a necessidade de proceder a uma reviso constitucional que compatibilize o texto da Constituio com o texto do Tratado. Hoje mesmo, o Parlamento ver-se- confrontado com a proposta de resoluo que o CDS vai apresentar, visando dot-lo de poderes de reviso constitucional de modo a permitir o recurso ao referendo que os centristas defendem para ratificar as propostas de Maastricht. Quando no avio se acende a luz verde e comeam a encaminhar-se para a porta e a sair ao ritmo de um por cada dois segundos, as batidas do corao j normalizaram. J nem ouvem a voz de j do instrutor-largador. Sentem a palmada que ele lhes d no ombro e l vo. parecem bonecos articulados. Por vezes, muito raramente, no se ganha para o susto. Como foi no quarto salto, em que um deles passou a velocidade meterica, com o pra-quedas dorsal em vela romana [ m abertura em que a calote do pra-quedas se mantm na forma de facho, sem superfcie de sustentao ]. O jovem instruendo tinha aproveitado bem os ensinamentos e abriu o pra-quedas de reserva. Respirou-se fundo, no ar e em terra. J no cho recriminam-se entre si, por causa de pequenos descuidos. No fui eu! A Ligier-Renault, a Lotus-Mugen-Honda e a Sauber-Mercedes sucedero Jordan nos referidos ensaios. A Ferrari chegar amanh ao circuito de Barcelona, onde apresentar no seu novo modelo uma das inovaes mais interessantes para a nova poca: a fixao das suspenses directamente incorporadas no chassis. Paralelamente aos testes de reabastecimento, o McLaren-Peugeot de Mika Hakkinen e o Sauber-Mercedes de Karl Wendlinger foram os mais rpidos em pista, cronometrados na suas voltas mais rpidas em 1m19,32s, enquanto o Lotus-Mugen-Honda de Pedro Lamy foi o mais lento, gastando 1m27,52s, muito aqum do italiano Alessandro Zanardi, piloto-ensaiador da equipa, que gastou apenas 1m22,95s. Frentzen (Sauber-Mercedes), com 1m20,02s, Irvine (Jordan-Hart), com 1m20,14s, Panis e Bernard (Ligier-Renault), respectivamente, com 1m21,64s e 1m22,24s, seguiram-se dupla mais rpida, no circuito de Montmel. A maioria das queixas, verdade seja dita, no tinham nada de transcendente ou desconhecido. Repetiam que as missas so uma chatice e que as homilias oscilam entre o horror e a banalidade. Algumas pessoas atribuam misria litrgica a suposta descida na prtica religiosa. Concluso porventura demasiado linear. Insistiam em que os jovens dos 18 aos 25 anos j no aturavam o que os pais tinham aguentado. O que, sendo talvez verdade, esquece que tambm h jovens para tudo. O irrequieto Jos Magalhes resolveu brindar deputados, jornalistas e amigos com uma prenda pirata. Uma disquete com a traduo portuguesa do Tratado de Maastricht com os conhecidos e altamente verberados erros oficiais, a que juntou uma bem humorada etiqueta: Disquete sem vrus (utilizao de software preservativo antieuropeu no aconselhada). Uso e difuso livres. Pirateada e distribuda gratuitamente no Natal de 1992 por Jos Magalhes a todo e qualquer interessado, de boa ou m f. Consta que enviou uma a Manuel Monteiro e outra a Paulo Portas. Uma pirataria completa. No, no mais uma caso de vida ou telenovela mexicana de televiso, ou uma histria para adormecer. Esta bem real e um exemplo de que pode acontecer a qualquer me que, ao tirar uma ecografia, descobre que no um, mas trs filhos que vai dar luz. Uma alegria para alguns e um choque para outros. O choque foi na ecografia, vo fazer j 18 anos, e s suspeitvamos de um. Mas reagi muito bem e fiquei a saber que engravidei duas vezes. Esta me explica que, em casos como este, os cuidados so todos multiplicados por mil. No caso para menos, pois so gmeos, mas neste caso, os chamados gmeos falsos, dizigticos na linguagem tcnica, por serem gerados em vulos diferentes. A Ins e o Joo so quase inseparveis, embora, quando pequenos, ela assumisse a funo de chefe nas brincadeiras e tivesse um instinto maternal em relao ao irmo. So muito diferentes, ela pela sua independncia e competncia e ele pela sua preguia e boa disposio. Sempre na mesma turma escolar, separaram-se no 10 ano. E foi quando se notou que era a irm o grande apoio nos estudos, conta a me. A Ins quer ser arquitecta, enquanto que o Joo j mostra tendncia para a rea da sade. A soluo definitiva de situaes como as da central da Av. Casal Ribeiro, do Campo das Ceboulas e de numerosas outras centrais dedicadas aos transportes inter-regionais e inter-urbanos est porm dependente de uma obra de grande envergadura, ainda sem localizao completamente decidida e cujo incio no dever ocorrer antes dos prximos dois ou trs anos. Uma vez que esta central esteja em funcionamento, passar a receber tambm as carreiras que em Janeiro sairo do Campo Pequeno para Sete Rios. O espao a libertar nessa praa poder ento ser consagrado aos carros de turismo que agora ocupam o topo do Parque Eduardo VII. Tambm uma residente h 30 anos nos Moinhos da Funcheira, Isabel Martins da Silva, considera excessivo o que pagou pela ligao de esgotos, 181 contos, e da gua, 81. Para esta moradora, a autarquia ainda no fez as obras necessrias porque considera aquela urbanizao como o bairro dos ricos, como afirma ter sido dito pelo vereador do urbanismo, Miguel Vieira, da CDU. Vrias solues tm sido propostas para contornar este obstculo. Uma delas consiste em supor a coexistncia de Universos paralelos. mas existe num outro, porque a nunca assassinou a sua av. Uma outra alternativa consiste em supor que este tipo de situaes incoerentes nunca se poder verificar, porque a prpria Natureza se encarregar de o impedir. Como? Talvez, autorizando apenas as viagens ao passado que no dem origem temida morte da avozinha. Ou talvez, simplesmente, proibindo qualquer tentativa de utilizao dos buracos de minhoca para construir mquinas do tempo. Ou ainda, desencadeando a destruio imediata das mquinas que alguma vez vierem a ser construdas. Depois de um comeo de campeonato decepcionante, a equipa de basquetebol do FC Porto confirmou ontem a sua subida de forma, vencendo o lder do campeonato por 85-84, em Ovar, com um cesto de trs pontos conseguido por Jlio Matos a trs segundos do fim do jogo. Com este resultado, o FC Porto isolou-se no segundo lugar do Nacional, enquanto o Benfica igualou a Ovarense no comando. O Dalkon Shield representa um episdio horrvel na histria do controlo da natalidade, mas hoje claro que os problemas se deviam a esse DIU em particular e no afectam todos os dispositivos do gnero, diz Jade Singer, responsvel do Centro de Sade de Santa Mnica. De facto, necessrio uma certa dose de f para considerar o uso do DIU, admite Debra Bronstein, vice-presidente responsvel pelo marketing na GynoPharma. E muitas associaes de consumidores, incluindo a Associao Internacional para a Educao das Vtimas do Dalkon Shield, continuam a fazer campanha contra o dispositivo, afirmando que no existe suficiente informao sobre os seus riscos. A multinacional farmacutica desmentiu as notcias, acusando o jornal de afirmaes incorrectas e sublinhando ter toda a confiana na segurana de emprego e na eficcia do sedativo, desde que ele seja utilizado de acordo com as prescries referidas na literatura que o acompanha. Comercializado sob as designaes de Midazolam (injectvel) ou Dormicum (ampolas) e venda em Portugal, o Versed um sedativo utilizado em pequenas intervenes cirrgicas, nomeadamente para anestesias locais. Segundo a Roche, o medicamento -- utilizado nos ltimos cinco anos sem problemas por mais de 15 milhes de americanos -- foi clinicamente testado nos Estados Unidos e na Europa, no tendo sido registado qualquer morte, cujas causas pudessem ser atribuidas administrao do ' Versed '. Os casos referidos pelo New York Times como sendo imputveis ao medicamento foram considerados pela Roche como mortes sem relao causal com a aplicao do medicamento, tratando-se de doentes cujo estado de sade j era grave e que apresentavam problemas cardiovasculares e ferimentos muito graves. Os representantes da CNN em Bagdad comunicaram a composio do comboio e a cor dos automveis sua sede nos Estados Unidos. Estes pormenores seriam fornecidos s autoridades americanas e, deste modo, talvez o risco de um ataque areo aliado sobre o comboio fosse reduzido. H uma dezena de dias que a estrada entre a fronteira iraquiano-jordana e Bagdad alvo de bombardeamentos quase dirios por parte de aviao aliada. Encontra-se cortada em diversos pontos e preciso por vezes tomar estradas secundrias, tambm alvo dos raides, como constatou este enviado ao efectuar o trajecto no dia 30 de Janeiro. Para o bem ou para o mal, at ao momento, quem foi, na sua opinio, a figura portuguesa do ano de 1995. E porqu? Foi um blgaro: o Iordanov. Porque marcou dois golos magnficos ao Martimo e, assim, o Sporting ganhou a Taa de Portugal. J no vamos o padeiro h treze anos. Por que razo to grave este facto? Porque -- consideram os relatores, interpretando as diversas contribuies ao congresso -- a natureza difcil dos problemas, a escala de tempo longa da sua resoluo, a sua raiz cultural e o facto que o objecto das polticas no outro seno a mudana de comportamentos das comunidades educativa e cientfica, impem uma implicao orgnica dessas comunidades nas polticas que lhes dizem respeito. Isto, acrescentam, sob pena de um estpido desperdcio de recursos, de energias e de motivao. As polticas anti-participativas que os relatores consideram existir nesta rea so por eles julgadas particularmente inadequadas num perodo de expanso e de sustentado auxlio comunitrio como o actual, mas elas so, para mais, particularmente vivas no actual momento de crescimento do sistema, devido ao mais vivo confronto entre novos e velhos elementos. A primeira dificuldade que se depara s empresas que conseguem concretizar um processo de investimento em Moambique o de recrutamento de quadros para permanncias prolongadas no pas. Para alm de problemas salariais, mais ou menos complicados, de alojamentos difceis de encontrar e caros, a questo mais sensvel acaba por ser de ordem familiar, principalmente as relacionadas com filhos em idade escolar e com a respectiva educao. Se no Maputo existem escolas particulares e pblicas suficientemente credveis para suprir as necessidades, nas restantes concentraes urbanas isto j no acontece. No segundo centro populacional de Moambique, a cidade da Beira, os portugueses residentes conseguiram, por iniciativa prpria, criar uma escola que lecciona o ensino primrio e que para o ano vai iniciar-se como escola preparatria. A organizao do 17 Festival Nacional de Gastronomia no pensa para j na sua internacionalizao, prevendo-se apenas que anualmente seja um pas da Europa comunitria a ser convidado, bem como um PALOP. No primeiro dos casos est Espanha, que poder estar representada j na prxima edio. As hostilidades abrem hoje, por tradio, com os paladares promovidos pela Regio de Turismo do Ribatejo, que entregou a responsabilidade da refeio a um restaurante do Cartaxo. Tal como em todos os restantes dias, o almoo est marcado para ter incio s 13h. Esta poca, o Sporting ainda no tinha conseguido marcar mais de dois golos nos jogos em Alvalade. F-lo ontem, batendo um Braga pouco atrevido e superando mais um fantasma da equipa. Mas, na bancada, j poucos acreditam no ttulo. A transaco de 117.203 aces nominativas e 34.376 ao portador da Ocidental Holding SGPS constitui ontem uma das notas dominantes do mercado accionista nacional. Movimentados na Bolsa de Valores de Lisboa (BVL), os lotes foram intermediados pela Comercial Dealer. No princpio da nossa relao electrnica, acordava a meio da noite e ficava deitado a pensar se teria correio de Bill. Ele parece escrever as suas mensagens noite, dormir (talvez) e mand-las na manh seguinte, escreve John Seabrook, explicando que a primeira pergunta por E-mail foi sobre o prprio meio de comunicao. Sou a nica pessoa que l o meu E-mail, por isso ningum se pode envergonhar ou pensar que a mensagem vai circular por a ... E-mail no uma boa forma para nos zangarmos, uma vez que no se pode interagir. Seabrook notou que as mensagens de Bill no tinham princpio ou fim, coisas como caro, seu ou atentamente. Bill nunca se lhe dirigiu pelo nome e despedia-se com um enigmtico &, que em linguagem de correio electrnico significa responda. Para os empregados da Microsoft comum encontrarem Bill Gates electronicamente antes de o conhecerem pessoalmente e, imagem do chefe -- o mais esperto de todos --, passam a vida a descreverem-se uns aos outros como espertos e superespertos. um conceito vago. H uma certa vivacidade, uma capacidade para absorver factos novos. Para perceber uma nova situao e, depois de algum lha explicar, dizer imediatamente: ' Que tal assim? ' . Fazer uma pergunta pertinente. Absorver em tempo real. Capacidade para recordar. Relacionar coisas que ao princpio no parecem ligadas. O rbitro Pinto Correia esteve bem durante toda a primeira parte e durante quase toda a segunda. No entanto, acabou por perdoar um segundo carto amarelo a Rui Neves, condescendendo igualmente com Dimas. O referido relatrio explica que a substituio dos pequenos ecrs pelos multiplexes -- at ao final de 1998 est previsto o aparecimento de mais trs (Loures Shopping, Norte Shopping e Vasco da Gama) -- dever aumentar o nmero de espectadores: de 12 milhes em 1996 para 18 milhes!) em 2000. Em termos de distribuio, os filmes norte-americanos detm praticamente 95 por cento da quota das bilheteiras portuguesas, a mais elevada da Unio Europeia. Segundo o relatrio, as principais distribuidoras nacionais ou so subsidirias norte-americanas ou empresas portuguesas que distribuem os filmes dos grandes estdios norte-americanos. Dois exemplos: A Castello-Lopes tem acordos com a Fox e a Miramax, a Lusomundo com a Disney e a UIP e adquire filmes da BMG. Momento -- Andou boa parte do ano a perder tempo, tendo em conta apenas os seus interesses, a actuar no meio-campo. Segundo observadores citados pelo Times, a China encontra-se numa posio muito mais fraca do que estava em 1982-84, perodo em que decorreram as negociaes que levaram ao acordo para a transferncia de soberania. Nessa altura, recordam, Deng Xiaoping estava no auge da liderana do processo de reforma, era um septuagenrio vigoroso e o desenvolvimento econmico parecia imparvel. Holanda: Bons jogadores de futebol . Falam ingls muito bem, no parecem ter uma grande cultura prpria. s vezes, um pouco parvos, geralmente descontrados. Itlia: Bons a falar e a fazer negcio . Pas bastante catico, mas muito industrioso. Muito convencidos. Um dos povos mais agradveis da Europa, amistosos. Mas muita pobreza. Fisicamente, o povo mais belo da CE. -- Mas o senhor to cuidadoso com a sua mota que se arrisca a ser preso s para no a deixar na rua? -- Mas normalmente assim! As outras coisas no so to ... O que os hngaros temem uma onda de nacionalismo nos pases vizinhos que perigue as escolas e instituies culturais de hngaras. Isto poder levar muitos hngaros a desses pases a decidir que a emigrao para a Hungria a melhor opo. Para resolver os complexos problemas dos hngaros que vivem nos pases vizinhos, o Governo criou um gabinete especial chefiado por Geza Entz. Os Orlando Magic foram ao The Omni, em Atlanta, perder pela primeira vez em dez jogos seguidos da edio deste ano da NBA, a Liga Norte-Americana de Basquetebol profissional. Os Atlanta Hawks venceram por 107-105 um jogo muito emotivo, que s ficou decidido a 32s do fim. Foi quando John Konkac marcou os dois lances livres que desempataram o jogo, e de seguida impediu que os Magic chegassem ao cesto, obrigando-os a um turnover. Por outro lado, no caso do haxixe os riscos para a sade so comparveis aos de produtos como o lcool, tabaco, caf e certos medicamentos. Mais um argumento: o consumo do haxixe no provoca dependncia, ao contrrio do lcool ou da nicotina. H alis estudos americanos que provam que o haxixe um bom relaxante, logo, se diminui o stress diminui um dos factores de risco do enfarte miocrdio. A legalizao das drogas duras mais difcil, pois tem que ser feita escala internacional. e no precisariam de injectar-se nos cantos do Casal Ventoso. Isto diminuiria a criminalidade, haveria menos mortes por overdose e de outras doenas, como a sida. Domingos -- Afinal no estava de rastos, como se quis fazer querer. Entrou confiante, integrou-se bem nas movimentaes e mostrou que, com ele em campo, bem mais fcil rentabilizar o jogo ofensivo. Fez trs remates, todos eles perigosos, e marcou um golo que s est ao alcance de um verdadeiro ponta-de-lana. No tinham razo de ser as desconfianas de equipa tcnica. Fernando Couto -- Esteve imperial, sempre muito certo a comandar a defesa. H alturas em que parece ter um man que atrai a bola na sua direco. Os futuros chips de 256 megabits podero ser usados em aplicaes que exigem grandes quantidades de memria, como o processamento de imagens em supercomputadores e noutras tecnologias de ponta. A aliana entre a Hitachi e a Texas -- que uma resposta associao da IBM, Siemens e Toshiba anunciada em Julho passado nesta mesma rea -- poder ser ainda alargada a outras firmas, como a NEC e a American Telephone and Telegraph. Alberto Ralha nunca foi alvo de grandes crticas pela simples razo de que raramente algum se lembra da sua existncia. Beneficiando do facto de negociar com uma classe tradicionalmente pouco reivindicativa, a dos docentes do ensino superior, Alberto Ralha designado por os sindicalistas como boa pessoa, mas mau conhecedor das questes do ensino superior. No fora o dinamismo do seu director-geral, Pedro Lince, e o mandato deste secretrio de Estado ficaria irremediavelmente marcado pela inoperncia. enquanto as universidades pblicas ameaam fechar as portas por falta de dinheiro para pagar a docentes e no docentes, cada vez maior o nmero de estabelecimentos de ensino privados de qualidade duvidosa a ministrar somente cursos que outros recursos no exigem seno a caneta e o papel. Esto suspensas as buscas do corpo do pescador que, tera-feira, foi engolido por uma onda, na zona da Boca do Inferno, no Guincho. O mar continua mau, bate muito ali, e as guas esto barrentas, o que torna impossvel a aco dos mergulhadores, explicou o capito do Porto de Cascais, Fernando Tavares de Almeida. Contrariamente ao que esperavam Lus Rodrigues e os seus correligionrios, o caso, porm, ficou adormecido no ministrio das Finanas. A doutora Manuela Ferereira Leite [ que sucedeu a Rui Carp na secretaria de Estado do Oramento ] disse-nos que quando assumiu funes encontrou o processo meio espalhado por aqueles buracos, confirmou ontem ao PBLICO o ento vice-presidente da concelhia local do PSD, Valdemar Saleiro. Uma das coisas que nos disseram que l vinha que havia na Cmara despesas no justificadas no valor de 194 mil contos, refere um dos outros elementos do PSD local que ento acompanhou o caso. Valdemar Saleiro -- que tinha tambm responsabilidades partidrias ao nvel do distrito de Beja -- diz que no se recorda de ter ouvido falar neste nmero, mas confirma que a secretria de Estado o informou de que o relatrio apontava coisas que tinham de ser esclarecidas. A sesso de ontem do Mercado Monetrio Interbancrio apresentou-se logo na sua abertura bastante pressionado pela procura de fundos, o que motivou uma subida das taxas de juro, que se foi acentuando medida que se avanava na sesso. Com a finalidade de estabilizar o mercado, o Banco de Portugal anunciou a meio da manh uma interveno ocasional de cedncia de fundos at ao montante de 150 milhes de contos, mediante a colocao de BT/TIM a um dia e taxa de 11,75 por cento. Apesar da presso evidenciada no mercado, apenas foram colocados 53.677 milhares de contos. O grupo A. Silva & Silva registou, no ano passado, resultados lquidos consolidados de 13,5 mil contos. Os resultados foram afectados pela realizao de provises extraordinrias na sua participada Assiconstri -- entretanto vendida Somague -- no valor de 330 mil contos, por forma a no transferir para o seu comprador riscos provenientes de situaes pendentes de reclamaes a clientes, eventualmente geradoras de diminuies patrimoniais futuras, refere um comunicado da empresa ontem divulgado. Em termos individuais, a A. Silva & Silva registou lucros de 444,6 mil contos. O volume de facturao ascendeu a 42 milhes de contos, os resultados operacionais consolidados foram de 1,1 milhes de contos (contra 652,4 mil contos registados em 1995), enquanto os resultados correntes totalizaram 342,4 mil contos (mais 159,2 por cento do que no ano anterior). So pequenas partculas metlicas e foram descobertas incrustadas nos ossos dos calcanhares do piloto Jorge Albuquerque. A sua importncia? que ningum consegue explicar como que elas podero ter ido a parar a no ser devido acelerao provocada por uma exploso. E possvel que o material que as compe seja aquele de que era feito o recipiente que conteve a bomba. Trata-se de uma das provas fundamentais apurada pela V CEIAC, a par da descoberta dos vestgios de explosivos em peas do avio. Clinton apressou-se a dizer que tais contactos no deviam ter acontecido e a sua porta-voz, Dee Dee Myers, indicou que documentos sobre esses encontros foram compilados e seriam entregues a Fiske, respeitando a intimao feita por este. O procurador especial chamou a depor seis funcionrios da Casa Branca e quatro elementos do Departamento do Tesouro, para esclarecerem os contactos mantidos. Entre os primeiros, conta-se o advogado Bernard Nussbaum, amigo de Clinton e que se demitiu na semana passada, em relao com o caso. A opo do Alentejo vai para o chamado Turismo Verde, em zonas de inequvoca qualidade ambiental. Um turismo que promova a descoberta da diferena e no a massificao, diz Andrade Santos, presidente da RTE. Alm da costa martima, a melhor preservada do pas, o Alentejo oferece turismo rural e agro-turismo, turismo cinegtico e o aproveitamento de albufeiras, recorrendo sempre boa qualidade do seu patrimnio natural. E a riqueza do seu patrimnio construdo, a cultura musical regional, o artesanato e a gastronomia tradicional completam, em linhas gerais, o quadro da regio. Ultimamente, a implantao de campos de golfe tem surgido como um complemento turstico no desprezvel, ainda que assumindo, por vezes, discutveis propores (ver caixa). O romance da princesa Ana -- segunda descendente e nica filha da rainha -- com o comandante Tim Lawrence, 37 anos, j fizera as delcias da imprensa popular britnica, nomeadamente na altura em que o casal apariceu em pblico, por diversas vezes, aps o divrcio de Ana. A data e local do casamento, que dever realizar-se numa cerimnia privada, no foram divulgadas pelo Palcio de Buckingham. Face ao impacte dos planos do construtor, a comisso de trabalhadores da Renault Portuguesa foi ontem ao Ministrio da Economia lembrar ao Governo o compromisso francs de criar 180 novos postos de trabalho em Cacia e investir nesta unidade 12 milhes de contos. A resposta do Governo que o assunto tem estado na agenda dos contactos com o grupo francs, uma posio que para os trabalhadores no representa qualquer garantia. Os representantes dos trabalhadores esperam pelo comit de grupo europeu extraordinrio para uma deciso quanto adeso a uma eventual greve dos efectivos do construtor. Contudo, o porta-voz da comisso de trabalhadores, citado pela Lusa, defendeu que a reduo de postos de trabalho na Renault Portuguesa mais significativa, ao ter passado de 3500 trabalhadores em 1992 para 1200 em 1997. Em 35 anos, os danos causados ultrapassam dois mil milhes de pesetas (2400 milhes de contos), diz o relatrio. Acrescenta que esta estimativa se refere unicamente aos danos directos dos sinistros e no inclui os custos da luta contra os incndios -- verbas avultadas, como pode concluir-se pelas destinadas ao ano em curso, que atingem os 130 mil milhes de pesetas (156 milhes de contos). Outro aspecto preocupante o da frequncia crescente de anos negros, em que se registaram recordes de zonas ardidas. De 1978 at agora, Houve quatro anos em que se registou a perda de reas superiores a 400 mil hectares. O ano passado esse valor foi de 432 mil hectares, estando o recorde no ano de 1985, com 484 mil hectares. A Assembleia Parlamentar da Francofonia (APF, ex-AIPLF), reunida em Abidjan, na Costa do Marfim, condenou ontem a insurreio desencadeada h um ms por certos elementos das Foras Armadas da Guin-Bissau e exortou as partes em conflito ao dilogo no respeito pela legalidade constitucional. Numa resoluo adoptada em sesso plenria, a APF reafirma a legitimidade do actual governo, eleito em eleies livres e democrticas e condena sem reservas a rebelio iniciada em 7 de Junho de 1998. A Assembleia exprime o seu apoio total ao governo do Presidente Nino Vieira, e lana um apelo premente a uma cessao imediata das hostilidades. Sendo o primeiro de uma nova gerao de jogos a trs dimenses para consolas de 16 bits que a Nintendo lanou no mercado de vdeojogos, Starwing surpreendente pela qualidade dos grficos que apresenta, pelo som, perfeitamente adequado ao ambiente grfico, mas sobretudo pela extraordinria velocidade a que se desenrola. O jogador -- que desempenha o papel de Fox McCloud -- tem de completar um total de 18 misses, sobrevoando planetas ou enfrentando os inimigos no espao, onde as dificuldades so acrescidas, j que, para alm dos opositores, tem de desviar-se de autnticas chuvas de asterides e outros objectos que vagueiam sem rumo. Tem tambm de zelar pela segurana da sua equipa de pilotos, ajudando-os sempre que esto em dificuldade. Aps algum treino, os comandos so fceis de manejar, permitindo ao jogador executar piruetas ou utilizar o propulsor para acelerar momentaneamente a sua nave. H trs formas de visionamento do cenrio, escolha do jogador. Nas cenas de espao a viso de ' cockpit ', com o recurso a uma mira, de grande utilidade. O programa de trabalho e o oramento para os prximos trs anos para o Observatrio Europeu das Drogas e das Toxicodependncias, com sede em Lisboa, foram ontem discutidos numa reunio em que participaram representantes de todos pases da Unio Europeia. O PCP, esse, est sempre bem e com pouco trabalho pela frente. O que era preciso fazer, est feito: derrotar uma iniciativa do Governo. Distribudos os trabalhos, falta perceber como se chegou aqui. O que tambm no difcil. Tudo nasce num triplo erro governamental. Reduzir a procura do consumo de droga a prioridade do rgo Internacional de Controlo dos Estupefacientes da ONU. Um relatrio divulgado hoje, que d conta dos receios causados por uma cultura propcia ao consumo e que condena as vozes mais tolerantes sobre o uso de narcticos, revela uma descida do nmero de consumidores de herona na Europa e uma subida do cultivo de cannabis e do consumo ocasional de estimulantes e alucinogneos. A herona cada vez mais uma droga fumada e menos injectada. A Norquifar refere ter obtido do presidente do Infarmed, Aranda da Silva, um compromisso no sentido de permitir uma distenso mais razovel, nunca inferior a trs anos, do tempo para a aplicao do decreto e queixa-se de ter sido surpreendida, pouco tempo depois, com as notificaes. O cumprimento do decreto [135/95] nos prazos em que agora o Infarmed veio exigir quase impossvel porque o novo licenciamento pede documentos que so difceis de obter num curto espao de tempo -- como sejam a obteno de alvars camarrios ou pareceres de segurana do Servio Nacional de Bombeiros, diz Srgio Figueira, da Norquifar. Quando os prazos se esgotarem, explicou ainda, os armazns correm o risco de ser fechados por qualquer aco de inspeco daquele instituto. Alguns prazos esto, segundo Srgio Figueira, a poucos dias de chegar ao limite. A Norquifar, que j se tinha queixado de se estarem a criar empregos por decreto, entende que a contratao de um licenciado a tempo inteiro para os armazns no s demasiado onerosa para as pequenas e mdias empresas que representa, como injustificvel, pois um armazm grossista no vende directamente ao pblico. Afirmou um dia ao PBLICO que o seu inimigo principal era o poder poltico. H um ano, achava muito divertidas as notcias que a davam como possvel candidata do CDS-PP s legislativas. Nunca pensei entrar na poltica!, jurava Manuela Moura Guedes. ao fim de 16 anos, deixava a RTP para ir apresentar o Telejornal da TVI. Foi sol de pouca dura.. Em Junho, afastada dos noticirios h meses, era outra vez notcia. Rumores de que estava a caminho da SIC -- apesar das resistncias de pesos pesados da redaco do canal de Carnaxide --, enquanto outros a davam com um p nas listas do renovado PP. Est tambm prevista a presena dos presidentes das entidades estatais mais ligadas aplicao do programa: Instituto Nacional da Habitao e IGAPHE Instituto de Gesto e Alienao do Patrimnio Habitacional do Estado. A indignao dos dirigentes do Sindicato dos Txteis tem como principal alvo a Cmara Municipal, que alegadamente ter colocado entraves que fizeram baixar o valor do imvel. Em causa estar, nomeadamente, a impossibilidade de alterar a fachada do edifcio de quatro andares, cada um deles com cerca de seis metros de p-direito. O prdio foi agora vendido empresa Construes Progresso no mbito de negociaes particulares. Estas foram iniciadas h meses, depois de se considerarem esgotadas as hipteses de venda por arrematao judicial, dado os valores oferecidos -- num mximo de 186 mil contos -- estarem muito aqum da avaliao. Foi melhor do que o que eu esperava, comentava uma mulher no elevador, acerca do Show de Moda Primavera / Vero que anteontem noite teve lugar no Shopping Center Cidade do Porto, no Bom Sucesso, e que se dever vir a repetir oportunamente. Aconteceu numa passarelle montada sobre a pista de gelo e rodeada por cadeias dirigidas aos diversos convidados de cada uma das lojas que acederam em participar. Os sem convite permaneceram atrs das cadeiras e nas varandas dos trs pisos do shopping. Curioso foi o facto de o show se ter dividido em duas partes e das noivas surgirem logo aps o intervalo e no no final da passagem, como hbito nestas coisas. Antes, ao fim da tarde, os manequins e crianas passaram modelos da Mango, Union Blue, Authentik, Quiosque, Lanidor, Jacadi, Inquietao, Action Sport, Nexus, Petit Patapon, Cenoura, Boxer Shorts, Nastra, Bambini, Tin Tin e Miss Selfridge. noite, numa prespectiva mais clssica, desfilaram coleces da Pronuptia, Gianonne, Lace, Mac Moda, Pinto's, Cortefiel, Alain Manoukian, Rondssimo, Globe, Springfield, MyGod, Vitalis, rbrica, Manel Boutiques, Veneza 5, Nogaret, Acapicua, Sugar, Decnio e Paulina Figueiredo. Um rol de propostas, prt--porter. R. -- A ideia agrada-me e j fiz, inclusive, algumas experincias. De qualquer modo, aquilo que se passa, em termos de electrificar a sanfona ou os outros instrumentos, que os espectculos do Realejo so na sua maior parte acsticos. Essas experincias de electrificao no tm resultado, at agora, muito bem, embora no disco haja uma faixa que aponta um bocado nesse sentido, a cantiga de Santa Maria, com um movimento meio arrockalhado. Pode ser um ponto de partida ... P. -- Mas por enquanto o Realejo continua a ser um grupo de e para interiores, no duplo sentido da palavra? Considerado culpado mas absolvido pela justia militar italiana, o ex-nazi Erich Priebke foi de de novo preso mas isso no lava o sentimento que cobriu uma maioria da Itlia: a vergonha. Pelo veredicto, pela maneira como o processo foi conduzido. Para os juzes, os crimes de Priebke prescreveram. Como disse o Presidente Scalfaro, nunca prescrevero na memria dos italianos. A Direco do Torreense, cuja a equipa de futebol milita na 2 Diviso de Honra, demitiu-se em bloco na Assembleia Geral realizada na noite da passada segunda-feira. Razo: a Cmara Municipal de Torres Vedras (CMTV) no tem ajudado o clube financeiramente. Chegmos a um ponto em que deixaram de existir condies financeiras que garantam uma gesto responsvel do clube, garantem os dirigentes. Antnio Jos dos Santos, o presidente demissionrio, prometeu continuar a assegurar a gesto corrente do clube e admitiu reconsiderar a sua posio e ficar at final do mandato, mas s depois de saber com quanto que Cmara pode auxiliar o clube. Jos Augusto Carvalho, presidente da CMTV, considera profundamente injustas as acusaes lanadas sobre a autarquia. A cmara s se comprometeu a contribuir com igual montante ao total da soma dos restantes scios-empresas e cumpriu, disse Jos Augusto Carvalho. A Federao Portuguesa de Andebol vai analisar amanh, em reunio do executivo, a no realizao, no passado sbado, do jogo FC Porto- Ginsio do Sul, referente segunda jornada do Nacional. O executivo federativo est j a fazer um levantamento de toda a documentao, que dever ser complementada com relatrio da equipa de arbitragem nomeada para dirigir a partida. Segundo uma fonte da FPA, o jogo foi marcado para o Pavilho das Antas e o FC Porto recebeu a notificao atravs de comunicado, logo pode-se concluir que o recinto j no se encontrava interdito. A resoluo do problema pode agora passar por um acordo entre os clubes. Caso isso no venha a verificar-se, a Federao vai instaurar um processo onde o FC Porto ter de apresentar as razes que levaram a equipa a comparecer em Aveiro e no nas Antas. um pacato cidado, livre de toda a suspeita, um homem simptico e prestvel, guarda em casa 24 cadveres mutilados. Descoberto por acaso, revela-se algum que encara os crimes como algo natural. Para a psiquiatra que o observa, um ser intrigante; para a polcia, uma encarnao do demnio. Jeff Goldblum e Alan Bates em confronto. J perdemos algumas cartas altas, ainda assim, temos algumas figurinhas, desabafou ao PBLICO um dirigente nacional do PP para justificar o silncio dos notveis. Fernandes Thomaz, Lus Queir e Celeste Cardona so as personalidades que restam, enumerou, para depois concluir que Monteiro no pode esquecer isso, caso decida recandidatar-se. A moo da distrital do Porto, A casa comum da direita, tambm procura congregar um maior nmero de apoiantes. Slvio Cervan conseguiu a adeso do presidente da distrital de Aveiro, S Correia, prximo de Paulo Portas, e da concelhia de Vila Nova de Famalico, uma estrutura do distrito de Braga, de Antnio Pedras. A aplicao aos avies comerciais de passageiros dos sistemas electrnicos de defesa antimssil, em uso na aviao militar, est a ser equacionada pelo governo norte-americano. Ontem, na Casa Branca, reuniu-se pela primeira vez uma comisso constituda por 21 especialistas em sistemas de defesa e contramedidas electrnicas. Segundo Al Gore, vice-presidente dos Estados Unidos, esta comisso deve apresentar um relatrio de concluses na prxima tera-feira. Os dez msicos que estaro em palco, incluindo Bryan Adams, fizeram um nico pedido -- querem comida vegetariana nos camarins. Os promotores do concerto recomendam aos espectadores: levem guarda-chuva no v S. Pedro pregar uma partida. Reconhecendo possuir algumas saudades do Parlamento, a autarca afirmou que preside a um concelho apaixonante, apesar das contradies, mas lamentou que este tenha estado esquecido pelo poder central e desmobilizado pelo poder local. Da a meter uma cunha aos antigos colegas foi um passo. Muitos aspectos da legislao das autarquias, sustentou Edite Estrela, so inibidores da aco e da criatividade e verdadeiros obstculos a vencer pelos autarcas. prefervel haver leis mais flexveis, para serem cumpridas, do que leis muito restritivas que, por vezes, no so cumprveis, disse, criticando a suspeio com que os autarcas so tratados pelo Estado. Em troca defende que os prevaricadores sejam castigados exemplarmente. Em Paris, a bolsa tambm registou alguns ganhos com a abertura em alta do Dow Jones, apesar de uma sesso calma que resultou num aumento de 0,56 por cento, com o CAC 40 a encerrar nos 2940,89 pontos. Em Madrid, o ndice geral subiu 0,44 por cento. Duas outras personalidades que se digladiam, sem dvida, pelo primeiro plano no crculo do poder so o presidente da Cmara de Moscovo, Iuri Lujkov, aliado dos banqueiros e das mafias da capital, e Anatoli Chubais, o ex-vice-primeiro-ministro recm-nomeado chefe da casa civil da Presidncia. Gestor genial, reformista moderado e autor do programa de privatizaes do Governo, Chubais tem a seu favor o apoio dos governos ocidentais e do FMI, que tendero a ver nele o garante da continuidade da poltica das reformas, quando Ieltsin j no puder impor a sua vontade. Mas, tendo tambm em conta a histria recente, no um dado adquirido que Ieltsin esteja acabado. No incio deste ano, em plena catstrofe tchetchena e aps dois ataques cardacos, a popularidade do Presidente no chegava aos dez por cento. e, no auge da campanha eleitoral, foi visto a gritar e cantar, a correr e a danar o twist e o rock'n roll. Inauguram no Lagar do Azeite, em Oeiras, pelas 21h30, o X Salo Livre de Humor / Oeiras 97 e o VIII Festival Internacional de Humor. Patente ao pblico at ao prximo dia 26, de Segunda a Domingo, das 14h00 s 19h00. O Dirio da Repblica no tem o exclusivo de publicao das boas notcias legais ou regulamentares. Prova indiscutvel disso a Circular 3/94/DEP/1 do director-geral dos Servios Prisionais que veio regulamentar o controlo e reteno da correspondncia dos reclusos, com vista a que o mesmo esteja em harmonia com o quadro constitucional de previso e tutela dos direitos fundamentais. A partir de agora, a correspondncia s ser aberta na presena dos reclusos, para verificar do eventual envio de bens no autorizados e a mesma s ser lida em casos devidamente justificados. O jornal popular alemo Bild anunciou ontem que o construtor automobilstico alemo Audi vai pagar 200 milhes de marcos (cerca de 20,4 milhes de contos) pela marca italiana Lamborghini. A notcia citava especialistas da indstria automvel, mas ainda nenhuma das partes envolvidas na transaco divulgou o montante envolvido. A Audi j tinha anunciado o seu interesse na compra da totalidade do capital da Lamborghini, cujo principal accionista o filho mais novo do ex-presidente indonsio Suharto. Jorge Petiz, em Porsche Carrera RSR, concorrente ao Campeonato Nacional de Velocidade (CNV) de Clssicos, foi ontem o piloto mais rpido no conjunto das duas sesses de treinos para a Rampa da Arrbida, pontuvel para o Nacional e trofus monomarca e Challenge FIA de Montanha, prova que esta manh se disputa. Petiz realizou a melhor subida em 1m44,084s, superando o seu prprio rubricou ao volante do BMW M3 do CNV (classe N2), fixado em 1m45,887s. Na classe at 2 litros (N1), Carlos Borges (Toyota Carina E) imps-se com 1m51,484s. Agarrou-se causa como uma nufraga, e seria compensada. maneira do regime. Com uma medalha por servios prestados comunidade enquanto Trmerfrau -- destas, s foram entregues duas --, uma certido numa capa vermelha com o smbolo da RDA a dourado, uma viagem em paquete para os activistas da primeira hora, onde viajou ao lado de Erich Honecker, da bailarina Great Paluka e muitos outros. E a esttua. A maior honra, porm, ter sido esse convite para falar em Berlim, frente a 100 mil pessoas no dia do trabalhador da construo, em 1979. Se foi ela a autora do texto? Qual qu, eles que prepararam tudo. Eu s tive de ler, o que, pela maneira como o diz, ter sido um grande alvio. Poder-se- dizer que o estilo resulta da sua profisso, fotojornalista. Trata-se do contrrio. Ele faz jornalismo fotogrfico, porque esta profisso a que melhor se adequa ao seu estilo, sua maneira de olhar o mundo. H, no seu modo de ser, como que uma espcie de timidez, que o leva a fazer fotografia no intrusiva, uma das maiores dificuldades do ofcio. No conheo fotografia sua onde os sujeitos estejam pouco vontade, furiosos por estarem a ser fotografados, envergonhados, com o sentimento de estarem a ser violados na sua privacidade. Os que olham para a sua cmara, com mais ou menos prazer, com ou sem indiferena, no mostram hostilidade ao fotgrafo. E os que no olham para a cmara, no parece, pelo que esto a fazer, que reagiriam contra o que os observa. No conheo fotografia sua que revele uma intimidade secreta, algo que ningum gostaria que se soubesse ou visse. difcil, sendo-se fotgrafo, cultivar sem concesses este respeito pelos outros. Muito mais ainda, sendo-se fotojornalista, profisso em que o voyeurisme pode ser elevado ao estatuto de virtude, como acontece tantas vezes com as revistas e os jornais sensacionalistas, como praticam to obsessivamente muitos jornalistas nas televises contemporneas. Nas suas fotografias, Alfredo Cunha revela uma humanidade quase inocente, uma comovente igualdade perante os outros. No se consegue perceber que o fotgrafo ganha a sua vida custa dos outros. so a demonstrao exacta da ternura tmida que o habita. O jornalista no parte do princpio que tem direitos sobre os outros, nem sobre os seus sentimentos, nem sobre as suas obras. Ele sabe o que o direito informao, mas no o reduz aos seus prprios direitos, que considera limitados pelos seus deveres. No conheo fotografia sua na qual a dor, o desespero, a clera, a volpia ou a intimidade sejam explorados. E, no entanto, as suas fotografias so capazes de nos revelar as pessoas, os sentimentos, as situaes. Tanto mais que Alfredo Cunha tem simpatias humanas, culturais e polticas. Fotografa os poderosos por profisso, os sem poder por vocao. Visivelmente, prefere os pobres, os que trabalham, os que sofrem. Nestas condies, muito difcil no fazer fotografia empenhada, engag, com funo e utilidade polticas. Ele consegue-o. Fotografa um lado da condio humana, mas f-lo sem pragmatismo, sem intuito propagandstico, sem outra inteno que no seja mostrar-nos o que ele v, como ele v, o que ele prefere. No encontro, a Sun apresentou a sua nova estratgia The Road to Java e o projecto Java Centre Program, iniciativas que pretendem promover a adopo do Java pelas empresas, integrado numa arquitectura de network computing (baseada na rede e no conceito de thin clients). Ao abrigo do Programa de Centros Java (Java Centre Program), a Sun vai continuar a abrir Centros Java (Java Centres), geridos pela prpria Sun, e Centros Autorizados Java (Authorized Java Centres), geridos por parceiros, que fornecero servios profissionais para instalao de projectos baseados em Java nas empresas. Neste momento, j existem cerca de 225 centros destes em todo mundo, onde cerca de trs a cinco mil profissionais oferecem um conjunto de servios de consultoria, integrao de sistemas, formao e manuteno. A sua unificao, agora, ao abrigo do Programa de Centros Java, uma tentativa da Sun de reorganizar os servios j existentes na rea das solues de network computing baseadas em Java -- explicou a COMPUTADORES Mark Tolliver, vice-presidente responsvel por Market Development. Ao que o PBLICO apurou, as razes que tm levado o ncleo duro da metalrgica a opor-se entrada da Cortal / Seldex no elenco dirigente tem pouco que ver com o receio de um eventual controlo da empresa pelo concorrente, e muito mais com a inconvenincia de desvendar estratgias empresariais em preparao. Com efeito, nos ltimos anos a F. Ramada tem tomado uma srie de medidas de reestruturao e modernizao. o caso da diminuio substancial do quadro de pessoal, que passou de quase 1200 trabalhadores em 1987 para os actuais 600, o que custou quase meio milho de contos empresa. Tambm a perda do mercado angolano levou a administrao da F. Ramada a preparar, atravs da recm-formada empresa de import-export Ramada Internacional, a conquista de mercados alternativos, nomeadamente os dos pases do Magrebe, da Amrica do Sul e da Espanha, esta ltima uma das grandes apostas, precisamente, da Cortal. H ainda a inteno de avanar com um projecto imobilirio no centro de Ovar -- nos terrenos da antiga sede da empresa --, com o que a administrao conta arrecadar 2,5 milhes de contos, para um investimento calculado em um milho. A Organizao dos Pases Exportadores de Petrleo (OPEP) vai pedir a todos os seus membros, na prxima reunio, marcada para 13 de Fevereiro, para que reduzam a produo de crude, de forma a inverter a tendncia de queda do preo do barril. Segundo o seu presidente, o venezuelano Alirio Parra, so necessrios cortes da parte de todos os Estados, pelo que se impe a sua adeso sem excepes a esta medida. Problemas: Distribuio e venda de droga. Obviamente que no quero dizer que as carcias de uma me possam ser substitudas de forma mecnica. O contexto onde as coisas se passam importante e nada disto quer dizer que os elementos psicolgicos no sejam importantes, observa Uvnas-Moberg. Mas o que eu digo que, quando h, por exemplo, uma criana que est numa incubadora e se no pode contar com a estimulao sensorial que a me normalmente lhe concederia, conveniente proporcionar-lhe essa estimulao. A investigadora lembrou que tambm existem drogas que tm o mesmo efeito. Uma droga que nos deixa ligeiramente sedados, que aumenta a nossa capacidade de interaco social, o lcool em baixas doses, que tambm faz subir a oxitocina, continua Uvnas-Moberg. H tambm drogas que propiciam a libertao de oxitocina, como certos neurolpticos, e existe tambm oxitocina em ' sprays ' nasais, que certas mulheres usam para facilitar a amamentao, quando h problemas na produo de leite. A revista comeou a ser feita exclusivamente por Lourdes Castro e Ren Bertholo. Os primeiros nmeros eram como que uma carta aos amigos. Depois, iniciou-se a colaborao de todos os elementos do grupo e de muitos outros portugueses, ligados colaborao escrita, emigrados ou no, como Joo Vidal, J. M. Simes, Jos Gil, Helder Macedo, Nuno Bragana, Cristovam de Pavia, Alfredo Margarido, Antnio Areal, Jos-Augusto Frana; ou estrangeiros, como Andr Pieyre de Mandriargues, Karl Laszlo, Benjamim Peterson. o 11, Primavera de 1963, homenageia Yves Klein). O trabalho fazia-se no atelier -casa de Lourdes e Ren, que possuam uma mquina de impresso serigrfica onde todas as imagens originais da revista eram realizadas. A ttulo de exemplo, citem-se originais (que os assinantes da revista recebiam devidamente numerados, assinados e datados) de todo grupo e ainda de Vieira e Arpad, Saura, Millares, Peter Saul, Corneille, Tinguely, Klein, um objecto op de Soto, postais sobre desenhos originais de Le Parc, Cruz-Diez, Alberto Greco, Aleschinsky, Telemaque, etc. Tendo-se tornado difcil avanar com um colectivo to alargado e dispersivo, Lourdes e Ren resolveram restringir a concepo da revista a eles prprios, Christo e Voss e acordaram em que cada nmero seria da responsabilidade individual de um deles, com a colaborao dos restantes. Assim concebem e realizam os melhores nmeros da revista (do 9 ao 12, sados entre 1962 e 1963), sob a sucessiva direco de Voss, Ren, Christo e Lourdes Castro. e, finalmente, Lourdes, que iniciava a explorao das suas sombras, concebe uma obra essencialmente visual: mais de 50 postais organizados em pginas picotadas, onde o recorte e o contorno so decisivos, mas a imagem fotogrfica se revela tambm essencial. O secretariado da comisso poltica distrital de Leiria do PS rene-se na prxima segunda-feira e a substituio do governador civil ser um dos temas a abordar, como afirmou ao PBLICO o seu presidente, Jos Canha. Apesar de a escolha para governador civil de Leiria recair sobre Alberto Costa, Jos Canha informou que a comisso poltica de Leiria tem mantido contacto com o ministro nos ltimos dias, com o intuito de lhe transmitir o seu parecer. O peso (a durao) diferente que d a cada som cria uma riqueza fascinante na iluso de timbres e de dinmicas. A manipulao do tempo (do ritmo, da essncia do fenmeno musical) recria todas as outras dimenses do som, numa capacidade de reinventar cada arquitectura musical que fez deste recital um acontecimento nico. Voluptuoso na suite em r menor de Louis Couperin, Leonhardt tornou o prlude non-mesur de Anglebert numa verdadeira dissertao filosfica, cujo final, no seu despojamento elptico, surgiu como um ponto de interrogao metafsico e perturbante. Para cada trecho de Franois Couperin, Royere, Forqueray, um esprito mais comedido no prescindiu de enorme fantasia, por vezes de transbordante emoo numa atitude comunicativa e interiorizada, simples e aristocrtica, reveladora de uma fascinante personalidade de intrprete. O resultado da sua primeira aplicao poder ser visto, em breve na Cidade Universitria, em Lisboa, local onde ir ser construda, com base neste molde de beto, uma galeria tcnica, cuja concepo permitir diversas intervenes no subsolo sem qualquer incmodo para os transeuntes. Trata-se, segundo o projectista Martins de Oliveira, que concebeu o molde, de uma conduta visitvel, capaz de reduzir ao limite mximo todos os inconvenientes produzidos pelas permanente operaes de manuteno levadas a cabo pela Cmara Municipal, EDP, Telecom e TLP. Com este tipo de modelo, a renovao das vrias redes, ao nvel do saneamento bsico, sero executadas sem perturbar terceiros, explicou Martins de Oliveira. Embora as aces dos trabalhistas, enquanto oposio, tenham sido extraordinariamente cautelosas em relao Irlanda do Norte, no poder podero enfrentar um cessar-fogo renovado tendo a completa noo das consequncias de um possvel falhano. E enquanto por um lado a administrao trabalhista no depender dos unionistas para obter o apoio parlamentar, por outro, necessitar que eles se mantenham comprometidos para assegurar o avano das negociaes de paz. Se o IRA apelar a um cessar-fogo aps as eleies muito provvel que se d incio a um segundo processo de paz. Mas ser que ir ser muito diferente do primeiro, e quais so as suas hipteses de sucesso? O sismo de segunda-feira em Los Angeles no poupou ningum, nem mesmo as estrelas. Michael Jackson fracturou a cabea e os dedos da mos, a cantora Dolly parton ficou sem nariz e os actores Oliver Hardy e Mae West perderam acabea- mas no literalmente. Todas estas tragdias artsticas deram-se com as rplicas daquelas estrelas no museu de cera da cidade, que no escapou aos efeitos do sismo. Sylvester Stallone foi uma das figuras que se manteve impassvel e no se desmoronou enquanto a terra tremia. Em carne e osso como em cera, eis um homem impassvel. O secretrio de Estado da Integrao Europeia, Vtor Martins, seguir hoje directamente de Bruxelas (onde preside ao Conselho do Mercado Interno) para Rabat (Marrocos), para manter conversaes com o ministro marroquino dos Negcios Estrangeiros sobre os resultados do Conselho de Ministros dos Estrangeiros dos Doze que se realizou a 17 de Fevereiro em Lisboa, onde foi aprovada uma proposta visando a concluso de um acordo de comrcio com Marrocos. A par de um medicamento antiacidez, o grupo aconselha especificamente dois antibiticos de entre a Claritromicina, Metronizadol e Amoxilina, prescritos apenas durante uma semana. que se receia que a escolha de um tratamento incorrecto e disseminado escala planetria faa aumentar a resistncia da bactria aos antibiticos. O grupo tambm recomenda fortemente a eliminao da bactria em doentes com um tumor raro chamado linfoma de MALT (Tecido Linfide Associado Mucosa), quando ainda est numa fase de baixo grau de malignidade. Vrios estudos tm mostrado que estes linfomas regridem depois de tratamentos dirigidos HP, o que sugere uma relao entre os dois. A Assembleia da rea Metropolitana de Lisboa (AML) continua a debater-se com o problema da falta de instalaes, mas pelo menos j tem mesa. A eleio decorreu ontem, num acto com algumas peripcias, mas sem grandes surpresas, j que se cumpriu o acordo previamente estabelecido entre a CDU e o PS. Se no se der qualquer fiasco de ltima hora, provvel que o PDL mantenha uma maioria confortvel dos 299 lugares da Assembleia. Os coreanos pensam, em geral, que a poltica no devia ser controlada pelos homens de negcios poderosos, afirma Ahn Byung Joon, professor de Cincias Polticas na Universidade Yonsei de Seul. e uma vitria do PUN com essa amplitude parece perfeitamente plausvel. Chung deu voz a uma poderosa corrente de insatisfao, especialmente entre os eleitores das classes mdias. Embora a economia da Coreia do Sul tenha crescido cerca de nove por cento no ano passado, so muitos os cidados que vem com preocupao o dfice comercial do seu pas, que de 8,8 mil milhes de dlares [ cerca de 1250 milhes de contos ]. Chung afirma que o problema haver demasiada interferncia do Governo nas finanas e poucas medidas contra as barreiras proteccionistas dos japoneses. O pblico no deve ser desinformado por esta campanha publicitria, disse o comissrio para os Assuntos Sociais, Padraig Flynn. os no fumadores devem ter prioridade. Estas declaraes surgem a poucos dias de, mais uma vez, o Conselho de Ministros da Sade da Unio, discutir uma proposta que visa banir completamente a publicidade ao tabaco. Em Portugal e na Itlia esta publicidade j no possvel, mas em vrios outros pases da Europa ainda persiste, embora com restries diversas. As propostas abolicionistas tm encontrado a oposio de lobbys tabagistas gregos e espanhis. Esperam-se, agora, as oportunas reportagens do golpe de 1974 sem tanques nas ruas, nem cravos, nem vivas ao MFA. E mesmo assim de madrugada, com bola branca a avisar os mais sensveis. Nessa altura a RTP ter atingido a perfeio. E ns teremos esgotado a pacincia ... Teatro e msica, pelo grupo Meia Preta, pelas 22h00, na Fortaleza, em Armao de Pra. Em Lagoa, inaugurada a Fatacil -- Feira de Artesanato, Turismo, Agricultura, Comrcio e Indstria. Tomislav Ivic, que foi treinador do Benfica e do FC Porto, esteve na Luz na quarta-feira a assistir ao jogo entre as suas duas antigas equipas [ 1 mo da Supertaa ]. Penso que o Benfica tem muitos talentos, tem belssimos jogadores, mas penso tambm que o FC Porto, pelo que vi na quarta-feira, mais agressivo sobre a bola do que o Benfica. A exposio abre com o quadro de Jlio Pomar Fernando Pessoa e inclui, entre outras, obras de Carlos Botelho, Joo Abel Manta, Joo Hogan, Bernardo Marques, Mrio Eloy e Maluda. Inaugurao de uma exposio de escultura de Rui Vasquez. s 19h30 no Hotel Alfa, em Lisboa. Patente ao pblico at 10 de Maro. Foi descoberto em 22 de Maro deste ano por astrnomos americanos, quando um rasto da sua cauda foi avistado na atmosfera de Jpiter. Em Julho do prximo ano, muitos dos telescpios do mundo inteiro vo querer assistir ao seu fim. Mais de 20 filmes esto em competio a partir de hoje na 46 edio do Festival de Cinema de Locarno, que se realiza at 15 de Agosto. Da lista fazem parte filmes como Au nom du Christ, do realizador Roger Gnoan M'Bala (Costa do Marfim), Beijing Zazhong, de Zhang Yuan, considerado o primeiro filme punk-rock chins, Bhaji on the Beach, de Gurinder Chadha (Gr-Bretanha), Koraku Zaru, do japons Kenchi Iwamoto, L'crivain public, de Jean-Franois Amiguet (Frana), L'Ordre du jour, do palestiniano Michel Kheifi, La Ribelle, do italiano Aurelio Grimaldi, La Vida Conyugal, do mexicano Lus Carlos Carrera, e Marcides, do egpcio Yousry Nasrallah, entre outros. Para o Leopardo de Ouro e o Prmio da Cidade de Locarno, o jri deste ano constitudo pelos realizadores Chantal Akerman (Blgica), Olivier Assayas (Frana), Kathryn Bigelow (Estados-Unidos), Ferid Boughedir (Tunsia), Alexei Guerman (Rssia), alm de um pintor italiano e de um produtor da televiso sua. O festival prope ainda um panorama sobre a produo sua recente, filmes sobre filmes na seco Cinma-Cinmas, uma homenagem ao realizador italiano Valerio Zurlini e a primeira retrospectiva integral da obra de Sacha Guitry. Para ressarcir a Cmara das despesas a suportar, bem como do direito montagem do equipamento, ir ser estabelecido um contrato de comodato de um bloco sanitrio e dois abrigos para passageiros. Em princpio chegou pensar-se instalar um jornal electrnico, mas isso implicava a compra dum computador demasiado caro para o municpio. Apesar de algumas informaes que do conta alguns casos em que o nome da firma aparece citado, o presidente da Cmara de Alcobaa, Miguel Guerra, referiu que o contrato ainda no estava assinado, e que a empresa tem tido um comportamento impecvel com a Cmara. P. -- Pode querer fomentar a criao de um grupo financeiro, tal como o Banif pretendeu fazer. R. -- At agora, a poltica do Governo para as privatizaes tem sido orientada de modo a obter o melhor encaixe financeiro. A forma de o conseguir pr venda as aces que ainda detm nas empresas nas melhores condies. No existem motivos para alterar, agora, a sua estratgia. um homem alto e magro, mos de dedos esguios e gestos largos a abraar desenhos, projectos, frases. Numa voz grave, falou sobre o Oceanrio de Lisboa e depois, descontrado, perdeu-se a contar coisas dos tempos da infncia e juventude, os caminhos que foi tomando e que o levaram a ser considerado o maior especialista do mundo em grandes aqurios. Depois de ter feito cinco grandes aqurios e muitas outras obras de dimenses monumentais, confessa que a coisa mais importante que fez foi mexer no metropolitano de Boston, remodelar toda a rede, torn-la agradvel e simples de entender. Fala aqui um pouco de tudo, das cobras da infncia em Nova Iorque s preocupaes sobre a corrida aos armamentos nos anos 60, do prazer de ver golfinhos a brincar livremente, de vulces e glaciares a esculpir a paisagem da Terra. Ao falar de Portugal, comea subitamente a trautear um fado de Amlia, de um disco que comprou h muitos anos, no sabe onde nem porqu, e que gosta de ouvir e de cantar sem saber que palavras so aquelas. Um destes trgicos erros foi o maniquesmo. Este fazia-nos acreditar que os nossos eram filhos da luz e os adversrios, das trevas. Pensvamos que tudo o que nos diziam era razovel e verdadeiro, mesmo sem o ter meditado, e que os nossos adversrios estavam sempre enganados. Nessa base, propnhamos projectos no global, com os quais pretendamos resolver todos os problemas do pas. Foram, principalmente, as atitudes sectrias e exclusivistas que nos levaram derrocada democrtica, entre outros factores. Em 1993, pensamos de forma diferente. Procuramos acordos, tratamos de conciliar posies, estamos receptivos para compreender e para aceitar critrios diferentes dos nossos. A criao de um observatrio com indicadores de convergncia, que dever abranger mais de uma dezena de sectores, um dos projectos a concretizar com a nova fase de intervenes estruturais da Comunidade. Atravs deste rgo, que funcionar como base estatstica, as autoridades portuguesas pretendem acompanhar o percurso de convergncia de cada um dos sectores, com base em trs grandes grupos de indicadores: estado de situao, performance conseguida e investimento. Alm do estudo dos sismos e das caractersticas tectnicas das regies onde eles tm lugar, as exploses nucleares podem ainda servir outros fins. De facto, h muitos anos que h quem proponha a utilizao de pequenas bombas nucleares em certas grandes obras de engenharia como terraplanagens de montanhas ou abertura de albufeiras. Recorde-se alis que, em Dezembro de 1992, quando a China realizou a maior exploso no nuclear de sempre -- arrasando a montanha Paotai, situada na Zona Econmica Especial de Zhuhai, na ilha de Sanzao, 25 quilmetros a sudoeste de Macau --, na qual foram gastas 11 mil toneladas de dinamite, no faltou quem especulasse sobre a possibilidade de um dia realizar obras desse tipo com engenhos nucleares. A ex-Unio Sovitica, que levou a cabo um ambicioso programa de exploses nucleares subterrneas pacficas durante anos, parece ser a regio do globo onde este tipo de know-how foi mais desenvolvido, ainda que se desconhea o nvel actual de domnio desta tecnologia. Ainda que a grande preocupao do pblico se costume situar ao nvel da radioactividade libertada, no caso das exploses subterrneas os especialistas dizem que ela pode ser facilmente controlada, havendo, porm, o perigo de se contaminar lenis subterrneos de gua. Por outro lado, o que alguns receiam que uma forte exploso deste tipo possa provocar, ao nvel das placas, fragilidades que venham a tornar mais graves as consequncias de um futuro sismo natural. trata-se das Laffer Utilities (Laffer o nome de famlia do heri), apresentadas como um conjunto de programas para perder tempo quando devia era estar a trabalhar. Algures nos textos do programa l-se que ele capaz de automatizar tudo o que voc faz no escritrio e no tem que ver com trabalho. Com uma base de dados de clip-art, um programa de impresso de apostas, uma seleco de anedotas temticas, um criador de cartazes e algumas surpresas mais, as Laffer Utilities so uma boa maneira de ... perder um emprego. Lembrar as razes foi ainda preocupao de Al Lowe ao preparar este disco, por isso mesmo, a par com alguns textos seus sobre os anos passados a criar Larry, uma divertida entrevista em vdeo, Al Lowe Undubbed, para a televiso alem -- e um vdeo e demo interactivo de Freddy Pharkas, um jogo de All Lowe -- h ainda espao para lembrar o jogo que est na origem do azarado heri da srie: Softporn Adventure. O texto refere-se, nomeadamente, s especificidades destes reclusos e constata que a comunidade prisional, intermediria da sano social atravs da privao da liberdade, passou a acolher uma populao difcil que ora se revolta, ora se adapta, ora se refugia na doena fsica ou psicolgica. E que efeitos se podem esperar da aplicao da pena de priso queles que cometeram delitos pela exigncia da toxicodependncia? --interroga a diviso de estudos e planeamento da Direco Geral dos Servios Prisionais. Bucareste acusou a Hungria de ter pretenses territoriais na Transilvnia -- uma regio atribuda Romnia em 1920, aps o fim do Imprio Austro-Hungaro -- e de apoiar a concesso de autonomia s regies dominadas pelos magiares romenos, uma vez que esta legitimaria a sua ingerncia nos assuntos internos de outro pas. Para mais, o Governo hngaro recusou at agora a assinatura de um acordo sobre a intangibilidade das fronteiras dos dois pases vizinhos. O dilogo poltico a nica via para melhorar as relaes entre a Hungria e a Romnia, afirmou quarta-feira Teodor Malescanu. Mas as palavras do ministro romeno foram recebidas pelos responsveis da UDMR como uma jogada de oportunismo. Isto porque o problema da minoria hngara denegriu a imagem internacional da Romnia, que tenta a todo o custo fazer aprovar o seu pedido de adeso ao Conselho da Europa (CE), que ser votado no incio de Outubro. O objectivo do acordo consolidar o cessar-fogo assinado o ms passado, e proceder troca de prisioneiros entre os dois campos. A guerra civil no Tadjiquisto, que fez mais de 50 mortos, comeou em 1992, quando as foras do neo-comunista Rakhmonov derrubaram o governo dos islamistas, que, nas ltimas semanas, tm somado vitrias e se aproximam perigosamente da capital. No Dubai, as vedetas candidatas a dividir um bolo de um milho de dlares so Goran Ivanisevic, Thomas Muster e Boris Becker. Mas as atenes sero certamente divididas com a armada espanhola, composta por Carlos Moya, Albert Costa e Flix Mantilla. Em Marselha (539 mil dlares), os favoritos so Marcelo Rios, Thomas Enqvist e Michael Stich. Por c, prossegue a segunda etapa do circuito satlite CESAE nos courts rpidos da Associao Acadmica de Coimbra. Dos portugueses presentes no quadro final, apenas dois actuaram ontem e ambos foram eliminados. O campeo nacional Bruno Fragoso perdeu com o eslovaco Boris Borgula por 6-1, 7-6 (7-3) e Tiago Vinhas de Sousa foi derrotado pelo brasileiro Mrcio Carlsson por 6-2, 6-0. Contudo, as alteraes no se ficam por aqui. Para conseguir que a originalidade volte ao local, o arquitecto quer ainda restaurar a chamada Casa do Nuno (onde habitava Nuno, o filho de Camilo) e onde actualmente funciona a sede da junta e o posto mdico. Apesar da discusso que estas obras esto j a merecer na freguesia, o restauro da igreja paroquial , sem dvida, o assunto de que mais se fala. Desde h j alguns anos que chove a eito na centenria Igreja de Ceide. O ltimo Inverno foi j muito difcil de passar e, recordam os paroquianos, que chovia tanto dentro da igreja como c fora. O padre Gabriel Pereira Lopes, proco da freguesia, queixa-se tambm do facto da igreja ser muito pequena e por isso, e para alm do programado restauro, entrou na Cmara de Famalico um proposta para a ampliao do templo. A Comisso de Obras da Igreja de Ceide quer alargar o edifcio de uma forma que no ter agradado a Siza Vieira. O arquitecto esteve j, por vrias vezes, no local e ter manifestado a sua oposio modificao da estrutura da igreja, defendendo antes a manuteno do actual edifcio. Mais quatro pessoas foram mortas ontem num bairro pobre de Istambul, durante os recontros entre manifestantes alevitas e a polcia, pouco antes de representantes de uns e outros terem chegado a um acordo pouco auspicioso. As ltimas mortes de que h notcia ocorreram no bairro de Umraniye, na margem asitica da maior cidade turca, onde cerca de 1500 alevitas saram para a rua para protestar contra a interveno da polcia num outro bairro, Gazi Mahallesi, onde nos ltimos trs dias morreram entre 17 e 23 pessoas em manifestaes. E, em sntese, remeteu para o seu advogado, Lus Avides Moreira. A posio de Brando sustenta-se numa carta da Lacto-Lusa, de 6 de Agosto de 1993, que anula o contrato com a Socifa. Inteno que foi aceite pela sociedade, conforme carta do dia 16. E alarga o rol dos lesados: os credores da Socifa e o prprio fisco, por no pagamento do IVA devido na prestao de um servio. Silncio sepulcral, s os pssaros se ouvem. Mas Manoel de Oliveira continua a ouvir rudos. um barco que vai a passar no mar, murmura um assistente. Aguarda-se que o barco se confunda ao longe com o horizonte .. Sustm-se as respiraes. Ser desta? Malkovich (Michael) e Deneuve (Hlne) so um casal -- ele americano, ela francesa -- que vem a Portugal porque Michael, historiador, julga que pode encontrar na Arrbida documentos que lhe comprovem uma tese segundo a qual Shakespeare era um judeu de origem espanhola. No convento, o casal depara com alguns estranhos personagens: Baltar -- Lus Miguel Cintra --, o director dos arquivos; o seu assistente, Baltazar -- Bnard da Costa --, e a cozinheira -- Elosa Miranda. Para os cubanos, o Che um semi-Deus vindo de outro lado que quis fazer deles homens novos inventando o trabalho gratuito, a abnegao poltica, o internacionalismo. Mas que resta hoje destes sonhos? Um monte de boas intenes abandonadas junto dos caixotes de lixo das cafetarias em dlares, onde jovens e velhos vasculham, ou pisadas pelos saltos-agulha das centenas de jineteras (prostitutas) do Malecn e da Quinta Avenida. Aos 15 anos, muitas crianas do Che deixam a escola primria e encontram-se na rua para buscarse la vida de qualquier forma. Para os antigos, o Che est prestes a morrer uma segunda vez. preciso um meio dia de trabalho a um operrio para ganhar a famosa nota vermelha de trs pesos com a efgie do Che que lhe permitir comprar trs bananas no mercado livre campons. O dlar e o marco fixaram respectivamente a 159.775 e 87.35 escudos face aos 156.942 e 86.871 verificados na sesso anterior. Este ano vai ser aberto um concurso para projectos de investigao sobre a desigualdade entre homens e mulheres na sociedade portuguesa. O objectivo conhecer melhor os fenmenos e as tendncias da desigualdade entre sexos, para que esse conhecimento possa servir de base tomada de decises polticas. Temos um exemplo logo aqui ao lado na nossa vizinha Espanha, onde os jogadores recebem bem, mas jogam futebol. Como bonito ver os estdios cheios e os espectadores verem bom futebol. C em Portugal, o que vemos? Fraudes, mortes nos campos de futebol, rbitros que so tendenciosos a favor de quem querem e outras coisas mais que s trazem vergonha ao nosso futebol. A polcia efectuou, na manh de ontem, nada menos do que oito detenes no Bairro de S. Tom, em Paranhos, na sequncia de vrias buscas domicilirias. A operao policial, que teve incio cerca das 9h00, visou detectar indcios de trfico de estupefacientes. Num dos domiclios, os agentes detiveram trs homens (um estucador de 56 anos, um padeiro de 19 e um empregado de balco de 24) e trs mulheres (uma reformada de 50 anos, uma estudante de 16 e uma tecedeira de 22), aparentemente da mesma famlia, tendo sido ainda apreendida uma aprecivel quantidade de droga. a outra rota por barco para portos como Southampton, Roterdo e Anturpia. ser correcto transaccionar fsseis com coleccionadores particulares, enquanto os paleontlogos (envolvidos muitas vezes em descobertas importantes) esto impedidos de fazer dinheiro com os seus achados? No poder esta situao levar corrupo da classe? Esta a altura ideal para discutir tica, disse ao PBLICO Lawrence Flynn, da Universidade de Harvard. O PBLICO apurou tambm que as negociaes entre a Marconi e a CN sobre a redistribuio das participaes estatais j esto bastante avanadas. A reestruturao do quadro accionista na TMN dever constituir um dos primeiros passos. A Telecom Portugal poder vir a assumir uma participao significativa no operador de telemveis -- ainda que inferior a 50 por cento -- e, em contrapartida, a Marconi ter manifestado, mais uma vez, o seu interesse no trfego internacional para a Europa. Actualmente, o capital da TMN est distribudo, em trs partes iguais, pela Telecom, pela TLP e pela Marconi. A assembleia geral da Marconi deu tambm luz verde emisso de papel comercial por parte da empresa -- que dever acontecer a breve prazo --, para alm de ter deliberado no sentido de os ttulos da companhia passarem a revestir a forma de aces escriturais, de modo a permitir uma maior liquidez do papel. Mrio Carrascalo desempenhou as funes de governador de Timor-Leste de 1982 a 1992. Nessa altura, por causa do massacre do cemitrio de Santa Cruz, descolou do regime de Jacarta, passando a manifestar algumas posies crticas, nomeadamente em relao s foras de segurana. Foi substitudo por um timorense bastante mais pr-Indonsia, o actual governador Ablio Osrio Soares. As crticas custaram-lhe o exlio, tendo sido nomeado embaixador da Indonsia na Romnia. Em 1997 regressou a Timor-Leste. Reencontrou-se com o seu irmo Manuel, tambm ele um timorense pr-integrao que progressivamente foi caminhando ao encontro das posies da resistncia. numa atitude indita, Robin Cook telefonou ao seu colega portugus antes de embarcar para Jacarta, para falar sobre Timor. Um dos assuntos abordados nessa ocasio foi a venda de armamento britnico ao regime de Jacarta. Nos meios diplomticos h, alis, quem interprete estas manifestaes de sensibilidade em relao questo de Timor por parte do Executivo de Tony Blair com uma tentativa de compensar a m imagem criada com as significativas exportaes de armas made in England rumo Indonsia. Apesar de ter herdado do Executivo de John Major as mais polmicas licenas de exportao de armas para Jacarta, o Executivo trabalhista no escapou s crticas dos activistas pr-direitos humanos, que acusam os novos governantes de no terem travado os negcios, nomeadamente a venda dos caas Hawk fabricados pela British Aerospace e alegadamente utilizados em Timor. No entanto, apenas trs meses depois da entrada em vigor do novo horrio de funcionamento do Mercado de Manuel Firmino, que estipulava um novo perodo de abertura entre as 17h00 e as 20h00, foi decidido voltar a encerrar o mercado da parte da tarde. A reivindicao partiu de alguns comerciantes, desagradados pelo facto de a abertura do mercado tarde no ter atrado novos consumidores. E nem a campanha publicitria desenvolvida em rgos de comunicao regionais, promovendo o mercado e dando conta do alargamento do seu horrio de funcionamento, levou mais pessoas ao Manuel Firmino. O fracasso desta medida ditou que, desde o incio deste ms, o mercado voltasse a funcionar apenas da parte da manh, encerrando portas s 14h00. Um dos comerciantes no poupa crticas Cmara, considerando que a campanha publicitria foi mal conduzida, que as obras esto atrasadas e que a alterao do horrio deveria ter sido feita aps a concluso das obras. Nem sequer puseram ' placards ' pela cidade, desabafa este comerciante, que considera que os anncios do alargamento do horrio da parte da tarde deveria ter sido mais publicitado. E acusa ainda a Cmara Municipal de no ter investido nos ltimos anos no Mercado de Manuel Firmino, por pretender que os comerciantes se transfiram para o novo Mercado Municipal de Santiago. O facto causou a maior indignao junto dos estudantes daquela faculdade, cuja associao de estudantes se apressou, alis, a colocar cartazes em todos os locais da escola avisando os recm-chegados mais distrados para que no passem cheques em branco. Apesar do grotesco da situao, qualquer caloiro que procurasse saber das diligncias que necessita de efectuar para se inscrever em Cincias, deparava com uma longa lista de preceitos, intitulada Aviso e que explicava que todos os colocados na faculdade no ano lectivo de 1994/95 (1 ano/1 vez) faro a sua matrcula por via postal (correio registado), ao que se seguia uma listagem dos documentos a enviar. A meio da lista, era solicitado o envio de um cheque emitido a favor a Faculdade de Cincias da Universidade de Lisboa, devidamente identificado com o nome do aluno no verso, para pagamento de propinas. De montantes, nada vem referido. Ora, conforme explica Nuno Bio, elemento da Associao de Estudantes, ainda no est fixado o valor das propinas a pagar pelos estudantes, ainda no se sabe quem ter direito a reduo, o regulamento interno da faculdade ainda nem sequer foi elaborado. Alis, aos alunos-caloiros era tambm pedido o envio do requerimento de reduo de propinas se pretender apresent-lo. Nuno Bio no tem dvidas de que se trata de uma tentativa frustrada por parte dos orgos competentes da faculdade de obrigar os caloiros, menos informados destas coisas, a pagar, para poder afirmar depois que h muita gente a pagar propinas. Um dos objectivos de Portugal nas negociaes do Uruguay Round do Acordo Geral de Tarifas e Comrcio (GATT) poder ter uma soluo positiva, dada a proposta que os Estados Unidos ontem apresentaram. Trata-se da aprovao de um perodo de 15 anos para a abertura das barreiras alfandegrias ao sector txtil. A Teleweb, uma nova empresa portuguesa fornecedora de acesso Internet, anunciou estar disposta a investir um milho de contos de forma a atingir o seu objectivo de 50 mil clientes no ano 2000. A empresa totalmente detida pela Finantel SGPS SA, que detm participaes na Ensitel e na NCM-Nokia e atingiu em 1997 um volume de vendas consolidado de 9,2 milhes de contos, com resultados lquidos de 244 mil contos. Andava pela rua de cabea perdida. Com cinquenta e trs anos, dizia ele de si para si, portava-se como um garoto. Que vergonha! Mas no via nenhuma sada. No momento em que chegou ao jardim pblico da rua Au-Sable viu, numa lea deserta, uma silhueta magricela. Aproximou-se e o corao comeou a bater com fora. Era mesmo o Pierre-Jean, da outra vez. Subiram-lhe as lgrimas aos olhos, e por pouco no beijou o seu antigo conhecido.Subiram-lhas lgrimas aos olhos, e por pouco no beijou o seu antigo conhecido. O outro esquivou-se delicadamente, mas convidou-o a subir a bordo. Aparentemente imparvel nos seus projectos de expanso (para grande dor-de-cabea das marcas europeias) o anncio da Honda surgiu uma semana depois do acordo conseguido entre o Japo e a Comunidade Europeia para a abertura gradual do mercado aos carros nipnicos a partir de 1992, e dois dias depois da morte do seu Oyaji-san (o grande pai), o homem que via na Europa o grande mercado a conquistar. Soichiro Honda foi o homem que construiu um imprio, que encarna o renascimento da indstria japonesa no ps-guerra, desde a pequena fbrica de motocicletas, em 1947, ao grupo que factura hoje 4,5 mil milhes de contos e tem 13 milhes de contos de lucros. A par do seu envolvimento com empresas de efeitos especiais, como o caso da ILM, alguns projectos recentes contriburam para dinamizar a companhia. o caso da associao com a Time Warner Cable para a concretizao da TV interactiva em Orlando (na Florida), com a Nintendo para a criao do novo sistema de videojogos Ultra 64, ou com a Sprint of the Drums para criar uma rede de alto dbito para a indstria do entertainment produzir filmes ou material de publicidade. Com a AT&T, criou a joint-venture Interactive Digital Solutions para o desenvolvimento e oferta de solues de vdeo interactivo, servios de informao ou de entretenimento, atravs das redes telefnicas e de TV por cabo. Foi tambm a primeira empresa nomeada pela companhia telefnica japonesa NTT para proporcionar servios interactivos na rede digital que este operador est a instalar. Com sede em Mountain View, na Califrnia, a sua presena em feiras ou conferncias internacionais usual e, no Siggraph deste ano (a 21 edio do maior certame actual de informtica, com 25 mil visitantes), foi o expositor com mais espao. A SGI aproveitou para anunciar a criao de uma nova empresa -- a Silicon Studio -- para o desenvolvimento de aplicaes para os novos mercados de ' media ' digital. A nova empresa trabalhar com programadores, criadores artsticos e distribuidores na expanso das suas actividades correntes nos domnios do filme, vdeo, TV interactiva, vdeojogos ou parques de diverses. Crimos a nova subsidiria para promover o crescimento da SGI no mercado do ' entertainment ', referiu revista Post Update Mike Ramsay, presidente da Silicon Studio, que diz pensar que a tecnologia para a rea do ' entertainment ' comandar o mercado nos prximos cinco anos. Documentos internos da empresa apontam o mercado da TV interactiva e dos videojogos como aqueles que, a longo prazo, vo ter uma maior procura, seguindo-se ento o vdeo e o filme digital e, por ltimo, os efeitos especiais. A Renault Portuguesa tem actualmente em curso um processo de despedimento colectivo de 144 trabalhadores. Os administradores nomeados pelo Estado portugus votaram contra esta deciso, mas os franceses, maioritrios, decidiram ir em frente, o que constitui mais uma prova do confronto aberto instalado na empresa. Como os despedimentos ainda no foram consumados, a Renault continua com 765 trabalhadores e com uma produo diria de 160 automveis do modelo Clio. A administrao da fbrica j fez no entanto saber que sua inteno reduzir a produo diria para 110 unidades. Entretanto, os trabalhadores da fbrica de Setbal realizaram uma greve na quinta-feira passada e marcaram nova paralisao para a prxima segunda-feira, que incluir uma marcha sobre Lisboa. A administrao francesa chegou a defender a venda da fbrica de Setbal a outro construtor automvel. O Governo portugus desenvolveu alguns contactos nesse sentido, e os coreanos da Kia chegaram mesmo a deslocar-se s instalaes. Mas as autoridades nacionais no viram grande receptividade e decidiram afastar essa hiptese, porque no queriam ficar com o nus de no conseguirem arranjar comprador, o que constituiria um trunfo para os franceses. Na concertao, h ainda muito a fazer, constatou Maria Jos Constncio, que prometeu o seu empenhamento pessoal para que a falta de concertao entre as entidades locais e o governo central deixe de ser um ponto fraco do PDI do Vale do Ave. H que coordenar a aco dos diversos ministrios e municpios. Em Portugal, infelizmente, os diversos ministrios e as autarquias tm os seus programas, mas, por vezes, no tm uma coordenao rigorosa, advertiu. Para evitar uma aplicao desgarrada de milhes de contos no Vale do Ave, Cravinho preconiza um sistema de gesto forte, que ponha todos a trabalhar de forma articulada para uma boa aplicao das verbas. Do lado portista, o dia foi calmo e sem problemas, entre Montechoro e as Aoteias, em cujo complexo desportivo a equipa fez ontem tarde o ltimo treino antes da partida. Tranquilidade absoluta, nenhuma hostilidade da parte de ningum, uma paz realmente sem mcula. Reinaldo Teles, que chefia a comitiva o nico dirigente presente e no se espera a presena de mais nenhum dos altos responsveis. Como o jogo d na televiso julgo que no vem mais ningum, disse-nos o homem que dirige o departamento de futebol portista. O deputado socialista Ferro Rodrigues mostrou-se descontente com as respostas do Governo, que considerou vagas e evasivas. O deputado disse mesmo que o o Governo est, deliberadamente, a querer desvalorizar politicamente um problema que um escndalo financeiro gravssimo que envolve muitos milhares de contos em fugas ao fisco. Tudo isto porque a resposta ao Grupo Parlamentar Socialista sobre esta matria foi dada pelo subsecretrio de Estado adjunto da secretria de Estado do Oramento, Vasco Valdez. Quando espervamos a presena do ministro das Finanas, Braga de Macedo, surgiu no o ministro, nem a secretria de Estado adjunta e do Oramento, mas sim um subsecretrio de Estado adjunto da secretria de Estado adjunta do ministro adjunto do primeiro-ministro, referiu o deputado socialista. Para Ferro Rodrigues, no se trata de uma questo de competncia, mas sim da forma como o Governo est a tratar este assunto. Palcio Galveias. Campo Pequeno. 3 a 6, das 10h s 19h. Sb., e dom., das 14h s 19h. Um conjunto de grandes telas, em que a exuberncia cromtica pretexto realizao de obras simpticas, precede a apresentao de um conjunto de estudos para os painis do Metropolitano de Lisboa -- entendidas as paredes como suporte de pintura, no sentido que possua antes de Maria Keil o transformar. no so todos os que intitulam um quadro Um co, talvez. Enquanto isto, foi ontem anulada a misso governamental angolana que deveria ter ido Jamba, quartel-general da UNITA, recolher parte dos presos de guerra. Abel Chivukuvuku, membro da Comisso Conjunta Poltico-Militar (CCPM) para a fiscalizao do cessar-fogo, alegou imprensa razes tcnico-logsticas para a anulao da viagem, relacionadas quer com o nmero reduzido de presos que desejaria voltar para Luanda quer com a programao da cerimnia. O silenciamento das emisses da RTPi na Guin-Bissau, aqui noticiado ontem e sbado, vem recolocar a questo das complexas relaes de Portugal com as suas antigas colnias. Um fantasma que se arrasta nestes 20 anos ainda marcados por uma descolonizao que foi -- ou teve de ser -- o que foi, verdade, mas que importa exorcizar de uma vez por todas. Sem complexos, mas com a coragem de um projecto srio, consequente e responsvel ao nvel da cooperao, em geral, e do audiovisual em particular. Foi o que no aconteceu, mesmo nestes ltimos tempos de obra feita do cavaquismo. R. -- O Presidente tem um prazo at Maro para o assinar e apresentar ao Congresso, como um pacote que dever ser aceite ou rejeitado. Caso ultrapasse esse prazo, o Congresso poder discutir cada alnea, o que pode adiar indefinidamente a sua aplicao. possvel que quem apresentou essa ideia considere que seja melhor para os Estados Unidos e para o mundo ter qualquer acordo do que no ter nada. O trabalho em famlia dos Hestons pai e filho (Charlton, o intrprete, e Fraser, o realizador) revelou-se bastante interessante na Ilha do Tesouro que h pouco vimos e justifica o interesse pela reincidncia, agora volta de outra figura clssica: Sherlock Holmes, investigando a desapario de um militar britnico, o que o leva a uma intriga de cobia e vingana. Legal Vdeo, 1992, 101 min. O painel considerou que os empreiteiros teriam menos tendncia para relatar eventuais problemas de segurana e exortou a NASA a manter a sua presena nas fbricas dos fornecedores. A NASA no se deve iludir pelo aparente sucesso inicial de todos os esforos de transio, sublinha o documento. O administrador da NASA, Daniel Goldin, ao fazer a sua leitura do relatrio afirmou que este tinha considerado que o programa de vaivns se encontrava de excelente sade. O painel tambm aponta algumas reas onde preciso continuar a colocar alguma nfase e outras onde necessrio melhorar, disse Goldin. A NASA concorda com essas afirmaes e j dei instrues para que sejam postas em prtica o mais depressa possvel. S recentemente e de uma forma restritiva, alguns tribunais comearam a considerar que os processos contra os jornalistas devem ser suspensos at que estejam decididos os processos contra aqueles, geralmente figuras pblicas, de quem falaram. A lei de imprensa, na sua actual verso, nestes como em muitos outros aspectos, fere gravemente o direito de ser informado dos portugueses, pelo que se aguarda que seja brevemente discutido na Assembleia da Repblica um novo projecto de lei de imprensa que confirme o reforo da liberdade de expresso, de informao e de imprensa. Orlando Miguel e Jos Pedro partiram de bicicleta, levaram sacos-cama, roupa, comida e algum dinheiro. O primeiro deixou um bilhete aos pais dizendo que no se preocupassem e que no ia fugir. Foram cerca de 500 jovens atletas, representando escolas e clubes de vrias regies do pas, incluindo a ilha da Madeira, que participaram no III Encontro Nacional de Andebol Feminino, j considerado o maior da modalidade, organizado pela Associao de Andebol de Lisboa. As expectativas dos promotores no tero sido goradas e a palavra sucesso era repetida, com insistncia, pelos organizadores. Creio que o encontro foi muito proveitoso. Para alm do convvio, as jovens atletas puderam praticar o seu desporto favorito e mostrarem as suas capacidades, afirmou ao PBLICO Isabel Cruz, da Associao de Andebol de Lisboa. 3 milhes de contos. Robert Koch, um Don Juan norte-americano de 51 anos, foi desmascarado e preso depois de ter burlado mais de 200 mulheres com promessas de casamento. Fazia-se passar por um vivo rico, propritario de fbricas e sedento de carinho, e, depois de conseguir que as suas presas lhe confiassem grandes quantias em dinheiro e jias, desaparecia sem deixar rasto. Reconhecendo que ningum melhor que os prprios sabem o que deve ser feito, Nery pensa que deve haver uma separao clara entre subsdios a equipamentos, subsdios produo e aos custos permanentes de manuteno de uma companhia. Exigncias que obrigam os grupos a montar peas a correr s para no serem penalizados monetariamente -- o que no saudvel para ningum, muito menos para o Gil Vicente. Renitente em divulgar quais sero os novos critrios -- embora tenha algumas ideias gerais --, Nery espera pelos resultados dos inquritos. Certo que ser um jri que decidir a atribuio dos subsdios e que dever incluir criadores no envolvidos no concurso, crticos e outros profissionais ligados ao sector. Vieira Nery julga que as decises devero ser tomadas por uma comisso mista que envolva o Estado e avaliadores exteriores. Mas no acredita no convite dos nomes pelos nomes. Uma fita carregadora com 20 cartuchos de bala real para espingarda G-3, 17 balas de salva, 27 invlucros, uma granada de mo defensiva M-63, uma granada ofensiva M-62, uma granada de fumo, uma munio para espingarda Mauser e uma munio de 20 milimetros, constituam o armamento abandonado sem qualquer tipo de sinalizao. O material, depois de inspeccionado pela GNR de Lamego, foi entregue no Centro de Instruo de Operaes Especiais. A comear em Lou Reed, passando por David Bowie e de Iggy Pop, h toda uma tradio de estrelas anglo-saxnicas em fase de crise existencial, que acabam por ir parar a Berlim, onde vm a gravar discos de um pessimismo to cerrado quanto brilhante. Apesar de o Muro ter cado, os U2 trataram de revitalizar esta tradio num disco de uma negritude sem par na sua discografia prvia que, se no constitui uma ruptura to radical com o passado quanto The Unforgettable Fire. No entanto, por esse mesmo estado de esprito que o anima, um disco diferente na carreira do quarteto irlands. As letras simplificam-se, para trazer flor da pele uma amargura infinita, enquanto os arranjos denotam um frenesim recalcado, a que a produo de Lanois, Eno e Lillywhite vai acentuando as nuances dramticas. Achtung Baby arte pop no seu znite, isto , beira do hospital psquico. Os adeptos da face mais ligeira da colaborao de Kurt Weil com Bertolt Brecht devem odiar esta reviso hard-core do seu reportrio, como as incurses dos Young Gods pela msica de circo e vaudeville no devem ter feito a felicidade dos adeptos da sua vertente mais industrial. um disco pouco indicado para amantes de ortodoxias, o que, de modo algum, o torna inconsistente, tratando-se afinal de uma vanguarda actual, que revisita luz dos seus prprios princpios outra vanguarda, que a antecedeu sob vrios aspectos. O resultado brutal e espectacular, baseado em verses radicais do reportrio mais famoso de Weil / Brecht, repletas de descargas de electricidade e avalanches de rudos urbanos, que da estrutura musical da famosa dupla no deixam mais que o esqueleto, mas lhe acentuam a sua carga teatral e inerente dramatismo, ao ponto do sufoco. O Sindicato dos Maquinistas dos Caminhos de Ferro Portugueses convocou mais uma greve s horas extraordinrias a partir de segunda-feira, informou ontem o porta-voz da CP. H algo de inacabado, amputado na nossa cultura, especifica Antnio Jos Saraiva, uma espcie de infncia para alm do seu termo. Foi isso que nos levou procura de outro pai alm-Pirenus, na CE. Uma segunda linha a que engloba os tecnocratas de optimismo standardizado e maniqueu, com a sua subcultura do sucesso, do consumismo, do top, da produtividades, da eficcia. No decurso da conferncia de imprensa, Carvalhas quis sublinhar a conjuntura em que est a ser preparado o Congresso. Decorre num quadro em que cresce o desencanto, a frustrao e o protesto dos trabalhadores e de amplos sectores da sociedade pelas consequncias de uma poltica que nos seus principais eixos no se afasta daqueles que constituram a matriz poltica que os portugueses quiseram ver derrotada. Carvalhas -- que continuar a assegurar a liderana do partido -- manifestou ainda a sua convico de que o Congresso ser um importante impulso para o reforo do PCP. A ideia de instalar na frente costeira da margem direita do Douro um equipamento virado para o mar, embora longe da dimenso que se pretende agora para o Centro de Cincia e Tecnologias do Mar, tem quase 20 anos e as diversas tentativas efectuadas at hoje nunca chegaram a bom termo, estivesse no Porto ou em Matosinhos o local escolhido. Tudo comeou com a ideia, em 1980, de substituir a j ento degradada Estao de Zoologia Martima Augusto Nobre, ou Aqurio da Foz, como mais conhecido, encerrado h mais de 30 anos. 1980 -- Por iniciativa de um bolseiro alemo, Michael Weber (hoje docente convidado do ICBAS), os rotrios do Porto ponderam duas ideias: reconstruir o velho aqurio da Foz, ou construir, de raiz, um novo aqurio, no Porto. Cmara do Porto, Universidade e Administrao Central aderem ao projecto. O primeiro local apontado so os terrenos do futuro parque da cidade. No entanto, o consumidor poderia optar pelo envio de um cheque sobre o estrangeiro passado pelo banco do comprador em moeda do pas onde se vai fazer o pagamento -- a forma mais barata --, ou passar um cheque no pas da compra em moeda local. Cinco dias depois Pedro Conceio deslocou-se cidade com os documentos indispensveis para avanar com o processo de matrcula (bilhete de identidade, passaporte e nmero de contribuinte). No dia seguinte, o carro podia ser levantado, tendo sido efectuado o pagamento de cerca de 1.600 contos em cheque sobre o estrangeiro. Com o carro, foi entregue uma guia de circulao vlida para 90 dias em territrio espanhol, o livrete, uma factura de compra e uma aplice respeitante a um seguro contra todos os riscos vlido por 30 dias (custo: 38 contos). Foi uma deciso muito ponderada, muito difcil e muito amadurecida, disse ao PBLICO Anabela Moutinho, que foi lacnica no que se refere aos motivos. No cargo apenas deste Outubro, ela assegurava, entre outras coisas, a ligao entre o IPACA e os espaos potenciadores de novos pblicos, como os festivais de cinema, nacionais e internacionais, as escolas e os cineclubes. Um projecto que ser concretizado por meio de protocolos, mas que Anabela Moutinho no teve tempo de concretizar por falta de tempo til. Esta uma poca de transio, muito importante e difcil para o sector, admitiu. O PBLICO tentou saber se havia divergncias de pontos de vista entre Anabela Moutinho e Costa Ramos, que ela no confirmou nem desmentiu. A minha deciso no foi pacfica, no seria honesto da minha parte diz-lo, posso dizer que foi pacfica e sem equvocos a maneira como o processo aconteceu, declarou. O autor receava, aquando da sua publicao, que este livro fosse incompreendido. Razes? Em particular, a linguagem simblica, certamente hermtica que foge ao estilo que mais tarde escolhi, justifica-se numa nota de contracapa. A ausncia da pretenso didctica, inequivocamente pautada em livros subsequentes, que narram igualmente a saga dos angolanos, como Mayombe ou As Aventuras de Ngunga, s aparente, porque esta componente, embora aqui se afigure numa estratgia enunciativa bastante esquiva, est flagrantemente presente. Quisera acabar com a ovalidade do mundo e conseguira. Mas o quebrar do sonho aliou-se impossibilidade de viver no mundo sem ovalidade. E ainda no havia mquinas que realizassem os sonhos individuais. S os de grupos. (pg. 158) Poder haver uma sugesto maior? Ela tentou olhar o lado esquerdo, mas uma montanha a separava. Ele fez o mesmo para o lado direito, mas a mesma montanha o impedia. Cada um contemplou o seu lado, reconhecendo-se, incapaz de transpor a montanha (pg. 11). -- Quando os corvos forem derrotados, no ser s aqui na montanha que o Sol ser azul. Por toda a parte ele dardejar rosas sem espinhos ... -- dizia ele. E ela sorria quela verdade desejada. -- Os meninos brincaro com o vento da madrugada, com ele fixando o capim terra ... -- E os morcegos comero mel e no excrementos ... -- concluiu ela (pg. 51). Este assunto, alis, foi tambm discutido na CCPM, que quinta-feira efectuou a sua 27 plenria, aps um interregno de 20 dias. O caso Quilengues, em que morreram trs turistas britnicos e um neo-zelands, foi um dos assuntos em discusso, dada a contradio entre as partes. A UNITA apresentou recentemente um suposto chefe do grupo que assassinou os turistas e que seria um oficial do ex-Ministrio da Segurana de Estado. As declaraes do prisioneiro no convenceram ningum e o Governo rebate que a zona de Quilengues de acantonamento de tropas da UNITA. Vrias pessoas citadas como tendo participado no acto desmentiram as verses da UNITA, e a Comisso Mista de Verificao e Fiscalizao (CMVF) encarregada de esclarecer o assunto no conseguiu apurar nada por dificuldades em chegar ao local. Duas organizaes francesas, a Associao Fora Operria Consumidores (AFOC, Altos Alpes) e a Unio Departamental das Associaes Familiares (UDAF), assinaram um acordo de cooperao com associaes de consumidores italianas que levou criao de uma estrutura franco-italiana denominada Consumidores sem Fronteiras. Esta organizao constituir, segundo os seus promotores, a base de uma nova agncia de informao europeia aos consumidores, a inaugurar no prximo ms de Junho. Os Consumidores sem Fronteiras propem-se, para j, envolver-se na resoluo dos litgios transfronteirios e organizar aces conjuntas de informao e esclarecimento aos consumidores dos dois pases. Onde a cidade do homem, no do lobo mas irmo? (...) Em 1975, nos Estados Unidos, a comisso trilateral divulgava um relatrio intitulado Crise da Democracia, no qual culpava pela contestao autoridade fundada na hierarquia, na competncia e no dinheiro ... intelectuais e grupos prximos, que estavam a tornar as democracias ocidentais, liberais, parlamentares, ingovernveis porque ameaadas por uma gravssima crise de autoridade, fomentada por intelectuais e jornalistas. E conclua que S se pode sair desta crise com o restabelecimento frreo do princpio da autoridade. De ento para c, o sistema tem vindo a domesticar esses sectores, seja atravs da seduo material seja pela atribuio de honrarias, pelo que tais sectores abandonaram a contestao s instituies. Paralelamente, o controlo sobre os rgos de informao de tal ordem apertada que s vem a pblico a verdade oficial, a do Big Brother. So trs os principais pontos litigiosos: a participao de organizaes opostas ao processo de paz, a possibilidade dos palestinianos de Jerusalm Oriental serem candidatos e eleitores, e ainda as competncias do Conselho Autnomo. O chefe dos negociadores israelitas, Yoel Singer, realara anteriormente progressos em relao a uma srie de questes, salientando que as negociaes continuaro, dentro de duas semanas, e pela primeira vez em Jeric. Na cimeira de hoje, em Erez, Arafat vai tentar convencer o primeiro-ministro israelita a fixar uma data para a retirada do Exrcito das cidades rabes da Cisjordnia e para a realizao de eleies, mas Rabin provavelmente hesitar uma vez mais, enquanto no obtiver garantias de segurana para os 130 mil colonos judeus. O lder da OLP pretende tambm a abertura dos territrios, fechados desde 22 de Janeiro, para que 50 mil palestinianos possam trabalhar em Israel, embora queira tambm que Rabin explicite a sua ideia de uma separao total dos dois povos. Arafat receia que a separao seja sinnimo de priso e no de independncia. A vida de Albino Luciani, que seria o papa Joo Paulo I, encerra dois mistrios: as circunstncias da sua morte e o teor da sua conversa de duas horas com a irm Lcia, vidente de Ftima. Depois desta conversa, Luciani nunca mais seria o mesmo, garante a sua famlia. por isso que, explica, no tem pena de Hillary Clinton. Eles [ Hillary e Bill Clinton ] podem ter alguma espcie de acordo e quem somos ns para dizer se bom ou mau? Acho que eles tm mais maturidade em relao ao sexo, como em relao s drogas, do que podem mostrar. Vivo num Estado de Ironia. Campos faz parte destes rgos, bem como Manuel Alegre que cedo se colocou ao lado do eurodeputado em nome de solidariedades antigas. A deciso de Guterres foi tomada depois de segunda-feira ter reunido o ncleo duro dos seus conselheiros polticos, tendo mantido outros contactos durante todo o dia. Esta precipitao dos acontecimentos vem de encontro a algumas das crticas feitas actuao da Comisso Permanente, criticada por ter deixado arrastar durante semanas o contencioso com o eurodeputado, em vez de desde logo, como muitos defenderam, chamar Campos para este prestar esclarecimentos e colocar uma pedra sobre o assunto. Na busca passada casa do suspeito, de 38 anos, que trabalha como marceneiro, foi encontrada a caadeira disparada trs dias antes contra a porta do clube, localizado naquele bairro, uma espingarda de presso de ar, diversos cartuchos e um panfleto de herona. Apesar de haver suspeitas de o indivduo estar relacionado com o trfico de drogas duras nas Galinheiras e tambm na zona de Odivelas, onde possui outra casa, nenhuma quantidade significativa foi encontrada. Elogiei a escolha do meu anfitrio, enquanto discorria, a despropsito, acerca do papel do consejero de cracion, que tinha talvez inventado aquela espcie de centro de mesa, com batatas acabadas de fritar (sim, batatas fritas), certamente em fatias espantosamente finas e em azeite a ferver, tratadas a modos de souffl. Iniciei-me com uma morcilla de Burgos, enrolada numa massa fofa, e dediquei-me com carinho a um naco de bonito, frio e escabechado. A conversa seguia animada, uma dose de marketing, 200 gramas de imagem, levadas ao lume com public affairs q.b. Gosto de levar a srio o meu papel de consultor encartado. Eduardo Loureno fechou, na sexta-feira, ao fim da tarde, os trabalhos da VI Reunio Internacional de Camonistas, que decorreu durante quatro dias na Universidade de Coimbra, com uma inspirada lio de encerramento em torno das ligaes entre Cames e Petrarca e da luz platnica que sobre ambos ter, ou no, incidido. Anbal Pinto de Castro, organizador do encontro, obrigado pela funo a proferir algumas palavras finais, confessou o quanto lhe parecia sacrlego falar aps a interveno do recm-galardoado com o Prmio Cames. de crer que estivesse a ser sincero. Eduardo Loureno tem, de facto, esse dom, to contraditrio com a sua proverbial modstia, de dar a tudo quanto escreve um tom definitivo. Provoca em quem o ouve a sensao de que aquilo que diz, o diz da forma mais justa, se no da nica forma justa. Um talento que, por norma, cabe apenas aos poetas. A France 3, uma das principais estaes pblicas da televiso francesa, abriu as hostilidades e declarou greve em nome do horrio laboral e da poltica salarial. A France 2 segui-lhe os passos e as emisses dos dois canais foram interrompidas. A direco j mostrou boa vontade, mas a greve prossegue em todas as delegaes do pas. Concebidos para funcionarem como instrumentos privilegiados dos gestores de sistemas de informao, estes Compaq Proliant viram reforados os seus dispositivos de segurana e preveno de quebras no sistema -- reforo esse numa perspectiva de centralizao fsica deste tipo de equipamentos, sustentadores de redes cada vez mais complexas resultantes das evolues de downsizing e rightsizing a que muitos sistemas de informao estiveram ou viro a estar sujeitos. Estes sistemas exigem um acompanhamento e controlo permanentes (as 24 horas de cada um dos dias da semana), pelo que a sua centralizao num nico espao permite racionalizar e conter os custos da sua gesto. Dos dispositivos de segurana e preveno faz, evidentemente, parte o Insight Manager, aplicao de gesto j conhecida das linhas anteriores de servidores mas agora na sua verso 2.3. A Orquestra Clssica do Porto abre hoje, em Guimares, a 4 edio dos Encontros da Primavera. O primeiro concerto dirigido por Piero Bellugi, com os solistas Pedro Corostola (violoncelo), Maria do Rosrio Ferreira e Palmira Troufa (sopranos). No Pao dos Duques de Bragana, s 21h45. Final do Festria -- IX Festival Internacional de Cinema de Tria, com a exibio s 16h30 do filme premiado com o Golfinho de Ouro. Um industrial do Porto, Manuel Magalhes, de 48 anos, abate a tiro dois scios, quatro disparos num, seis noutro, por terem passado cheques em seu nome. Um casal de Aveleda, povoao do Minho, acusa o padre da aldeia, Joaquim Carneiro, de ter relaes sexuais com um filho seu de 12 anos. O povo defende o sacerdote. As autoridades arquivam o processo por falta de provas. O arcebispado, porm, transfere o acusado para Paris. necessrio que a construo [ da barragem do Ca ] seja efectivamente suspensa e que sejam reunidos os meios humanos e necessrios a um estudo completo de toda a rea antes que se crie uma situao irreversvel. E no se invoquem os prejuzos imediatos de tal deciso, pois correspondero, certamente, a uma parcela diminuta dos encargos com essa obra faranica e desajustada que a Expo'98. So afirmaes do reitor do Universidade do Porto, Alberto Amaral, no editorial do n 25 do Boletim daquela Universidade, inteiramente dedicado ao tratamento da grave situao do patrimnio histrico-cultural do vale do rio Ca, como igualmente refere o autor. Comdia inglesa, anos 50. Um friso inenarrvel de personagens marginais e/ou marginalizadas, com o ps-guerra em pano de fundo. Pea nica e inimitvel no contexto do teatro ingls deste sculo, oscilando entre o naturalismo e o lirismo. Receita que funciona lindamente, graas a um elenco talentoso e bem dirigido e a uma direco plstica (Jos Carlos Barros) eficaz. TEATRO DA TRINDADE. De 3 a sb., s 21h30; sb. e dom., s 16h. Cumpridos 16 dias de uma greve de 45, a paralisao na Rdio Televiso da Guin-Bissau (RTGB) foi suspensa ontem at ao prximo dia 25, devido a uma cedncia dos trabalhadores que tm ouvido forte e feio da populao fora de Bissau que no tem acesso s emisses da RTPi -- especialmente por causa dos jogos da seleco portuguesa, acompanhados como se fosse a seleco guineense. E foi assim que ontem -- precisamente por causa do Portugal-Turquia -- a televiso guineense reabriu ... hora necessria para transmitir o Portugal-Turquia, o que motivou grandes ajuntamentos em locais pblicos que dispunham de televisores, tanto em Bissau como no interior do pas. A Lisnave no conseguiu atingir as metas fixadas pela sua administrao para o exerccio de 1994. No final do primeiro semestre do ano passado, a empresa previa obter um volume de vendas de 24 milhes de contos, mas acabou por no ir alm dos 21 milhes. No que se refere a resultados correntes, as estimativas apontavam para um valor negativo de 2,6 milhes de contos, mas que na realidade ultrapassou os seis milhes. Ao nvel do Grupo, isto , a Lisnave propriamente dita e as restantes 21 empresas em que os Estaleiros Navais de Lisboa participam no capital, os resultados saldaram-se por um prejuzo de 16,2 milhes de contos. Este valor reflecte um agravamento de 6,5 por cento face aos 15,2 milhes de contos de resultados negativos em 1993. As vendas consolidadas foram de 26,7 milhes de contos em 1994, menos 1,2 milhes de contos do que no ano anterior. Do ponto de vista concorrencial, o agravamento das perdas da empresa explica-se, em parte, devido entrada no mercado de novos estaleiros situados em pases de muito baixo custo de mo-de-obra (pases do ex-bloco de Leste), e a aumentos da capacidade de docagem em reas bem localizadas relativamente aos grandes fluxos de trfego martimo, alm de tambm disporem de mo-de-obra barata (pases do Mdio e Extremo Oriente). O PRESIDENTE etope, Mengistu Hail Mariam, anunciou ontem a constituio de um novo Governo, de cuja formao ficou responsvel o chefe da diplomacia, Tesfaye Dinka, agora promovido ao cargo de primeiro-ministro., revelou a rdio de Addis Abeba. A emissora anunciou igualmente que Mengistu demitira dois dos seus mais antigos ministros, o vice-Presidente Fiseha Desta e o secretrio do Comit Central, Legesse Asfaw. Tesfaye, um economista formado nos Estados Unidos, tem a misso de renovar o Governo e de alargar a sua base tribal, nos termos de uma deciso adoptada pelo parlamento no incio da semana. A PRXIMA ronda de negociaes sobre a paz em Moambique dever ter incio no dia 2 de Maio em Roma, confirmaram ontem o Ministrio italiano dos Negcios Estrangeiros e a comunidade religiosa de Sant'Egidio, mediador destas conversaes. Em comunicados separados, as duas entidades deram conta de que os contactos preliminares principiaram ontem em Roma. Segundo a agncia Lusa, a delegao da Renamoj se encontra na capital italiana, onde so esperados este fim-de-semana os negociadores de Maputo. Apanhado pela ronda do PBLICO quando chegava quela praia acompanhado pela mulher e pelo filho, na tarde de quinta-feira passada, o ministro dos Negcios Estrangeiros portugus escusou-se a ser fotografado, com o argumento de estar em frias, acelerando o passo (mais tarde, quando abandonava a praia, j depois das 20 horas, acabaria por ser apanhado pela objectiva). Conhecido pelos vizinhos banhistas, l foi acenando at se instalar mesmo junto gua. No longe de Guilherme Oliveira Martins, o assessor poltico de Mrio Soares, que deixar Belm em Setembro para assumir por inteiro a sua condio de futuro deputado pelo Partido Socialista. H muito frequentador do local (os pais possuem uma residncia em ...Boliqueime), Oliveira Martins tem visto os Tomates a transformarem-se na moda do Vero algarvio. o filho de Marcelo Rebelo de Sousa fazia anos e o ex-comissrio de Lisboa 94 promovia uma festa, em local mais ou menos secreto. Pela Praia dos Tomates, restava apenas um discreto centrista, Fausto Quadros, alm do porta-voz da Comisso Europeia, Joo Vale de Almeida. O australiano Michael Doohan (Honda) fez ontem a desfeita aos adeptos italianos, ao vencer o Grande Prmio de Itlia de motociclismo em 500cc, quarta prova do Mundial da modalidade, disputada no circuito de Mugello, onde os espectadores puxavam fervorosamente por Max Biaggi (Honda), que no foi alm do segundo posto. A Honda acabaria por fazer o pleno no pdio, com Alex Crivill a conseguir o terceiro posto. Nos arredores de Ponta Delgada uma residncia do sculo XVII, inteiramente restaurada. Quatro quartos, simples por 850000 e duplos por 980000; servem refeies quando solicitado e admitem animais. A seduo de uns olhos de longas pestanas sedosas tem agora em Cil Sublime, da Bourjois, um precioso auxiliar. Enriquecido com protenas e pr-vitamina B5, aplicado com uma escova especialmente criada para evitar o esborratamento do rmel. N. R. A razo por que se realou no ttulo da notcia em causa o benefcio do whisky foi porque ela se refere a um estudo que realava esse facto, o que se compreende quando se sabe que se trata de um estudo britnico. O PBLICO tem dedicado frequentes notcias aos benefcios do consumo moderado de lcool em geral e do vinho em particular. Veja-se, a ttulo de exemplo, os textos Corao, vinho e legumes, PBLICO do passado dia 16; lcool faz bem ao corao, 19.11.93; Glrias e misrias do lcool, 15.8.93; lcool protege corao?, 3.9.91; Vinho contra colesterol?, 12.8.91. O quadro de intenes levanta uma outra questo. Na prtica, privilegia-se as intervenes de fachada em detrimento do bem-estar dos habitantes das aldeias. A ilustr-lo o exemplo de Pido, onde as coberturas em telha das habitaes (mesmo estando em bom estado) esto a ser substitudas por outras em lousa, enquanto as casas que j tm coberturas em lousa (ainda que chova no interior) no beneficiam de qualquer ajuda. Pessoalmente causa-me alguma perplexidade que se trabalhe para a fotografia do visitante e no tanto para o bem estar de quem l habita, confessou Jos Reis. os restantes 25 por cento so participao nacional, sustentada pelos promotores dos projectos apresentados. Mas tambm aqui h aspectos pouco claros, j que esta disposio no seguida por todos. Quantos emigrantes muulmanos residem na Europa? 3 milhes residem em Frana, 1,7 milhes na Alemanha e 850 mil na Gr-Bretanha. Na Holanda, Blgica, Itlia e Espanha, os nmeros oscilam entre 250 mil e 300 mil muulmanos. A Jugoslvia tem 3 milhes, a Albnia 1,7 milhes e a Bulgria 800 mil. O sindicato dos maquinistas tambm no contabilizou os custos desta greve. No entanto, garante que os gastos com os autocarros fretados pela CP em Agosto para efectuar o transporte alternativo dos passageiros (principalmente emigrantes) dos comboios internacionais chegavam para resolver os seus problemas. Fora 4, uma placa de mrmore disposta sobre um soclo, onde efeitos de ondulao criam a iluso da passagem do vento sobre a superfcie do mar. Joo Cutileiro tem um peixe, que ironicamente se apropria de todas as formas de peixes / fontanrios de jardins, e que reenvia para a sua grande escultura de ninfas com um barco, mais longe, num dos lagos. Jos Pedro Croft fez uma enorme caixa oca de pedra, dentro da qual disps bolas do mesmo material. a nica pea que exige algum esforo ao visitante, que levado a debruar-se e espreitar para o interior da caixa. Poder assim tomar conscincia do contraste entre o volume sugerido e a massa das bolas, e associar a caixa a outras formas semelhantes, feitas no mesmo material, que j tenha visto: tmulos, sarcfagos ... Manuel Rosa e Rui Sanches, por fim, mostram-nos esculturas que tm por referente mais ou menos explcito o corpo. mas a coluna ela prpria uma metfora do corpo que sustenta, da fora que tudo suporta. e, no meio de uma estrutura de mrmore, colocou dois vasos com gua a diferentes nveis, que se foi sujando -- e enchendo de moedas -- com o passar dos dias. A certa altura, este corpo, que o corpo da escultura, foi mesmo quebrado, levando sua substituio. Esttua (porque um corpo), escultura, organismo que vive, ferido e morre? Todas as interpretaes so possveis, tudo pode acontecer a este corpo. BRASLIA Pesquisa Datafolha publicada hoje revela um dado supreendente: recusando uma postura radical, a esmagadora maioria (77%) dos eleitores quer o PT participando do Governo Fernando Henrique Cardoso. Tem sentido -- alis, muitssimo sentido. Muito mais do que nos tempos na ditadura, a solidez do PT est, agora, ameaada. Nem Lula nem o partido ainda encontraram um discurso para se diferenciar. Eles se dizem oposio, mas ainda no informaram o que vo combater. Muitas das prioridades do novo governo coincidem com as prioridades do PT. Desde o ltimo dia 13, Confisses de Adolescente pode ser vista pelos teens portugueses. A srie exibida aqui pela Cultura estreou na TVI de Portugal. Alm disso, a co-produo com o canal francs TCF1 para a realizao de mais 30 episdios continua sendo negociada. Cmera Manchete o nome do novo programa jornalstico que estria quarta-feira, s 22h30, na Rede Manchete. Sob o comando de Ronaldo Rosas, o programa mostrar reportagens especiais de Snia Pompeu. A direo do novo semanal ser assinada por Ewaldo Ruy. Os jogadores se dividem pelos dez quartos do alojamento, equipados com frigobar, ar condicionado, televiso e telefone. uma coisa do Primeiro Mundo, afirmou o levantador Maurcio (leia matria ao lado). Alm de Maurcio, Carlo e Paulo, a seleo deve contar hoje com Giovane. O atacante, que deveria ter se apresentado anteontem noite, pediu mais um dia de folga ao treinador. Na volta de uma viagem ao exterior, vale a pena trazer uma impressora matricial. Free shops dos aeroportos internacionais tambm vendem o equipamento. O modelo Lx 810, da Epson, vendido em Miami por US$ 178. O preo de lista nas revendas brasileiras de US$ 422. Esse equilbrio era tido como pr-condio para o sucesso do plano econmico. Parte dos recursos para a formao do FSE foi deslocado do oramento da sade e educao. Na poca, o ento ministro da Fazenda, Fernando Henrique Cardoso, fez um pronunciamento em cadeia nacional para anunciar a inteno do governo de destinar o FSE a investimentos sociais. O assessor de imprensa do Ministrio da Fazenda, Srgio Danese, disse ontem que o ministro da Fazenda, Rubens Ricpero no iria comentar o assunto porque no tinha informaes suficientes. O projeto original do governo destinava ao TSE R$ 334,9 milhes. Como no houve acordo entre governo e tribunal quanto ao volume de recursos, a dotao foi includa na reserva de contingncia sem especificao de despesa. Posteriormente, diante da ameaa do tribunal de entrar com uma ao judicial, o governo mandou ao Congresso uma alterao ao projeto, aumentando para R$ 452,7 milhes a dotao do TSE. Essas medidas reduziram a disponibilidade de dinheiro no sistema bancrio e os grandes bancos passaram a no fornecer recursos para as pequenas instituies. Quebraram aquelas que estavam descasadas, ou seja, que financiavam no mercado interbancrio de um dia (CDI) os emprstimos e carteiras de ttulos estaduais e municipais com prazos longos. Tel -- Claro. Aqui s joga quem est bem. Ningum fora sua escalao porque h quem escale o time no So Paulo. Ele s no jogava porque no estava bem. Folha -- E o que o senhor acha do escndalo da arbitragem carioca? Tel -- Uma vergonha. Apenas dois rbitros resolveram contar todos os podres, enquanto a federao tem mais de 70. O futebol precisa seguir o exemplo da CPI do oramento e apresentar todos os podres. Se eu dirigisse uma federao, apresentaria balanos mensais e liberaria minhas contas bancrias. A Fifa e a CBF deveriam entrar de sola nesse caso e em todas as outras federaes. A maioria das empresas que produzem leite das marcas interditadas no tinha sido comunicada ontem sobre a liberao do produto. O presidente da Cooper, Benedito Vieira Pereira, 49, afirmou que pretendia distribuir leite C nos postos de venda hoje. Prandi disse ainda que a empresa est elaborando normas factveis de serem executadas para a soluo ou minimizao dos problemas existentes no local. O Charade vai concorrer na faixa do Suzuki Swift e do Twingo, da Renault, afirma Caparelli. Herbert Berger, diretor-superintendente da empresa, diz que o Charade se aproxima do Honda Civic em tamanho e custa bem menos. O Applause, um sed quatro portas, com motor 1.6, o carro mais caro da Daihatsu. O top de linha custa US$ 30 mil. A mudana do local de jogo que deve acontecer tambm na partida contra o Corinthians, no prximo dia 17 foi determinada pela CBF, que no viu garantias de segurana no estdio santista. Na Vila, quando recebo a bola, tenho que ficar olhando sua trajetria, para no ser surpreendido. S depois que levanto a cabea para fazer um lanamento, reclama Neto. JFK -- A PERGUNTA QUE NO QUER CALAR Telecine, 20h30. O destaque do HBO o indito Exterminador do Futuro 2 -- O Julgamento Final, em que Schwarzenegger um rob com aparncia humana que vem do futuro para proteger um garoto. Para o terceiro ru, Alexandre Cardoso, 21, o Topeira, o juiz determinou uma pena de 20 anos. Souza tambm negou aos rus o direito de apelarem da setena em liberdade. Os trs esto presos desde 30 de julho de 93. Comeou bem antes do que se previa a batalha pela futura sucesso na Fifa apenas seis meses depois do super-acordo que, nas vsperas da Copa, reconduziu o brasileiro Joo Havelange ao sexto mandato consecutivo. Pelo acordo, os trs continentes mais obstinados em cortar o reinado de Havelange, a frica, a sia e a Europa, aceitaram cancelar os seus movimentos de oposio em troca, basicamente, de dois compromissos. Havelange aceitaria engolir o italiano Antonio Matarrese como o seu vice-executivo e, alm disso, esqueceria os seus modos autoritrios, coordenando a entidade de maneira colegiada. O Ambulim foi um dos centros que contriburam para um estudo apresentado na 5 Conferncia Internacional sobre Transtornos Alimentares, de 29 de abril a 1 de maio em Nova York. Dados sobre abuso sexual em bulmicas no Brasil, ustria e Estados Unidos foram centralizados por Harrison Pope, da Escola de Medicina de Harvard. Essa diviso gera algumas distores terrveis. Um dos injustiados Alfredo Volpi, que recebe apenas um painel, com cinco telas que servem para ilustrar sua evoluo de figurativo a abstrato. Afora historicismo, isso menosprezar um fator interno arte brasileira, que independe de contexto internacional. Volpi foi dos mais influentes pintores do pas para alm da questo da autonomia. O panorama sofre prejuzos demais em favor da tese. Abstratos entre medianos e medocres, como Fukushima, Prsio, Raimo e Douchez, tm o mesmo ou maior destaque que Volpi e nada que se possa chamar de autonomia para oferecer como lenitivo. Talvez isto seja muito barulho por nada. Original: Ccero. Disse que no conseguia vislumbrar artifcios fraudulentos ou prtica de peculato no protocolo assinado por Qurcia. Afirmou que o conjunto de fatos, em princpio, aponta o envolvimento de Qurcia. Recebeu a denncia. Segundo o mdico, o caso no preocupa. Romrio no se exercitou nas cobranas de falta e pnalti. Toda a comisso tcnica sabe que Romrio de treinar pouco, geralmente se poupando entre dois jogos difceis. Nem o PSB nem a coligao tm competncia legal para trocar o vice da chapa sem a concordncia de Bisol, que teve o nome aprovado em conveno. Outra maneira de um partido forar a substituio seria expulsar o candidato, com base em seu estatuto. Neste caso, o registro da candidatura seria cancelado pela Justia. Uma boa parte do pblico dos Stones hoje yuppie. uma tendncia, dizia o lojista Nivaldo Silva Costenaro, 34, cabeludo baterista de uma banda de blues. Escuto Stones desde os 13 anos de idade. O f daquela poca vai ser f sempre, acrescentou Costenaro. Apesar de limitar a venda de quatro ingressos por pessoa, a Mesbla no evitava ontem que uma mesma pessoa comprasse mais de uma vez. A queda nas vendas teve reflexos nas negociaes entre as confeces e as lojas. No est havendo cancelamento de pedidos. Mas existem indstrias que esto sendo procuradas pelos lojistas para postergar as entregas, conta Eduardo Costa, diretor da Abravest, que rene a indstria de vesturio. A seca que atingiu as reas produtoras de gros no deve causar grandes estragos na safra 1994/95. A primeira previso do IBGE (Fundao Instituto Brasileiro de Geografia e Estatstica) indica queda de 0,62% na rea plantada nesta safra em relao a anterior. Pocock, que publicou os resultados no ltimo nmero da revista British Medical Journal, do Reino Unido, estudou a influncia do chumbo desde a gestao. Segundo ele, a exposio ao material durante a gravidez ou nos dois primeiros anos de vida no representa perigo. OS PAPIS DE FLAVIO-SHIR -- A mostra, que faz parte das comemoraes dos 50 anos de pintura do artista japons, rene 25 obras em pequenos formatos. Foi utilizada tcnica mista, incluindo desenho, pastel, ceragrafia e fotografismo. De seg a sex das 11h s 19h, sb das 11h s 14h. At 30 de maro. Preo das obras: de US$ 2.000 a US$ 4.000. A aplicao dessas observaes ao caso americano e s relaes entre negros e brancos sugere uma nova maneira de conceituar os argumentos j conhecidos do legado da escravido. No se trata simplesmente da questo da escravido certamente ter tido efeitos duradouros sobre a cultura negra, nem mesmo dela ter exercido um amplo efeito negativo sobre a auto-confiana e auto-estima dos negros, mas mais especificamente da experincia da escravatura ter desvirtuado e tolhido a evoluo do algoritmo etnocntrico que os negros americanos teriam desenvolvido no decorrer normal dos acontecimentos. Os brancos fizeram tudo em seu poder para invalidar ou menosprezar cada sinal de talento, virtude ou superioridade entre os negros. Eles tiveram que fazer isso se os escravos fossem superiores em qualidades que os prprios brancos valorizavam, onde estaria a justificativa moral para mant-los escravizados? E, assim, tudo o que os afro-americanos faziam bem teve de ser colocado em termos que menosprezassem a qualidade em questo. Mesmo a simples tentativa de se documentar esse ponto deixa uma pessoa exposta a acusaes de condescendncia e, assim, os brancos de fato conseguiram cooptar os julgamentos de valor. ainda mais bvio que impossvel falar abertamente sobre o superioridade de muitos atletas negros sem ser sujeito a acusaes de que se estar sendo anti-negro de uma maneira enviesada. Pela segunda vez desde quando comeou a coordenar as aes no Rio, h duas semanas, o Exrcito mudou o nome das operaes. Agora, os oficiais envolvidos se referem sempre ao comando das aes como Centro de Coordenao de Operaes de Combate ao Crime Organizado (Ccocco). Cinco linhas paralelas, de mais de 400 km cada, foram descobertas por cientistas australianos no sul do pas. Elas esto separadas por espaos de 80 km a 100 km. As linhas, invisveis da superfcie, foram detectadas atravs de dados de satlites. Pesquisadores acham que as linhas podem ser falhas geolgicas. O derramamento de leo do petroleiro Exxon Vldez, em maro de 1989, causou estragos no valor de US$ 286,8 milhes, segundo um jri em Anchorage (Alaska). A ao est sendo movida por pescadores, lojistas, propietrios de terra e nativos. O valor mais que o dobro do estimado pela Exxon, mas menor que o original pedido, de US$ 895 milhes. Bombeiros e pessoal de resgate foram colocados em alerta mximo em Argel no final da tarde de ontem, segundo a rdio estatal. Veculos de resgate estavam a apenas 500 metros do Aibus 300. Optar entre um aparelho conjugado e outro simples outro ponto que merece ateno. Para fazer uma boa compra, a tcnica do Procon recomenda ao consumidor que verifique se j h dentro de casa os tradicionais equipamentos que desempenham as mesmas funes do conjugado, s que separadamente. Caso a opo seja pelo aparelho multiuso, o comprador deve checar se o produto tem assistncia tcnica, diz ela. Como a idia de enxugar a Constituio enfrenta resistncia inclusive nos partidos que apiam o governo, a equipe de FHC resolveu fazer as reformas por partes. Primeiro aprova-se o texto enxuto e depois negocia-se a aprovao, sem prazo definido, das leis complementares e ordinrias. O Pentgono usa a Internet, que conecta computadores a sistemas telefnicos, para que seus funcionrios troquem informaes. Os arquivos so protegidos por senhas e cdigos, que acabaram mostrando-se vulnerveis. O conselho fiscal entende que os acionistas no devem mais suportar investir dinheiro bom em companhia sem condies de se reerguer. Tem que torcer muito e rezar bastante, porque milagres acontecem. QUADRINHISTAS DO BRASIL inteiro se renem entre sexta-feira e domingo em Arax (MG). Haver oficinas e um concurso. Informaes pelo telefone (034) 661-2458. O MERCADO AMERICANO para desenhistas o tema da oficina com David Campitti (veja trabalho dele ao lado), roteirista de histrias do Super-Homem e criador de vrios personagens. Outros profissionais brasileiros, que atuam nos EUA, tambm participam. Informaes pelo (011) 263-4700. Ontem pela manh, os atletas que no atuaram contra o Vitria, participaram de um coletivo com a equipe principal. Os que jogaram, fariam treino de recuperao tarde no Projeto Acqua. Murici reclamou da postura do time do Vitria. Eles bateram muito. O que fizeram com o Pereira foi um absurdo, disse. Pereira levou um piso na cabea, mesmo estando cado no cho. Murici espera que o Inter no tenha a mesma atitude. A primeira misso lunar dos EUA em 21 anos teve incio ontem, quando um foguete Tit 2 colocou no espao a astronave no tripulada Clementine 1, que vai passar dois meses em duas rbitas da Lua para realizar completo mapeamento mineralgico e topogrfico do satlite da Terra. O nome oficial do projeto Depse 1 (Deep Space Program Science Experiment). Ele patrocinado pela Organizao de Defesa de Msseis Balsticos e pela Nasa, em uma das primeiras operaes espacais com fins civis e militares. O lanamento ocorreu ontem na base area de Vanderberg, na Califrnia, costa oeste do pas, s 8h30 locais (14h30 em Braslia). O custo total do projeto de U$ 55 milhes. O atual perfil dos poupadores ajudar a manter o dinheiro aplicado. Segundo o BC, mais da metade dos recursos da caderneta so de poupadores de mdio e grande porte em tese, menos sujeitos a achar que o dinheiro perdeu rendimento com a queda da inflao. A previso de saques reduzidos sobre a poupana compartilhada por especialistas. O ceticismo em relao a planos econmicos faz a populao manter reservas, diz o ex-diretor de Poltica Monetria do BC, Lus Eduardo de Assis, diretor de investimentos do Citibank. s vezes peco por falta de experincia, mas me considero um piloto rpido, afirmou ontem o brasileiro, em entrevista Folha por telefone, da Itlia. Diniz comeou sua carreira automobilstica em 1989, no Brasileiro de Frmula Ford, campeonato em que obteve a sexta posio na classificao final. A Polcia Civil de Ourinhos (371 km a oeste de So Paulo) prendeu ontem tarde o ex-lder religioso Jonas Rbio, 45, acusado de matar na quarta-feira a estudante Claudirene Contijo, 13, com um tiro de espingarda. O delegado Celso Antonio Borlina, 38, disse que Rbio confessou o crime. Rbio era acusado por Claudirene de t-la estuprado no ano passado, poca em que era o lder da Assemblia de Deus na usina So Luiz, onde a estudante morava. A me da estudante prestou queixa do estupro e, depois, das ameaas de morte que Rbio teria feito a Claudirene, disse Borlina. Zanini diz que est preparando a fundao de um novo partido, que trar explcito em seu programa a ressalva de que contra o racismo. Ser um movimento nacionalista independente. O novo manifesto do movimento no ficar pronto antes das eleies. Sendo assim, no podemos dar nosso apoio aberto, explicou. Ele afirma ter negros e judeus entre seus correligionrios. Segundo a empresa, ser o primeiro alimento produzido totalmente atravs de tcnicas de engenharia gentica a ser vendido. O tomate foi projetado para manter o sabor que tem depois de colhido durante tempo prolongado na prateleira dos supermercados. Pases produtores e consumidores de caf entraram em um tipo de pr-pacto internacional que no intervenha nos preos do produto, afirmou o porta-voz da Organizao Internacional do Caf (OIC). Temos as bases para um novo acordo. Pode-se agora recomendar um texto definitivo para o conselho da organizao, disse o porta-voz. Os pases integrantes do instituto enviam o texto com o novo acordo para seus governos antes da aprovao final, que deve ocorrer na semana que vem. Passarinho j descarta claramente a candidatura ao Planalto. Meu compromisso com o Par inarredvel, afirmou ontem depois de conversar com Amin. O senador catarinense ainda hesita. Mas faz questo de lembrar que, aos 46 anos, ainda ter vrias outras oportunidades para disputar a presidncia. A maior facilidade criada a supresso das barreiras alfandegrias para a importao de produtos de um pas pelo outro. Isso vai facilitar as atividades de franqueadores que precisam importar produtos para distribui-los a seus franqueados que operam em outro deles. Por exemplo: o master-franqueado da Arby's no Mxico enfrentar menos problemas e menos burocracia para importar o rosbife (que, ao contrrio do que ocorre com a Arby's Brasil, vem dos EUA) e as batatinhas (canadenses) que serve em seus restaurantes prprios e franqueados. O maior impacto do Nafta a curto prazo ser sobre as franquias do setor automotivo. Pelo acordo, so reduzidas de imediato todas as barreiras tributrias e alfandegrias relacionadas importao por qualquer pas do acordo, de veculos automotores, autopeas e acessrios fabricados nos demais pases membros. Roberto Capuano, presidente do Creci-SP (Conselho Regional de Corretores de Imveis), diz que o mercado de usados sofre com a carncia de financiamentos desde 70. Em 79 houve uma leve reabertura de crdito, mas nada significativo, afirma. Segundo ele, hoje as concesses de financiamento bancrio so restritas e elitistas. A Benetton volta a provocar polmica com um cartaz da srie United Collors. Desta vez, com uma camiseta branca furada a bala e empapada de sangue acima de calas militares. A empresa informou que a roupa era de Marinko Gagro, soldado croata-bsnio morto em julho do ano passado. L'Osservatore Romano, o jornal do Vaticano, qualificou a campanha de US$ 15 milhes como terrorismo de imagem. Os franceses Le Monde e Le Figaro e o alemo Frankfurter Allgemeine recusaram o anncio. Ilmar Galvo e Moreira Alves votaram pela concesso do mandato de segurana. Eles sustentaram que a renncia interrompeu o processo de impeachment. Moreira Alves tambm votou a favor de Collor em outros dois mandatos de segurana, mas foi vencido nos dois julgamentos. Aquele pensamento provocou-me um arrepio estranho e delicioso. No falei nada, mas visualizei a Sra. Oke, sentada no salo amarelo, o mesmo salo onde nenhum Oke de Okehurst exceto ela ousava permanecer sozinho, envergando o vestido de sua antepassada e confrontando-se, por assim dizer, com aquela coisa vaga, plangente, que parecia permear o aposento ... aquela vaga presena -- assim me parecia do galante poeta assassinado. O filme Barfly (traduzido no Brasil como Condenados pelo Vcio) foi inspirado na vida de Bukowski. O escritor, identificado com a cultura beatnik, admitia somente ter sido inspirado por um outro autor da Califrnia, John Fante (Pergunte ao P, entre outros), que Bukowski descobriu nas suas leituras em bibliotecas pblicas quando jovem. Michael Schumacher culpou um degrau existente nas zebras do circuito de Adelaide por seu acidente ontem no final da primeira sesso de treinos oficiais. Quando analisei a pista na quinta-feira, pedi FIA que melhorasse as condies daquele ponto, disse o alemo. Na sua opinio, o Estado no pode continuar em setores onde a iniciativa privada j comprovou sua eficincia. Em entrevista Folha, Angarita elogiou a iniciativa do Sindicato dos Eletricitrios de So Paulo, que articulou uma ao judicial para tentar suspender a venda de aes da Eletropaulo. Cutler admitiu que as conversas entre funcionrios da Casa Branca e do Tesouro sobre investigaes sobre o banco de McDougal no eram recomendveis, mas no constituram crime. Essas conversas levaram em maro demisso de Bernard Nussbaum, amigo da primeira-dama, do cargo de assessor jurdico da Casa Branca. Fiske comeou seu trabalho em 20 de janeiro. Comisso Parlamentar de Inqurito sobre o caso Whitewater comea a funcionar dia 29. O relatrio no diz se as reunies entre funcionrios da Casa Branca e do Tesouro violaram a tica. O secretrio do Tesouro, Lloyd Bentsen, disse que a questo tica vai ser examinada agora. S o procurou tempos depois, quando o secretrio de Assuntos Especiais, Eliezer Baptista, resolveu preparar o macroplanejamento do pas uma idia esplendorosa que, infelizmente, no resistiu ao impeachment de Collor e convocou Mouro para sua equipe. Com medo de perder poderes, Modiano chamou Mouro e, numa atitude mesquinha, comunicou-lhe que, como o financiamento do projeto seria bancado pelo BNDES, ele iria para l, mas na qualidade de representante do banco. A tela de abertura do programa simptica. Uma sala tem como cones uma pilha de discos, um aparelho de som, uma enciclopdia e uma janela que mostra a encruzilhada onde Robert Johnson teria feito um pacto com o diabo, trocando a alma pelo sucesso. Peppermint Tea House -- The Best of Shoukichi Kina. Lanado dentro do projeto Asia Classics, um subselo do Luaka Bop (o mesmo que lanou Tom Z no exterior), essa coletnea mais um passo de Byrne na sua busca do sublime na msica. No primeiro volume de Esses Byrne mostrou soberbas trilhas sonoras de filmes indianos. Com Kina, ele chegou bem perto da perfeio. Em todo o shopping, que est decorado com lanternas, leques e at um portal japons, h exposies de iquebana (arranjos florais), bonecas, cermicas, armaduras, mscaras, quimonos e espadas. Em outro espao, h exibio de vdeos que demonstram, por exemplo, a cerimnia do ch e o teatro kabuki, alm de pontos tursticos do Japo. Prximo praa da alimentao, um sushiman estar preparando pratos tpicos, como sashimis e o sushis, que estaro venda. Para isso, ele afirma pretender inaugurar uma cmara setorial para discutir as questes especficas da economia no Estado. Para atender poltica desejada pelo governador eleito, Barelli vai comandar uma reviso da antiga pasta (Secretaria das Relaes do Trabalho). O economista diz que, para gerao de empregos, vrias propostas tero que ser analisadas. Segundo ele, nem os grandes pases souberam resolver o problema do desemprego. Segundo ele, italianos e belgas j descobriram que comprar imveis em Natal um timo negcio. Um diplomata da Blgica comprou apartamento para passar as frias de julho e janeiro, mas fora desse perodo ele aluga o imvel por US$ 1.000, diz Bezerra. Apesar de morarem aqui do lado em Miami, Teresa e Emerson Fittipaldi no do nem tchuns para a Copa. Pegam os dois pimpolhos Joana e Luca e se mandam para Lake Powell, Arizona. Para tanto imprescindvel priorizar algumas aes como a recuperao de unidades armazenadoras de cereais e das rodovias, alm do incentivo utilizao das ferrovias. Tambm necessrio alertar toda a sociedade para a importncia de se reduzir as perdas de alimentos. Aos horticultores preciso oferecer informaes de mercado seguras e atuais para a programao da produo e desenvolver novas alternativas para o acondicionamento e embalagem dos produtos. Helmet brutal. metal punk. Tem tamanho, peso intrnseco que dispensa volume alto. A banda foi cortejada por vrios selos multinacionais, que viam nela um possvel novo Nirvana. Acabou assinando com o Interscope, ligado Warner, por US$ 1 milho por trs discos. Como notou o jornalista Jim Greer, da revista americana Spin, uma boa grana, mas nada muito absurdo. Meantime, disco de estria da banda no Interscope, vendeu 500 mil cpias. As duas guitarras do Helmet atacam sempre juntas, em pequenas clulas separadas por um silncio nunca menos que perturbador. Algo como um samba de breque transposto para o domnio eletrificado do rock. As letras irnicas, eruditas, denunciando que os msicos talvez tenham estudado um pouco demais so gritadas. No h nenhuma nfase na melodia. O Helmet uma grande seo rtmica, definiu o baixista da banda, Henry Bogdan, em uma entrevista Spin. Mtodo japons exige calhamaos de exerccios; opositores dizem que 'tcnica peca pelo mecanicismo'. A matemtica ainda o bicho de sete cabeas para grande parte dos estudantes do 1. e 2. graus. Na semana passada, a Secretaria Nacional de Direito Econmico baixou uma portaria, que aplica uma multa de 250 Ufirs (R$ 170) aos seis colgios por cada matrcula recusada. A Justia foi aplicada porque o Estatuto da Criana e do Adolescente garante o direito de escolas a todas as crianas, disse Mrcia Mendes, presidente da Associaos dos Pais de Alunos do Pitgoras, um dos seis colgios acusados de recusar as matrculas. O mesmo no se pode dizer da pintura de Cy Twombly. Da segunda gerao de expressionistas abstratos, Twombly tambm do segundo time. um grande pintor, e sua ltima fase o que mais sustenta a afirmao; mas irregular, tem fracassos evidentes. Ainda que Robert Hughes, da revista Time, ache que o tema de Twombly mesmo o fracasso, um fracasso sempre um fracasso. Os brasileiros escolhem Cardoso foi o ttulo na primeira pgina. O jornal compara a eleio presidencial brasileira francesa, afirmando que o premi Edouard Balladur, ex-favorito, tambm pode perder, em abril de 95. A explicao da reviravolta no reside em uma pretensa especificidade brasileira, que seria um coquetel de leviandade, inconstncia e pusilanimidade, tudo ao som de salsa, ironiza o editorial. Os empresrios abriram mo de posies histricas, eventualmente visando sua proteo, para construir e defender idias exclusivamente de interesse coletivo para o desenvolvimento global. Folha -- Quais so as implicaes que o encerramento da reviso traz para a economia? Resposta -- J pedimos a criao de um conselho para produzir essa definio. medida que o grupo de servios que entram nos lares e nas empresas evolui, a definio de servio universal vai evoluir tambm. Deve haver, no mnimo, uma linha digital chegando a cada casa, mas acho que servio universal deveria significar muito mais do que isso, mais do que o fio. Amato -- O cenrio ideal para o Plano Real seria a vitria inequvoca de um dos candidatos j no 1 turno das eleies. Folha -- Quem vence? Eu tenho uma origem poltica. Meu av materno foi prefeito de Limeira, meu pai foi duas vezes prefeito de Itapira, onde passei toda minha infncia. Desde cedo fui orador da turma, sempre gostei de declamar. Da a alguns anos, os Gershwins foram para Hollywood, a fim de fazer o que intimamente desprezavam: escrever canes para filmes. Mas, mesmo que quisessem no conseguiriam fazer nada menor. Num nico ano, 1937, eles compuseram They Can't Take That Away From Me, Let's Call the Whole Thing Off, A Foggy Day, Nice Work if You Can Get it, They All Laughed, Love Walked In' e Love Is Here to Stay', e essas so apenas as que ficaram universalmente conhecidas. Infelizmente, 1937 seria tambm o ltimo ano de George, porque ele morreria em julho, de um tumor cerebral, aos 38 anos. A desgraa que, cientificamente, como demonstra a moderna criminologia, quase nada dessa poltica criminal simblica serve para atenuar o gravssimo problema da criminalidade. A interveno militar localizada, que pode imediatamente oferecer algum alvio, tem como efeito principal o seguinte: o crime s muda de lugar. Com a interveno no Rio, que se cuide o resto do pas! Quarenta por cento dos gerentes japoneses esto insatisfeitos com os salrios recebidos, segundo pesquisa divulgada por um instituto privado em Tquio. Cerca de 72 % dos empresrios da construo querem que o prprio setor negocie a converso dos contratos para a URV, enquanto 28 % desejam que o governo estabelea as regras. Folha Qual a sada? Broz No tenho a receita exata, tenho a certeza de que algo precisa ser feito diante do confronto entre nacionalismo albans e srvio. O candidato deu rpida entrevista no saguo do aeroporto e dirigiu-se ao Hotel Delmonicos, na Park Avenue, em Manhattan, onde a diria de US$ 195 (CR$ 253 mil) no fim de semana e US$ 250 (CR$ 325 mil) de segunda a sexta-feira. Lula e outros 20 convidados almoaram a convite do cnsul-geral do Brasil em NY, Marco Cesar Meira Naslausky, no restaurante Peter Luger Steak House, tradicional no bairro do Brooklin. A conta, paga pelo embaixador, ultrapassou US$ 1,4 mil (CR$ 1,8 milho). A lio parece clara em ambos os casos: no ser um artifcio legal nem a boa vontade dos governos que protegero a riqueza dos poupadores ou o poder de compra dos assalariados, mas sim um ambiente de estabilidade econmica consistente. Pelas razes alinhadas, que no refletem apenas conjeturas tericas, mas a experincia duramente vivida pela nao nas ltimas duas dcadas, se impe dar um basta definitivo a todos os remanescentes de indexao formal, sejam eles nas taxas de juros nominais (TR), no recolhimento dos tributos (Ufir e correlatos) e nos salrios (IPC-r). O castelo de cartas ruiu logo. H uma esquerda reformista no PSDB. Mas ela parece ser mera exceo. Agora, o partido retirou a mscara. Mostrou que a maioria dos seus polticos profissionais cuja respeitabilidade no pode ser posta em dvida oscilava entre o centro nacionalista e os radicais do PMDB. Considerados globalmente, no faziam diferena. Eram o retrato do corao e do crebro deste partido. Nada mais! O tancredismo concedeu ao seu principal terico, professor e senador Fernando Henrique Cardoso, a oportunidade de esboar a filosofia poltica do mudancismo. Na prtica, os que iriam constituir o cerne e a base do partido iam mais longe: condenavam o autoritarismo, o oportunismo, o clientelismo e o fisiologismo. Ostentavam um patamar aparentemente slido, que evidenciava a radicalidade dos estratos mdios e dos intelectuais envolvidos na rejeio do status quo. Na largada, Ayrton Senna, que havia feito a pole position, manteve a primeira colocao. Por causa de um acidente envolvendo o finlands J.J. Letho, da Benetton, e o portugus Pedro Lamy, da Lotus, o safety car entrou na pista. Durante seis voltas, os pilotos tiveram que manter suas posies por razes de segurana e aproveitavam para aquecer os pneus enquanto os carros envolvidos no acidente eram retirados. O caso Paubrasil j rendeu Receita Federal, segundo o coordenador-geral de fiscalizao do rgo, Luiz Henrique Barros de Arruda, cerca de US$ 1,5 milho em crdito tributrio -- a soma do imposto devido, multas, correo monetria e juros de mora -- de cerca de 20 empresas envolvidas no escndalo que procuraram a Receita espontaneamente. Essas empresas, diz Arruda, reconheceram ter emitido notas frias, sem o recebimento de nenhum servio da empresa do pianista Joo Carlos Martins e de seu scio Ettore Gagliardi. Os jogadores do Palmeiras treinam s 9h, no Centro de Treinamento. O treino deve ser s recreativo. Depois do treino, os jogadores voltam ao Lord Hotel, onde esto concentrados desde a noite de ontem. Interessado em organizar um livro de arte sobre viajantes, Ronaldo Graa Couto, da produtora Metavdeo/Metalivros, tomou conhecimento da pesquisa e procurou Belluzzo. Couto procurou a Odebrecht. Entre 40 outras idias, Brasil dos Viajantes foi selecionado para comemorar os 75 anos de atividade do grupo empresarial. Como naquela poca ainda no existia a fotografia, eram os pintores que faziam os retratos das famlias. Rembrandt era o preferido das pessoas mais importantes. Ele era to bom artista, que muitos pintores quiseram aprender a pintar com ele. O livro Apresentando Rembrandt -- Pintor, Gravador, Professor (editora Klick), de Alexander Sturgis, conta muitas outras coisas sobre a vida e as obras do pintor. O governo do Estado realizar um concurso pblico para preencher 200 vagas de agente fiscal. As inscries comeam no prximo dia 12, nas agncias dos Correios. A taxa de CR$ 4.300,00. Cerca de 20 mil pessoas fizeram cursos proficionalizantes no Senac (Servio Nacional de Aprendizagem Comercial) no ano passado, na Paraba. Segundo o diretor regional da entidade, Glauco Pereira Chaves, a crise no mercado de trabalho estaria dificultando o aproveitamento dos formandos. Tudo indica que os telejornais da Globo vo ganhar novo reprter especial. Antes de estrear como editor senior da Playboy, Nirlando Beiro fez uma srie de testes de vdeo. Cientistas dos EUA confirmaram ontem em Quito (Equador) a contaminao causada por atividades petrolferas na Amaznia equatoriana. Segundo eles, casos de doenas de pele em ndios estariam relacionados contaminao. O Centro para Direitos Econmicos e Sociais e mdicos da Universidade Harvard informaram que amostras de gua tm nveis entre 10 e 1.000 vezes maiores que os recomendados pela Agncia de Proteo Ambiental dos EUA. Para tentar resolver o problema do trfico de drogas e da violncia no Rio de Janeiro, o governador do Rio e o Presidente da Repblica chamaram o Exrcito. Agora, os soldados e a polcia esto trabalhando juntos para prender os traficantes. Eles esto fazendo uma guerra contra o trfico. S quando h problemas muito graves, como o do Rio, o Exrcito chamado para ajudar. Quando Jos Sarney comeou a escrever semanalmente para a Folha, chegaram protestos redao. Como aceitar que, depois de uma passagem to ruim pela Presidncia, Sarney dispusesse de espao privilegiado no jornal? O tom familiar, coloquial, benigno de suas crnicas (agora reunidas em Sexta-feira, Folha, ed. Siciliano) foi pouco a pouco vencendo as resistncias do pblico. Como cronista, Sarney toma o cuidado de no indignar ningum, e difcil antipatizar com ele. Como a separao das marcas tida como uma tendncia irreversvel, caberia Volkswagen construir sua prpria fbrica. A TV-A atribui ao Plano Real a conquista de 17 mil assinantes em outubro, contra a mdia de 10 mil assinaturas/ms no incio do ano. Segundo o ministro, os juros desses emprstimos sero compatveis com os praticados no mercado internacional (8 % a 12 % ao ano). Ciro no quis confirmar se a TR (Taxa Referencial) deixar de corrigir esses financiamentos. Ciro minimizou a estimativa de inflao de 3% em outubro, de acordo com a Fipe. A prpria Fipe prev que em novembro isso tudo passa e a inflao cai, disse. Na segunda seguinte, o dia 4 ao dia 8, os bancos ficaro abertos at uma hora aps o expediente normal, apenas dez delegacias regionais do Banco Central e em agncias do Banco do Brasil encarregadas de entregar o real aos demais bancos. A distribuio acontecer at o final do ms, partindo das capitais para o interior do pas. Cada banco buscar no BC ou no BB o volume de dinheiro compatvel com suas operaes. Apenas 5 milhes de contribuintes haviam entregue sua declarao de Imposto de Renda at ontem pela manh. A estimativa da Receita Federal era receber 7 milhes de declaraes. O nmero final do total de declaraes entregues s deve ser divulgado na prxima semana. Os 12 computadores da Future Kids oferecem jogos educativos que treinam o raciocnio lgico. No estande da Tec Toy h trs consoles Mega Drive, alm de um Master System e quinze cartuchos de jogos. As crianas podem ficar 15 minutos em cada um dos jogos. Caso no haja fila, o perodo de uso pode ser maior. A partir do dia 17, a Playtronic tambm ter um estande no Iguatemi. Quatro meses depois do atentado, o governo argentino ainda no conseguiu avanar nas investigaes sobre o atentado. No incio do ms, o presidente Carlos Menem criou um fundo especial de US$ 1 milho para recompensar quem fornecesse informaes que ajudem nas investigaes. Quando os britnicos retomaram o Egito em 1801 a pedra foi parar no Museu Britnico, mas os franceses fizeram cpias dela. O resultado foi que um francs, Jean-Franois Champollion, que s visitou o Egito em 1828, foi o primeiro a decifrar os hieroglifos. Uma carta de Champollion de 1822 Academia de Inscries e Belas Artes, mostrando seus resultados, o nascimento da egiptologia, segundo Vercoutter. Hoje, a praia ganhou nova infra-estrutura com bares, quiosques, sistema de iluminao, banheiros e quadras de esportes. Os jovens tm se encontrado em Atalaia, principalmente entre quarta-feira e domingo, quando so realizados jogos de voleibol. A verdade simples. Lula cresceu e prosperou enquanto tudo dava errado no pas. Fcil subir nos palanques, defendendo que necessrio mudar tudo o que est a. Seu sucesso estava, essencialmente, sustentado no fracasso. No clima de sinistrose, as fragilidades do PT no ficaram to visveis opinio pblica: a dificuldade de se fazerem alianas capazes de sustentar Lula quando virasse presidente; os aloprados que conquistaram posies estratgicas no partido; a falta de generalizada convico sobre o valor da democracia. No final da reunio, o premi italiano, Silvio Berlusconi, leu um documento de 45 pargrafos, pelo qual 140 pases consideram que o crime organizado o maior inimigo das democracias. Segundo nmeros da ONU, os cartis do crime como a Mfia, a Yakuza japonesa e os cartis de Cali e Medelln, na Colmbia, faturam anualmente US$ 750 bilhes em atividades ilcitas. Os contratos de gaveta, vistos pela CPI como mais um indcio de que as empreiteiras atuavam em cartel, so compromissos sigilosos entre as empresas. O sigilo uma das exigncias contratuais. Alm do contrato com a Servaz, h outros do mesmo tipo, envolvendo mais empreiteiras. Os documentos foram encontrados em papel ou retirados de disquetes apreendidos na casa de Ailton Reis, diretor da Odebrecht. Com a Servaz, segundo o documento obtido pela Folha a Odebrecht acerta o pagamento de US$ 110 mil mais uma porcentagem referente a obras de abastecimento de gua em Roraima, ainda a serem licitadas. Alm disso, tem que se alterar o aparato jurdico para favorecer aes de negociao. Todo esforo deve ser no sentido de transformar o local do trabalho em primeira instncia do processo trabalhista, com o poder de compor conflitos. S assim se evitar a estratificao da legislao, permitindo que o sistema acompanhe de maneira dinmica as transformaes na economia. A Justia do Trabalho passaria a tratar apenas dos conflitos individuais, que no sejam resolvidos no prprio ambiente de trabalho. H a necessidade de uma representao permanente do trabalhador que converse com o empregador e signifique garantia de que no haver perseguio aos negociadores, diz Siqueira. Foi enterrado ontem de manh no cemitrio Chora Menino, em Santana (zona norte de So Paulo), o ex-chefe do Departamento de Fotografia da Folha Waldemar Cordeiro, 63. Cordeiro morreu devido a complicaes decorrentes de um enfisema pulmonar (diminuio da rea disponvel para troca de ar nos pulmes). A patinadora Tonya Harding, banida das competies por ter planejado atentado contra a rival Nancy Kerrigan, ser o destaque de setembro da revista Penthouse. Segundo o jornal USA Today, a revista publicar fotos nas quais Harding aparece mantendo relaes sexuais com seu ex-marido, Jeff Gillooly. A legislao prev multa de 300% sobre o valor da venda para quem se recusar a emitir a nota. Depois, haver campanha de fiscalizao e aplicao de multas para quem for pego em flagrante. O objetivo da campanha aumentar a arrecadao dos impostos. gratificante exibir opinies progressistas, dessas que fazem chover cartas de apoio no Painel do Leitor. A verdade, porm, no d cartas nem votos. A verdade ofende somos uns boais. O fato que o caos sangrento de nossas ruas o resultado conjunto de nossas prprias aes, embora ningum individualmente tenha a inteno de produz-lo ou sequer tenha poder para isso. A responsabilidade a um s tempo de todos e de ningum. Ns, motoristas, pais e jovens brasileiros, somos os piores inimigos de ns mesmos. um equvoco reduzir a liberalizao comercial ao neoliberalismo. Afinal, a abertura comeou na economia brasileira movida por consideraes pragmticas e objetivas. Simplesmente tornara-se impossvel, por razes tecnolgicas, comerciais e financeiras, estimular o desenvolvimento com base na substituio de importaes. Tal poltica convertera-se, com o avano da globalizao econmica, em sinnimo de proteo ao atraso, ao desperdcio e ineficincia. O governo agora apenas antecipou uma rodada de reduo de tarifas prevista para janeiro de 95. E, com essa medida, pode matar dois coelhos. De um lado refora a estabilizao, pois qualquer eventual aumento irresponsvel de preos depara-se agora mais rapidamente com a concorrncia de similares importados; de outro, ajuda a aliviar a supervalorizao do real, pois quanto mais importaes houver, maior ser a procura por dlares. Fazer avanar uma reforma estrutural (como a abertura da economia) e, ao mesmo tempo, calibrar um processo de estabilizao no porm uma tarefa fcil. Resta portanto saber se as medidas anunciadas, em tese adequadas, tero na prtica o efeito que se espera. Com o dlar comercial j a R$ 0,85 e o governo dizendo que no h limite para baixo, a necessria abertura pode involuntariamente desaguar num arriscado escancaramento. O livro de Marilena Ansaldi tambm oferece detalhes sobre uma das mais bem-sucedidas experincias culturais de So Paulo: o Teatro da Dana, que funcionou a partir de 1975 na Sala Gil Vicente do Teatro Ruth Escobar. Esse palco, alm de revelar inmeros talentos que depois se refugiaram na Europa, como Sonia Motta, serviu para Marilena inaugurar a fase mais frtil de sua carreira. BRIZOLA O 3 NO RIO, com 13 % . Estava vivo o sentimento tnico e nativista. Seu instrumento de ser ouvido era a fora da violncia. Em dois pases esses ndios ainda no esqueceram o trauma da conquista: Cortz, no prprio Mxico, e Pizarro, no Peru. o delrio que alimenta o Sendero Luminoso. a vigorosa histria mexicana que surpreende em Chiapas. O acordo no se tornou pblico, mas, agora, se sabe que foram feitas muitas concesses polticas. Nas grandes cidades, multides imensas saram s ruas em apoio aos rebeldes. A rebeldia no a revoluo, como bem acentua Octavio Paz. A rebeldia um ato pessoal; a revoluo uma manifestao coletiva. Mesmo asim, Van Himst pretende surpreender a defesa saudita, colocando dois atacantes velozes. Alm do zagueiro Grun e do centroavante Weber, o lateral-esquerdo Vital Borkelmans encontra-se tambm com um carto amarelo e pode ficar de fora hoje. Eles se intitulavam filiais da MIT Dealing Management Inc., dos Estados Unidos. A empresa norte-americana informou PF que no tem filiais ou representantes no Brasil. A MIT brasileira no aplicava o dinheiro. O golpe foi denunciado por funcionrios da empresa. A ao da PF comeou s 16h30 de anteontem e s terminou ontem s 15h30. A MIT operava no mercado desde junho passado. A PF ainda no levantou os nmeros exatos do golpe, mas acredita que pelo menos cem pessoas tenham perdido seu dinheiro. Em nvel internacional, h um processo de conglomerizao de empresas que, ao permitir a integrao de softwares, conferiu aos grandes grupos vantagens expressivas sobre os independentes. Tem que se pensar, para o Brasil, em uma empresa com modelo acionrio flexvel, que permita incorporar, em uma nica marca, os esforos individuais desses criadores e dispor da sinergia necessria para investir no mercado internacional. O jogador disse que antes de voltar Espanha visitar o Estado onde nasceu, a Bahia. Ontem levou, no Rio, os dois filhos a uma visita de rotina ao pediatra. Se sentir que h um desejo coletivo, ele vai encaminhar as emendas constitucionais, disse Luiz Henrique aps uma hora de conversa com Itamar, ontem, no Planalto. Luiz Henrique disse que o PMDB vai procurar os presidentes e lderes dos outros partidos em busca do consenso sobre a votao da reforma fiscal e tributria. A aprovao das emendas considerada essencial para o Plano Real. O corredor comear na marginal Tiet, na altura do Cebolo, passar pelo Ceagesp, praa Panamericana, Faria Lima, Lus Carlos Berrini e terminar, por enquanto, no shopping Morumbi. O escritrio de Jlio Neves j comea a preparar novos estudos para o prolongamento deste corredor alm do shopping Morumbi, em direo ponte do Socorro. No sei se fui claro: seria mais fcil um pinguim cruzar trs Saaras do que FHC comer carne de bode espremido a um magote de sertanejos suados. Compreendeste? No fosse a eleio, d. Luciano Mendes teria de negar Cristo trs vezes para que nosso senador empuleirasse no lombo de um cavalo, envergando aquele chapu de cangaceiro. O caso de Lula semelhante. O candidato do PT bateu na trave em 89. Agora imagina que chegado o momento. Sente o bafejar da sorte a acariciar-lhe a cara. Os metalrgicos do Estado de So Paulo da base da Fora Sindical (data-base em novembro) romperam negociao com os setores de autopeas, forjaria e parafusos e preparam greve. Os empresrios ofereceram 10,4 % em novembro para So Paulo, Osasco e Guarulhos e 6 % mais 6,27 % em maro para o interior. A categoria pede 69,69%. O Sindicato dos Metalrgicos do Rio e do Grande Rio (data-base em outubro) se rene hoje com representantes da Firjan (federao das indstrias do Rio) pela primeira vez desde o incio da greve da categoria, na tera-feira. O sindicato pede 99% e os empresrios oferecem 13,56%. A greve por empresa atingia ontem sete empresas e 12 mil empregados. O governador da Flrida, Lawton Chiles, 63, resolveu processar o governo dos EUA para recuperar os US$ 739 milhes anuais que ele diz estar gastando em despesas com imigrantes ilegais em seu Estado. Ele acusa o governo federal de no cumprir bem sua misso de impedir a entrada no pas de estrangeiros e de no repor o dinheiro gasto pelos Estados com eles. A deciso tem evidentes objetivos eleitorais. Chiles, que pertence ao Partido Democrata e ao mesmo grupo neoconservador que ajudou Bill Clinton a chegar Casa Branca, vai tentar se reeleger em novembro deste ano. Sua popularidade est mais baixa do que nunca e um de seus possveis oponentes filho de George Bush. O governo se comprometeu a no garfar a poupana, nas palavras do presidente do BC. Estabeleceu que as tarifas pblicas vo todas marchar ao compasso da URV at meados de maio. Depois ser a vez dos financiamentos rurais e, no incio de junho, acenar os rumos que podero tomar os pontos fundamentais do plano a poltica monetria e a poltica cambial. Ao justificar sua necessidade, o presidente da Repblica abriu uma fresta para a realidade na propaganda sobre as virtudes da URV. Mesmo indexado, o salrio convertido em cruzeiros reais continua a ser corrodo pela inflao. Mais de 110 mil pessoas assistiram partida, no estdio Santiago Bernabeu. O atacante Romrio no atuou bem. Afastado de seu companheiro de ataque, Stoichkov, pegou pouco na bola. Para Itamar, Stepanenko 'est perdido'. O presidente Itamar Franco disse ontem que o ministro Alexis Stepanenko (Minas e Energia) autor de vrios bilhetes pedindo apoio ao candidato Fernando Henrique Cardoso est perdido. Ele est perdido na China, disse Itamar, com bom humor. Stepanenko viajou para a China h uma semana e deve voltar na sexta-feira. A demisso do ministro dada como praticamente certa. Ir a uma feira livre serve de aula para os principiantes aprenderem a comprar frutas, legumes e verduras mais baratos e de melhor qualidade. Mas preciso disposio para andar a feira toda, com caderno e lpis na mo, comparando os preos dos produtos. O advogado Mariano Gonalves Neto, autor da ao popular que se arrasta h 12 anos, acusa os ex-ministros de aprovarem uma superavaliao dos terrenos entregues pelo grupo Delfin como pagamento de uma dvida que, em 1982, chegava a Cr 70 milhes. O ex-ministro da Fazenda Ernane Galvas (governo Figueiredo) disse que sua participao no episdio limitou-se a um de acordo no processo que teria vindo pronto do Banco Nacional de Habitao, subordinado na poca ao Ministrio do Interior. O episdio emblemtico do tipo de poltica externa exercitada pelo governo Clinton e ajuda a explicar por que ela to mal avaliada. At agora, o Bradesco Visa financiava em at quatro parcelas, com os juros cobrados sobre os cruzeiros reais. Para o lojista, a nova modalidade no traz mudana alguma. Ele receber o valor vista aps 30 dias. O candidato Presidncia pelo PT (Partido dos Trabalhadores), Luis Incio Lula da Silva, o convidado de Marlia Gabriela no Cara Cara. CARA CARA Bandeirantes, 0h. Marco Antonio Nahum, advogado dos canadenses, nega que o premi venha ao Brasil tratar da expulso. Isso ainda no ficou acertado pelo governo canadense, disse. A famlia de Christiane j gastou aproximadamente US$ 1 milho para tentar a sua libertao. Contratou a empresa Winfreys, a maior especialista em atividades de lobby, em todo o Canad. Terminado o Mundial, todas as atenes das selees j se voltam para a Olimpada de Atlanta. Um exemplo o tcnico norte-americano, Taras Liskevich, que no esconde seu objetivo: conquistar a medalha de ouro olmpica. Para isso, a comisso tcnica no se cansa de estudar os outros times. Nesta temporada, um dos mais visados foi o Brasil. Ele filmaram todos os jogos das brasileiras. A dcada de 90 no tem sido boa para a China. O time, que nos anos 80 dominou o vlei feminino, voltou a decepcionar no Mundial. Novamente, no conseguiu ficar entre os quatro melhores. Em 92, na Olimpada de Barcelona, as chinesas terminaram em stimo lugar, colocao que provocou grandes mudanas na equipe. O Santo Andr perdeu por 2 a O para o Rio Branco, de Americana, ontem tarde no estdio Bruno Jos Daniel, em Santo Andr. Para o Rio Branco, bastou o primeiro tempo do jogo para determinar o resultado. Souza, ponta-esquerda do time de Americana, abriu o marcador aos 25min. Paulo Cesar, cobrando falta na cabea da rea, fechou o marcador aos 43min, sem chances de defesa para o goleiro Slvio, do Santo Andr. Para se ter uma idia, no Brasil a melhor marca de Hilma, com 3,12 m.. No por acaso, o tcnico Bernardinho diz que o time cubano ataca do terceiro andar. Parar esse ataque no bloqueio uma misso quase impossvel. O presidente Itamar Franco tem, como qualquer cidado brasileiro, o direito de apoiar a candidatura para a Presidncia da Repblica de quem bem entender. Pode tambm cobrar dos membros de sua administrao que apiem o seu candidato ou deixem o governo. Ainda assim, os planos do Planalto de dedicar apoio total ao ex-ministro Fernando Henrique Cardoso, como esta Folha revelou na edio de ontem, esbarram em problemas. Embora no ocorra no Brasil h muito, absolutamente normal, nos pases de maior tradio democrtica, que o governo tenha um candidato e o apie. Ronald Reagan no poupou esforos para eleger seu vice, George Bush, presidente dos Estados Unidos, em 1988. A defesa apresentar o recurso extraordinrio no ltimo dia do prazo: trs dias teis aps a publicao no Dirio da Justia da deciso do TSE sobre o embargo declaratrio pedido pelos advogados de Lucena. Os advogados de Lucena calculam que isso no deve ocorrer antes de tera ou quarta, com Lucena j eleito, segundo acreditam. O embargo uma medida para que o TSE admita a constitucionalidade da questo o que permite que o assunto seja levado ao STF. FHC -- No h temor de recesso. No estamos fazendo poltica recessiva. No estamos diminuindo investimentos do setor privado e, no setor pblico, o BNDES est mudando a orientao de investimentos do Nordeste. Estamos num ciclo de expanso. Folha -- Mas no momento em que a nova moeda for criada, se os preos forem convertidos no pico, no ser necessrio um aperto monetrio, uma recesso temporria? A agncia de comunicao Free Press, do comit do candidato Fernando Henrique Cardoso (PSDB-PFL-PTB), reconheceu ter errado na divulgao de uma entrevista do diretor para Assuntos Internacionais do Banco Central, Gustavo Franco, concedida a ela como pea de campanha. Na sua defesa encaminhada ontem ao TSE, ela admitiu ter cometido um erro de interpretao na produo da reportagem na qual Franco afirma que o PT, caso ganhe as eleies presidenciais, pode dar um calote na dvida interna. Promotor pblico, procurador da Justia e um dos criadores da Comisso Justia e Paz da Arquidiocese de So Paulo, Bicudo lista como outras prioridades temas de sua especialidade: a reforma da Justia e do sistema penitencirio. Folha -- Quais so suas linhas prioritrias de atuao na Cmara dos Deputados? Para tentar se recuperar do jet lag e da correria da exposio do marido no Yerba Buena, Regina Cas fez ontem sada pela tangente. Junto com seu guapo Luiz Zerbini, foi at Muir Woods um bosque nas imediaes da cidade, onde s queria era abraar rvores de preferncia, enormes sequias. O grupo Po-de-acar, 2 maior rede de supermercados do pas, iniciou negociaes para a aquisio das 27 lojas G.Aronson, todas localizadas em So Paulo. A Folha confirmou os entendimentos junto a empresrios do setor de eletrodomsticos e a funcionrios do Po-de-acar. Oficialmente, a rede nega. Folha -- Em torno dele no estavam pessoas que hoje esto na sua candidatura? FHC -- Certamente, e da? O candidato do PT apresentou o vice de seu principal adversrio como algum identificado com o autoritarismo. At aqui, os tiros de Lula parecem no ter abalado o alvo. Meu passado muito me orgulha, d de ombros Maciel. O governador se convenceu ainda de que, se abandonassse a candidatura, poderia se tornar refm de Qurcia e perderia o poder de influncia na escolha do candidato do PMDB. A declarao de Fleury de que continua candidato no convenceu alguns prefeitos que o acompanhavam. Ele j est abraando a candidatura Qurcia, disse Itamar Borges (PMDB), de Santa F do Sul. Fleury aproveitou a viagem para responder ao ex-secretrio Jos Machado de Campos Filho, que o chamara de traidor por no apoiar Qurcia. Para quem quer ser candidato ao Senado, faltou bom senso e juzo, disse. Ao visitar a ponte rodoferroviria de Rubinia, criticou o deputado quercista Alberto Goldman (PMDB-SP), ex-ministro dos Transportes. Ele veio aqui e liberou US$ 5 milhes, 1% do valor da obra., declarou. Colaborou Aurlio Alonso, da Folha Norte. Apesar de todas as restries, a esmagadora maioria dos consultados (86%) acredita que Lula tomar posse sem resistncia de qualquer setor da sociedade. H uma pequena parcela, no entanto, que catastrofista ao ponto de afirmar que Lula sequer conseguir tomar posse. So apenas 2%. Para outros 9%, haver resistncias mas no a ponto de inviabilizar a posse. Burle Filho disse que combinou a ida de seu auxiliar aps telefonar para seu colega carioca, Antnio Carlos Biscaia. Segundo Burle Filho, Biscaia no lhe adiantou nada por telefone sobre o assunto. Queremos saber se o bicho de So Paulo e o do Rio so ramificaes de uma grande organizao criminosa. Ao mesmo tempo, estamos esperando denncias sobre a atuao dos bicheiros paulistas, disse Burle Filho. Mas o deixaram partir. Infante -- E pagaram as passagens. Me deram uma licena de dois anos, acreditavam que o exlio ia me neutralizar. Tinha uma famlia para sustentar, e pensavam que isso me impediria de fazer qualquer coisa. Pensavam que meu livro Trs Tristes Tigres seria recebido sem grande estardalhao. Outro lado de Lara: ela estudou canto lrico, adora msica e costuma cantar em shows e jam sessions com amigos. Como fazia em Madri. A idia para o livro surgiu aps uma entrevista que Andra fez em meados de 93 para o Tokyo Journal com o jogador Zico, poca atuando no Kashima. O editor da revista me telefonou e deu a idia. Fizemos um projeto e comecei a elaborar a estrutura do livro, diz Andra. Nos fundos de commodities h liquidez diria aps 30 dias. Na transio para o real, so mais recomendados os fundos de renda fixa DI (Depsito Interfinanceiro). RIO DE JANEIRO Aos trancos e barrancos, eis que mais uma vez o pas tenta dar um passo em direo a um futuro mais feliz. O panorama assustador: ameaa de greve, possibilidade de hiperinflao e descontrole de preos, perda salarial, nuvens negras para todos os lados. Exercendo o direito de no estar a favor ou contra ningum, gostaria aqui de remar contra a mar e pasmem manifestar meu otimismo. Por que no? Se h um artigo em falta no mercado hoje todo tipo de mercado, diga-se o otimismo. Por que no ousar ainda que ingenuamente esperar que as coisas desta vez se ajeitem? Olha o Carnaval de salo! S bela! S piranha! S lulite! Eu no s lulite, s gostosa. Rarar! E bomba! Bomba! Simo Emergncia! Um camel cearense inventou o kit Aids: um par de joelheiras e 45 camisinhas! Rarar! Comeou a esculhambao?! Carnaval! E os nomes dos blocos? Balana Rolha, Sacode a Rosquinha e Peru Esperto! Bloco da Galinha Azul Solta a Franga. Alpinista Sexual s Scala Monte Lbano. No, eu vou pescar. Ah bem, pensei que voce ia pescar. mole? mole mas sobe! Principalmente no Carnaval! T no ar mais uma calnia do Macaco Simo! Pior o Itamar que ficou intoxicado com o prprio pastelo! Nervosas! Atacadas! Inadimplentes do sexo em geral! Tem pelado no salo! Os Piranhos do Kung Fu invadem o salo! Essa a grande novidade do Carnaval 94: os pelades. Antes o Scala e o Monte Lbano s contratavam peladas. Depois do Clube das Mulheres (em Portugal, Clube das Bigodudas) os pelades invadem os sales. Gente, cada piranho! Parece remador de porta avio! E bem Clube das Mulheres: sunga, punho e gravatinha. E eu sei que tudo bofe mas parece tudo gay. Capa da revista Honcho. Esttica gay! Um rinoceronte de sunga mais sexy! Pelo menos agora na televiso tem pra todo mundo. No Brasil ns j estamos com oito tipos de opo sexual: perua, bofe, bofeca, boneca, traveca, hermafrodita, drag queen e drag king. E tem gente que ainda fica na mo? Rarar! Carnaval devia ser o ano todo. Cinco dias pra amador! A fazenda de renas Napapiiri, nas proximidades (tel. 38-4048) promove jantares com carne de rena e faz exibies folclricas. Preo: cerca de US$ 43. Dos museus, o melhor o Arktikum (leia reportagem ao lado). Mas h dois muito instrutivos, a cu aberto por isso, as visitas vo s de junho a setembro. Achei a idia muito interessante e educativa. A gente precisa de reciclagem em geografia, pois tudo muda muito rpido. L em Portugal, eu costumo comprar a Folha, afirma Barra. Em So Jos do Rio Preto, as duas bancas mais tradicionais da cidade tiveram filas na distribuio do terceiro fascculo do Atlas Folha/The New York Times. Esses ncleos coloniais, entre eles o do Vale do Ribeira, foram ocupados a partir de terras vendidas pelo governo. Para quem vai fazer a pesquisa no computador a separao de dados no faz nenhuma diferena: tudo funciona como se fizesse parte de um s arquivo. Segundo Milton Julio Zanluqui, da Paraklin Pra-raios, entre os tipos mais comuns (veja exemplos ao lado) esto o sistema de gaiola (Gaiola de Faraday), composto de seis partes principais: captador do tipo terminal areo, cabo de cobre, suportes isoladores, tubo de proteo, malha de aterramento e conector de medio. Esse equipamento envolve o prdio e indicado para construes que ocupem reas horizontais, como galpes e indstrias. preciso que tenhamos a conscincia de que os nossos tratados, que envolvem questes transcendentais que buscam uma circulao de bens e servios sem barreiras entre nossos pases, exigem uma viso da histria. No podemos deixar morrer o esprito de Iguau. Ele fechou um tempo de divergncias para abrir uma era de convergncias. Jos Sarney escreve s sextas-feiras nesta coluna. O Rio sempre foi um lugar que deu exemplo para o Brasil, disse o corregedor eleitoral, tentando desfazer o carter de interveno branca atribudo sua viagem ao Rio de Janeiro. Ns no vamos acreditar que um povo to culto como o da Guanabara possa ter problemas como andam dizendo por a, afirmou, referindo-se populao do antigo Estado da Guanabara, que desapareceu na fuso com o Estado do Rio de Janeiro. Um, o convencional, o que o Plano Real contempla e tem como instrumentos essenciais a importao taxa cambial fixa e baixa e quase sem tarifa aduaneira de mercadorias, tornando impossvel pela competio os produtores e comerciantes brasileiros aumentarem seus preos; e a reduo da demanda por bens de consumo e de investimento, mediante juros altos e corte do gasto pblico, impedindo as remarcaes de preos pelo excesso de oferta suscitado. O outro, heterodoxo e inconvencional, consiste em controlar as presses inflacionrias em sua origem, nas cadeias produtivas, mediante a fixao negociada entre todos os setores participantes de tetos para os aumentos de valores, sejam estes preos ou salrios. A agitao ontem foi grande tambm na Bolsa de Mercadorias com frequentes boatos de dificuldades de instituies financeiras em cumprirem seus compromissos no mercado futuro do ndice Bovespa. A projeo de juros acumulados para este ms no mercado futuro caiu de 3,76% no dia anterior para 3,71% ontem. O impacto provocado pela jazzstica trilha de A Marca da Maldade, com temas influenciados por Stan Kenton e at por pianistas de bordel, chamou a ateno de Edwards, que em 1958 procurava um msico novo, talentoso e barato para compor o prefixo da telessrie Peter Gunn. O jovem (34 anos), talentoso e barato Mancini entrou rachando. A fanfarra que a seguir comps para Dizem que Amor (High Time) s no ficou tambm gravada na memria de todo mundo porque, daquela vez, Edwards nem sequer chutou na trave. De todo modo, uma slida e longa parceria se configurava, alternando comdias e dramas cuja qualidade cinematogrfica variava infinitamente mais que as virtudes de suas trilhas sonoras. Quando os dois acertavam juntos (Bonequinha de Luxo, Vcio Maldito, A Pantera Cor-de-Rosa), era uma festa para todos os sentidos. E faturamento em dobro para o compositor, que no perdia uma chance de estourar no hit parade com uma cano. Por duas delas, Moonriver e Days of Wine and Roses, ambas com letra de Johnny Mercer, ganhou o Oscar. O baiano Dias Gomes, 70, est se preparando para ressuscitar um clssico da teledramaturgia brasileira. Semana que vem, comea a reescrever Irmos Coragem, novela de Janete Clair que a Globo exibiu com enorme sucesso entre junho de 1970 e julho de 1971. A emissora pretende lanar o remake em janeiro de 95, s 18h, no lugar de Tropicaliente. No centro de todos esses nossos esforos de qualidade est o nosso cliente. tudo pelo cliente. A principal meta da melhoria da qualidade da Degussa no Brasil a satisfao total do cliente. o cliente quem define os requisitos que teremos de cumprir e, at mesmo, superar, praticando qualidade em sua plenitude e obtendo, assim, seus benefcios, que so competitividade, lucratividade e satisfao. Entendemos que palavras como respeito, ateno e tica, incluindo a eficincia na prestao dos servios, devem mais do que nunca, nortear nossa conduta nas relaes com nossos clientes. PT, PDT, PC do B e os outros contras (partidos contrrios reviso) exigiram que as votaes do Congresso revisor comeassem pela questo tributria. A nova reivindicao irritou os partidos favorveis reviso, que criticaram a proposta. No dia 1 de maro, a fundao inaugurou mais uma escola. Ela foi construda em Marlia (cidade onde foi fundada o banco) com capacidade para mil alunos. Como nas outras escolas, ali os alunos recebem gratuitamente o ensino, o material didtico, o uniforme, a alimentao e tratamento mdico e odontolgico. A excluso de Maradona, por ter jogado dopado contra a Nigria, abalou os jogadores. O time, antes bastante aguerrido e solidrio, passou a discutir em campo. A Argentina fez uma opo bastante ofensiva. Variava entre um 4-3-3 e um 4-2-4. O tcnico Alfio Basile tentou aproveitar seus melhores jogadores. Ainda h muito o que fazer para desgravar completamente as exportaes de impostos indiretos, conforme permitem as regras do Gatt e impe a realidade de um ambiente internacional competitivo. Mas a Unio foi ao limite em termos de desgravao das exportaes. Antes da urgente reforma constitucional, a extenso da desonerao tributria das exportaes para todas as etapas da cadeia produtiva enfrenta dificuldades operacionais insuperveis no mbito federal. Seria invivel estimar, por exemplo, com a acuidade necessria para a concesso de crdito fiscal, qual a incidncia efetiva de PIS e Cofins em cada etapa do processo produtivo anterior produo dos insumos usados diretamente na produo para exportaes. Reginaldo Duarte pertence a uma famlia de polticos de Juazeiro do Norte (CE), mas nunca havia disputado eleio. Sempre fugia da raia. Achava que no tinha vocao poltica, explica. Em 90 ele no pde fugir. O Tasso Jereissati quis homenagear a minha regio e pediu um nome. Fui escolhido e no queria aceitar. Mas aceitei porque foi uma gentileza do governador, diz. Jnice Tristo afirma que acompanha a vida poltica de Elcio lvares h duas dcadas. Nunca havia disputado mandatos eletivos. Acha que foi escolhido como suplente pela sua imagem empresarial formada em mais de 50 anos. O tcnico Sacchi elogiou ontem a bravura de seus jogadores, especialmente de Baggio, com seu toque mgico para ganhar partidas. Ainda assim, foi a mdia que amplificou as acusaes, ao divulg-las sem uma checagem mais aprofundada. Culpa da mdia, ento? primeira vista, sim, mas a que se chega ao verdadeiro n da questo. Por partes: a lista de nomes mais completa foi ao ar durante o Jornal Nacional, da Rede Globo, por volta de 20h. De que adiantaria os jornais do dia seguinte omitirem os nomes, se eles j haviam sido arrastados lama por um noticirio que atinge cerca de 60 milhes de pessoas, certamente mais do que a tiragem conjunta de todos os jornais brasileiros? O treinador Levir Culpi no quis adiantar o substituto de der Aleixo. Segundo ele, a sada do meia-ponta representa uma perda importante para o time. Vamos com calma achar o substituto do der, disse. Culpi, no entanto, acha que o time pode suportar a presso e confirmar a vantagem em So Paulo. O time pode empatar a segunda partida para se classificar. O assessor de comunicao da Reduc, Fernando Fortes, disse que os grevistas esto desrespeitando a determinao do TST. O que nos preocupa o cumprimento da ordem judicial. Os grevistas esto irredutveis. O TST fala em manter a produo de combustveis e gs. Se eles no substiturem o pessoal, vo estar praticando desobedincia judicial, disse Fortes. Nenhum eleitor evanglico deve se sentir culpado por ter opinio diferente de seu pastor ou lder espiritual, diz a cartilha. O texto defende o voto para a Presidncia baseado em programas de governo. E condena a disseminao de boatos, como os que atingiram Luiz Incio Lula da Silva na eleio de 89. A IRLF s aprova o aborto quando a gravidez coloca em risco a vida da me. Mesmo quando a gravidez decorre de estupro ou quando o feto no tem chances de sobrevivncia, o aborto visto como crime. A IRLF condena ainda a eutansia e os mtodos contraceptivos no naturais, inclusive a plula. A tcnica tradiciona de tomografia no conseguia ver o abcesso no dente. Com o uso do mtodo tridimencional, foi possvel localiz-lo e descobrir a causa da morte. O artista, cujo nome de batismo Lus Carlos dos Santos, pagou uma fiana de CR$ 100 mil e foi liberado. A Folha procurou falar com Melodia sobre o incidente. O telefone da casa do msico permaneceu conectado a uma secretria eletrnica. Foi barra pesada, pelo som pareciam armas superpotentes, no foi um tiroteio qualquer, afirmou Tatiana Marques Pedrosa, 24, estudante de psicologia. Tatiana mora na rua So Clemente, Botafogo, em frente principal via de acesso ao morro. Morando h seis anos ali, Tatiana disse que nunca viu um perodo to violento. H dois meses que a coisa est pior. Com tanta violncia, sinto uma angstia muito grande, afirmou. A expectativa do Sebrae era que o volume de negcios chegasse a US$ 7 milhes na Fenit. Em 95, no entanto, a participao dos pequenos subsidiados pelo Sebrae na Fenit vai diminuir. Vamos reduzir o nmero de expositores para cem. A idia privilegiar os de maior qualidade e as feiras estaduais, diz Souza. Ao lado da Fenit foi realizada a Fenatec (Feira Internacional de Tecelagem), com representantes da Argentina, Alemanha, Frana e Estados Unidos. O IPMF padece de males profundos. Mas est correto o conselheiro Paulo Planet Buarque. Analisado isoladamente, um imposto democrtico e que, se levado sua consequncia lgica, o Imposto nico, poderia consumar a revoluo tributria que o pas deseja e precisa. MARCOS CINTRA CAVALCANTI DE ALBUQUERQUE, 48, doutor em economia pela Universidade de Harvard (EUA), vereador da cidade de So Paulo pelo PL e professor da Fundao Getlio Vargas (SP). Foi secretrio do Planejamento e de Privatizao e Parceria do municpio de So Paulo (administrao Paulo Maluf). Tambm nisto Bscoli estava em seu elemento: poucos podiam ser mais hilariantemente viperinos desde que a piada no fosse dirigida contra voc. E havia outro motivo para que ele atrasse a ira de tantos: seu sucesso com as mulheres. Cite uma cantora da bossa nova ou estrela da televiso dos anos 60 e h poucas chances de que ele no a tenha namorado sempre dando a impresso de que fazia isto por tarefa. Pela bossa nova, namoro at o Trio Irakitan, dizia. De acordo com Maluf, a equipe de elaborao do programa ter 30 dias para trabalhar e o nome mais cotado para coorden-la o do presidente nacional do PFL, Jorge Bornhausen. Segundo ele, a parte econmica do programa poder ser elaborada pelo ex-ministro da Fazenda Gustavo Krause (PFL-PE). Segundo Maluf, o programa de governo seria o ponto de convergncia para a discusso uma futura aliana eleitoral na sucesso presidencial. Ser um programa capaz de unir todas foras que querem levar o Brasil para o Primeiro Mundo. Um programa capaz de combater a inflao, o desemprego e que abra uma corrente de investimentos para o pas, disse Maluf. Se eu fosse voc, passava a andar de txi especial e apresentava a conta concessionria. Ligue imediatamente para o Procon (tel. 829-3055) e solte os cachorros. O livro est em toda parte e eu sou daqueles que no conseguem ver a imagem em outra dimenso. O que fao? O Brasil um pas do Terceiro Mundo, ainda sem cara e sem personalidade, mas metade do seu territrio a Hilia, que tem fascinado os iluminados, de Humboldt a Euclides da Cunha; acontece que, no passo de Jeca Tatu em que caminha, o Brasil ameaa transformar a Hilia, no sculo 21, que j nos olha com cara feia por cima do muro, num gigantesco cinzeiro, cheio da cinza das rvores e da cinza dos ndios. O Brasil s sair da sua mediocridade se se deixar incorporar Amaznia. E jamais chegar a isto se no tiver a total dedicao de algum como Ricupero, um excepcional cadre, que em pouco tempo convenceu os brasileiros da sua inteligncia e competncia na vida pblica. Quando Ricupero foi pinado da sua misso amaznica, lembrei Osvaldo Aranha, que dizia que o Brasil um deserto de homens e de idias. A TVs Bandeirantes e TVA/ESPN abrem espao para o futebol americano hoje. A Bandeirantes mostra o videoteipe de Kansas e Atlanta, s 17h45. A TVA apresenta em VT a partida entre Detroit e Dallas, s 7h. Promotores da Siclia pediram Justia local o julgamento de 23 pessoas, acusadas de conspirarem para assassinar Salvo Lima, poltico democrata-cristo morto pela Mfia pouco antes das eleies de 1992. A polcia deu ordem para prender mais sete pessoas suspeitas de envolvimento. O caso Lima coincide com a priso de 54 pessoas, como parte de uma operao anti-Mfia no oeste da ilha. Ignacio Perea, 32, foi condenado a cinco penas de morte em Miami. Ele sequestrou e violentou trs meninos com a inteno de lhes transmitir o vrus da Aids de que se sabia portador. As penas foram para sequestro, violao e tentativa de homicdio. a primeira vez nos EUA que um tribunal considera o vrus HIV como uma arma letal. O modelo de privatizao previsto na MP foi organizado pelo presidente do BNDES, Prsio Arida. No Planalto, no existe consenso sobre a MP. Inicialmente, a equipe econmica chegou a estudar a privatizao das empresas do grupo Nuclebrs, alm de Angra 1 e 2, mas no chegou ao fim. No apenas Marluce Dias da Silva a poderosa superintendente da Globo que considerada gnio em casa. To tchans quanto ela, seu marido Eurico o dono de uma coleo de restaurantes estrelados no Rio. O PDT prope uma auditoria para a dvida interna. Os papis da dvida seriam obrigatoriamente trocados por outros de vencimento a longo prazo (a partir de 15 anos, segundo o documento). O programa prope a monetizao da dvida interna (pagamento com dinheiro impresso) acompanhada da criao de uma nova moeda (assegurada a converso da poupana em termos favorveis) e imposio de rgido controle monetrio. Essa proposta permitiria ao governo negociar em posio favorvel o resgate dos ttulos pblicos, j que o programa admite o no-pagamento parcial da dvida, assegurada a poupana. O PDT pretende reduzir os impostos federais a quatro. Sobre reforma agrria, o projeto prev terra para 20 milhes de pessoas com assentamento de comunidades em reas vizinhas a estradas vicinais. O grupo suo baseia sua msica na combinao de barulhentas guitarras sampleadas, vocais guturais e pesadas batidas danantes. Ao vivo, sua msica no difere muito do pop agressivo de Ministry, Young Gods a quem substituiu no BHRIF, Nine Inch Nails e KMFDM. Os alemes, por outro lado, fazem um tecnopop assobivel e nostlgico no seu uso de sintetizadores analgicos anteriores ao surgimento da tecnologia dos samplers digitais. Segundo o COE, eram remotas as chances de Lauro e Alex serem encontrados com vida porque eles haviam levado comida suficiente para apenas um dia. S sobrevivemos porque encontramos duas cabanas de caadores com mantimentos e cobertores, disse Nascimento. O PPR tentou derrubar a reteno das verbas de habitao e educao no primeiro turno, mas uma manobra do governo barrou a mudana. Na ocasio, o PMDB apoiou o governo, mas agora no estaria totalmente unido para repetir a dose. A equipe econmica considera que cedeu tudo o que podia durante a votao da emenda no primeiro turno. O governo concordou em incluir no Oramento de 94 investimentos em educao nos mesmos nveis do ano passado e destinar para habitao trs vezes mais do que o registrado em 1992. Os nmeros destes investimentos ainda esto sendo fechados pelo Ministrio do Planejamento. O presidente Itamar Franco disse que sero mais de 50 demisses nas estatais do Ministrio dos Transportes por causa da desobedincia MP de converso URV. Vrias empresas ligadas a este Ministrio converteram os salrios pelo pico e no pela mdia dos ltimos quatro meses. Itamar disse que at agora poucas demisses foram efetivadas porque h sempre um processo legal e burocrtico que precisar ser cumprido. Mas todos sero demitidos, acrescentou. O presidente disse que pediu ao ministro das Minas e Energia, Alexis Stepanenko, que cheque tambm as estatais ligadas a ele para saber se tambm houve desobedincia. Participa de reunio partidria em Maravilha (Santa Catarina). tarde, vai a Belo Horizonte e visita o jornal O Estado de Minas em companhia de Hlio Costa, candidato do PP ao governo do Estado. noite vem a So Paulo para participar do programa Fogo Cruzado. Segundo Marcondes, o valor do patrimnio do hospital de US$ 50 milhes (R$ 42,7 milhes) e o total de dvidas, de R$ 50 milhes. Aps a negociao das dvidas, a reabertura do hospital est condicionada ao destombamento peloo Condephaat (rgo que cuida do patrimnio histrico da cidade) de uma rea de 10 mil m2, que dever ser comercializada. As frias nos EUA podem terminar no balco do consulado, onde o passaporte devolvido s 16h. O da advogada Zilda (no quis dar o sobrenome), 49, veio junto com uma carta dizendo que seu visto foi recusado. A Diviso de Vigilncia Sanitria informou que foram registrados 402 casos de clera, com quatro mortes, este ano no Estado. Os municpios com maiores incidncias so Nossa Senhora do Socorro, com 85 casos, Aracaju, 66, e Laranjeiras, 46. O Banco de Leite Humano da Secretaria Estadual de Sade montou um estande na praa Fausto Cardoso, centro de Aracaju. O objetivo informar sobre a importncia da amamentao. A atriz Cssia Kiss e a primeira-dama do Estado, Maria do Carmo Nascimento Alves, visitaram ontem o local. Iniciado em 1958, o estudo de O Capital por jovens intelectuais da USP s terminaria cinco anos mais tarde. Entre eles estavam figuras hoje conhecidas, como o filsofo Jos Arthur Giannotti, o crtico literrio Roberto Schwarz e o economista Paul Singer. FHC foi o primeiro a tirar consequncias de tal estudo. Em 1962, publica Capitalismo e Escravido no Brasil Meridional, primeira obra de sua autoria a se tornar referncia nas cincias sociais do pas. Para melhorar a ventilao, podem ser criadas janelas nos telhados ou pequenos vos.com telas para evitar a entrada de insetos. Os vos permitem a sada do ar quente, que fica na parte superior do ambiente. Vidros e janelas do transparncia casa e facilitam a entrada do ar. Mesmo com os ventos do litoral, suficiente o vidro de seis milmetros, no temperado. uma soluo econmica e segura, diz Vinograd. No piso, carpetes esto vetados. Deve-se usar pedras (pedra gois e ardsias), madeiras no-brilhantes (que riscam facilmente) e cermicas no-esmaltadas. Outra soluo, utilizada pelos decoradores Oscar Mikail e Fernando Rodrigues Alves na casa de praia do apresentador de TV Gugu Liberato, o piso com p de mrmore e cimento queimado branco. Uma casa de praia tem que ser prtica, ao mesmo tempo que confortvel, diz Mikail. Rubens Barrichello promete um anncio nos prximos dias. Fica com o pacote Jordan-Peugeot. Por enquanto, se dedica preparao fsica. Est com quilos de sobra. Christian Fittipaldi, por sua vez, negocia com a Tyrrell. Seu Sauber pode acabar trocando o certo pelo incerto ao esperar por Karl Wendlinger. A Diviso de Vigilncia Sanitria de Sergipe registrou nos primeiros 115 dias deste ano 87 casos de clera com uma morte. A chefe da Vigilncia, Nolia Soares, disse que, apesar da existncia de um surto da doena, o controle da qualidade das guas dos rios, poos e lagos tem evitado a propagao do vibrio. Aristides est esperando a publicao do texto do FSE aprovado pelo Congresso para concluir se a desvinculao dessas verbas atinge ou no direitos individuais. Pela desvinculao, o governo fica desobrigado de cumprir os artigos da Constituio que mandam aplicar percentuais fixos do Oramento em educao e habitao. Aristides tem defendido junto ao governo a aplicao do art. 212 da Constituio, que prev que Unio deve destinar pelo menos 18 % da receita de impostos educao. Os Estados e municpios tm que destinar 25 % desta receita. O papa Joo Paulo 2 escolheu a irm Emlia Ehrlich para ocupar uma das trs secretarias no snodo que acontecer em outubro. A reunio de bispos discutir a vida de padres e freiras. a primeira vez que o papa indica uma mulher para um cargo desse nvel na Igreja Catlica. As bibliotecas do Vaticano podero ser consultadas em breve atravs da Internet, rede de computadores com milhes de usurios. O material ser transferido para a memria de computadores. O projeto das empresas Xerox e Ernst & Rubican. Foram analisadas apenas a primeira amostra dos produtos e ainda deve ser feita contraprova. A Otker e a Somel vo pedir contraprova. A Mocotex vai sofrer anlise para saber onde ocorre a contaminao. Uma das poucas surpresas na eleio anticlimtica por que passou o pas foi a alta incidncia de votos brancos e nulos nas eleies legislativas e para os governos estaduais. Se abordo este tema aqui no para arriscar-me na anlise poltica, coisa que outros faro melhor do que eu. Meu intuito tentar entender que papel teve a imprensa nessa histria. So os garotos do Nabuco que vo administrar o negcio. Deciso do Supremo Tribunal Federal permite aos micropartidos disputar a Presidncia. O STF derrubou dispositivos da lei eleitoral que restringiam a participao dos partidos de acordo com sua representao na Cmara. 15 de maio -- Sarney desiste de disputar a prvia do PMDB contra Requio e Qurcia. Ela vencida por Qurcia, indicado candidato Presidncia. A absteno de 52 % dos peemedebistas credenciados a votar. A primeira etapa do M2.000 Summer Concerts amanh em Santos e domingo no Rio, inaugura a temporada 94 de caa aos shows. Espalhados pelos 7.408km do litoral brasileiro, os shows deste incio de ano tem l suas atraes internacionais. Nada comparvel aos shows de Madonna, Michael Jackson e Paul McCartney no final de 93. Saem os megas e entram os micros, minis e mdis. A dupla planeja agora temporada de dolce far niente. Em meio a tantos scuds por causa dos imbroglios de seu vice Jos Paulo Bisol, Luiz Incio Lula da Silva teve anteontem um providencial refresco. Preocupado com a facilidade de comunicao de seu adversrio, Francisco Rossi, Mrio Covas acatou as recomendaes de assessores do presidente eleito, Fernando Henrique Cardoso, e se submeteu s tcnicas de marketing. Avesso a um tratamento mais tcnico de candidatos durante as campanhas eleitorais, Covas protagonizou a mudana mais significativa no horrio eleitoral gratuito. A vida de alunos como Z no fcil. Eles tm que comparecer no Juquery de segunda a sexta-feira, das 8h s 17h. Em um dia da semana, Z faz planto noite. Fica mais de 36 horas ininterruptas no pronto-socorro. Uma vez por ms, d planto no fim-de-semana. Segundo ele, os traficantes tambm teriam interesse em negociar uma reduo em suas penas. Eles tm medo de ser mortos, porque a coisa est ficando violenta demais. Entregar-se, com a garantia de reduo da pena, seria um bom negcio, disse Maciel. A diria do aluguel de um Uno Mille sai por R$ 63, com quilometragem livre, na Avis. Na locadora Clean Car a diria do mesmo carro sai por R$ 72 com limite de 200 quilmetros por dia. A Localiza tem diria promocional de R$ 41 sendo que cada quilmetro rodado custa mais R$ 0,26. Na promoo o cliente ganha uma diria grtis a cada trs. Um advogado nova-iorquino transformou seu apartamento no Brooklyn num paraso tropical, com rvores, um sol artificial e pelo menos cinco jacars na foto, os policiais no momento da retirada de um deles. Os animais foram entregues custdia do zoolgico do Bronx. Um snodo da Igreja Catlica concluiu ontem que as freiras devem ter maior poder e responsabilidade. Mas postos de comando da igreja permanecero exclusivamente masculinos, disse o cardeal Eduardo Martinez Somalo em Roma. Pargrafo 3 -- Da aplicao do disposto neste artigo no poder resultar pagamento de benefcio inferior ao efetivamente pago, em cruzeiros reais, na competncia de fevereiro de 1994. Pargrafo 4 -- As contribuies para a Seguridade Social, de que tratam os arts. 20, 21, 22 e 24 da Lei n 8.212, de 1991, sero convertidas em URV e convertidas em UFIR nos termos do art. 53 da Lei n 8.383, de 30 de dezembro de 1991, ou em cruzeiros reais na data do recolhimento, caso este ocorra antes do primeiro dia til do ms subsequente ao de competncia. Pargrafo 5 -- Os valores das parcelas referentes a benefcios pagos com atraso pela Previdncia Social, por sua responsabilidade, sero atualizados monetariamente pelos ndices previstos no art. 41, Pargrafo 7, da Lei n 8.213, de 1991, com as alteraes da Lei n 8.542, de 1992, at o ms de fevereiro de 1994, e convertidas em URV, pelo valor em cruzeiros reais do equivalente em URV no dia 28 de fevereiro de 1994. A polcia do Mato Grosso do Sul vai combater a pesca predatria durante a piracema (poca de reproduo dos peixes), que termina no dia 31 de janeiro. At esta data, a pesca est proibida. Os pescadores detidos em flagrante usando tarrafas (redes de pesca) sero acusados de crime contra fauna, que prev de um a trs anos de priso. A polcia de Arealva (386 km a noroeste de So Paulo) prendeu trs rapazes acusados de estuprar a funcionria pblica A.P.S., 30, no ltimo domingo. Renato Batista Pedroso, 21, Douglas Jos da Silva, 21, e A.L.F., 16, esto incomunicveis. Segundo o delegado Roberto Ilhesca, Silva teria confessado do crime. Outros trs suspeitos esto foragidos. Ele confirmou as idias que eu j tinha, um candidato srio, responsvel. Sobre minhas questes em particular, na rea de educao, ele me impressionou muito. Tem muitos conhecimentos gerais e faz digresses um pouco longas. Daria a mesma nota para ele e para o Rossi. A Nigria conquistou ontem o ttulo da 19 Copa Africana de Naes, ao derrotar Zmbia por 2 a 1, em Tnis (Tunsia). O zagueiro Elijah Litana abriu o placar para Zmbia aos 3min de jogo. O atacante Emmanuel Amunike empatou dois minutos depois e marcou o gol da vitria aos 2min do segundo tempo. Uma proposta plausvel a de se fixar um prazo determinado, trs ou quatro meses, para que a sociedade se adapte voluntariamente URV, estabelecendo regras privadas de converso. Findo esse prazo, a converso para URV seria obrigatria, conforme regras previamente fixadas pelo governo. No h hiptese do Plano ser lanado antes do Congresso votar, primeiro, a emenda constitucional que cria o Fundo Social de Emergncia e, em seguida, o Oramento federal para 1994 com deficit zero. E com o Carnaval no meio. Isso significa que, tudo indo bem, a criao da URV fica para maro. A atriz e apresentadora norte-americana de TV Ricki Lake e outras 14 pessoas passaram a noite de anteontem na cadeia em Nova York, depois de haverem realizado um protesto nos escritrios do estilista Karl Lagerfeld em Manhattan. As 15 pessoas so membros da Peta (People for Ethical Treatment of Animals), entidade norte-americana que luta pelos direitos dos animais. Elas se algemaram entre si e nos mveis do escritrio em protesto contra a utilizao de peles animais em criaes do estilista. O cantor Stevie Wonder anunciou anteontem que sua turn norte-americana a primeira desde 1989 reverter fundos para campanhas contra a fome. O dinheiro arrecadado com a venda do single Take the Time Out ir para uma campnaha organizada pelo American Express. A falha ocorreu no momento de conferir, reembalar e repassar os pacotes de moedas para a tesouraria do banco. Calliari explicou que a posio das embalagens permitiu a queda de dois sacos, que depois foram parar na lata do lixo. Segundo Calliari, o barulho na agncia causado pela reforma do prdio abafaram o rudo da queda. O diretor-geral da PF, coronel Wilson Romo, disse que recebeu recomendaes expressas para investigar tudo a fundo. A vitria, porm, no foi suficiente para acalmar o ambiente no clube santista. O presidente Miguel Kodja Neto ignorou a diplomacia que normalmente marca o futebol para acusar o ex-jogador e empresrio Pel. Ele foi um timo jogador, mas um pssimo dirigente, disse Kodja, que quer o afastamento de Pel, assessor de assuntos internacionais, e do vice Samir Abdul Hack. Pelechian -- Sim, e gosto. Folha -- O sr. acredita ter influenciado estes filmes? Joo Paulo Bordon, vice-presidente da Swift-Armour, alega no ter sido comunicado sobre a realizao da assemblia da empresa para ratificar a concordata. O anncio legal foi publicado recentemente. O grupo Bordon enfrenta briga de famlia. O balano do Eldorado S/A informa que as irms Verssimo, filhas do fundador, ocupam cargos na vice-presidncia e diretoria do grupo. Sem acesso s contas do grupo, elas informam que no assinaram o balano. Elas no apenas restringem a liberdade e o futuro dos que tentam deixar Cuba como tambm oferecem expectativa de reformas democrticas foradas aos que ficam na ilha. O principal objetivo norte-americano evitar a repetio da fuga de mais de 125 mil cubanos em direo Flrida ocorrida em 1980 a partir do porto de Mariel. Na ocasio, Fidel Castro permitiu a sada de milhares de pessoas insatisfeitas com seu regime e abriu um novo espao para se manter no poder. Profissionais com longa experincia em cargos gerenciais e administrativos devem se preocupar, ao fazer o currculo, em mostrar a essncia da carreira, resumindo suas conquistas e realizaes. Muitos, no entanto, preferem descrever em detalhes toda sua trajetria educacional e profissional o que acaba deixando o currculo extenso e de difcil leitura. O filme bom, divertido. As mulheres jornalistas que esto na tela tm inmeros problemas. Alguns at se parecem com os meus. Mas nenhuma delas tem medo de no encontrar o carro na sada do cinema ou de ser agredida pelo guardador, ou de parar no sinal de trnsito na volta para casa. Amanh e por mais 15 dias tem mais filme. O de meia-noite timo, mas em Botafogo e acaba tarde. O das cinco tambm bom, mas no shopping mais concorrido da cidade e no vai ter ingresso. Nem vaga para estacionar, embora seja seguro. a Companhia de Dana de Deborah Colker que faz a abertura. Com figurinos de Tufi Duek e Yam Reis e festa doppo com a turma do ValDemente. Em discurso feito na semana passada em Johannesburgo, Arafat conclamou os muulmanos a uma jihad pela cidade de Jerusalm. O discurso, transmitido posteriormente por uma rdio israelense, colocou o dilogo em perigo. Nenhum filme brasileiro foi escolhido, quebrando assim a expectativa criada em torno de Alma Corsria de Carlos Reichenbach e Mil e Uma de Suzana de Moraes. O Brasil est assim fora das selees de Cannes-94. Tampouco nenhum filme latino-americano passou pelo crivo da Quinzena dos Realizadores. Ainda assim, expressiva a presena nas outras sees do festival. Na competio estar o mexicano La Reina de La Noche de Arturo Ripstein. A paralela Um Certo Olhar selecionou Los Naufragos do chileno Miguel Littin e Sin Compassion do peruano Francisco Lombardi. Adeus America de Jan Schutte, O Casamento de Muriel de P.J. Hogan, 71 Fragmentos da Cronologia de Um Acaso, de Michael Haneke, Fresh de Boaz Yakin, Venus de Neve de Sotiris Gortisas, Costas s Costas , Cara a Cara de Huang Jianxin, Rainha Bandida de Shekkar Kapur, Sem Pele de Alessandro d'Allatri, Wrony de Dorota Kedzierzawska, Trs Palmeiras de Joo Botelho, Katia Ismailova de Valeri Todorovski, Os Silncios do Palcio de Moufida Tiati e o filme coletivo Homem, Deus e Monstro. O adversrio que os romenos mais temem no nenhuma seleo, mas o cansao. O fuso horrio ser um duro obstculo. A tabela da primeira fase prev uma viagem Los AngelesDetroitLos Angeles para a seleo romena, em um espao de oito dias, na primeira fase. O prprio agredido informou polcia de que no se tratava dos agressores. No gramado, antes do jogo, representantes das torcidas dos grande times de So Paulo (Corinthians, So Paulo, Palmeiras e Santos) fizeram uma manifestao pela paz. Como exemplo no combate violncia, o prefeito carioca cita a criao da Guarda Municipal, um grupo de atividades especiais que tem atuado mais na liberao de reas tradicionalmente ocupadas por camels. Foram reabilitados 65 pontos da cidades, usados como pontos de venda de drogas e que viraram quiosques de flores, conta Maia. A princpio, a dvida era sobre a necessidade de se jogar com dois volantes meias que tambm auxiliam na defesa, Dunga e Mauro Silva. Esta armao d equipe um carter excessivamente defensivo, o que desagrada queles que esperam da seleo brasileira um futebol alegre, sempre visando ao gol. O funcionamento bastante simples. O cliente passar seu carto de dbito ou de crdito Bradesco Visa no terminal da empresa conveniada e fixar as datas de pagamento de suas compras. Na data acordada, os computadores do Bradesco debitaro (saque) a conta do cliente e creditaro (depcsito) a empresa vendedora. O objetivo facilitar a vida dos clientes e reduzir os custos da instituio financeira. O custo do processamento dos cheques de US$ 0,70 por folha. O do pr-datado eletrnico, US$ 0,15. Sem educao, reforma ficar 'capenga'. O perodo da tarde do Seminrio Internacional foi dominado pelo conceito de Qualidade Total. No no mbito da empresa, mas de um pas inteiro. Tema: mercado, relaes de trabalho e educao. As duas palavras no chegaram a ser ditas, mas resumem o consenso: investir em treinamento nas empresas bom, mas no basta. No domingo, em Campos (280 km ao norte do Rio), Brizola concentrou as crticas em Cardoso e Lula. Disse que FHC incompetente administrativamente e que Lula inexperiente e que no gosta de trabalhar. Segundo os organizadores, havia 6.000 pessoas no comcio s 21h, quando Brizola comeou a falar. Quando ele terminou, s 22h10, metade das pessoas havia abandonado o local. A aposentadoria , em tese, o tempo de realizar planos antes abandonados pela necessidade de trabalhar, mas nem sempre isso que acontece. Muitas vezes as mudanas na vida do recm-aposentado levam a uma sensao de perda de capacidade. Essas mudanas geralmente coincidem com a crise da meia idade, que definida no pela faixa etria, mas como um certo momento em que as pessoas reavaliam as escolhas feitas na sua vida. Que a organizao investiu muito tempo e dinheiro na sensiblizao para o programa e que, agora, com eles. Que no faria nada, a no ser cobrar os resultados do investimento. Zagalo assistiu ao jogo entre Flamengo e Corinthians, domingo, no Maracan. Elogiou dois jogadores flamenguistas e um corintiano. Ele afirmou que os atacantes Magno e Svio, do Fla, podem disputar o torneio Pr-Olmpico, em maro de 1996 na Argentina. Primognito de Tsunezaemon Maeda, Takayuki tinha trs anos quando o pai, a me e seis irmos chegaram a Ituverava, procedente de Sagaken, no Japo, para trabalhar como colonos na fazenda Santa Tereza. Tsunezaemon, que havia sido barbeiro em Lima, no Peru, onde morou por 14 anos, acumulara dinheiro com o qual comprou casas e prdios ao retornar ao Japo. Segundo Arnaldo Leite, coordenador do Programa de Capacitao Gerencial do Sebrae (Servio Brasileiro de Apoio s Micro e Pequenas Empresas), o programa se distingue de similares de outros Estados por ter aval prvio do Tribunal de Contas. De acordo com Leite, o Tc considerou desnecessrias as licitaes nas operaes do programa porque o Sebrae uma entidade civil sem fins lucrativos. Na tarde de ontem, diante de dezenas de jornalistas, Pertence zerou os computadores da Justia Eleitoral. A operao uma garantia de que todo o sistema estar vazio para receber os nmeros oficiais da apurao. Ao todo, sero usados cerca de 5.800 computadores. Os de maior porte esto nos TREs (Tribunais Regionais Eleitorais) e no TSE. O Contru (Departamento de Controle e Uso de Imveis) desinterditou ontem dez salas de cinema. Trs do shopping Ibirapuera, seis do Belas Artes e o Cinearte, no Conjunto Nacional. Continuam interditados o Cine West PLaza 3, o Comodoro e o Iguatemi, que s deve voltar a funcionar em dezembro. A produtividade das fbricas de nvel internacional foi o dobro da verificada nas demais, enquanto a qualidade foi cem vezes melhor. Elas registraram uma produo por hora de trabalho 43% superior s demais, afirma o relatrio. A frequncia na entrega de mercadorias para os clientes tambm revela grandes diferenas entre as empresas world class e as concorrentes. O ex-auxiliar de produo em cervejarias Joilson da Silva, 23, casado com um filho, pedia mais informaes sobre o abrigo. Eles do comida pra gente nesse lugar? Podemos dormir? O motivo seria a indefinio das regras de converso dos aluguis. O advogado Waldir de Arruda Miranda Carneiro, especializado em locao diz que a converso espontnea pode ser interessante dependendo da negociao entre as partes. Se o inquilino conseguir um bom desconto na hora da mudana do aluguel de cruzeiros reais para URV negcio para ele, diz. Isso porque ningum sabe quais sero os critrios da converso automtica, explica. Uma parada no Boathouse Cafe (East 72 com a Park Drive North) pode ser uma boa idia para quem est indo ao Metropolitan pelo Central Park. A lanchonete simples, mas faz um saboroso cachorro-quente por US$ 3. Quem gosta de comida italiana, deve experimentar o Christo's (Lexington, quase esquina com a rua 49). O cardpio varia entre pratos de massa al dente (cerca de US$ 15) e peixes grelhados (US$ 20). H algo de novo nestes quatro filmes que Cac Diegues fez para a TV Cultura, em cima de quatro canes de Gil, Caetano, Chico e Jorge Ben Jor. No maestria de virtuose, no rigor formal racional, no mensagem revolucionria, no filiao s artes conceituais, no achado de marketing. Queremos tambm negociar com as indstrias rentabilidade mnima de 15%, diz Arajo. Outra sada que ser buscada o aumento da venda de carne in natura, que hoje de apenas 30 % do total produzido. A meta chegar a 50 % , diz Ferreira Jr. Outro dia mesmo, durante a Copa, um jornalista ingls que escreve sobre o futebol para os principais jornais do mundo, reclamava dos poucos autores de expresso que escreveram sobre o fut. No caso do autor de O Amanuense Belmiro (recomendo a leitura de um dos inaugurais romances urbanos brasileiros) no havia nada especial a demandar a pergunta feita atravs do Alcino. Os agentes de turismo do Japo esto dando tratos bola para novo tour em solo brasileiro. Querem organizar pacotes para visitas ao tmulo do tricampeo Ayrton Senna eternamente favorito de 10 entre 10 japoneses. O nmero a soma dos US$ 8,6 bilhes arrecadados mais US$ 3,3 bilhes de dvidas das empresas (transferidas para os novos controladores) mais US$ 3 bilhes que o governo deixou de investir. Montoro disse que sobre esses US$ 15 bilhes o governo deixa de pagar US$ 1,5 bilho/ano de juros anuais, mais US$ 1,5 bilho/ano em amortizaes de dvidas e mais US$ 2 bilhes/ano em aportes de capitais s empresas. A produo de petrleo na Argentina aumentou 30 % entre 1991 e 1993. O salto na produo atribudo ao sucesso da privatizao da indstria petrolfera. De 1991 a 1993, a produo de petrleo local pulou de 20 milhes para 34,60 milhes de metros cbicos. ltima dica: no espere subir ao cesto do balo para iniciar suas fotos. A preparao para o vo, por si s, j um espetculo parte. CIRO COELHO editor-assistente de Fotografia. A ex-prefeita Luiza Erundina participa hoje da divulgao de um balano comparativo entre sua administrao 89 a 92 e o primeiro ano da gesto Paulo Maluf frente da Prefeitura de So Paulo. O balano aponta para uma queda nos investimentos nas reas de sade, educao e habitao e reduo nos salrios da prefeitura. Uma pesquisa da Associao Nacional de Agncias de Viagens (Abav) revelou que o setor foi incrementado 29 % em mdia desde a implantao do Plano Real. Para o presidente da Abav, Srgio Nogueira, a expectativa que o mercado de viagens e turismo recupere este ano os mesmos ndices favorveis de 1990, ano em que foi registrado o maior volume de vendas em passagens areas. As agncias movimentaram cerca de US$ 2,5 bilhes em 90. Na sonolenta Eindhoven, cidade do Brabante (regio do sul da Holanda), seriedade e pontualidade so apreciadas. Gostamos de gente que cala a boca e faz seu trabalho, como Ronaldo. Romrio era querido, mas em certos bares ele no entrava, diz um taxista. O tricolor, aps tantas decepes na temporada, vai s finais da Conmebol, caa-nqueis to inexpressivo que o prprio So Paulo inscreveu apenas jogadores jovens, que se denominaram Lees, ao invs do tradicional Expressinho. Apesar disso ou exatamente por causa disso, conseguiu a classificao diante do Corinthians, que ficou no meio do caminho: nem escalou sua fora mxima, nem optou por um time de juniores com alguns reservas de reforo. Assim, o tricolor conseguiu impor seu melhor conjunto. Mas ter sido apenas isso? Vamos confrontar os titulares que tudo perderam com esses meninos que so a derradeira esperana de um ttulo neste ano: enquanto os titulares de Tel privilegiam o meio-campo, com trs, quatro, s vezes cinco volantes congestionando o setor, o que, na prtica, no confere nem poder ofensivo ao time, tampouco proteo infalvel defesa, os garotos jogam com Mona, o nico volante tpico. O resto composto de meias-armadores e atacantes. Ser mera coincidncia? Em respeito histria, diga-se que o mal que a UDN fez ao pas no decorreu de seu moralismo, mas de sua amoralidade. O mal residia, enfim, no fato de que a poltica real da UDN se traduzia no que conspirava entre quatro paredes, sem uma antena parablica que lhe pusesse termo e sem uma imprensa independente que revelasse, livre de paixes partidrias, sua vocao para renunciar ao Estado de Direito ao toque da primeira corneta. A imprensa independente no quer cassar de Itamar Franco o direito de fazer seu sucessor. Tambm no lhe pede que cometa o maior pecado de um governante, que no governar e no servir ao pblico. Que o faa, no entanto, todos os dias do ano, no apenas quando o pas est boca da urna, e nos limites da lei. Embora j estejam com as chaves do imvel novo, s devem ir para l em fevereiro. Nossa vontade era mudar antes. O casal quer trocar o piso do banheiro, revestir o piso das reas sociais com madeira e instalar armrios. Metade dos Estados e municpios do pas federativa e economicamente invivel, segundo Aspsia Camargo. A sociloga e cientista poltica falou sobre o tema na ltima conferncia do encontro da Associao Nacional de Ps-graduao em Cincias Sociais (Anpocs). Na noite de ontem, havia cerca de 3.000 pessoas retidas no aeroporto de Ezeiza, o principal de Buenos Aires. Os vos da tarde de ontem entre Buenos Aires e So Paulo ou Rio foram retidos em Ezeiza. Sei disso. Procure saber se voc se entrega, ou se voc foge de suas emoes. Mas no faa perguntas como esta, porque o amor no grande nem pequeno. apenas o amor. No se pode medir um sentimento como se mede uma estrada. Se voc tentar medi-lo, estar enxergando apenas seu reflexo, como a da lua em um lago; no estar percorrendo seu caminho. Eu estou decepcionado, afirmou Coulthard aps o treino. O escocs atribuiu o seu desempenho ruim ao fato de desconhecer o carro sob as novas regras. Pela primeira vez neste sculo, vastos setores das classes mdias ocidentais no conseguiro efetivar a proeza social -- cimento da democracia e do capitalismo -- realizada pelas geraes precedentes: garantir a seus filhos um nvel de vida igual ou superior ao que seus pais haviam desfrutado. Ensino universitrio, contas de poupana nos bancos, investimentos em aes e imveis no asseguram a transmisso do patrimnio familiar dos pais para os filhos. Esse quadro de inseguranas e de incertezas que j dura h duas dcadas contribui para transformar o integrismo, o racismo, a intolerncia poltica num formidvel desafio ao sistema democrtico neste nosso fim de sculo. LUIZ FELIPE DE ALENCASTRO, 48, historiador, pesquisador do Cebrap (Centro Brasileiro de Anlise e Planejamento) e professor do Instituto de Economia da Unicamp (Universidade Estadual de Campinas). Emlia conta a Adelaide, tambm, sobre o livro que est escrevendo, que trata da famlias mais importantes de So Paulo. Adelaide critica Emlia por esta sair pouco de casa e, em seguida, deixa a sala irritada pela pouca ateno qu lhe dispensada pela me. Cada curso dura dois sbados, em um total de 14 horas, e custa US$ 250. As prximas datas dos cursos so dias 23 e 30 de julho. Santana era morador antigo de Lambari d'Oeste. Os trs cortadores de cana eram de Alagoas e estavam na cidade havia 15 dias. Lambari d'Oeste (5.000 habitantes) virou municpio h dois anos. Tal cobertura, assistemtica e voltada para buscar escndalos, leva-os, tambm, a trabalhar sobre fatos consumados. No estamos concluindo que este comportamento intencional, mas sim que h uma forte predisposio cultural (preconceito) em criticar o Legislativo antes mesmo de conhecer tudo que cerca algumas decises. Os agentes polticos, sociais e econmicos tambm tm de se reciclar, pois sua ao sobre o Legislativo paulistano s ocorre em funo de interesses especficos. Que a democracia contempornea fortemente corporativa o sabemos, mas esperamos que no continue a ser to cheia de particularismos! RUI TAVARES MALUF, 35, mestre em cincias polticas pela Unicamp (Universidade Estadual de Campinas), analista da Superintendncia de Experincias Metropolitanas Internacionais da Emplasa (Empresa de Planejamento Metropolitano da Grande So Paulo). Vinte marcas criaram t-shirts especialmente para o evento, sob o tema Atitude Forum, Zoomp, Zapping, Reinaldo Loureno, Walter Rodrigues, Der Haten, Viva Viva, Armadilha, Ellus, Idice, Sucumbe a Clera, Cia. do Linho, Special K, Lorenzo Merlino e Herchcovitch. Do movimento underground esto presentes Anderson Rubbo, Cac di Guglielmo, Divas e Alessandro Tierni. O direo geral de Paulo Borges. No galpo sero mostradas fotos dos bastidores dos lanamentos de moda em registros de Claudia Guimares, da Folha Imagem. O mercado acionrio foi bastante agitado ontem. A Bovespa (Bolsa de Valores de So Paulo) movimentou R$ 450,1 milhes. o maior volume desde 1 de novembro ltimo (R$ 763,6 milhes). O mercado trabalhou com a expectativa de aprovao, pelo Conselho Monetrio Nacional, de medidas de socorro a bancos em dificuldades. O ndice Bovespa operou em alta durante toda a manh e incio da tarde, mas fechou em baixa de 0,34%, considerando as medidas do CMN como restritivas ao funcionamento do sistema financeiro. Seria uma repetio do que ocorreu em 1989, durante o Plano Vero, quando desapareceram 51,87 % de inflao do ndice oficial de correo monetria. Muitas empresas entraram na Justia para obter o direito de contabilizar essa corroso inflacionria em seus balanos, e ganharam as aes em primeira instncia. Os processos continuam tramitando. O dlar futuro na BM&F foi cotado a R$ 0,977425 para os negcios com vencimento no prximo dia 29. O ndice Bovespa futuro fechou a 39.600 pontos, com expectativa de valorizao de 7,10 % ao ms. Aposto uma boneca Barbie como os pais de crianas entre cinco e doze anos ainda vo perder vrios fios de cabelo procurando por um tal de Mighty Max debaixo do sof, no cinzeiro do carro ou atrs da geladeira. Mighty Max um boneco de plstico, que convive com monstros e morcegos em um universo miniaturizado. O brinquedo a mais recente sensao da apetitosa loja Fao Schwarz, da Quinta avenida, em Nova York. Juan Manuel Fangio, sobrinho do argentino pentacampeo de Frmula 1, est testando na Califrnia um novo prottipo que vai dar o que falar na Frmula Indy. Trata-se do Eagle uma reedio com motor Toyota e chassis Lola do legendrio carro feito por Dan Gurney. Assinaram o acordo Luxemburgo, Blgica, Holanda, Itlia, Frana, Alemanha, Espanha, Grcia e Portugal. No so signatrios Inglaterra, Irlanda e Dinamarca. Os pases visitados podero exigir a comprovao de meios de subsistncia para a permanncia do turista. Essa permisso de entrada de brasileiros, sem o visto, partiu de uma negociao entre Portugal e a Unio Europia. Na segunda-feira, a me de Christian, Suzy Fittipaldi, dissera que gostaria de pedir ao filho para que abandonasse as pistas. Ontem, ela foi ao aeroporto e se disse contente por t-lo de volta ao colo. s 11h30 de ontem, no escritrio da Fittipaldi Promoes, empresa da famlia, Christian afirmou que, enquanto estiver motivado e achar boas as condies de segurana, continuar na Frmula 1. Folha O Banco Central est se espelhando na experincia de algum outro pas, em operao semelhante? Tavares No usamos nenhum modelo porque temos uma caractersitica muito nossa, que o tamanho do Brasil. um continente e um pas de Terceiro Mundo. Vai ter real chegando de barco, caminho, avio, todos os meios de transporte. J tinha feito dieta, ginstica localizada, exerccios com aparelhos e no conseguia eliminar essas gordurinhas. Achei que a 'lipo' seria uma boa alternativa. A cirurgia durou duas horas. Silvia tomou anestesia peridural (aplicada na regio da coluna) e pde retornar para sua casa no mesmo dia. No Tatuap, onde a falta de atendimento comum nos fins-de-semana, havia ontem clnicos e cirurgies trabalhando. Ningum foi encontrado ontem nas secretarias municipais e estaduais da Sade para comentar o assunto. Na semana passada, o mercado de gado leiteiro foi marcado por uma surpresa: o gado girolando atingiu preo superior ao holands puro-sangue nos leiles. Dia 19 ltimo, o 2 Special Girolando atingiu a mdia de R$ 3.600 por 44 fmeas. Dois dias depois, o Salute Milk, baixou a mdia para R$ 3.000 na venda de metade das 30 vacas holandesas que foram apresentadas para negcio. Segundo o Instituto Mdico Legal de Taboo da Serra, todas as vtimas levaram mais de um tiro, a maioria na cabea. A arma utilizada foi uma pistola automtica 380. Os crimes no Itaim Paulista ocorreram na rua Enseada de Itapacoria, 327, por volta das 20h de sbado. Muller chega s 5h25 de hoje a So Paulo, vindo de Londres. O jogador contou o motivo que o obrigou a no acertar com o Everton. No garantiram casa, carro, assistncia mdica e passe livre depois de quatro anos, disse o diretor Kalef Joo Francisco, que falou ontem com Muller. O contrato, diz ele, expirou ano passado e s foi renovado em fevereiro. Ele disse que a receita da ferrovia no cobre as despesas. Gazeta Filho confirmou que os dormentes se deterioraram muito antes do previsto. Segundo ele, os cerca de mil dormentes podres sero trocados em curto prazo. O percurso pelas avenidas Consolao, Rebouas e Francisco Morato durou exatos 20 minutos, sem trnsito moroso. Foi possvel cumprir os 13,4 quilmetros do percurso em uma velocidade mdia de 40 km/h, incluindo-se a espera nos semforos. O que corresponde a uma rentabilidade de 13,03 % sobre o patrimnio lquido. O grupo Votorantim recebeu o ISO-9002 para o cimento Votoran, produzido em Volta Redonda. Os servios de inteligncia da Argentina que trabalham com a ajuda do Mossad (servio secreto de Israel) e do FBI (Polcia Federal dos EUA) atribuem o atentado ao Hizbollah, grupo muulmano xiita pr-Ir. O grupo e o governo iraniano negaram envolvimento. Ontem, o juiz Juan Jose Galeano, responsvel pelo caso, viajou para a Venezuela. A corrida sucessria comea esta semana com um quadro mais claro e definido do que o da semana passada. Embora ainda no possa ser definitivamente descartada, a hiptese da candidatura do ex-presidente Jos Sarney encontra-se pelo menos momentaneamente bastante afastada, abrindo espao para que a campanha comece a deslanchar com base no atual grid de largada. A nica alternativa que resta a Sarney de fato que o Superior Tribunal de Justia acate a denncia de estelionato apresentada peloo Ministrio Pblico Federal contra Orestes Qurcia e que, neste caso, o ex-governador de So Paulo renuncie candidatura do PMDB. uma possibilidade remota. No s a expectativa de uma tramitao demorada at a deciso do STJ o que pode inviabilizar uma eventual candidatura Sarney por falta de tempo at as eleies, como tambm h informaes dando conta de que o ex-presidente poder em breve anunciar seu apoio a Fernando Henrique Cardoso, do PSDB. Estudo da Associao dos Engenheiros da Petrobrs sobre a estrutura de preos dos combustveis mostra que a Petrobrs tem uma margem de at 29% no preo do combustvel que poderia ser usada para reduzir o valor da gasolina. O clculo baseia-se na comparao dos preos brasileiros com o americano. A Folha procurou a Petrobrs mas no obteve retorno. Os Bulletin Board Systems (BBS) esto oferecendo um novo servio aos seus usurios: a conexo com a rede das redes, a InterNet. Essa rede, criada h 20 anos pelo Departamento de Defesa dos Estados Unidos para conectar os seus vrios centros de pesquisa, considerada a maior rede de comunicao entre computadores, contando com milhes de usurios espalhados por vrias cidades do mundo. Os servios mais procurados na InterNet so correio eletrnico e transferncia de arquivos, conhecida como FTP (File Transfer Protocol). Para se ter uma idia da importncia da InterNet, a Casa Branca est conectada a essa rede, disponibilizando publicaes e documentos para qualquer pessoa do mundo. O acantonamento Itere oferece futebol, vlei, caminhadas, brincadeiras, e est com vagas para as temporadas de 17 a 23 e de 24 a 30 de julho. Fica na fazenda Itere, em Juquitiba, a 120 quilmetros de So Paulo. Steven quer aproveitar a viagem ao Brasil para completar sua discografia dos Mutantes e conhecer Arnaldo Baptista. A advogada diz que o Mais! adulterou a obra para satisfazer a vontade de neonazistas brasileiros. Vera Lucia Vassouras, que negra e se diz militante do PDT, sustenta que os autores do livro em nenhum momento afirmaram que os negros so geneticamente inferiores ao brancos. No texto publicado no Mais!, os autores do livro afirmam que as pontuaes de QI aumentam com o status econmico, em ambas as raas. Mas a magnitude da diferena entre negros e brancos nos desvios padres no se reduz. Na realidade, ela aumenta medida que as pessoas ascendem a escada socioeconmica. H excees. Uma blusa preta pode ser usada com um suti branco se a saia for branca, explica. A angua deve ser usada sob roupa transparente. Nunca use angua curta com uma saia comprida. CONCERTOS INTERNACIONAIS GLOBO, 0h. A comdia romntica Suburbano Corao conta a histria de Lovemar (Andra Beltro), uma sonhadora costureira do subrbio que idolatra o locutor de rdio Rogrio (Marco Nanini). Adaptado da pea homnima de Naum Alves de Souza, o especial tem ainda no elenco Marisa Orth, Pedro Paulo Rangel e Diogo Vilela. A direo de Guel Arraes. Alm de Braslia, devem atrasar o relgio os Estados de So Paulo, Rio Grande do Sul, Santa Catarina, Paran, Rio de Janeiro, Minas Gerais, Esprito Santo, Gois, Mato Grosso do Sul, Mato Grosso, Amazonas e Bahia. A alterao deve ser feita de acordo com a hora local, respeitando portanto as diferenas normais de fusos horrios. Quem est duas horas mais cedo que Braslia mudar o horrio duas horas antes, portanto. Com a mudana, o metr de So Paulo vai funcionar hoje uma hora a mais, como se tivesse duas meias-noites. Os relgios sero atrasados meia-noite para 23h, e os trens continuaro a correr at a segunda meia-noite. Isso vale para os trens das linhas Norte-Sul e Leste-Oeste. O metr Paulista continua fechando s 20h30 e, portanto, no ser afetado pela mudana; as outras linhas do metr funciona das 5h00 s 24h00. A proliferao nessa rea maior pelo fato dos indgenas no pagarem impostos com a atividade, mas apenas taxas. Muitos dos cassinos tm grandes grupos por trs, que usam incentivos aos ndios como fachada. A emoo cresce quando atravesso o salo de festas e chego s salas do museu, do Memorial de Menininha do Gantois. Aqui era o quarto pobre, simples, limpo e acolhedor. A cama no era um leito de enferma, era um trono de rainha. Apoiada nos travesseiros, o busto levantado na animao da conversa, o rosto concentrado no jogo dos bzios, no instante da adivinhao, Menininha do Gantois personificava a verdade do Brasil, de um Brasil mais profundo e mais belo, situado alm da corrupo, da injustia, da violncia, da mentira, das pequenezes, da delao transformada pelos pobres homens da baixa poltica em suprema virtude nacional. Ai, me Menininha, acode-nos nesta hora de quase desespero, d-nos o alimento da confiana e do sonho. Aqui, neste espao onde se reverencia sua memria, eu a recordo amiga de toda uma vida, nossas longas vidas vividas na intensidade da paixo: com ela aprendi a bondade e o povo. Me ensinou que s o povo constri grandeza e o faz desinteressadamente, no dia-a-dia da generosidade. O Instituto Brasileiro do Patrimnio Cultural reinaugura dia 18 o Museu Victor Meirelles, em Florianpolis. O museu funciona na casa em que nasceu Meirelles, autor do quadro A Primeira Missa no Brasil, sua obra mais famosa. O prdio foi restaurado. A Cartilha da Segurana Escolar foi lanada anteontem, em Porto Alegre. O autor do projeto, vereador Isaac Ainhorn (PDT), disse que o objetivo orientar os estudantes sobre segurana pessoal, no segurana no trnsito e preveno contra as drogas. O patrimnio era de CR$ 26,3 trilhes, equivalentes a US$ 16,3 bilhes, contra US$ 6,9 bilhes dos fundes tradicionais, que vm encolhendo. A grande vantagem dos fundos de commodities a possibilidade de saques dirios, sem perda do rendimento, aps 30 dias. Um grupo de miul policiais militares dos EUA sob o comando do ex-chefe de polcia de Nova York, Raymond Kelly, comeu a trabalhar ontem em Porto Prncipe com o objetivo de ajudar a manter a ordem pblica. Trezentos soldados de cinco pases da regio do Caribe tambm chegaram ontem ao Haiti com a mesma misso. O coordenador da Inteligncia Fiscal da Receita, Marcos Vincius Lima, disse que foram encontrados em 14 estabelecimentos de um conceituado colgio privado de So Paulo os equipamentos importados por uma fundao educacional ligada ao mesmo grupo. Esse colgio foi multado semana passada em 2 milhes de Ufir quase R$ 1 milho, valor que corresponde ao II (Imposto de Importao) e IPI (Imposto sobre Produtos Industrializados) sonegados por meio da fraude. Manoel Carlos Marques Beato, do restaurante Fasano e Jos Sebastio Figueiredo, sommelier e proprietrio do La Bicocca, garantiram suas vagas no Concurso Brasileiro de Sommeliers. No segundo semestre, o concurso escolher um destes profissionais (responsveis pelo servio de vinhos num restaurante) para representar o Brasil no concurso mundial, no Japo, em 95. Jack Girafa Charlton, tcnico da Irlanda, usou seu pescoo comprido e seus quase dois metros de altura para vislumbrar a passagem da equipe para as quartas, depois de um estria vitoriosa contra a Itlia na primeira fase. Acabou saindo nas oitavas. Com a desclassificao, Jack Girafa no teve o comportamento pacato que se espera da gigante das savanas. Criticou a Fifa at pelas diretrizes sobre suprimento de gua durante as partidas. Embora seja concorrente respeitado, a Nielsen no representa uma ameaa real, diz Flvio Ferrari, diretor da Ibope Mdia. Acrescenta que o cinquentenrio instituto que, a exemplo da Nielsen, tem atuao internacional, se dispe a oferecer eventuais relatrios diferenciados do concorrente. No final do ltimo ano, Coelho dividiu a cena com o tenor espanhol Plcido Domingo. H um ms, estrelou Salom, de Richard Strauss. O jornalista LUS ANTNIO GIRON viaja ustria a convite da agncia Austraca de Turismo e da Lufthansa. No sbado, dia 2, a CET (Companhia de Engenharia de Trfego) restabeleceu o sentido normal de circulao da rua Visconde da Luz, no Itaim-Bibi (zona sul), da Joo Cachoeira para a Clodomiro Amazonas. A mudana foi determinada como opo de retorno ou acesso avenida Santo Amaro. Parte do crescimento de Fernando Henrique na ltima pesquisa Datafolha deve-se evoluo do tucano entre os eleitores que declaram simpatia pelo PMDB de Orestes Qurcia e pelo PPR de Esperidio Amin. FHC foi de 33% para 41% entre os peemdebistas (oito pontos percentuais de crescimento) e de 3% para 42 % no eleitorado do PPR (11 pontos). 24 de novembro de 92 -- Um Boeing 737, da China Southern Airlines, bate em um morro pouco antes de aterrissar na cidade turstica de Guilin, causando a morte de 141 pessoas. 23 de julho de 93 -- Um jato britnico Aerospace 146, da China Northwest Airlines caiu na regio autnoma de Ningxia Hui. Dos 133 passageiros e tripulantes que estavam na aeronave, 55 morreram. Pergunta Quais suas opinies sobre a atuao do Ministrio Pblico na investigao da contraveno? Resposta Injusta principalmente quando nos acusa de trfico de drogas, de participar de quadrilha e sequestro. A Rscal Pizza & Cozinha criou um cardpio especial para o almoo da Pscoa de amanh, com uma massa com um recheio e um molho base de peixes. Pelo preo de CR$ 6.200, os clientes podero se servir vontade do ravili com recheio de salmo. Os que preferirem carne, podem optar por um nhoque preparado base de vitela. A pizzaria foi inugurada pelo Grupo Viena h dois meses e oferece 21 tipos de pizzas no forno lenha, alm de massas especiais, grelhados e saladas. Presidente arruma cargos para integrantes do 'Grupo de Juiz de Fora'. Na reta final de seu mandato, o presidente Itamar Franco decidiu arrumar empregos, a partir de janeiro prximo, para seus amigos que compem o chamado Grupo de Juiz de Fora. Parecia concurso de perguntas idiotas a cobertura do carnaval nos sales, tera. Manchete, Gazeta e Bandeirantes empataram. Voc se acha louca? Os cargos oferecidos so, por exemplo, gerncia de produo industrial, engenharia de projeto ou desenvolvimento de produtos. Pesquisa da Laerte Cordeiro & Associados, com base em anncios de emprego, indica que a rea voltou a ocupar a segunda posio na demanda. Pases ex-comunistas como Polnia e Hungria tm interesse em entrar na Otan. Eles temem, porm, que um relacionamento especial entre Moscou e a aliana os deixe sem garantias de segurana. De seu lado, a Rssia manifestava preocupaes semelhantes diante da aproximao da Otan em relao aos antigos membros do Pacto de Varsvia (a aliana militar pr-URSS, extinta em 1990). So trs as certezas entre as partes que negociam o aumento de salrio para o funcionalismo: a deciso no pode ser adiada, a aparncia de equilbrio fiscal deve ser preservada e Ricupero ficar no cargo em quaisquer circunstncias. Comandantes militares voltaram a receber comunicado esta semana, destinado tropa, dizendo que o reajuste do soldo sair este ms. Os maestros candidatos ao prmio de melhor regente so Claudio Abbado, Simon Rattle, Valery Gergiev, Mariss Jansons e John Eliot Gardiner. Para melhor cantor, figuram Jos Carreras, Thomas Hampson, Ben Heppner, Sergei Leiferkus e Bryn Terfei. Entre as mulheres, esto Cecilia Bartoli, Jessye Norman, Anne Sofie von Otter, Dawn Upshaw e Galina Gorchakova. O Classical Music Award promovido pela BBC Music Magazine e pelo jornal The Independent, e tem o patrocnio da empresa Kenwood. Outros prmios incluem personalidade do ano, melhor produo de pera, melhor transmisso de TV, melhor grupo de msica antiga, entre outros. Antonio Jorge, diretor tcnico da Amil, diz que a empresa ainda estuda a converso, mas que os carns de junho j sero emitidos em URV. Ele garante que o aumento real nas mensalidades no vai superar os 22 % propostos pela Abramge. Outras empresas, entretanto, vo praticar um reajuste maior na converso. Pssimo comeo de temporada para os dois melhores tenistas do mundo. Pete Sampras e Michael Stich perderam no Torneio de Doha, no Qatar, para jogadores mais acostumados ao calor forte o marroquino Karim Alami e o haitiano Ronald Agenor. O nico favorito a vencer foi Stefan Edberg, que arrasou o tambm sueco Anders Jarryd por 6/1 e 6/1. Sampras esteve irreconhecvel. Alami, de 20 anos e nmero 204 do ranking, teve a maior vitria de sua carreira. Tinha vo marcado para esta noite (ontem), porque no imaginava ser capaz de vencer Sampras, disse Alami. SO PAULO -- Se o leitor se deu ao trabalho de ler todas as repercusses sobre o real publicadas ontem pela Folha, ter verificado que, como sempre ocorre nessas ocasies, h opinies contrrias, a favor e mais ou menos. Se separar as opinies conforme a caracterstica do entrevistado, ter verificado igualmente que todos os empresrios ouvidos so a favor. Seria bom grifar a o todos. Os 160 expositores da mostra da indstria de cosmticos estimam vendas de US$ 150 milhes at a prxima segunda-feira. So esperados cerca de 60 mil visitantes. A Cosmtica'94 acontece no Parque Anhembi (zona norte de SP) e aberta ao pblico. A Secretaria das Finanas da Prefeitura de So Paulo informou ontem que a partir de segunda-feira, dia 8, a UFM (Unidade Fiscal do Municpio) valer R$ 28,15. O reajuste de 6,08 %, equivalente ao IPC-r de julho. Os contribuintes que pagarem seus tributos (IPTU, ISS etc.) hoje sairo ganhando, pois o clculo ser feito pela UFM de R$ 26,54. Ser lanado no prximo dia 9, no auditrio da Folha, a partir das 19h30, o livro Tributao no Brasil e o Imposto nico, organizado pelo economista Marcos Cintra. Autor da proposta do Imposto nico, que pretende reduzir a enorme gama de tributos a um, Marcos Cintra far uma palestra sobre o assunto. Ainda em setembro, no final do ms, encerram-se as inscries nos colgios Santa Cruz (para a 1 srie do 1 grau) e Santo Amrico. As escolas limitam os vestibulinhos a algumas sries que tm maior procura por vagas. MAIORIA DOS PROGRAMAS EST LIGADA REDE DE ESGOTOS*. A modelo Monique Evans, um dos destaques do Carnaval carioca deste ano, vai submeter-se a uma tomografia computadorizada na prxima semana para verificar a origem de um pontinho que apareceu em seu rim. No acho que seja cncer, pode ser algo relacionado aos meus medos e angstias, afirmou. Monique, 37, disse que descobriu a marquinha, que no pedra no rim quando se separou do marido, em junho passado. Senti dores e, como tenho uns casos de cncer na famlia, decidi fazer o exame, contou a modelo. Antonio Delfim Netto, deputado federal pelo PPR-SP, ontem no Jornal do Brasil. Itamar e as autoridades que o acompanhavam chegaram ao cemitrio 1h50 de ontem. Ele permaneceu no velrio por aproximadamente uma hora e seguiu para descansar no hotel Glria at o horrio do enterro. Segundo amigos do presidente, ele repetiu vrias vezes que Ariosto era o filho homem que nunca teve. Ele voltou a ter crises de choro. Na dianteira, o freio a disco ventilado de 320 mm, com pisto duplo. Na traseira tambm h disco ventilado de 230 mm. Ambos tm acionamento hidrulico. A suspenso frontal por garfo telescpico invertido, marca Marzochi, com curso de 200 mm. A suspenso traseira progressiva com um s amortecedor Boge. O TJM (Tribunal de Justia Militar) vai comear a ouvir neste ms as testemunhas de acusao do processo contra os 120 Policiais Militares denunciados pelo massacre de 111 presos no pavilho 9 da Casa de Deteno, no Carandiru (zona norte de So Paulo). O massacre ocorreu quando a tropa de choque da PM invadiu o pavilho para acabar com uma rebelio, em 2 de outubro de 92. Mal, pelo menos por enquanto, na eleio presidencial, o PMDB mostra na corrida dos senadores que ainda tem cacife. So peemedebistas os lderes no Rio Grande do Sul (Jos Fogaa), Paran (Roberto Requio) e Cear (Mauro Benevides). No espectro poltico, Fogaa e Requio so rotulados como membros da centro-esquerda. O Conselho Municipal de Defesa dos Direitos das Mulheres e das Minorias promove hoje em Natal, a partir das 14h, no calado da rua Joo Pessoa (centro), um ato pblico. A Coordenadoria Especial da Mulher de Londrina (PR) promove debate, s 20h30, no Catua Shopping Center, com a sexloga Marta Suplicy. Em Curitiba, as comemoraes vo incluir envio de fax, plantio de rvores e passeatas. Em Salvador, haver hoje a inaugurao na estao da Lapa (centro) de um balco de informaes sobre as questes da mulher. Em planto especial, funcionrios da Delegacia Regional do Trabalho e do Instituro Pedro Melo expediro carteiras de trabalho e de identidade para mulheres. A excurso est sendo organizada pelo clube More Fun, que promove a troca de correspondncias entre gays que no querem expor publicamente suas preferncias sexuais. At sexta-feira, 38 pessoas j haviam reservado suas passagens. Queremos lotar dois nibus com a 80 pessoas e fechar o hotel, disse o administrador de empresas E., 31, um dos organizadores. Curiosamente, quanto mais os novos programas e redes facilitam a vida dos informatizados, maior a tendncia dles de ficarem grudados no computador. Hoje em dia, qualquer americano medianamente equipado pode fazer diante do teclado tarefas que antes exigiam que ele tirasse o bumbum da cadeira: mandar e receber fax e mensagens eletrnicas, reservar passagens areas, controlar a conta bancria, acessar bancos de dados e dezenas de outros servios. Para marcar a doao da coleo de Takeo Hirata para a biblioteca do Masp, dois designers japoneses fizeram ontem uma palestra na Fiesp. Takuo Hirano e Tetsuyuki Hirano, pai e filho, vieram falar sobre o novo conceito de design que esto desenvolvendo. Atravs da sua empresa, a Hirano Design International Inc., que tem sede no Japo e filial em Chicago, eles promovem o casamento entre o design e a administrao de empresas. Eu estou apaixonado, declarou o presidente Itamar Franco, 63, no hall do Teatro Nacional de Braslia, onde assistiu anteontem o bal Quebra Nozes com a pedagoga June Drummond, 31. June no respondeu se tambm estava apaixonada. Apenas sorriu. Itamar disse que o assdio da imprensa continua atrapalhando muito seu namoro com June. Com cinco livros publicados, e um sexto em preparao, o escritor aguarda apenas, para a definitiva consagrao, que Hollywood consiga adaptar alguma de suas histrias malucas para as telas. Strip-tease, que a Companhia das Letras lana hoje no Brasil, o ltimo romance de Hiaasen. A mesma editora publicou, em 93, o livro de estria do escritor, Caa aos Turistas. O ciclista espanhol, 48, se suicidou em Caupenne d'Armagnac, no sul da Frana com um tiro. Em 1973, Ocaa venceu a Volta da Frana, a maior competio do gnero no mundo. Piotr Wator, 21, do Gajowianka, da 3 diviso, morreu domingo durante partida, em Cracovia. Wator se chocou com um companheiro de equipe. Cerca de 80 % da rea do municpio considerada de preservao ambiental. O aumento na arrecadao possibilitou que, depois de 30 anos, a prefeitura pudesse comprar trs caminhes, um nibus e um trator, disse o prefeito Fausto Camper, 43 (PMDB). E a Regina Cas me disse que churrasco da Brahma, vulgo churrasquinho de Los Gatos, assim: voc enfia o dedo na picanha e fica rodando. Rarar. Dedo na brasa! E sabe quem eu ainda no vi? O Chato Mesquita do Pernil. O Jabaury Jr. a gente encontra toda hora. Esse t trabalhando. T cobrindo at vaca no pasto! Rarar! No caso do carto Unidas, o possuidor tem desconto de at 5 % na locao, possui tratamento preferencial e melhoria na categoria de carro. O carto TVA vai permitir a seus portadores, a partir deste ms, a participao nas primeiras exibies realizadas em espaos culturais do Banco Nacional. A American Express estuda tambm lanar cartes de afinidade. Os cartes Sollo j realizaram contratos, entre outros, com o Jockey Club de So Paulo, a Fundao Getlio Vargas, a Associao Paulista de Propaganda, a Birello e a Jog. Moradora da Mooca, nos ltimos cinco anos Regina virou o ano com Hilton, o namorado, no Recanto da Cachoeira, uma estncia no municpio de Socorro (SP). Brigada com o namorado, Regina optou por estrear na So Silvestre. Foi timo, voc fez um minuto melhor do que o esperado, disse a Medeiros ao cruzar a linha de chegada. Ganhou um abrao, um beijo no rosto, e foi para a Mooca estourar uma champanhe meia-noite. O sindicalista Luiz Antnio de Medeiros correu a So Silvestre em busca de resultados. E chegou l. Com uma ponte de safena e uma mamria implantadas h um ano e meio, fez o trajeto em 1h41 58 minutos atrs do vencedor. Na chegada do ano em que disputar uma vaga no Senado pelo PP-SP, deu mais de duas dezenas de entrevistas, posou para fotos, e deixou 1993 nas emissoras de rdio e TV. As chibatadas rompem a pele e as cicatrizes ficam por toda a vida. A dor faz os golpeados desmaiarem. Ao ouvir a negativa a seu recurso, Fay no demonstrou reao. Em Cingapura, a pena de chibatadas obrigatria para tentativa de homicdio, roubo, estupro, trfico de drogas e vandalismo. Na discusso com a bancada, FHC deve tratar da reviso constitucional e da delegao que recebeu do partido para negociar as alianas eleitorais. A reunio da Executiva do PMDB deve ser palco de mais um embate entre os adeptos da candidatura do ex-governador Orestes Qurcia e os antiquercistas. No PPR, a discusso ser sobre a poltica de alianas e se ela inclui o PSDB, alm dos novos candidatos do partido a presidente. O mais cotado o senador Esperidio Amin (SC). Apesar da entrada da frente fria no Estado, so poucas as chances de chuvas no interior, regio mais afetada pela falta de chuvas. Mas o abrandamento da inverso trmica (fenmeno causado por massas de ar quente que impedem a disperso dos poluentes) e a volta dos ventos devero contribuir para melhora da qualidade do ar. FHC ressalvou que no conhecia a proposta com detalhes e que o governo est aberto a negociaes. Entretanto, disse no saber que perdas so essas, e que at agora todo mundo est ganhando com a URV. Pelo que a Folha apurou, a equipe econmica no est disposta a aceitar modificaes nos dispositivos sobre salrios da medida provisria que instituiu a URV. No podemos aceitar medidas que contrariem o combate inflao, disse FHC, ao responder sobre o assunto. Falando aos membros da Comisso de Assuntos Econmicos do Senado, o ministro disse estar feliz por no haver, at agora, sentenas judiciais contrrias s medidas do plano econmico. Ele disse que a Justia do Trabalho em So Paulo havia decidido que no havia perdas salariais para os metalrgicos do Estado. Ao contrrio do que tem acontecido em suas ltimas declaraes pblicas, FHC evitou ontem fazer ataques aos chamados aumentos abusivos de preos por parte dos oligoplios grandes empresas que dominam determinados segmentos da economia. O ministro limitou-se a dizer que havia conversado por telefone com assessores do presidente Itamar Franco sobre a proposta de uma nova legislao contra abusos do poder econmico. Esse prazo, no Brasil, muito flexvel. O Opala durou 23 anos, o Chevette, 20, a Kombi segue firme desde 1957. As redes de revendedores das duas marcas festejam a dissoluo. Os distribuidores Ford esto exultantes. Foram recebidos na semana passada em Detroit por Alex Trotman, principal executivo da Ford norte-americana. Assim como a China, aceleramos a reforma no campo, comeamos por a. Todos sabem que as reformas na agricultura foram a locomotiva que levou a China a seu estado atual. Por isso, j em 1991, comeamos a introduzir a propriedade privada da terra, desmontando o sistema socialista. Hoje ns temos o maior setor agrcola privado entre os pases da Comunidade de Estados Independentes (aliana que substituiu a URSS). Por enquanto, s esto definidos investimentos da ordem de US$ 600 milhes nos prximos trs anos na Argentina e mais US$ 500 milhes no Mxico, anunciou Douglas Ivester. Nos ltimos cinco anos, s na Argentina, a Coca-Cola investiu US$ 800 milhes na aquisio de novos equipamentos e desenvolvimento de novas tecnologias, lembrou Ivester aps encontro com o presidente argentino Carlos Menem. Uma das consequncias disso outra aberrao na sade brasileira: em 1991, os hospitais psiquitricos cadastrados rede federal consumiam 7,5% das despesas com internaes realizadas pelo SUS e representavam o equivalente a 18% dos leitos. Esses nmeros vo de encontro com a tendncia mundial, segundo a qual o paciente deve ficar o menor tempo possvel internado, e os esforos devem ser para reintegr-lo na sociedade. Apesar de seu discurso duro contra eventuais aumentos de preos, Ciro admitiu, pela primeira vez, rever a estratgia de reduo das alquotas de importao. Ele disse aos empresrios -- que mantiveram suas crticas poltica de abertura comercial indiscriminada -- que o governo pode aumentar de novo as alquotas de importao em todos os setores. Os juros esto em alta. O rendimento projetado das cadernetas saiu de um patamar de 42% nos depsitos na semana passada para 46% nas aplicaes nos prximos dias. Francisco Lafayette, administrador do Banco Banorte, diz as cadernetas esto com rendimento atraente, mas que os fundos de commodities do a vantagem adicional do resgate dirio com rentabilidade a partir da carncia de 30 dias. Cada modelo de fac-smile, com suas especificaes tcnicas peculiares, recebe uma patente do tipo MI (modelo industrial). No um PI porque s funciona com algo que j existia (o telefone). O registro do tipo DI (desenho industrial) diz respeito configurao e cor do produto. O Coprotest uma verso revista e melhorada das anti-higinicas e pouco seguras latinhas para coleta de fezes para exames parasitolgicos. O produto no tem similares em todo o mundo e est sendo muito bem aceito nos Estados Unidos, afirma Adolfo Moruzzi, 49, diretor da NL. A praa deveria ser reconstruda aps a construo do Piscino. A praa, por ser tombada, deveria ficar como era antes da obra. Segundo o sindicato dos arquitetos, o projeto da prefeitura modifica o desenho original. Entre as modificaes, est a construo de um anfiteatro ao ar livre. O sindicato fundamenta o pedido de embargo da obra com a omisso da prefeitura, que no submeteu o projeto de reurbanizao da praa ao Condephaat. A assessoria de imprensa da Secretaria de Vias Pblicas disse ontem que a prefeitura s vai se manifestar depois que for notificada pela Justia. Jason o maior assassino de todos, com 126 vtimas. Myers vem a seguir, com 46. Freddy est na rabeira, com 37. Coincidentemente, Jason tambm levou o maior nmero de balas (132), contra 27 de Michael Myers e apenas seis de Freddy Krueger. Os avais so respeitveis, mas no bastam. Tem que se demonstrar atravs de contas e de raciocnios que o expurgo significar perda. Como no se pretende o monoplio da verdade, apresento o raciocnio (e as contas) que fundamentam a tese de que no haver perdas. Quem pensa o contrrio, que trate de comprovar que o raciocnio est furado. Ao desembarcar no Canad, troque a moeda norte-americana por dlares canadenses. No esquea dos culos escuros e uma capa de chuva. Existem outras formas de burlar o reajuste anual. Alguns proprietrios fazem um contrato para o apartamento e um acordo, muitas vezes verbal, para a locao da garagem e/ou do telefone. Nesse caso, embora o valor do aluguel do imvel fique fixo por 12 meses, os da garagem e do telefone podem ser reajustados at mesmo mensalmente. H ainda aqueles que optam pela locao de temporada. Ou seja, alugam o imvel por um prazo mximo de 90 dias e ainda podem receber o valor adiantado. A equipe do ministro da Fazenda, Rubens Ricupero, avalia que o processo de crescimento do volume de dinheiro em circulao, decorrente da queda da inflao, est praticamente encerrado. Segundo o diretor de Assuntos Internacionais do Banco Central, Gustavo Franco, as emisses de reais cresceram bastante na primeira semana de julho, mas esto estveis desde ento. Blatter disse que o Comit no levou em considerao o fato do jogador ter se desculpado e comparecido ao hospital para visitar o norte-americano Tab Ramos. Tambm no nos interessa se o jogador estava internado ou no. O desenlace, porm, no veio na forma do golpe de timo com reforma radical, mas foi precipitado pelas duas balas que tiraram a vida de Colosio. Mas o resultado, surpreendentemente, foi o mesmo no que se refere reconstituio do princpio da autoridade. Uma hora aps a morte de Colosio -- na realidade minutos depois do pblico receber a trgica notcia, Octavio Paz pedia um basta aos excessos verbais e ideolgicos de alguns intelectuais e jornalistas e s numerosas e irresponsveis apologias da violncia. Cantora rejeita som 'enlatado'. A cantora e violinista Meca Vargas no faz coro com os contentes com o computador. Para ela, a mquina produz msica enlatada, desprovida de fora viva. Os paulistanos esto pagando menos tambm pelos produtos da cesta bsica. Outra pesquisa do Datafolha mostra que estes alimentos recuaram 0,47% na ltima semana de novembro. Mesma tendncia teve o custo da cesta bsica pesquisado pelo Procon, em convnio com o Dieese. Este levantamento de preos que inclui tambm produtos de higiene e de limpeza mostra recuo de 1,58% em relao sexta-feira anterior. Estamos na transio. Desculpe-se. Agosto ser outro mundo. Mais perto da eleio. O governo no est errado em formular regras que possibilitem o alongamento. No Brasil, coisa esquisita, j se negociou CDB de seis meses uma eternidade, hoje em dia. A droga salmeterol mais eficaz para tratar asma a longo prazo do que a comumente usada, segundo estudo na ltima edio da revista da Associao Mdica Norte-americana (Jama). A droga pode ser inalada atravs de bombinhas de aerossol. Ela teve ao prolongada como dilatadora dos brnquios (tubo por onde o ar passa aos pulmes). O alemo no teve muita chance de lutar pela pole. Aps fazer uma nica volta rpida, Schumacher emprestou seu carro para o companheiro Jos Verstappen, que havia rodado nos treinos da manh. O piloto holands, em sua primeira volta, perdeu o controle do Benetton e foi parar na brita. Final de treino para ele e Schumacher. Em relao aos remdios, houve um aumento abusivo, perto dos 60% em janeiro. Dallari, aps negociaes com os laboratrios, candidamente anunciou que para janeiro, no se pode fazer mais nada, mas em fevereiro os reajustes sero iguais inflao. Sequer lhe ocorreu que em fevereiro o excesso de aumento poderia ser compensado. Distrao, claro. Mas, em fevereiro, os laboratrios ultrapassaram a inflao outra vez. Em maro, esboa-se a mesma tendncia. A Fenasoft deixou de ser um reduto de especialistas em informtica para atrair tambm quem pretendia comprar o primeiro computador. Estou comprando o micro -- o porttil Aero, da Compaq -- para acelerar as pesquisas da minha tese de mestrado, disse Deborah Caldas. Enquanto isso (ou entrementes, como nas antigas histrias em quadrinho), o ex-governador ACM descola suntuoso emprstimo para socorrer os cacaueiros da Bahia, quatro anos de carncia, juros de 2% subsidiados, um emprstimo de pai (governo) para filho (fazendeiros). No fundo, uma indecente doao do dinheiro pblico. Enquanto isso (outra vez a vontade de escrever entrementes), a Receita Federal divulga a lista dos devedores do Imposto de Renda que nada pagaro. Mais da metade do rombo na Previdncia tem como causa a sonegao de empresas. Entrementes (resistir trs vezes quem h de? ), o governo de FHC ameaa reduzir as aposentadorias para cinco mnimos. O Depav tambm est zoneando o parque. O trabalho, coordenado pela diretora Vera Bononi, 49, prope solues para o aproveitamento do espao construdo, em especial a marquise. A Asuapi doou 10 bicicletas para a guarda municipal fazer o policiamento do parque. Para entrar em ao, a tropa de ciclistas est esperando apenas que acontea o cerimonial de inaugurao, diz Santini. A guarda usar uniformes especiais. Segundo Santini, com as bicicletas, o trabalho ser mais rpido e eficiente. Ele afirma que no tem previso para o incio do trabalho dos ciclistas. Na disputa pelo mercado de impressoras quem sai ganhando o consumidor. Vale a pena conferir os produtos e comparar a qualidade de impresso dos modelos jato de tinta. A Epson mostra na Comdex quatro modelos com tecnologia jato de tinta, que devero substituir os modelos matriciais no dia-a-dia dos usurios e empresas. O Departamento de Estado dos Estados Unidos ordenou ontem reforos na segurana das representaes diplomticas de Israel no pas para prevenir eventuais ataques terroristas. Em Nova York, o FBI (Federal Bureau Investigation, a polcia federal dos Estados Unidos) recebeu uma ligao annima no final da noite de anteontem informando que o Consulado de Israel sofreria um atentado a bomba. O secretrio disse que aguardava apenas uma definio de Paulo Maluf quanto a seu futuro poltico para anunciar medidas de base. As denncias que vm sendo feitas procedem, disse Raia. J os quatro leiles realizados durante a Expomilk venderam 124 cabeas de raas holandes, jersei, pardo-suo e girolanda. O faturamento bruto atingiu R$ 615,7 mil, equivalentes a US$ 721,8 mil. Uma vaca pardo-suo conseguiu a maior cotao da Expomilk, R$ 36 mil (US$ 42,2 mil). Foi apresentada pela Agropecuria Amrica e arrematada pela Citrovita Agropecuria. Folha -- O sr. j fez algum discurso contra a monarquia no Parlamento? Skinner -- No permitido falar muito contra a monarquia no Parlamento porque supostamente voc deve jurar fidelidade rainha. A carga tributria das instituies financeiras poder aumentar em 73% caso o Congresso Nacional aprove o aumento da contribuio social sobre o lucro dos bancos e a incluso da obrigatoriedade de recolhimento do PIS (Programa de Integrao Social). Mesmo assim os bancos continuam entre os setores da economia que recolhem menos tributos. O ganho de arrecadao com o aumento da carga de impostos dos bancos vai ajudar a fechar em US$ 16,1 bilhes o FSE (Fundo Social de Emergncia). O governo teve que elevar de 15% para 20% o percentual dos recursos dos tributos que vo fazer parte do FSE para compensar a perda de recursos do Fundo de Participao dos Estados e municpios. Essa deciso dever dar mais de US$ 12 bilhes ao FSE, s com tributos que pertencem Unio. O pas vive, neste momento, a parte mais crucial da grande luta pela implantao da cidadania. quando o rastreamento das pistas da contraveno comea a bater em personalidades acima de qualquer suspeita, dentro do pequeno crculo ntimo da elite cosmopolita brasileira um amlgama que rene intelectuais de esquerda e direita, eleitores de Lula a Maluf, passando por todo o espectro intermedirio, empresrios, profissionais liberais, jornalistas e membros do judicirio. A corrupo institucional brasileira vai muito alm de ratos gordos como Joo Alves. Caio Gorentzvaig, cotista da Petroplastic, disse que o arresto cair automaticamente quando seu pai, Boris, se der por citado e oferecer bens imveis penhora. Prev que isso acontea em dez dias, quando seu pai voltar a So Paulo. O tucano tem passado a maior parte do tempo em seu quarto e, ao contrrio dos 60 hspedes, no tem feito exerccios. Mesmo assim, o governador eleito tem seguido a dieta hipocalrica da clnica, onde a ingesto diria de 350 calorias. boa, mas pouca. Apesar de ter dito h dias que entre o justo e legtimo e o legal, opta pelo justo e legtimo, Lula disse que a reforma agrria, em sua eventual governo, ser feito dentro da lei. Lula fica na frica do Sul at quarta-feira noite, quando viaja ento para a Alemanha, onde permanece at sexta-feira noite. Disaster Movie, de 1979, foi uma revelao: apesar da produo apertada, esse curta-metragem revelava um talento mais que incontestvel. um pouco em funo disso que Anjos da Noite decepciona. A incurso de Wilson Barros ao universo dos seres noturnos paulistanos revela um trao tipicamente brasileiro: uma espcie de vergonha de ser brasileiro. Talvez por isso a So Paulo de Anjos da Noite parea com qualquer lugar do mundo. O talento de Barros continua l, apesar de Marilia Pra se empenhar em tomar conta do filme com um histrionismo fora de lugar. Mas de alguma forma representa a tendncia formalista que marcou o cinema brasileiro na segunda metade dos anos 80. Alm de Serpa e Teixeira, j estavam sob investigao judicial eleitoral os candidatos Mrcia Cibilis Viana (federal, PDT), Paulo de Almeida (federal, PSD) e Aluizio de Castro (estadual, PPR). Ontem, o plenrio do TRE do Rio determinou a recontagem dos votos da 81 seo da 77 zona. O tribunal anulou 108 votos de Nilton Cerqueira (federal, PP) e 93 de Emir Larangeira (estadual, PFL) na 113 zona (Niteri). Os votos foram preenchidos com a mesma letra. Tambm foram anuladas duas urnas em Barra Mansa. A Reebock teve resultado melhor do que o esperado no segundo trimestre. O lucro cresceu 24% para US$ 51 milhes e as vendas, 18%, para US$ 776 milhes. A Nike tambm teve bons resultados, pondo fim a especulaes de que teria passado o auge das famosas fabricantes de tnis dos EUA. O hotel Deville, perto do aeroporto de Guarulhos, diz que, alm de executivos, tem hospedado muitos turistas, principalmente com destino ao Caribe e Disney. Afirma que, na primeira quinzena de julho, o ndice mdio de ocupao foi de 80%. Para elaborar a lista, o grupo mede o custo de vida em quase 200 lugares. So analisados os preos de 155 produtos e servios. Atrs de Buenos Aires, esto duas cidades asiticas (Hong Kong e Taip) e duas europias (Zurique e Genebra, ambas na Sua). As declaraes foram recebidas por Maradona como uma tacada de beisebol na cabea, como o prprio meia definiu. Maradona negou veementemente as crticas da me de Franco. No sei o que dizer. Estou surpreso, foi o que Franco disse a sobre as declaraes da me. A resoluo normativa de 4 de junho de 1987 do Instituto Brasileiro de Turismo (Embratur) afirma que os meios de hospedagem de turismo dos tipos hotel, hotel de lazer e hotel-residncia, de qualquer categoria, ficam obrigados a dispor de local especfico, devidamente sinalizado e apropriado ao embarque e desembarque de usurios portadores de deficincias, alm de estacionamentos para seus veculos. A lei diz ainda que 2% dos apartamentos tm que ser adaptados e devem haver facilidades para acesso e circulao em todas as dependncias sociais do hotel. As rodas de liga de alumnio ou magnsio tm manuteno menos frequente. A Secretaria da Fazenda do Estado de So Paulo determinou que, quando um contribuinte realizar venda de mercadorias em feiras, exposies ou em locais chamados de outlets ou feira de promoes e permanecer na rea determinada por mais de 60 dias, ser obrigatria a inscrio do referido local no cadastro de contribuintes do ICMS (Fund. portaria CAT 116/93). A restituio do IRPF poder ser feita a terceiros, desde que: se de valor at 80 Ufir, mediante simples autorizao por escrito do beneficirio, acompanhada de cdula de identidade e CPF do representante e do representado, para verificao das assinaturas; e se de valor acima de 80 Ufir s poder ser paga a procurador (Fund. instruo normativa DRF n. 38/92). Segundo ele, o que retardou o trabalho da equipe foram as contuses de alguns jogadores importantes, como Marcelo Negro e Giovane. J pensando no Mundial da Grcia, que ser disputado em setembro em Atenas, o tcnico brasileiro retoma o comando dos treinos da equipe a partir da prxima segunda-feira. Correo: hoje trazemos as respostas das palavras cruzadas de sbado. A resposta para as palavras cruzadas de hoje foram publicadas ontem. Segundo Rosa, nutricionista e professora da PUC de Campinas, junto com os alimentos se ingerem e digerem afetos. Isto : os diversos tipos de refeies so investidos de smbolos que alteram o valor que as pessoas atribuem ao que comem. Para chegar a suas concluses, Rosa entrevistou um grupo de funcionrios pblicos (tcnicos administrativos) que trabalham e comem no centro velho da cidade de So Paulo (ruas Lbero Badar, So Bento e adjacncias). Observou tambm os lugares onde essas pessoas almoam. VITRIA Josias de Souza ensinou sexta-feira, nesta Folha, que pesquisa no urna. Eu tambm acho, mas preciso avisar o pessoal do PT. Nunca vi um partido render-se to rpida e incondicionalmente como o PT est fazendo. notvel o baixo astral da militncia, de quadros dirigentes e de simpatizantes em geral. De acordo com o amassado, sem que seja preciso interferir na pintura. Agora, as negociaes so internacionais: Amlia vai conversar com a Krupp alem e a Japan Steel. Cada obra consome 1,5 tonelada de ao (US$ 6 mil) mais US$ 10 mil de mo-de-obra. Multiplique isso por 15, some o valor do transporte e se tem US$ 400 mil. Sete Ondas no est partindo do marco zero. Amlia j tem o OK inicial de trs pases: Alemanha, Frana e EUA. Na Alemanha, a obra ficaria na Casa da Cultura dos Povos em Berlim. O Centro Internacional de Arte e Escultura, de Vaison-la-Romaine, sul da Frana, tambm tem interesse pela obra. Nos EUA, o Museu da OEA (Organizao dos Estados Americanos) j tem agendada uma exposio de Amlia, mas ela no sabe se instala l suas Sete Ondas. A negociao com patrocinadores o processo mais complexo, segundo a artista. As esculturas, ela diz que faz em seis meses. Nos perodos de choques, 41% dos contratos do setor de confeces precisaram ser renegociados entre compradores e vendedores. Nas pocas de normalidade da economia inflacionria brasileira, esse percentual ainda chega a 7,5%, contra 2,3% do Chile, pas que j alcanou sua estabilizao. Duas pessoas morreram em um acidente ocorrido na noite de ontem no km 2 da rodovia Joo do Amaral Gurgel, que liga Caapava a Jambeiro, no interior de So Paulo. Um Gol e um Voyage se chocaram. Eles viajavam em sentido contrrio. O motorista do Gol e uma passageira do Voyage morreram na hora. A polcia ainda no sabe qual foi a causa do acidente. O diretor de Assuntos Internacionais do Banco Central, Gustavo Franco, disse que o BC vai comprar dlares por uma cotao bem abaixo de R$ 1,00, preo pelo qual vende a moeda norte-americana. Ontem, no mercado privado, sem interferncia do Banco Central, o dlar foi negociado a R$ 0,90. Segundo Franco, o BC vai comprar dlares quando a cotao cair a um ponto determinado, no conhecido pelo mercado. O povo, assim, mais destinatrio de um discurso poltico, que ele mesmo no profere, quando muito escuta, que seu emissor ou senhor. Por isso, estando o demos na recepo e no na produo do discurso poltico, melhor dizer regime demtico que democrtico. O kratos no do povo; seu, se tanto, o ouvido, o olhar. Mas, nesta sociedade, cabe papel relevante ao conforto. Os discursos da frugalidade, da austeridade ficaram para as repblicas gregas e romanas, aquelas que Montesquieu ainda celebrava, em meados do sculo 18, quando dizia que o regime republicano se caracteriza pela disposio de seus cidados a renunciar ao bem privado em favor do bem comum e por isso, acrescentava ele, trata-se de regime impossvel na modernidade, na qual se busca a vantagem pessoal. Segundo o jogador, o Palmeiras s ter dificuldades para garantir o ttulo na partida contra o Corinthians pois deve enfrentar antes adversrios teoricamente fceis (Mogi Mirim, Ituano e Santo Andr). Em o ltimo jogo, ns teremos um prazer, ou de sermos campees ou de dar o ttulo ao So Paulo, garantiu Moacir. Nosso objetivo no campeonato, portanto, ser definido depois do domingo, quando enfrentamos o So Paulo, disse o volante. A entrada de capitais, para ser esterilizada monetariamente, teve de ser absorvida pelo crescimento da dvida pblica, o que por sua vez requereu um supervit fiscal primrio crescente. h mais de um ano, o Mxico encontra-se numa situao de desequilbrio potencial permanente no balano de transaes correntes e no oramento fiscal, que foi compensado mais recentemente por um endividamento crescente do setor pblico nas praas financeiras internacionais, em particular na Bolsa de Nova York. A quinta rodada da Superliga Nacional Masculina de Vlei da temporada 94/95 prossegue hoje com mais quatro jogos. O Palmeiras/Parmalat que est em terceiro lugar na classificao geral, tenta defender sua invencibilidade no torneio diante do Flamengo, no Rio. Na tese, despreza o papel do povo num governo. Mas, nos anos 90, os yuppies desenvolvem conscincia. Joe Pesci vai ser a conscincia em carne-e-osso. O ator vive um sem-teto, que o acaso coloca diante do estudante. A TV por assinatura no Brasil est nascendo concentrada nas mos de quatro grupos: Organizaes Globo, Multicanal (subsidiria da Companhia de Minerao do Amap), Rede RBS e Abril. E abre s para almoo e drinks ao cair da tarde. Apesar de ningum l ser mais amigo do rei, a turma continua na ativa. Os policiais foram a uma casa, indicada pelas duas mulheres, onde estariam outros integrantes da quadrilha, mas nada foi encontrado ali. O Parque So Carlos habitado por moradores de renda baixa. A casa em que Faria ficou era de dois quartos. O quarto em que ficou era pequeno e sua porta permaneceu sempre aberta. O conjunto fica prximo estrada de Madureira, uma das principais vias de Nova Iguau. Segundo a polcia, a estrada procurada por grupos de extermnio para desova de cadveres. Esta a simplicidade que mata, quando mal traduzida em simploriedade. Mas Michael Tilson Thomas, regendo tudo com uma segurana natural e nenhuma pompa, parece o regente por excelncia desse nosso tempo, que ao menos na msica um tempo de refinamento e inteligncia, alm de comunicao. Simptico, fluente (com as palavras tambm), vontade no papel de regente jovem (embora beirando os 50 anos), no h um nome melhor para dirigir a New World Symphony, uma orquestra de bolsistas. Nem simples, nem muito menos beethoveniano o Concerto para Violino de Tchaikovski. uma obra infelizmente popular. Est longe de ser o melhor de Tchaikovski, mas o diabolismo da parte solista no deixa nunca de impressionar. Diabolismo no falta ao solista Robert McDuffie: um violinista de mil dedos, afinadssimo e sem medo da msica. Mas toca tambm com certa brutalidade, impacincia ou agressividade com a msica. um msico atltico, no limite extremo da escola americana de interpretao. Os juros despencaram ontem com a expectativa otimista para a inflao de dezembro. As taxas de juros equivalentes dos CDBs diminuram 2,31 pontos percentuais em relao mdia da ltima segunda-feira. Em 1920, a sujeio do sulto Mohamed 6 ao Tratado de Svres desagradou os nacionalistas liderados por Mustaf Kemal, militar que lutou nas guerras balcnicas e na Primeira Guerra Mundial. J em Ancara que se tornaria capital Kemal presidiu a grande Assemblia Nacional. Eleito presidente e primeiro-ministro, ele cuidou pessoalmente das operaes militares durante as guerras grego-turcas (1920-22). Depois do tratado de paz de 1923, aboliu o califato, a poligamia, substituiu prticas legais islmicas por outras de inspirao europia, adotou o alfabeto romano e o calendrio gregoriano e concedeu s mulheres direito de voto. O assalto foi impedido pela segurana do carro blindado. Houve tiroteio. O carro-forte do Banespa estava estacionado no ptio interno quando um caminho basculante. com homens armados na traseira. bloqueou o porto. O turismo de pesca ainda atrai 75 % dos visitantes que vo ao Mato Grosso do Sul. Muitos hotis, no entanto, perceberam o crescente interesse pelo ecoturismo e diversificaram suas atraes. Promovem safris fotogrficos, focagem noturna de jacars e outros passeios. Cobram dirias a partir de US$ 50 (CR$ 22,9 mil) por pessoa, com penso completa. H opes mais econmicas, como os campings, por US$ 5 (CR$ 2.200,00) por pessoa. Aguap (067-241-2889) -- A 59 km de Aquidauana, a fazenda tem 7.000 hectares, campo de pouso e sete apartamentos. Os pacotes, com penso completa, incluindo passeios, vo de US$ 150 (CR$ 68,8 mil), dois dias, a US$ 300 (CR$ 137,7 mil), cinco dias. O pacote de pescaria, com barco, motor, piloteiro e penso completa, 20% mais caro. No camping, a diria de US$ 5 (CR$ 2.200,00). Nessa obra literalmente seminal, Miller estabeleceu o padro do que seria dali para diante a sua literatura: memrias pessoais vistas com a mincia de um estudo biolgico e narradas no tom exaltado de um profeta bbado. A matria-prima desse primeiro livro o perodo que o autor passou em Paris nos anos 20, perambulando de prostbulo em bar, de quarto emprestado a banco de estao, enquanto tentava escrever. O otimismo de Van Himst se justifica, em parte, pela entrada do centroavante belgo-croata Josip Weber no time. Ele marcou cinco gols na vitria sobre a Zmbia, no comeo do ms. Foi o mais expressivo resultado j atingido por uma seleo belga na histria. Weber deu maior profundidade equipe. Encontramos um homem-gol de que sentamos falta h muitos anos, disse Van Himst, ele prprio um dos maiores jogadores que a Blgica j teve. Carmelita Aralse Tebele, 89, moradora do oitavo andar, passou mal com a fumaa e teve de ser carregada pelas escadas at uma ambulncia. Beth Tebele, 62, filha de Carmelita, foi medicada junto com a me. O comerciante Albano Miguel Ferreira, 37, disse que estava na rea de servio de seu apartamento, no primeiro andar, e viu o fogo comear em um barraco nos fundos da loja. No caso dos microcomputadores, o aumento foi de 200 unidades para 500 unidades mensais, no mesmo perodo. H uma busca frentica por eletrnicos que facilitem o dia-a-dia do cidado em casa e no escritrio, afirma Quintana. Andrade afirmou que a limpeza feita diariamente. No largo Santa Ceclia, funcionrios da Administrao Regional recolheram material acumulado pelos indigentes e desinfetaram bueiros. Nos primeiros 20 dias deste ms foram assassinados no Rio 223 pessoas mdia de 11 por dia. O ndice o mesmo registrado em novembro do ano passado. O diretor da DRFVAT, Leonilson Ribeiro, disse que a presena de militares e policiais nas ruas causou a diminuio do nmero de carros roubados e furtados. Muitas vezes em vos quase sem escalas, algumas espcies voam mais de 10 mil km, do rtico at a lagoa. o caso de um maarico-de-papo-vermelho, anilhado na lagoa do Peixe e observado 13 dias depois no Estado de Nova Jersey (nordeste dos EUA). Apesar de ser a primeira rea no Brasil a ser includa, em 1991, rede hemisfrica de reservas para aves migratrias na categoria de reserva internacional, o Parque Nacional da Lagoa do Peixe, criado em 1986, no existe de fato, pois ainda no h um plano de manejo e de desenvolvimento. O aumento da cesta bsica na cidade de So Paulo na terceira semana de junho foi de 10,27% em cruzeiros reais, segundo pesquisa do Procon em convnio com a Fipe. No acumulado do ms atinge 43,85% , superando as variaes de maro (40,91%), abril (40,23%) e maio (40,23%). Em 94 acumula alta de 701,5%. Entre as maiores variaes da semana esto a sardinha em lata (30,52%) e papel higinico (28,18%). J as maiores quedas, dos preos mdios, ficaram com a cenoura (- 13,82), batata comum (5,14%) e po francs (- 1,25%). O grupo alimentao, porm, vem tendo as maiores altas do ms. J o Regency St. John Resorts ter amplida sua rea esportiva e aprimorado seu servio de hspedes. O Hyatt Regency Aruba tambm ter melhorias em seu cassino e a instalao de duas piscinas de hidromassagem. Essas dificuldades, que surgiram desde o lanamento da idia, estimularam economistas e governo a buscar caminhos mais rpidos. o caso da segunda opo, uma verso mais apressada da primeira hiptese. O governo cria a URV e estabelece um prazo de trs a quatro meses para que a sociedade negocie converses voluntrias. E anuncia, no momento mesmo da criao do indexador, que a partir de determinada data a converso ser obrigatria, conforme regras predeterminadas. Dando um tempo para os acertos, FHC cumpre a promessa de que ser tudo voluntrio. Mas na segunda fase, a converso obrigatria exige regras de converso, tablita e possivelmente um congelamento para impedir especulao na nova moeda. E isso um choque com data marcada. Mas uma vantagem deixar todos os passos definidos desde o momento de lanamento do plano. O Banco Mercantil de So Paulo realizou lucro lquido de CR$ 52,3 bilhes no primeiro semestre. O que corresponde a 3,65% sobre o patrimnio lquido. O lucro lquido do Banco Bandeirantes no primeiro semestre foi de CR$ 44 bilhes, com um retorno de 12,6% sobre o patrimnio lquido. Teve direito at rara presena de Maria Bethnia na platia a ltima performance carioca de Gilberto Gil e Caetano Veloso. Junto com sua Magda Collares, Fausto Silva decola dia 13 para temporada de trivial variado cultura e showbizz em Nova York. Na quarta-feira, o So Paulo tomou um vareio do Palmeiras. Se o jogo terminasse 6 a 0 no primeiro tempo no seria mais que o fiel espelho da partida. Parte porque o Palmeiras jogou com extremo empenho e raro talento. Parte porque o tricolor entrou em campo mal escalado por Tel, que tentou consertar logo aps, com a entrada de Vtor na lateral e a passagem de Cafu para o lugar de Jamelli. Por fim, pela inexpressiva atuao das duas maiores estrelas tricolores Leonardo e Cafu. Esta tarde, por fora da suspenso de Axel, j comea melhor no papel, com Vtor na lateral e Cafu no meio. Mas hora de Tel dar um toque de classe nesse meio-campo com Vlber e voltar com Palhinha mais frente, que esse jogo vale o campeonato. Em algumas cidades, onde os pais tm poder de deciso no sistema educacional, os projetos de novas escolas incorporam computadores, CD-ROM, modem e linhas telefnicas para permitir que os estudantes se comuniquem on line. Americano exagerado. Costuma encarar a tecnologia como a soluo milagrosa para todos os problemas. Computador na sala de aula no salva ningum da ignorncia. Mas pode facilitar a vida dos estudantes. Traz dicas de produo grfica, como alinhamento. Tem vrios apndices de software para DOS ou Windows. A venda em bancas de jornal ou pelo telefone (0800) 11-5353. Voltado para usurios leigos e avanados, os 13 captulos do dicas para aproveitar melhor os recursos do sistema operacional da Microsoft. Inclui disquete de atualizao do 6.0 para 6.2. Preo: CR$ 16,9 mil. Tome-se o caso de Ventersdorp, 200 km a oeste de Johannesburgo, uma das 14 cidades declaradas pelo governo reas de turbulncia, estgio suave do estado de emergncia. Em Ventersdorp, a nova e a velha frica do Sul so vizinhas. Na delegacia de polcia, a nova bandeira de seis cores est hasteada no mastro e o juiz David Postman acabou de concluir a contagem dos votos. Creio que essa vocao de poeta determinou um dos maiores problemas (h vrios) na realizao da obra de Murilo Mendes. H, com efeito, um poetismo em seus poemas. Murilo sempre est partindo do pressuposto de que ele poeta. Afirma-o, de maneira explcita, em sua obra. O resultado muitas vezes ingnuo, infantil. A CompUSA de Los Angeles vendia, na semana passada, a impressora jato de tinta Epson Stylus 800 por US$ 199. Na loja de Nova York, o Compaq Aero est por US$ 996,97 (se chegar s 9h da manh, ganha o cupom de desconto de US$ 100). No aceita carto de crdito internacional. A J&R Computer World, apesar de meio fora de mo, tem preos competitivos. A equipe de Maluf no a primeira a deixar de lado o problema de enchentes. A prefeita Luiza Erundina priorizou o transporte pblico. Jnio Quadros tambm gastou em obras virias e Mario Covas marcou sua gesto pela construo de corredores de nibus exclusivos, como o da avenida Nove de Julho. Ontem a cidade conferiu os estragos causados pela chuva de quinta feira, a maior chuva de maro desde 1930. Reynaldo de Barros, secretrio de Vias Pblicas, participou de uma cerimnia pela manh e depois no pde mais ser encontrado em seu gabinete. O Banco Mundial, o FMI e o Gatt so instituies pblicas que deveriam ter suas polticas submetidas a esferas pblicas de deciso e ter a transparncia necessria para serem avaliadas e monitoradas pelos cidados. Recentemente, o Banco Mundial constituiu um grupo independente de inspeo para analisar queixas sobre seus projetos, iniciou uma nova poltica de informaes e o Gatt organizou o primeiro seminrio com a participao de ONGs. A Secretaria da Sade de Goinia recebeu ontem CR$ 163,7 milhes, atravs de convnio assinado com a Fundao Nacional de Sade. Desse total, CR$ 127 milhes sero destinados campanha de combate dengue e ao pagamento dos 200 agentes de sade contratados para a campanha. Gois ter dentro de dois anos o seu Zoneamento Ecolgico-Econmico. O superintendente de Meio Ambiente, Clarismino Jnior, disse que um dos objetivos identificar as macrozonas existentes no Estado. Segundo ele, a proposta inicial zonear no primeiro ano a Bacia do rio Meia Ponte. As faixas, listras e crculos desenhados nos gramados norte-americanos da Copa no so apenas um recurso esttico. Eles servem para camuflar o improviso dos estdios da Copa. Os nove estdios do Mundial so normalmente usados para o futebol americano, que tem marcas prprias na grama. Comentei com minha amiga espanhola, Dolores de Pana, a profuso de beijos melados que massacraram minhas aveludadas bochechas na Bienal. Ela subiu nos tamancos flamencos. por essas e outras que a gente acaba pegando uma micose na chica e depois no sabe explicar por qu, bradou a espanholita. E props que eu desencadeasse nesta coluna uma campanha nacional contra saudaes lambuzadas. Ta, gostei da idia! Al, vtimas do abuso oscular do Brasil! Faam como eu! Em sinal de protesto, de hoje em diante, quem me cumprimentar com beijo suado leva uma muqueta no focinho. Viva Howard Hughes! O IBGE divulgou tambm ontem a inflao registrada no Rio e So Paulo entre os dias 9 de maro e 7 de abril. O IPC-amplo (cesta de consumo de famlias com renda de at quarenta salrios mnimos) registrou 42,65% em So Paulo e 41,96% no Rio. Esses ndices revelam queda de inflao de 0,23 ponto percentual em So Paulo e 0,59 ponto percentual no Rio, em comparao com o perodo anterior. O candidato do PMDB chegou a Bauru s 10h de ontem, num jatinho fretado pelo partido. Foi recebido no aeroporto por militantes, prefeitos e vereadores. Qurcia disse que tem sido vtima de uma campanha srdida com o objetivo de denegrir sua campanha. O modo de escapar disse simples: aplica-se um reajuste de, digamos, 40% ; em seguida, avalia-se qual a inflao diria, digamos de 1,5% , e a se aplica ao preo cheio um desconto decrescente. No primeiro dia aps o reajuste de 40% , o desconto de 38,5% ; no segundo de 37% e assim por diante. Assim, um carro de CR$ 10 milhes, sairia no primeiro dia aps o reajuste por CR$ 6,15 milhes. No segundo dia, por CR$ 6,3 milhes e assim sucessivamente. uma URV s avessas. A generalizao dessa indexao diria no s acostuma a economia, como passa a exercer presso para que o governo oficialize e universalize a prtica. O assunto veio tona durante conversa do advogado Osmar (Nuno Leal Maia) com o jardineiro Kennedy (Alexandre Moreno), jovem negro que sofrera insultos racistas do patro, o empresrio Raul Pelegrini (Tarcsio Meira). Uma semana depois, trs organizaes paulistas resolveram contestar a informao na Justia Federal. Os termos da reforma foram assinados quarta-feira pelo prefeito Paulo Maluf, pelo abade Dom Isidoro Oliveira Preto (mosteiro) e por representantes do Banco de Boston, Philips e Akzo/Tintas Ypiranga, empresas que patrocinam as obras. As reformas no mosteiro e a colocao de um novo calamento no largo de So Bento devem consumir US$ 200 mil. Esperava aparecer como o nico responsvel pelo plano de estabilizao. O fracasso das medidas antiinflacionrias poderia lhe custar o acesso ao Planalto. Desta forma, para se proteger de uma reviravolta inoportuna dos ndices econmicos, FHC e seus amigos do governo tiveram o cuidado de fixar um calendrio vantajoso. Foram detidas 25 pessoas no escritrio e em outros cinco endereos ligados a Castor de Andrade. As listas apontam quase todas as divises e departamentos da Polcia Civil do Rio como recebedoras de quantias mensais. Minha experincia, nos dois ltimos anos, supervisionando programas de qualidade total dentro do grupo Rhne-Poulenc tem mostrado isso com frequncia. Cabe ao empresrio, ao empreendedor, seja pequeno, mdio ou grande, criar o ambiente adequado para a valorizao de um recurso humano com essas qualificaes. Do contrrio, todo o esforo feito estar destinado ao fracasso. Jovens estudantes na Belo Horizonte dos anos 20, o futuro mdico Juscelino Kubitschek e o futuro advogado Jos Maria Alkimin costumavam sair juntos em busca de moas namoradeiras. Iam aos bailes no Automvel Clube e reparavam nas poucas donzelas que ousavam frequentar os cafs. Um dia Juscelino comunicou a Alkimin que estava namorando a moa Sara Lemos, filha do poltico Jayme Lemos. Alkimin achou que tambm era hora de firmar namoro e pediu a JK que o apresentasse s amigas da namorada. Para ele, os clubes precisam encontrar uma frmula que concilie os dois aspectos. Segundo Nujud, as contas do Corinthians ainda no foram fechadas, para se saber se o clube teve lucro ou prejuzo. Qual o maior vexame que j deu? Sempre os mais recentes so os piores. Ao parar de fumar, acabei perdendo a estribeira algumas vezes. Nunca tive muito interesse, apesar de j ter experimentado. Boa notcia: o Parreira j comprou sua Mitsubishi! Claro, a nica que lhe d garantia de ir at a Copa de 98! Porque no escala aquele goleiro jnior, o Pitarelli. Que defendeu os trs pnaltis do So Paulo. E os so-paulinos podem se conformar com o Zico. Que perdeu apenas 4 pnaltis em sua vida. Rarar! Pssima Notcia: o Brizola lanou sua biografia em vdeo. Campanha no ar! Pelo preo simblico de CR$ 1.000. Simblico?! Em se tratando do vdeo, pra mim uma fortuna. Simblico seria CR$ 1,00. E olhe l! A a gente tirava a fita e gravava em cima! Ento eu vou lanar o Prmio Simblico! Pago CR$ 1.000 pra quem assistir o vdeo at o fim. O Roberto Dvila no vale! Porque foi ele que fez! Acho que o nico que viu at o fim! As aparncias so as principais vtimas do sarcasmo de Altman, desde as peruas que discutem a cor de seus cabelos at a noiva, que usa aparelho, escovando os dentes alegre diante do espelho. O humor de Altman vai na direo das convenes de uma sociedade que, tendo que aceitar tudo sob as aparncias, acaba perpetuando uma vasta rede de hipocrisia. Pertencemos a um grupo de caridade. Fazemos um nmero de dana. Danamos em hospitais e manicmios, diz uma das tias do noivo, uma senhora, para as colegas peruas. H uma boa dose de realismo tambm. Os personagens de Cerimnia de Casamento parecem sados das pginas de recm-casados do The New York Times, uma das sees mais provincianas da imprensa mundial e de maior leitura num jornal que vende a imagem de um dos mais cosmopolitas do mundo. O diretor de crdito agrcola do Banco do Brasil, Said Miguel, disse ontem na Comisso de Agricultura da Cmara dos Deputados que os agricultores j renegociaram R$ 1,7 bilho das dvidas em atraso. Faltam serem renegociados mais R$ 700 milhes. Os acordos j feitos abrangem 33 mil agricultores. O financiamento pblico seria baseado em fundos sociais (FGTS, FAT), na atrao de recursos de penso e nas parcerias com a iniciativa privada. A taxao de grandes fortunas e o dinheiro de privatizao -- de estatais no-estratgicas (exclui minrios, petrleo e telecomunicaes), complementaria esses fundos. O ex-presidente Fernando Collor, sua mulher Rosane e o secretrio Lus Carlos Chaves jantaram anteontem noite no restaurante Golden Horn, em Aspen, no Colorado (EUA). Durante o jantar, pessoas que estavam em uma mesa prxima de Collor e Rosane tiraram fotos. Embora no fosse o alvo, o ex-presidente assustou-se com os flashes. Na sua avaliao, a oposio sabe que o plano bom, mas no quer aprov-lo agora, pois seria bater palma para o adversrio. Votar contra seria perigoso porque o plano tem apoio popular. Segundo o presidente da Matis-Paris Brasil, Loureno Lopes, 44, a empresa francesa foi escolhida por ter mais estrutura e nmero de produtos para clnicas de beleza, medicina esttica e cirurgia plstica. Alm dos produtos, a Matis oferece infra-estrutura a seus franqueados. A empresa possui laboratrio prprio na Frana e fabrica os equipamentos usados em suas clnicas. Os produtos so profissionais (para institutos) e para o consumidor. So Paulo foi literalmente invadida pela cozinha japonesa. E o tempo vem mostrando que o fenmeno no mero modismo. A cidade tem uma larga populao de origem japonesa e isso sustenta a proliferao e a qualidade de seus restaurantes. Mas se em muitas casas japonesas da cidade o sashimi e os sushis so os carros-chefes em outros se destacam por iguarias nem to comuns. o caso do Iti Fuji, restaurante fundado 12 anos atrs pelo japons Hashiro Yawata, 74 (proprietrio de um dos mais antigos restaurantes japoneses da cidade, o Kokeshi, aberto h 33 anos). Trinta empresrios argentinos e 25 brasileiros se renem na prxima sexta-feira em seminrio da FGV para debater perspectivas do setor privado nos dois pases com o Mercosul. Os empresrios brasileiros acreditam que os setores mais propcios a investimentos na Argentina so: cerveja, celulose, alimentos, petroqumica, telecomunicaes e gs. Segundo Luiz Estevo, Collor, vestido com uma cala marinho e camisa de algodo branca, sem mangas, estava confiante no trabalho dos advogados. Tambm esteve na Casa da Dinda Euncia Guimares, amiga de Rosane. Collor recebia telefonemas o tempo inteiro, dos advogados, de polticos amigos e de pessoas que prestavam solidariedade. Um amigo da famlia ligou para informar sobre o estado de sade do irmo do ex-presidente, Pedro Collor, que est sob tratamento nos Estados Unidos. Subiu ao palco com o candidato a vice, Aloizio Mercadante, e os candidatos ao governo do Estado, Jos Dirceu (PT), e ao Senado, Luiza Erundina (PT) e Joo Hermann (PPS). Lula respondeu perguntas do pblico sobre sade e cultura. Mercadante foi escalado para responder quando o tema foi aposentadoria. Apartamento na rambla Armenia, Montevidu. Valor: US$ 69 mil. 50% de apartamento na avenida Atlntica, Copacabana, zona sul do Rio. Valor da frao: US$ 100 mil. A Reuters Holdings, baseada em Londres, pretende lanar um servio financeiro por TV em meados do ano. Inicialmente restrito Europa, a empresa o tornaria mais tarde disponvel em todo o mundo. Entre os teens, os tringulos esto em alta com direito a formao de casais a trs. No filme, uma garota (Alex) colocada por engano no mesmo quarto de dois rapazes e se apaixona por Eddy. Mas Eddy ama Stuart e eles acabam se desencontrando. A Eufrasio Veculos, revendedora Ford, aposta no showroom sazonal e se prepara para comear a atuar em Campos do Jordo por dois meses. Pesquisa Update, da Cmara Americana, revela que os empresrios esperam queda na inflao com o real. O ndice mensal de expectativa de inflao teve este ms o resultado mais baixo desde que foi criado h trs anos. verdade. Sei que agora tenho um longo caminho a percorrer. Tenho que estudar muito. Este foi meu primeiro filme, mas sei que posso ser uma grande atriz. Sinto isso. Quero ter mais experincia, estudar e me dedicar 100% a isto. Quero ser uma das melhores atrizes do mundo. Este meu objetivo. E como voc se sente como estrela de cinema? Meu Deus! Tenho muito trabalho a fazer antes de me sentir assim. Mas as coisas que tm acontecido com mim so espantosas. Eu venho de um bairro muito pobre, muito pobre de Nova York. Voc no pode imaginar que quo longe eu venho. Eu me tirei das ruas e comecei a estudar. Aprendi a falar direito e tentei fazer minha vida melhor para mim. Quando se est num ambiente como o que eu vivia, voc no sabe que existe todo um mundo diferente fora dali. Tive muita sorte mesmo. Eu podia ser uma drogada hoje, podia viver na rua. Espero que possa voltar l e dizer s crianas que elas no precisam ficar presas l, que existe todo um outro mundo fora dali. Hoje as coisas esto melhores para a comunidade latina nos EUA. At em Hollywood, h pessoas como Sonia Braga e Andy Garcia. Quando eu era criana, tinha vergonha de ser latina. Alec Eu quero viver, eu quero no fazer nada. Ali, no torneio que encerrava a temporada 93, Steffi aguardava mais um trofu, olhando para o cho, sem ligar para as 12 mil pessoas em volta da quadra. Minutos antes, tinha pulverizado a espanhola Arantxa Sanchez. Nos discursos de agradecimento em Wimbledon e Us Open, meses antes, a alem lembrara o pblico de que no ficava feliz em vencer torneios na ausncia de Monica Seles, afastada das quadras depois de ser esfaqueada na quadra. A proposta da Anfavea uma indexao disfarada. Do assessor especial da Fazenda para preos, Jos Milton Dallari. Chegaram famintos e esfarrapados s cabeceiras do Purus aps quatro meses de viagem. Mas desvendaram o mistrio da sua ligao com outros rios, feita atravs de canais abertos pelo homem para o comrcio e o contrabando. De volta ao Rio de Janeiro, Euclides preparou alguns dos mapas que serviram de base para o tratado de fronteiras entre o Brasil e o Peru. Foi assassinado em agosto de 1909, um ms antes do acordo entre os dois pases. Escrevia poca Um Paraso Perdido, livro sobre a Amaznia, interrompido com a morte. A tarefa foi transferida a Falco. Trata-se de um candidato que estreou em comcio sob vaias e quer, pela via do preconceito, enfraquecer quem lidera as pesquisas, disse Falco sobre FHC. Ns no vamos dar corda ao debate rasteiro, acrescentou o presidente do PT, que acha improvvel o STF permitir a candidatura de Jos Sarney por um partido que no seja o PMDB. Os motoristas da EMTU (Empresa Metropolitana de Transportes Urbanos), empresa do governo do Estado, tm o mesmo piso salarial das empresas privadas. Segundo a EMTU, cerca de 215 mil pessoas utilizam diariamente o corredor do trlebus. A famlia do bicheiro Jos Scafura, o Pirunha, espera para qualquer momento a libertao de Andr, 15, sequestrado na noite de tera-feira. Neto de Pirunha, Andr foi capturado perto de sua casa, na Abolio (zona norte). A famlia diz estar reunindo os US$ 100 mil do resgate. Segundo o pai de Andr, Luis Carlos Scafura, os sequestradores provaram que ele est vivo, ao responder corretamente algumas perguntas sobre a vida do rapaz. Scafura afirmou que os sequestradores j ligaram trs vezes. Na sequncia, em 1498, Vasco da Gama chegou s ndias e inaugurou uma nova era. Um cabo, ensinam os dicionrios, uma faixa de terra que entra pelo mar o oposto de fiorde, que uma poro de mar que avana pelo continente, como acontece na Noruega e Sucia. o mesmo que capo, cabedelo ou promontrio, extenso de terra maior que a ponta e menor que a pennsula. As vendas brutas caram de US$ 3,78 bilhes para US$ 3,57 bilhes, consequncia de redues tanto no mercado interno como no externo. A receita lquida, excluda a carga tributria, foi de US$ 1,15 bilho em 93, contra US$ 1,27 bilho em 92. O mercado brasileiro de cigarros apresentou uma queda de 6,5% com o total de unidades vendidas passando de 127,8 bilhes em 92 para 119,5 bilhes no ano passado. A Souza Cruz, que lidera o mercado, teve sua participao reduzida de 83% em 92 para 79,6% em 93, consequncia de uma retrao de 10,5% nas suas vendas. Durante a apresentao do balano, Mton Cabral disse que a empresa reagiu no final do ano, voltando a deter mais de 80% do mercado, graas ao sucesso da marca Derby. A Ensec, brasileira, foi uma das cinco empresas pr-qualificadas para participar de concorrncia de US$ 11 milhes para reconstruir o sistema de segurana do estacionamento do World Trade Center, em Nova York, destrudo com a exploso de um carro-bomba. O Sinicesp impugnou concorrncia da Prefeitura de Santana de Parnaba argumentando que o edital para obra de infra-estrutura contraria a Lei 8.666. O hotel ter cinco alas com motivos de cada um dos cinco esportes mais populares dos Estados Unidos futebol americano, basquete, beisebol, tnis e surfe. A segunda fase do projeto prev a construo (para 1995) do All-Star Music Resort, no mesmo estilo e com mesma capacidade, que homenagear os estilos musicais do country, jazz, rock, calypso e musicais da Broadway. Entre os catlicos que se intitulam no-praticantes, 94% no tomam deciso baseado nos conselhos religiosos. mais alto do que os que se dizem ateus ou sem religio (93%). Indagados sobre se concordavam com a posio da Igreja com relao ao uso de camisinha, 74% discordaram totalmente e 13% discordaram em parte. Convencido de que Munhoz tem uma boa retrica, o publicitrio pretende deixar o candidato falar o que quiser para, antes da gravao final, decidir o que ir ao ar. Centrado no slogan Fome e desemprego . Agricultura a soluo, Munhoz vai falar durante o tempo a que tem direito sobre a necessidade de aprimoramento do setor. A Folha realizou um teste em um dos nibus que integram a nova linha Penha-Lapa, acompanhando a dcima viagem de ontem. A principal diferena verificada a sensao de frio na barriga quando se passa pelas curvas do elevado Costa e Silva (Minhoco), mesmo em uma velocidade mdia de 60 km/h. Quem ainda no escolheu sua carreira ou sonha dar uma guinada na vida profissional pode recorrer aos servios de orientao vocacional oferecidos por algumas instituies em So Paulo. Geralmente procurados por estudantes adolescentes, esses servios so abertos a qualquer interessado e tm a vantagem de ser gratuitos ou cobrar preos baixos se comparados aos de empresas de consultoria. O objetivo do projeto, segundo Miro, facilitar a investigao policial e a obteno de provas contra o crime organizado (delitos praticados por atividades de quadrilha ou bando). Uma das novidades a permisso para a polcia se infiltrar em quadrilhas para obter provas. Atualmente, esse tipo de prova no aceito na Justia. O projeto tambm permite polcia impedir, interromper e interceptar a escuta e a gravao das conversas telefnicas. Mas isso s teria validade com a aprovao de um projeto de lei complementar Constituio, regulamentando a escuta telefnica. Esse projeto tambm do deputado do PDT e j foi aprovado pela Cmara. Quem queria comprar, por exemplo, meia dzia de pes e um litro de leite, mas s tinha cruzeiros reais no bolso, teve que enfrentar fila de at duas horas. Um consumidor que tivesse apenas uma nota de CR$ 50.000,00 para fazer uma compra de valor menor tinha de solicitar um cheque de converso no valor aproximado da mercadoria e receber o troco em cruzeiros reais. As vitrinas sempre guardam surpresas, variando desde o caro refinamento da avenida Montaigne (que concentra a alta costura, no Champs Elyses) at as pechinchas nas pequenas lojas espalhadas pela cidade e nos magazines. Paris ... o encanto dos sentidos. Mesmo abstraindo-se tudo o que a cidade representa como patrimnio cultural da humanidade, ainda assim ela resulta parisdisaca, adornada de todo o sortilgio. Com essas definies, a equipe teria condies de estabelecer as regras o real. Amaral disse que as reservas internacionais sero mesmo a garantia da nova moeda. Existem algumas opes de como fazer isso, disse Srgio Amaral, chefe de gabinete do ministro Rubens Ricupero. O real ter que ter uma referncia em relao ao dlar, segundo Amaral, que no quis dizer se o cmbio ser fixo ou varivel. Primeiramente, gostaria de dizer que o vosso caderno D+!!! Adoraria se vocs engordassem um pouco mais o Folhateen. E eu gostaria de saber como fazer para adquirir a carteirinha de estudante. No interior, o jeito mais fcil procurar o grmio estudantil ou a direo de sua escola. Eles devem ter informaes para orientar voc. O ex-deputado Ibsen Pinheiro (PMDB-RS) tambm no quis falar sobre a absolvio do ex-presidente. Ex-presidente da Cmara, Ibsen conduziu a votao autorizando o impeachment de Collor, em 29 de setembro de 1992. O prefeito de So Paulo, Paulo Maluf (PPR), e o candidato derrotado do PMDB Presidncia, Orestes Qurcia, tambm no quiseram comentar o resultado. O meia Mazinho deve substituir o meia defensivo Mauro Silva segunda-feira contra os Estados Unidos pela segunda fase (oitavas-de-final) da Copa do Mundo. O tcnico Carlos Alberto Parreira afirmou ontem que a seleo brasileira precisar tocar mais a bola contra um adversrio que ele prev muito defensivo. O nvel de detalhamento do atlas Folha/The New York Times no tem similar no mercado brasileiro. Os mapas de pases, como os do terceiro fascculo, mostram 16 tipos de vias de comunicao (de estradas a aeroportos de trfego local e internacional), rios, lagos, cachoeiras, montanhas (acompanhas da altitude), parques nacionais, reservas de gs natural etc. Senti que eu j conhecia o Brasil antes de ter vindo aqui. Saar descobriu no Brasil o bambu verde. Sua idia fazer na 22 Bienal uma instalao com bambus no segundo andar que seja vista por cima, do terceiro, com toda sua cor e padro. A partir de hoje a rea de convivncia do Sesc Pompia abriga instalao indita do artista plstico Nuno Ramos, Montes. O trabalho faz parte do projeto Exerccios Visuais do Sesc Pompia, que visa a ocupao inusual de espaos da ex-fbrica. Os integrantes dos movimentos Pinheiros Vivo e Vila Olmpia Viva receberam com alegria a notcia. Acho que o Consema foi sensvel ao que est acontecendo. A obra vai causar um impacto to grande na regio que deve ser analisada pelo Estado, disse Horcio Galvanese, do Pinheiros Vivo. Para o arquiteto Siegbert Zanettini, do Vila Olmpia Viva, agora um rgo independente est analisando o impacto da obra sobre o meio ambiente. Segundo Zanettini, a anlise do Consema foi resultado do compromisso assumido, na sexta-feira passada, pelo governador Fleury. O ndice de Preos ao Consumidor (IPCA) de Belo Horizonte fechou a ltima quadrissemana de novembro com variao de 3,12%, contra os 2,54% registrados em igual perodo de outubro. Em comparao quadrissemana anterior, quando o IPCA ficou em 3,28%, houve reduo. O dado do Ipead (Instituto de Pesquisas Econmicas e Administrativas/MG) e mede os gastos das famlias com renda de 1 a 40 salrios mnimos em Belo Horizonte. O IPCR, para famlias com renda de 1 a 8 mnimos, ficou em 3,06%, contra os 2,85% do mesmo perodo de outubro. Teremos, ento, um meio-campo ainda mais slido na proteo zaga e mais inteligente, veloz e agressivo, na combinao com o ataque. Teremos o balano entre defesa e ataque que tanto Zagalo e Parreira defendem, mas que o time no revela em campo. Alberto Helena Jr., 52, colunista da Folha. Sua coluna na Copa diria. Fria Metal entrevista os integrantes do Ramones, banda surgida em Nova York na dcada de 70. Johnny (guitarra), Joey (vocal), CJ (baixo) e Mark (bateria) falam sobre o surgimento do grupo, a influncia de Iggy Pop, processo de criao e o revival do punk considerado ridculo e ilegtimo por CJ. Entre os clips que o programa exibe esto Blitzkrieg Bop, I Wanna Live e Poison Heart. Furia Metal MTV, 21h30. Entendo que a minha misso no torcer. o dilaceramento permanente desta profisso possessiva, exclusivista, desconfiada e ciumenta. Isto no impede que, no ntimo, eu deseje alguma espcie de uma complexa, delicada e difcil soberania para ns. Na hora da invaso da delegacia, s havia um motorista no planto. Os invasores quebraram portas, janelas, grades e cadeados. Os outros 13 presos no tentaram fugir. O delegado disse que tm se tornado frequentes os linchamentos na regio. Citou outros ocorridos recentemente em Carinhanha, Santa Maria da Vitria, Paratinga e Guanambi. Com esse caso em Caetit, chega a 14 o nmero de linchamentos registrados na Bahia desde o incio deste ano. O aquecimento das vendas nos ltimos dois meses fez com que o empresrio carioca Arthur Sendas desengavetasse antigo projeto de construo de shopping center, hipermercado e conjunto residencial em rea do grupo localizada na Via Dutra, no final da Linha Vermelha. O grupo Sendas entra com o terreno de 154 mil metros quadrados, e o Nacional Iguatemi Empreendimentos, dono de shoppings centers no Nordeste e scio de Tasso Jereissati no Iguatemi de So Paulo, com os recursos para bancar o empreendimento. No sbado, Goldman relatou a conversa a alguns quercistas no Palcio Bandeirantes. Ele disse que a conversa foi preliminar. Goldman defendeu a necessidade de novas negociaes. Em sua avaliao, a conversa com o deputado fluminense mostrou que ambos desejam a aproximao. O msico e humorista Juca Chaves inaugura no dia 5 de outubro, a partir das 22h, um novo teatro em So Paulo. O Jucabar-Theatro Inteligente vai funcionar na rua Major Sertrio, 661, no centro de So Paulo. A renda da noite de inaugurao ser doada ao Fundo Social de Solidariedade do Estado. 33... dias sero completados na prxima segunda-feira, dia de eleio, sem que os vereadores da Cmara de So Paulo votem algum projeto importante. Dos 55 vereadores, 24 so candidatos Assemblia Legislativa, Cmara dos Deputados ou Senado. Se todos conseguirem ser eleitos, quase metade (43%) dos vereadores paulistanos vo trocar de Casa Legislativa. Yoshiaki Nakano -- economista, ex-secretrio especial de assuntos econmicos do Ministrio da Fazenda e ex-secretrio nacional de poltica econmica (gesto Bresser Pereira). Pasta indefinida. Vladimir Rioli -- Pode ficar com a presidncia do Banespa. economista e ex-vice-presidente do Banespa (governo Fleury). Como num grandioso mea culpa, os ltimos filmes que Eastwood dirigiu Os Imperdoveis e Um Mundo Perfeito apontam justamente para uma crise do heri, de sua autoridade restauradora, de sua integridade monoltica. Em consequncia, pe a nu uma crise dos prprios gneros com que trabalha. Em Um Mundo Perfeito, o tema e seu desenvolvimento so de uma clareza exemplar. No Texas, em 1963, um menino sem pai, Philip Perry (T.J. Lowther), tomado como refm por um criminoso fugitivo, Butch Haynes (Kevin Costner). Ambos partem numa fuga para o norte (rumo ao pai de Haynes, que mora no Alasca). O programa sobre a 2 Guerra Mundial traz informaes sobre 2.100 eventos, tem 900 fotos e 140 mapas de batalhas, permitindo estudar as tticas usadas na guerra, alm de textos e vdeos de gravaes originais. Quem prefere as misses espaciais tem disposio cerca de oito horas de informaes sobre 1.600 misses, incluindo o programa Apollo e o passeio do astronauta Neil Armstrong na Lua. A organizao tambm iniciou um inqurito para averiguar o massacre de cerca de 500 mil pessoas. Encarregado do caso, o marfinense Rene Degni-Segui se disse favorvel a um tribunal internacional para julgar os acusados. Pessoalmente prefiro um tribunal internacional a dependermos dos tribunais nacionais (de Ruanda), que tero dificuldade em ser imparciais, disse o professor de direito da Costa do Marfim. A surpresa descrita por Pedro Collor atingiu todos os partidos que integram a chamada Nova Fora da poltica local, liderada por Lessa. At o final da tarde de ontem, nenhum dos partidos da Nova Fora havia se manifestado sobre o assunto. Lessa disse que est disposto a ouvir uma proposta formal do governador. O Congresso Internacional de Direitos Autorais enviou, na ltima sexta, uma recomendao ao ministro da Cultura, Luiz Roberto Nascimento Silva, para reativar o rgo governamental que cuidava da rea da propriedade intelectual. O CNDA (Conselho Nacional de Direito Autoral), que respondia pela fiscalizao, consulta e assistncia nas matrias relativas a direitos autorais, foi desativado pelo governo Collor. FHC e Malan no esclareceram, at o momento, quanto o governo gastou e quando foram comprados os papis. Ainda assim, s o anncio de que as garantias sero adquiridas junto ao mercado poder elevar os gastos com as futuras compras, raciocinam senadores como Espiridio Amin (PPR-SC), Ronan Tito (PMDB-MG) e Gilberto Miranda (PMDB-AM). Assessores da equipe econmica j tm pronta parte da argumentao. FHC e Malan devero defender que o uso das reservas foi necessrio para salvar o acordo da dvida, sem o qual o pas no normalizar suas relaes financeiras internacionais. Para explicar as compras em sigilo, o argumento justamente a necessidade de evitar o encarecimento dos papis. O acordo da dvida implica a troca dos papis atuais por uma combinao de cinco tipos diferentes de novos papis. A operao dar um desconto inicial de US$ 4,3 bilhes sobre o valor do dbito. Essas condies, para renegociar US$ 35 bilhes, foram aprovadas pelo Senado no final de 93. O mesmo no ocorre com a mistura de tinta para a parede. Segundo a consultora de cores Renata Loureno, o trabalho pode ser feito por qualquer pessoa seguindo as instrues da lata, sem a necessidade de um pintor profissional. Mas deve-se tomar o cuidado de acabar com infiltraes na parede para que a tinta possa ser bem aproveitada e dure mais tempo. lvaro Vidigal, da Bovespa, registra a prudncia dos candidatos s eleies de hoje, que esto sabendo preservar a instituio acima dos interesses pessoais. Preocupao de Vidigal: Manter junto s autoridades a imagem de seriedade cuidadosamente cultivada no decorrer desses anos. A Secretaria Estadual da Sade investiu R$ 11 milhes na campanha, de acordo com o diretor do Centro de Vigilncia Epidemiolgica, Wagner Costa. Na primeira fase, no dia 11 de junho, foram vacinadas cerca de 3,2 milhes de crianas em So Paulo. Neukirchen disse que a proposta apresentada aos bancos inclui uma injeo de 2,7 bilhes de marcos (US$ 1,55 bilhes) para aumento do capital acionrio. Alm disso, a companhia pede que os bancos aumentem suas linhas de crdito em 500 milhes de marcos (US$ 287,1 milhes) e concedam uma moratria de trs meses no pagamento das dvidas. No mercado, estima-se que as dvidas do grupo superam os nove bilhes de marcos. Os bancos devem responder proposta do grupo at o dia 12. A crise no MG aprofundou-se no incio de dezembro com o declnio nos preos internacionais do petrleo. A pesquisa, divulgada no ltimo sbado, aponta o candidato do PSDB em primeiro lugar, com 48% das intenes de voto. A definio do tempo no horrio eleitoral gratuito foi encaminhada pela corregedoria a todos os partidos que disputam a sucesso do governador Luiz Antonio Fleury Filho. A escolha dos tcnicos Johan Cruyff, do Barcelona da Espanha, e Tel Santana, bicampeo mundial pelo So Paulo, como colunistas da Folha durante a Copa dos EUA recebeu elogios de ex-jogadores da seleo brasileira. Todos lembraram a importncia dos dois treinadores na histria do futebol e no cenrio atual mundial. Folha -- Como vai atuar no jogo contra o Palmeiras? Sandoval -- Vou entrar com determinao, sabendo que vai ser difcil. Confio na nossa equipe e espero sair do Parque Antarctica com a vitria. Uma nova escalada obscurantista na China afastou ontem de Cannes o cineasta Zhang Yimou, diretor do clebre Lanternas Vermelhas. Os negcios com ouro na BM&F (Bolsa de Mercadorias e Futuros) somaram 54 toneladas em maio, o que representa uma queda de 2,70% em comparao ao ms imediatamente anterior, quando foram movimentadas 55,5 toneladas, de acordo com informaes da entidade. A mdia diria de negcios com o metal em maio foi de 2,43 toneladas. Em relao ao mesmo ms do ano passado, quando os negcios atingiram 139,8 toneladas de ouro, a reduo de 61,37%. A mdia diria naquele ms foi de 6,6 toneladas, segundo dados da Bolsa de Mercadorias & Futuros. Se no fizssemos isso, os jogadores sofreriam mais. Os horrios das refeies foram adaptados ao fuso local, disse Sant'Anna. Apenas para estadas mais breves, de um a dois dias, o preparador teria optado por manter o fuso de Los Gatos. O Conia, por exemplo, a parte do congresso que discute o uso da informtica pelos profissionais liberais. O Conia ter 18 palestras dirigidas s reas de medicina, odontologia, veterinria, arquitetura, urbanismo, engenharia e direito. nico no mundo. Felicito. uma coisa a se estudar. Mas deixa escapar o que pensa. Na cabea do treinador ou no papel, o esquema ttico pode ser ofensivo. Mas no o ser, na prtica, se no houver determinao para vencer. J Vail oferece entre outras coisas espetculos ao longo do vero realizados num anfiteatro aberto. O destaque, sem dvida, o bal Bolshoi da Rssia, que h pelo menos cinco anos consecutivos se apresenta na cidade. O Flamengo ficou na espera do Grmio, procurando apenas aproveitar o contra-ataque. Mesmo assim, no criou muitas chances. O gol da vitria flamenguista foi marcado aos 20min do segundo tempo. Magno desceu pela direita e cruzou para a rea. Nlio teve apenas o trabalho de escorar para o gol vazio. Emissora: Srie em seis episdios, exibidos de segunda a quinta, na Cultura. Propaganda - O Poder da Imagem o tipo de srie que poderia ser adotada com proveito nos currculos escolares. Os responsveis pelo programa vasculharam uma multido de arquivos de filmes (russos, norte-americanos, franceses, ingleses, alemes) e selecionaram imagens produzidas pelas agncias oficiais desses pases que compem uma formidvel ilustrao da histria do sculo 20, tal como vista por seus protagonistas. O governo do Estado diz que a greve ilegal. Os professores da UFPB (Universidade Federal da Paraba) tambm entraram em greve geral ontem. Eles reivindicam 105% de reposio de perdas salariais. Os funcionrios da universidade esto em greve desde o ltimo dia 19. Patrcia -- Minha literatura vem da imagem, da TV, do cinema. Eu tento usar o processo de edio, como se trabalha em uma mesa de edio. Folha -- Como teu mtodo? O ingls John Ritchings vai escrever um livro sobre os meses que foi torturado pelo galo da vizinha, diz o jornal. O galo, Corky, cantava das 3h s 7h diariamente e Ritchings no conseguia dormir. A estrela smbolo do PT vai emoldurar com destaque o cenrio dos programas do candidato Luiz Incio Lula da Silva. Seus programas de rdio tero reportagens de rua, vetadas na TV. Segundo Markus Sokol, coordenador de comunicao, a estratgia de comunicao foi delineada em um seminrio realizado em dezembro passado. Executivos em cargos de liderana assumem seus cargos com mais uma pesada funo: implementar cada passo da estratgia que eles prprios esboaram. Em junho, Christensen esteve no Brasil a convite da Fundao Dom Cabral de Belo Horizonte, para acompanhar a segunda etapa do curso STC-Executivo. As construes lembram vagamente uma verdadeira vila de pescadores. Perto de qualquer uma, o cenrio de Tropicaliente parece uma grande aldeia de milionrios. bonito isto daqui, dizia admirado um pescador nativo, que participava de uma gravao como figurante no ltimo dia 30. Licena potica parte, as praias e as dunas de Fortaleza foram escolhidas como cenrio de Tropicaliente por Paulo Ubiratan, diretor-artstico da Globo, por causa da beleza natural do lugar. Crianas mais velhas que nunca seguiram a dieta podem ter retardo mental, desenvolvimento motor prejudicado e alteraes significativas do comportamento (isolamento e agressividade). Pesquisas tm mostrado que o fenilcetonrico deve seguir a dieta durante toda a vida. Os adultos que voltam a comer protenas podem ter piora da concentrao e da memria e uma maior agitao. Quando astronautas embarcarem numa estao orbital ou, eventualmente, estiverem a caminho de Marte, tero como companhia necessria alguns bilhes ou trilhes de micrbios comiles. A Nasa (agncia espacial dos EUA) encomendou empresa Micro-Bac o desenvolvimento de um sistema de purificao de esgoto utilizando bactrias conhecidas por devorarem material orgnico. O estudo custou US$ 400 mil. Os testes de laboratrio deram certo, e agora a empresa se prepara para comercializar essa mini-usina tratadora de esgotos. Em um dos subprodutos mais irnicos da pesquisa, a tecnologia criada para uma casa orbital de bilhes de dlares, a estao Freedom, poder facilitar a construo de casas populares na Terra, segundo Ercole Amrico Carpentieri Jr., diretor da filial brasileira da Micro-bac. A tecnologia pode reduzir a necessidade de rede de esgotos em determinados locais. O que deve ser destacado de forma resumida em um item sobre qualificaes profissionais so os pontos fortes da experincia e da formao acadmica e os atributos que podem diferenciar o candidato no mercado. Segundo Laerte Leite Cordeiro, consultor especializado em recolocao, o currculo deve ser dividido em trs partes: rea geral de interesse (administrativa, contbil, financeira etc.), qualificaes profissionais e descrio da trajetria de empregos (empresas, cargos, perodos). Comenta o contrabando de plutnio, dizendo que sria advertncia para que governos controlem os excessos deste material que o fim da Guerra Fria criou. O jornal kuaitiano diz que o governo do pas fechou um salo de beleza por uma semana por infringir a lei que probe mulheres de trabalhar em sales para homens e vice-versa. O Houston Rockets conquistou sua 27. vitria na temporada da NBA (contra apenas quatro derrotas), em casa, sbado, ao vencer o Philadelphia 76ers por 100 a 93. Mais uma vez o piv Hakeem Olajuwon foi o fator de desequilbrio a favor do Houston, um dos mais destacados times do campeonato. Ele converteu 23 pontos e ganhou 17 rebotes. O Philadelphia tem apenas 12 vitrias e j perdeu 19 vezes. Contra o Houston, o cestinha da equipe foi Jeff Honaceck, com 20 pontos, seguido por Clarence Whiterspoon, com 16. Lamentavelmente, mais uma vez, um pas do subcontinente sul-americano suspende as garantias do Estado de Direito. Desta feita, porm, no foi uma aventura castrista ou uma injuno poltica qualquer que levou o governo venezuelano a este ato extremo, mas sim a conjuntura econmica. E, de fato, os ventos no sopram em favor dos venezuelanos. Eleito em 1988, o ex-presidente Carlos Andrs Prez (social-democrata) encetou uma poltica econmica neoliberal que imps sacrifcios populao dando incio a uma srie de violentos protestos. Prez enfrentou duas tentativas de golpe militar e, a exemplo de Fernando Collor, acabou sofrendo impeachment por corrupo e, hoje, encontra-se numa cela espera de julgamento. Aprofundando as dvidas, Leila se deixou cair na tentao do duplo. Se no h mais bossa nova realmente nova, o melhor mimetizar o criador do estilo, o cantor Joo Gilberto. O resultado dessa tentativa de metempsicose est no recm-lanado CD Isso Bossa Nova (EMI-Odeon), stimo disco em 12 Anos de sua carreira. A lei tambm prev que at o final de junho o governo enviar ao Congresso um projeto sobre a elevao do valor real do salrio mnimo. Se o mnimo sofrer alterao, a Constituio determina que as aposentadorias tambm sejam revistas. O Mogi Mirim venceu ontem o Unio So Joo de Araras por 2 a 0 em Mogi e deu mais um passo para conquistar uma vaga na Copa Bandeirantes. Esse torneio reunir os seis primeiros colocados da Srie A-1 e o campeo da A-2 do Campeonato Paulista. Em So Paulo, no necessrio deslocar-se muito procura de artigos de segunda mo. H pontos de concentrao de lojas que facilitam a pesquisa de preos. Entre uma loja e outra podem ocorrer grandes diferenas. Nas aulas, Candelori usa reportagens de jornais para introduzir seus temas e colocar toda a classe para falar. Na lista dos assuntos esto a campanha contra a fome, racismo, imigrao nordestina, aborto, corrupo poltica e aborto. Tem gente que acha a aula intil. So as mesmas pessoas que acham que sua responsabilidade social se limita a pagar impostos, diz Letcia Zioni Gomes, 16. preciso cobrar o governo para as coisas funcionarem, completa Bruno Fanizzi, 15. Fanizzi conta que antes das discusses em aula era uma pessoa de extrema direita, que no pensava nas suas opinies. A aula boa porque o professor jamais impe a opinio dele. Ele levanta a discusso e deixa o conflito de idias rolar, diz Manoela Nicoletti, 15. Segundo Candelori, seu curso de sensibilizao para temas ticos. Eles no suportariam a aridez da teoria, mas discuto temas como espao pblico e privado atravs de fatos como o comportamento do Itamar no Carnaval, diz. Sobre o controle antidoping, a Fifa estabelecer uma fax hotline, em operao 24 horas por dia, para que os mdicos das delegaes consultem a organizao da Copa antes de fornecer qualquer medicamento a seus jogadores. As consultas sero feitas por escrito para evitar contestao. O brasileiro Joo Havelange consolidou ontem, em Nova York, a sua candidatura para o sexto mandato consecutivo como presidente da Fifa. O presidente da Uefa, Lennart Johansson, negou que a entidade europia v apresentar oposio a Havelange. O fonoaudilogo tem uma rea vasta de atuao, apesar de estar muito concorrida. A gente pode trabalhar com atores de teatro, crianas, na preveno de doenas ou no exame de trabalhadores expostos a rudos. O atacante do Barcelona obteve o Onze de Ouro, distino outorgada por leitores da revista francesa Onze Mundial. Recebeu 13.576 votos (33,94%) em 40 mil respostas. O time anunciou ontem a contratao do craque argentino. Naquele ano, Fleury j aparecia com 20% das intenes de voto em setembro, um ms antes do primeiro turno. A menos de uma semana da eleio deste ano, Munhoz no passou dos 12%. Fleury foi para o segundo turno e venceu o atual prefeito de So Paulo, Paulo Maluf (PPR). Na pesquisa Datafolha divulgada no ltimo sbado, o primeiro colocado, Mrio Covas, aparece com 47%, nove pontos percentuais acima da soma dos ndices dos demais candidatos. ACM disse que no conseguiu entender nada do novo plano econmico elaborado pela equipe do ministro Fernando Henrique Cardoso. Eu estou igualzinho ao presidente Itamar, igualzinho ao ministrio, igualzinho a 150 milhes de brasileiros, que no sabem coisa alguma sobre este plano. Segundo ACM, quem disser que conhece o plano, de um modo geral, est mentindo. O governador acrescentou que pedir ao Nosso Senhor do Bonfim para que o plano d certo. O acordo inclui votar, sem novas emendas, o projeto de Oramento para 95. Os lderes acertaram que, havendo quorum, o Oramento de 95 s no ser votado hoje se no ficar pronta a publicao do parecer aprovado na Comisso Mista de Oramento, acolhendo cerca de R$ 2,5 bilhes em emendas. O STJ decide que Halpern e Decaro, indiciados pelo MPF sob a acusao de fraude processual, devem ser julgados pela Justia Federal em So Paulo. Os deputados petistas Luiz Gushiken (federal) e Luiz Azevedo (estadual) pedem que o Ministrio Pblico estadual reabra processo sobre as importaes. O partido atua no movimento sindical atravs da Corrente Sindical Classista, filiada CUT. Controlam ou so influentes em sindicatos importantes em So Paulo, como o dos metrovirios, o da Sabesp e bancrios. Os sindicatos dos metalrgicos de Betim (MG), de Ribeiro Preto (SP) e da Bahia tambm so da tendncia. Com o resultado, o time perdeu a chance de garantir antecipadamente a vaga nas quartas-de-final. A equipe entra na quadra amanh s 20h30 (15h30 de Braslia) para disputar com o Canad, no chamado grupo da morte, um lugar na prxima fase do Mundial (leia matria nesta pgina). Entre os exemplares destes vinhos que chegaram s nossas praias, todos safra 92, o Bourgueil Lchellerie, milita decididamente na primeira fila. Ele um vinho ligeiro, de aroma simples de frutas vermelhas e boa acidez, ideal para ser bebido refrescado. O Chinon Les Granges e o Saumur-Champigny Domaine de la Martinire so pouco mais encorpados. O Les Granges tem aroma e sabor marcado por morangos e evolui bem na boca, enquanto o Saumur, tambm frutado, mais srio e (levemente) tnico. O sushiman Oscar Tadaioshi Izumi baixa com hashi e raiz forte hoje nos domnios de Wilma Kovesi para entregar os segredos da cozinha japonesa. Ningum ignora os terrveis problemas que o pas atravessa, dos quais a corrupo policial e os nveis de violncia so apenas uma faceta. Ainda assim, de se supor que existam bons agentes de polcia. particularmente a estes bem como a toda a sociedade que interessa a apurao completa e rigorosa dos massacres. Toda vez que um crime cometido por representantes do poder pblico fica sem punio, o nus recai sobre a corporao como um todo, gerando um turbilho que indiferncia os bons dos maus policiais. evidente que quem perde so os primeiros. O pior, porm, que nesse processo o prprio Estado cai em descrdito e acaba cedendo mais espao s foras do caos. Folha -- O Mercosul foi objeto de declaraes meio eufricas nos ltimos dias. Que pontos, no entanto, o deixaram frustrados? Amorim -- No posso dizer que algo me frustre, levando em conta que h trs anos a idia do Mercosul surgia no Tratado de Assuno. Talvez preferssemos que o comrcio estivesse totalmente liberalizado, sem as listas de produto em regime de adequao (sobre os quais so mantidos os impostos alfandegrios) que afetam de 5 % a 8 % do comrcio regional. E como ningum de ferro e Orlando fica perto, duas horas e meia de avio, a turma depois d uma passadinha para um milk shake com Mickey e Pateta. Aberto desde 1948 no Beverly Boulevard quando os clientes eram Errol Flynn, Frank Sinatra e Billy Wilder, o restaurante Dominick's continua na moita. Tudo leva a crer que estamos num processo de endemia, sustenta o patologista Bruce Mendes Campos, responsvel pela anlise do sangue. De 205 pacientes com suspeitas da doena, 47 revelaram-se positivas neste ano. Estamos assustados, afirmou o diretor do hospital, Aloysio Campos da Paz, professor visitante das universidades de Oxford (Inglaterra), Stanford, Harvard e Nova York (EUA). Para viabilizar a viagem de pelo menos parte dos 20 integrantes do coral advogados, bancrios, secretrias, estudantes, aposentados e apenas um padre, a diretora diz que se uma empresa se dispuser a cobrir os custos da viagem, o coral oferece em troca a gravao de um disco promocional. Qualquer doao seria muito bem recebida, diz madre Maria. Os interessados em viabilizar a participao do coral no festival belga podem entrar em contato com a direo do coral nos telefones (011) 231-5346 e 288-3514. Olhando com ateno os jogos da Copa possvel chegar a uma concluso oposta de Parreira. Foi justamente graas arte de alguns de seus jogadores que equipes como a Bulgria, a Romnia e a Nigria chegaram muito mais longe do que jamais tinham conseguido. Num campeonato em que predominaram as defesas fechadas, o que fez a diferena foi a imaginao ouso dizer, para horror de Parreira, a magia de um punhado de craques: Romrio, Baggio, Stoichkov, Hagi, Brolin... No final feita uma auditoria para ver se a auto-avaliao feita pela fbrica corresponde realidade. Segundo Migues, algumas fbricas da Autolatina ainda no atingiram o padro de excelncia Q1. As fbricas que atingiram receberam uma placa e, em alguns casos, um veculo sorteado para seus funcionrios. A disciplina de cirurgia plstica da Faculdade de Medicina da USP est lanando um programa de cirurgia plstica em crianas com a colaborao do Banco Real. O objetivo divulgar as tcnicas mais modernas de tratamento das deformidades congnitas nas diversas regies do pas. A primeira etapa do programa comea hoje e vai at o dia 2 em Joo Pessoa (PB). Um novo protocolo desenvolvido por um grupo de mdicos de quatro instituies pblicas de So Paulo est trazendo para o Brasil o Interferon Beta, para o tratamento de pacientes com esclerose mltipla (doena degenerativa que causa enrijecimento muscular, debilidade das funes motoras e perda parcial da viso). A droga foi aprovada em julho passado pela FDA rgo de fiscalizao de medicamentos dos Estados Unidos para o tratamento da doena. Informaes pelo tel. (011) 887-8740. O ex-ministro e candidato presidencial Fernando Henrique Cardoso disse Folha que frequentava no passado os sebos do centro, perto da Faculdade de Direito da USP. Quando vai aos EUA, ele no se limita a comprar blazers caros. Tambm vai livraria Barnes and Nobles, em Nova York. Nem todos tm tempo e pacincia para sebos. O filsofo Jos Arthur Giannotti diz que se sente angustiado em livrarias e prefere comprar seus livros por catlogos. Tantos livros, to pouco tempo para ler. O nibus era da Viao Boa Vista e transportava cerca de 40 passageiros na hora do acidente, segundo a Polcia Rodoviria. Com a coliso, cinco passageiros ficaram feridos e foram atendidos no Hospital Mrio Gatti. Todos passam bem. De um lado, acirrou-se a reao marcadamente militante, mais diretamente identificada com as propostas poltico-ideolgicas das esquerdas organizadas, cujas manifestaes procuravam realar o que se entendia na poca por cultura nacional e popular, rechaando a influncia imperialista e suas armas culturais entre as quais incluiam-se a televiso voltada para o consumo e para a alienao, as formas artsticas americanizadas, a cultura pop e at... a guitarra eltrica. Esta vertente, francamente conteudista, derivava das experincias realizadas no perodo pr-64 pelos Centros Populares de Cultura (CPCs), ligados Unio Nacional dos Estudantes, que privilegiavam a mensagem e procuravam falar uma idealizada linguagem do povo. Ele teve importante papel ao manter a poltica externa e muitos assuntos internos fora da esfera decisria da UE, ganhando a simpatia britnica. J assessores do federalista Delors, que em dez anos na presidncia da CE aumentou os poderes do cargo e entrou em conflito com Margaret Thatcher (antecessora de Major), temem que a escolha de Santer sirva para enfraquecer os poderes do presidente, o que abriria caminho para o eixo Alemanha-Frana consolidar o seu domnio. O fato de haver razovel consenso acerca desses temas no significa, porm, que seja fcil ou rpido aprovar tais reformas no Congresso. Elas dependem em boa parte de emendas constitucionais, o que est longe de ser simples de fazer. Ora, se o presidente chegar posse sem que estejam no mnimo delineados, divulgados e negociados com os setores organizados da sociedade os projetos que consubstanciem tais reformas, torna-se muito mais difcil faz-los aprovar no Congresso no prazo de um semestre definido pelo prprio FHC. Por tudo isso, no cabe formalismo em excesso. Cabe, lgico, todo o respeito s prerrogativas do atual governante. Mas, at por ser tambm o candidato do presidente, FHC deve comear a trabalhar j. As paulistas Mancha Verde (Palmeiras) e Gavies da Fiel (Corinthians) aceitaram, mas as cariocas se recusaram. A carta de Stepanenko engrossa desde ontem o inqurito contra o ministro no TSE. Trata-se de outra evidncia irrefutvel. Itamar no precisa da ajuda de Vicentinho. Tem diante de si um portentoso exemplo de desvio de conduta. Alis, j poderia ter agido. Seu silncio soa a cumplicidade. Mas os pontos mais importantes no foram tocados. No ouvi falar nada sobre arbitragens. No se fala na cobrana dos rbitros que apitam maldosamente, para prejudicar uma equipe. Isso aconteceu muitas vezes este ano. Outro ponto que as bolas que vo ser usadas no campeonato no devem ser escolhidas por interesses comerciais. Cerca de 300 cubano-americanos protestaram na frente do prdio da misso cubana na ONU, onde as negociaes ocorreram. Funcionrios do escritrio cubano tentaram abafar os protestos na rua colocando caixas de som nas janelas do prdio com msica alta. Em Miami, duas bombas caseiras (coquetis Molotov) foram encontradas ontem em frente revista cubana Replica, que apia as negociaes entre Cuba e EUA para tentar resolver a crise. As bombas no chegaram a ser acionadas. Se podemos nos orgulhar de nosso futebol, no podemos dizer o mesmo dos nossos rbitros, aqui representados exatamente por Marsiglia. Falando em futebol, em bom futebol, devemos saudar a chegada de Ronaldo aos Estados Unidos. As empresas Tejofran e Power informaram ontem, em nota enviada ao TRE (Tribunal Regional Eleitoral), que prestam servios aos comits do PSDB e PMDB. A Tejofran e a Power so acusadas por Rui Ramos, ex-integrante da campanha tucana, de financiar a estrutura poltica do candidato do PSDB ao governo, Mrio Covas. Na cheia, entre outubro e abril, quando a mdia mensal de chuvas supera 290 milmetros, a irrigao do arroz feita por inundao. No perodo mais seco, quando as chuvas no passam de dez milmetros/ms, os produtores plantam soja, que exige menos gua. Na sua primeira declarao, Queiroz omitiu US$ 1,7 milho em bens. Entre eles estavam a empresa ticas Trevo, terrenos e pontos comerciais em Salvador. O presidente do partido no Rio, S Freire, disse que o pedido de cassao foi feito porque Walter Queiroz no teria comparecido s reunies marcadas para que ele explicasse sua nova declarao de renda. J o sistema de tratamento de esgotos residenciais, com a estao do ABC, dever atingir 150 toneladas/dia. Somando-se os dois nmeros, chega-se a 500 toneladas/dia, aproximadamente 50% das 1.100 toneladas de poluio orgnica diria jogadas no rio. Antes da derrota por 2 a 0 na primeira partida entre os dois clubes, quinta-feira no Mineiro, o tcnico botafoguense Renato Trindade desenhou a ttica adversria num quadro negro do vestirio. Ele circundou com giz os cinco meias. O segredo do Atltico o quinteto. Eles vm de trs com a bola e nos dificultam, disse. Ele foi o responsvel pela incluso de peixes amaznicos no livro dos recordes da International Game Fish Association. A Polcia Militar do Amazonas est desenvolvendo uma campanha educativa de trnsito. Ela est alertando os motoristas que saem de Manaus pela rodovia AM-10. Policiais orientaro os motoristas com placas educativas que indicaro os cuidados bsicos contra acidentes. Agora suponha que a economia toda vire um bando de gente que ganha mesada e de gente que d mesada. Para quem recebe em URV, timo. Para quem paga, pode no ser um grande negcio ter de pagar mais dinheiro todos os dias. mais ou menos o que est acontecendo na discusso de quem tem imvel com quem paga o aluguel, por exemplo. Ou do dono da escola com seu pai, que paga a mensalidade. Se voc tem um pai po duro, no tenha dvidas pea a URV na sua mesada. Entre os itens importados, devem chamar ateno na Feicon os revestimentos de cermica para piso trazidos da Itlia. Importados pela Cramus, os pisos de uso comercial (para hotis e shopping centers) destacam-se pela grande resistncia. Algumas ainda esto sendo calculadas pela Caixa. Eles tero ainda de pagar multa equivalente a R$ 647,90. Alm dos dois, tambm foram condenados outros 12 diretores da CEF. A necessidade de reparar os equipamentos durante a viagem teria alterado os planos de trabalho. Damascene viaja raramente de navio, mas no um estranho ao mar. Quando jovem, esteve engajado na Marinha grega. Conhecia bem grandes embarcaes e saberia como agir numa emergncia. A situao tende a se agravar, uma vez que nenhuma das partes parece mostrar disposio de recuar. O governo recebeu ontem outra m notcia: empresrios se declararam contrrios ao CIP. O motivo que ele obriga a empresa que contratar jovens a manter um tutor para ensinar o trabalho aos novos empregados. Com isso, dizem, aumentam os custos de contratao dos jovens. Ao contrrio do que poderia esperar a direita norte-americana na vigncia da Guerra Fria, a vinda de Soljenitsin no pde ser capitalizada em discursos e aparies pblicas. Por dois motivos: primeiro, a indisponibilidade do escritor, que simplesmente no estava interessado em percorrer os Estados Unidos ou debater suas idias; segundo, seus comentrios sobre a decadncia moral do Ocidente logo o tornaram inconveniente. Sua ltima apario foi numa formatura em Harvard, em 1978. Depois disso, caiu no ostracismo. Sua partida dever ser igualmente discreta. Seu editor, Claude Durand, disse Folha por telefone, de Paris, que Soljenitsin no tem inteno de fazer qualquer despedida pblica dos Estados Unidos. Sei que jamais terei uma chance de trabalhar to calmamente de novo. Na Rssia, serei dilacerado pelos acontecimentos e pelas tragdias das pessoas. Est visto que nem o conjunto, nem qualquer outra frase dele, d frase negada por Fernando Henrique o sentido que ela tem por si mesma. A leitura de todo o trecho leva, alis, a outra mentira, esta sobre os tempos de TV do PSDB e do PT. No a primeira vez, e espero no ser a ltima, que Fernando Henrique e esta coluna se atritam pelo mesmo motivo. A primeira fez dez anos h pouco. Foi quando noticiei que Fernando Henrique estava em contatos sigilosos com o governo Figueiredo, oferecendo um plano de conciliao entre o regime que se exauria e o que nasceria. Para isso, a presidncia no poderia ficar com o candidato bvio na poca, Ulysses Guimares, que supostamente representava o risco de confronto com os militares. Tancredo tivera com o governo conflitos impeditivos, aqueles que o levaram a dissolver o PP de ento. No preciso repetir, agora, quem era o homem que Fernando Henrique apresentava como adequado ao plano, sob o argumento de que tinha trnsito nas esquerdas para encabear um pacto conciliador. O professor Leito de Abreu, que era ministro do Gabinete Civil, no vive para confirmar as visitas ofertantes de Fernando Henrique. Mas h testemunhas da veracidade da notcia, inclusive o ex-presidente Figueiredo, que repeliu a proposta. Apesar da veracidade ainda hoje comprovvel, na ocasio Fernando Henrique me fez graves acusaes. Com a mesma hombridade que exibe mais uma vez. Pois experimente ir a um show de msica, por exemplo, ofender Tim Maia ou Joo Gilberto na boca do palco, e voc ver que a atitude de Edmundo foi, no mnimo, compreensvel. Pois , Palmeiras e Corinthians fazem hoje a finalssima do Brasileiro. Provavelmente ser um belssimo jogo. O Corinthians ter de atacar os 90 minutos para reverter a enorme vantagem palmeirense. Como Collor, o novo presidente falou em: reforma do Estado, abertura da economia, avano da privatizao e flexibilizao dos monoplios estatais. Isto comeou a acontecer no governo anterior ao do presidente Itamar Franco. Morreu no domingo noite aos 80 anos o cientista Roger Sperry, ganhador do Prmio Nobel de Fisiologia e Medicina de 1981. O anncio foi feito segunda-feira pelo Instituto da Califrnia para Tecnologia Avanada (Pasadena), onde Sperry trabalhou at 1984. Na relao divulgada ontem em Washington, o Brasil citado com outros 34 pases. O Japo o principal alvo da lista por barreiras aos produtos, servios e investimentos norte-americanos, que segundo o USTR, colaboraram para que o dficit dos EUA no comrcio com os japoneses atingisse US$ 60 bilhes no ano passado. O relatrio dedica 42 pginas ao Japo, cujas barreiras importao de produtos e servios so muito maiores do que as dos demais membros do Grupo dos Sete (pases mais industrializados) e representam um entrave inaceitvel ao sistema de comrcio global. Tendo em mente o Japo, os EUA reinstalaram recentemente a clusula conhecida como Super-301, mecanismo legal que amplia os poderes de retaliao comercial da Casa Branca. A Super-301, que j foi aplicada contra o Brasil quando esteve em vigor em 1989 e 1990, estabelece um prazo de 18 meses para que as negociaes bilaterais levem eliminao do que os EUA consideram barreiras ao comrcio. Depois disto, os EUA podem impor tarifas punitivas de at 100% s importaes do pas ofensor. F do tcnico so-paulino Tel Santana ( muito modesto), Bianchi v o time paulista to forte como em 1992 e 93. O principal jogador da equipe o meia-direita Jos Basualdo. Ele atuou pela seleo argentina na Copa do Estados Unidos. Entre a defesa e o ataque, a bola quase sempre passa por seus ps. A Llian sem calcinha fez mais estrago do que um bando do Comando Vermelho com revlver. Ela uma profissional e conseguiu o objetivo dela. Mas o presidente tem de ter postura. O bom que o Fernando Henrique no estava presente e no se comportou como o Maurcio Corra. S tenho um sentimento: estupefao. Todo presidente uma autoridade pblica e deve cuidar da imagem daquilo que ele representa. O fato do presidente ter posado ao lado de aquela atriz foi um vexame para toda a nao brasileira. As letras do Digable Planets no refletem este radicalismo. So mais cool, falam de amor, crescimento e amizade sempre com um vis engajado, porm. O prprio nome da banda, segundo Ann Marie, vem de leituras sistemticas de Sartre, Marx, Kafka. Significa planetas bacanas. O nome seria uma tentativa de passar s pessoas o conceito de auto-suficincia cada ser um planeta mas tambm de inter-relao entre os planetas. Da os nomes de insetos para ns, continua Ann Marie. O atlas que a Folha vai distribuir aos leitores em 19 fascculos tem trs patrocinadores: o Banco Itamarati, a Companhia Brasileira de Alumnio (do grupo Votorantim) e a OAS Participaes. No foi difcil encontrar patrocinadores para o projeto, segundo Antonio Carlos de Moura, 35, diretor comercial da empresa Folha da Manh S/A, que edita a Folha. Uma mesa de encostar, em jacarand claro, feita mo no Rio no sculo 18, vai a leilo no prximo ms em So Paulo por US$ 15 mil, preo equivalente a dois carros populares zero quilmetro, como o Fusca. O leilo ser na casa Renato Magalhes Gouva Escritrio de Arte, nos Jardins (zona oeste). Colosio, candidato do Partido Revolucionrio Institucional (PRI, governista) foi morto a tiros em Tijuana durante um comcio. Ele era o favorito para as eleies programadas para 21 de agosto. Foi substitudo como candidato por Ernesto Zedillo, que chefiava sua campanha. O percurso, que ser divulgado momentos antes da prova, deve incluir a avenida Bandeirantes, via Anchieta, represa Billings e chegada na av. Nove de Julho, em frente ao Banana Banana. Ao final, todos os corredores comemoram a prova com uma feijoada na casa noturna. Ali estar exposto o Lancer Evolution, carro de competio da Mitsubishi. Ele acusado pela morte de pelo menos 83 pessoas e de ter ferido outras 200. O Chacal chegou ontem s 10h30 (5h30 no Brasil) ao Palcio da Justia, usando algemas e cercado por um forte esquema de segurana. SO PAULO A eleio presidencial est na sua pior fase, que aquela das articulaes polticas de bastidores, da costura sigilosa de alianas, das rasteiras e baixarias entre adversrios e correligionrios. a fase do vale-tudo que antecede a escolha dos candidatos pelos partidos. uma fase trgica para ns, eleitores. Imagino que para a grande maioria dos polticos este seja o momento de glria porque exercita o que considera uma virtude: a arte da esperteza, a poltica segundo os cnones herdados da velha tradio das raposas polticas. H quem se orgulhe dos golpes que consegue imaginar e confunda artimanhas com estratgia. que os dirigentes petistas acreditavam tanto na vitria no primeiro turno que dispensaram os polticos, quer dizer, a direita. Agora ela est de volta. Jos Genoino, entrevistado, j falava como novo centro do poder. Ns vamos colocar a campanha na rua, dizia, sorrindo. A secesso est s comeando. Maria Mole: mistura de bebidas, conhaque com martini e/ou vodka. Muito N: coisa ou conversa negativa demais. Futebol e poltica no se misturam. Essa a postura que prevalece para a maioria dos jogadores da seleo brasileira de futebol. Apenas o goleiro Gilmar (Flamengo) revelou o seu voto para as eleies presidenciais deste ano. Gilmar pretende votar no senador Esperidio Amin (PPR-SC). No quadro de hoje, voto nele, disse o goleiro do Flamengo. Para governador ele ainda est indeciso. s 21h15, o tucano Jos Serra, recm-eleito senador, juntou-se ao casal presidencial. s 22h43, o governador eleito de So Paulo, Mrio Covas, e sua mulher Lila chegaram para o jantar. Covas disse que Z Milionrio um velho amigo seu. Jos Carlos Arajo foi scio de Octvio Cavalcanti Lacombe (morto em 1992), que fundou h mais de 30 anos a Paranapanema, companhia de explorao de minrios principalmente estanho e cassiterita na regio amaznica. O ladro verificou qual era o saldo da conta de Maria Estela e, em seguida, retirou R$ 225 do caixa. Antes de ir embora, o ladro pegou o ttulo de eleitor e o CIC da assistente de secretaria e os jogou atrs do caixa. Ele disse que era para eu apanh-los e para no tentar reagir e impedir sua fuga, afirmou Maria Estela. Mas, quando o ladro ia deixar o caixa, dois outros clientes do banco chegaram. Arrelia foi o primeiro palhao brasileiro a ter um circo na televiso. Era o Circo do Arrelia, na TV Record, em 1953. Ele ficou famoso no todo o pas. Arrelia se chama Valdemar e tem 87 anos. Quando menino, gostava de estudar e no queria ser de circo, como seus pais. Para Serra, se no for aprovada a emenda constitucional que desvincula dos fundos de participao dos Estados e municpios as receitas dos novos tributos, de nada adiantar o aumento de impostos. Ainda no d para ficar tranquilo. O prazo curto e os adversrios so muitos, disse Covas. Para ele, a prxima semana ser decisiva para o governo, quando as MPs que tratam do aumento de impostos tero de ser necessariamente votadas. Elas no podem ser reeditadas porque a criao ou alterao de impostos s podem ser feitas no ano anterior ao de sua cobrana. O Banespa ainda no esclareceu as supostas dificuldades de uma agncia no exterior por haver pago parte das importaes sem licitao de equipamentos israelenses no governo Qurcia. O BC no teria aprovado a operao. Se o Ministrio Pblico Estadual comprovar danos ao patrimnio pblico com as importaes de Israel, os responsveis estaro sujeitos a ao de ressarcimento. Mas o Concerto para Harpa, Obo e Orquestra de Cmara (1980) e sobretudo sua Chane 2 para violino e orquestra (1985) apontam para uma escuta mais livre, hedonista. Resumindo: Lutoslawski radicalizou Bartk naquilo que este tinha de noturno, de dramtico e moderno. Os acasos e interrupes misteriosas da msica noturna bartokiana se fizeram, em Lutoslawski, silncios e vibraes aleatrias da orquestra. Jogos. Os folhetos tero uma rede de distribuio indita at agora em qualquer campanha institucional do governo: carteiros da ECT (Empresa de Correios e Telgrafos), agncias e postos de atendimentos do Banco do Brasil, Caixa Econmica Federal, alm de postos lotricos. Durante 15 dias, os bancos vo lhe dar reais em troca de cruzeiros reais. Se for necessrio, esse prazo ser prorrogado. Por isso, voc no precisa correr para trocar o seu dinheiro. Essa a inscrio obrigatria nos folhetos e cartazes. O atacante Casagrande aponta a falta de entrosamento como uma das causas para o seu baixo desempenho. Sei que no estou rendendo tudo que posso, mas estou em evoluo, disse. Folha -- O que voc acha das crticas que vem recebendo da torcida e da imprensa? Segundo o mdico Flvio Pozzuto, as funes neurolgicas do torcedor se mantm sem alterao ele respira com ajuda de aparelhos. O mdico diz que o edema neste caso irreversvel. A leso foi grande e vai deixar sequelas. A polcia pretende localizar todos os envolvidos na briga at o dia 21 de outubro. O Conselho Monetrio Nacional aprovou ontem uma nova linha de emprstimos do Banco Central para socorrer bancos de qualquer porte. Os emprstimos podero ser pagos em 90 dias e prorrogados pelo mesmo prazo. a terceira linha especial de socorro criada pelo BC desde o lanamento do real, todas com o objetivo de atender bancos que tiveram sua sade financeira comprometida. A frase do ex-presidente uma referncia a uma das promessas feitas por Cafeteira, um ex-sarneyzista que insinuava, durante a sua campanha, mandar Sarney para fora do seu Estado. Se derrotar o candidato do PPR, Roseana se tornar a primeira governadora eleita na histria do pas. No caso de impedimento do presidente da Repblica ou de vacncia do cargo nos primeiros dois anos de mandato, a substituio ser feita atravs de eleio direta em 90 dias. Se a vacncia ocorresse nos ltimos dois anos de mandato, a eleio seria feita pelo Congresso, em 30 dias. Nos dois casos, at a eleio a Presidncia seria exercida pelo presidente da Cmara dos Deputados. Ficando novamente vago o cargo, o presidente do Senado e, sucessivamente, do Supremo Tribunal Federal (STF) assumem a incumbncia. O nmero de aes ordinrias em junho apresentou queda de 10,36% comparado com o ms de maio. Foram requeridas 606 aes em junho contra 676 no ms anterior. Em relao ao mesmo ms do ano passado a queda ainda maior 34,98%. A pesquisa feita pela Hubert Imveis e Administrao. Em junho foram pedidas 145 aes revisionais de aluguel. Comparadas com o ms anterior, as revisionais tiveram queda de 9,94%. Porm, se o total de junho for comparado com o mesmo ms de 93, verifica-se aumento de 28,32%. No 1 semestre de 94 foram feitos 662 pedidos no total. Segundo o secretrio, US$ 43,5 milhes foram liberados pelo prefeito quatro meses atrs para ampliao ou reforma em dez hospitais e unidades de sade. Entre elas, esto as reformas e ampliaes dos hospitais do Tatuap (US$ 7,4 milhes), Vila Nova Cachoeirinha (US$ 17 milhes) e Tide Setubal (US$ 9,4 milhes). Raia disse ainda que sua equipe levantou as necessidades de material de consumo da rede e que US$ 3 milhes esto sendo liberados agora para gastos do primeiro trimestre. Ao saber que os militares pretendiam a democracia e o fim da guerra colonial, os portugueses comearam a dar cravos aos soldados, que os colocavam na ponta dos seus fuzis -- da o nome Revoluo dos Cravos. No processo que seguiu derrubada da ditadura, o poder caiu na rua. Decises governamentais eram ultrapassadas pela fora de manifestaes. A Secretaria Municipal da Administrao afirmou ontem que poder rever o decreto do prefeito Paulo Maluf que probe a contratao de portadores de doenas com possibilidade de se tornarem incapazes no futuro. O decreto foi publicado no Dirio Oficial do dia 29 de julho. No pargrafo nico do seu primeiro artigo, ele afirma que os candidatos no podem apresentar patologia com perspectiva presente, de incapacidade no futuro. Os fatos esto a e so estarrecedores. J passou da hora de governo, prestadores de servios e usurios se empenharem para no s aprofundar as discusses sobre a ineficcia do sistema de sade, como tambm para buscar alternativas e solues viveis, a fim de proporcionar um atendimento mais digno populao em geral. A crise da sade tem soluo, basta vacin-la contra as ingerncias poltico-partidrias, as omisses e desacertos dos responsveis pelo setor. O Supremo interpreta, em ltima instncia, a Constituio e as leis, mesmo que elas digam respeito a contratos privados. Como poder faz-lo, porm, respeitando ele prprio a lei, quando existem legalmente duas moedas? Como julgar em conscincia contratos escritos ou implcitos que dependem de uma moeda como meio de pagamento e de outra como unidade de conta? A primeira varia tanto, dia-a-dia, que uma semana suficiente para produzir um aumento de mais de 10% nos vencimentos antecipados dos congressistas e dos magistrados. Cabe perguntar: Se o Supremo no est de acordo com a suposta falta de regra das duas moedas, que existem de fato desde a emisso da ltima medida provisria, por que no a proclamou imediatamente inconstitucional e se serviu dela privadamente? O Supremo Tribunal por acaso uma Justia privada ou o baluarte da Justia Pblica? Como se v, trata-se de questes altamente perturbadoras. Assim, se voc est usando uma planilha, por exemplo, pode selecionar parte (ou todo) e levar o que foi selecionado para outra planilha ou qualquer programa que estiver sendo usado. Os arquivos das pastas tambm podem ser abertos a partir do menu, facilitando o uso de arquivos e aplicaes. O operrio da construo civil negro e desempregado Rodney King, 28, vai receber US$ 3,8 milhes da Prefeitura de Los Angeles como compensao pelos efeitos da surra que levou de quatro policiais brancos em maro de 1991. A absolvio em abril de 1992 dos policiais provocou 48 horas de conflitos raciais em LA que resultaram em 58 mortes, 4.000 feridos e US$ 1 bilho em prejuzos. A contracultura, o movimento pelos direitos civis, a terceira onda gay e o politicamente correto nasceram aqui. Ningum fica parado com o vento que sopra do Pacfico. O politicamente correto institucionalizou-se, mas nem os americanos o suportam, exceto, claro, os eternos paroquiais. uma nova religio. A prpria Receita Federal pode ter, indiretamente, inspirado este golpe. Quando o secretrio da Receita era Osires Lopes Filho, vrias cartas foram expedidas a profissionais liberais e empresas convidado-os a acertar as contas com o leo. O delegado informa que foi aberta sindicncia interna no ministrio para tentar apurar o caso e que a Polcia Federal dever tambm abrir um inqurito. Para avaliar as perspectivas deste ano, a Folha ouviu economistas e consultores que tm entre seus clientes muitas das principais empresas brasileiras. Todos trabalham com trs cenrios: o provvel, o otimista e o pessimista. De uma ponta a outra, as variaes so imensas e refletem a instabilidade de uma economia em inflao crnica e alta. Para a taxa de inflao de dezembro de 1994, por exemplo, as previses variam de civilizados 2% para explosivos 150%. Uma resposta est numa palavra: tranquilidade. Num Mundial, esta palavra tem um significado determinante. A tranquilidade pode levar uma equipe ao ttulo ou pode destro-la no caminho. Num grupo de 30 pessoas, entre jogadores, tcnicos, mdicos e massagistas, que devem conviver durante tanto tempo, surgem problemas diariamente. O McDonald's proibiu o cigarro nos 1.400 restaurantes de sua propriedade nos Estados Unidos. Mais 3.600 concessionrios da marca aderiram proibio. William Rhodes, vice-presidente mundial do Citibank, veio ao Brasil para reunio com executivos do banco. Hoje deve se encontrar com FHC. O problema grave. Se no fizermos nada, pode haver uma tragdia. O nosso jogo aqui preveno, disse o secretrio de Segurana Pblica do Estado de So Paulo, Antnio Correia Meyer. O secretrio comandou ontem uma reunio sobre segurana nos estdios, na sede da Federao Paulista de Futebol. Estiveram presentes dirigentes do futebol, e autoridades policiais civis e militares. Cornlio Pires -- Folclorista e contador de causos (1884-1958), foi o primeiro produtor independente de discos do Brasil. Raul Torres & Florncio -- Dupla que atuou dos anos 40 aos 60 e criou um estilo romntico de interpretao. Reis no quis dizer se tambm abandonar Esperidio Amin. Um acidente envolvendo um caminho e um nibus, na ltima sexta-feira, matou 16 estudantes, trs motoristas e deixou nove feridos no km 5,4 da estrada, que chamada de rodovia da morte. De acordo com a assessoria, o secretrio dos Transportes, Mrcio Ribeiro, determinou que os engenheiros do Departamento de Estradas de Rodagem (DER) apressassem os estudos para a duplicao. Crianas norte-americanas encontram em maio mais dois programas educativos em CD-ROM. Through the Woods (Pelo Bosque) e At the Seashore (Na Praia) foram desenvolvidos pela IBM. Through the Woods voltado para crianas que esto no primeiro ano do 1 grau. Cada um dos produtos deve custar US$ 329. Incluem ainda um outro CD, que serve como guia para uso em sala de aula. A maior parte da produo (25O t) j est comprometida com uma indstria de processamento da polpa. O restante ser comercializado em supermercados, feiras e indstrias de sorvetes do Paran. O fazendeiro, que cultiva tambm 1 hectare de rosas, diz que optou pelo morango aps experincias nem sempre vantajosas com outras culturas. Eu tenho um projeto, que vou revelar aqui pela primeira vez, que tem a assinatura de um dos homens mais inteligentes e criativos do mundo, Eliezer Batista. Ele est me municiando e me dando o instrumental de que careo para imaginar a soluo dos problemas do Estado. Schumacher era a esperana de conferir alguma emoo a um Mundial que j parecia ter dono. E emoo significa pblico, o qual atrai patrocinadores. Depois, com a morte de Senna, vieram as exigncias por medidas de segurana na F-1. Mas a bordo de um 'cadillac' preto conversvel, o presidente Menem atravessou a pista do parque da Sociedade Rural Argentina sob os aplausos da platia de cerca de 10 mil agricultores. No chegamos ao paraso, mas j conseguimos sair do inferno, disse Carlos Menem da tribuna de Palermo. Ontem, os bancos j se adequaram s novas regras. Todas as linhas foram travadas em trs meses, disse Mrio Luna, gerente de Departamento de Crdito e Financiamento do Bradesco. Luna afirmou que cedo para fazer uma avaliao completa do impacto das novas medidas. O tcnico Jair Pereira deve escalar um time de reservas amanh, s 17h, contra o Flamengo, em Caio Martins. Quero todos totalmente recuperados para as quartas-de-final, afirmou o treinador corintiano. Ele considerou surpreendente o desempenho do russo Alexander Popov. O ex-jogador defendeu tambm a adoo de penas para menores de 18 anos que cometam crimes. Ningum, por mais criana que seja, pode ir para o estdio com uma arma. A participao aumentaria 3% at meados de 95. O trabalho da publicao inglesa, divulgado pelo Financial Times, indica que h menos preocupao com riscos polticos e econmicos do que com a falta de liquidez e diversificao. No caso de Alvarez, a intimidade com os EUA levou-o a conhecer a mentalidade norte-americana, seus gostos, seus costumes. De certa forma, afiou-o para a luta em que se engajaria a partir dos anos 60. desse combate, da maneira como ele chegou a criar as formas que geraram uma obra original, que O Olho da Revoluo tira seu interesse. O livro chega num momento em que a idia de cinema engajado est meio por baixo. Cuba tambm est com a imagem um tanto abalada (crise, xodo por mar etc). Mas justamente por isso O Olho da Revoluo chega num momento apropriado. perto de casa e tranquilo, diz Rossi, em meio a goles de refrigerante em lata, dentro de seu Opala estacionado atrs de uma igreja. Folha -- O estado de stio seria um golpe? Serra -- Seria e o foi apresentado assim, numa reunio da qual participei. Nela, o presidente confidenciou no acreditar mais que terminaria o seu mandato. Foi seis meses antes do golpe. As dificuldades que bancos federais e estaduais provavelmente enfrentaro reforam as dvidas quanto convenincia do setor pblico ser proprietrio de tantas instituies financeiras. O Brasil dispe de um dos mais modernos sistemas bancrios privados do mundo. O fato de que os bancos pblicos venham a ser, recorrentemente, fontes de prejuzo vem s reforar as desvantagens da atuao do Estado nesse setor. Na Bblia, h duas histrias que tentam explicar a origem de tantas lnguas. Essas histrias so chamadas de mitos. A histria da Torre de Babel (Gnesis, captulo 11, no Antigo Testamento) diz que a Terra tinha uma s lngua e um s modo de falar. Ento, os homens encontraram um lugar plano e resolveram construir uma torre que chegasse at o cu. Trata-se de uma estratgia de inflao reprimida, pronta para rebelar-se pela via da exploso cambial no momento em que os agentes econmicos se convencem de que no sustentvel, e em que os capitais comeam a fugir. Ela dever inviabilizar a atual oportunidade de estabilizar afetivamente a economia brasileira e, de quebra, dever fazer uma srie de estragos irreversveis, que em muito debilitaro seu potencial de desenvolvimento a longo prazo. O PRN no tem mais candidato Presidncia da Repblica. O partido expulsou ontem Walter Queiroz por considerar que ele faltou com a verdade no episdio da dupla declarao de renda. Depois de registrado, Queiroz apresentou uma segunda declarao ao TSE (Tribunal Superior Eleitoral), acrescentando bens que alegou ter esquecido nos primeiros papis que foram apresentados. No h diferena essencial entre as armas individuais do trfico e as do Exrcito. Elas pertencem basicamente categoria fuzil automtico, isto , um hbrido surgido na Segunda Guerra entre os tradicionais fuzis e metralhadoras de mo ou submetralhadoras. Fuzis disparam munio mais poderosa e a um alcance maior, de at um quilmetro. J as submetralhadoras disparam balas semelhantes s dos revlveres, mas podem faz-lo em rajadas, a um alcance curto, de dezenas de metros. Nesta ala merece meno o seu Bruto Ros, um espumante elaborado com uva Baga (100%) que conserva um pouco da cor e muito de frutado desta variedade tpica da regio. Este um vinho bem seco e encorpado capaz de escoltar uma refeio completa. O de 91, tem bolhas finas, abundantes e persistentes, que lhe do um paladar cremoso marcado como o aroma por um toque atraente de cascas ctricas e framboesa. J no captulo dos goles decididamente rubros, desponta o Joo Pato (75% Baga, 25% Cabernet Sauvignon). Quem experimenta a ltima verso (safra 91) descobrir um vinho novo ainda, adstringente, mas de aroma e sabor intensos. Um vinho marcado pelo belo contraponto entre a especiaria da Baga e as frutas vermelhas da Cabernet, quase uma marca registrada na criao deste vinicultor portugus. A atacante Ana Moser (26 anos, 1,85 m e 70 kg), revelou ontem que, depois do vlei, sua segunda paixo o futebol. Quando era menina gostava muito de jogar futebol com meu irmo, disse a jogadora, que, ao contrrio do vlei, onde atua como atacante, prefere jogar como quarto-zagueiro no futebol. Anteontem, o Banerj, que administra a dvida pblica estadual, no conseguiu que o Banco Central negociasse R$ 84 milhes em ttulos estaduais, trocando-os por ttulos federais. o que se chama de alavancagem com um emprstimo de US$ 10, por exemplo, pode-se entrar num jogo de US$ 100 ou mais. Se a aposta for errada, o risco de bancarrota uma forte possibilidade. A expanso dos derivativos no foi gratuita. Os economistas alinham vrios motivos bsicos, entre eles a possibilidade de inverso aberta com a queda dos regimes socialistas, a onda liberalizante que varre a Amrica Latina, a forte expanso econmica dos pases do Sudeste Asitico e tambm a baixa e estvel remunerao de tradicionais aplicaes. Nos EUA, os CDBs, at antes da alta dos juros, rendiam 3% ao ano para uma inflao de 2,5%. O IRA (Exrcito Republicano Irlands, separatista) admitiu o envolvimento de seus militantes no assassinato de um funcionrio do correio durante um roubo na cidade de Newry (Irlanda do Norte). No ltimo dia 11, Frank Kerr, 53, foi morto com um tiro na cabea quando trs homens assaltaram um posto do correio. Os trs levaram cerca de US$ 210 mil. Eles so a razo de tudo o que fao, de tudo aquilo em que acredito, disse, com grandes espaos entre as palavras. Tentava, a custo, firmar a voz, sob flashes dos fotgrafos. Ricupero, na verdade, foi o ltimo do grupo que o acompanhava familiares e assessores a sucumbir emoo, pelo menos publicamente. Vice de FHC, Maciel nunca usou tanto a expresso ternurar. Ele a emprega quando tem que dizer no a uma pessoa sem desagrad-la no caso, a seus colegas de PFL que querem repartir os cargos com o PSDB antes do 1 turno. Quem adere depois da vitria fisiolgico. Como melhorar o ensino superior sem transformar as universidades em escoles, onde os professores s repetem e se afastam da pesquisa? Essa foi a principal questo do 1 dia de debate, do qual participaram os professores Jos Arthur Giannotti (Departamento de Filosofia da USP), Jos Augusto Guilhon Albuquerque, (Departamento de Cincia Poltica da USP), e Luiz Pinguelli Rosa (diretor da Coordenao de Programas de Ps-Graduao da Universidade Federal do Rio de Janeiro). O bom uso dos modems e fax/modem depende dos programas para gerenciamento e transmisso de dados que acompanham os equipamentos. Todos os fax/modem saem de fbrica com dois programas um para gerenciamento das transmisses dfa e outro para gerenciamento das transmisses de dados. Na delegacia, que fica no Jardim Mutinga trabalham seis policiais com dois carros. A Caravan foi queimada. Resta um Gol. Os outros carros incendiados so trs Passats, um Chevette, dois Fuscas e um Escort. Somos quercistas no atual momento. Ele desenvolvimentista, defende a nao soberana contra o imperialismo, representa as foras populares. Temos vrtebra poltica, no somos gelia. O PT no tem estatura ideolgica para enfrentar o candidato conservador. Por isso est como cego em tiroteio. Pesquisa da Cia. City de Desenvolvimento mostra que os empresrios procuram locais com infra-estrutura, facilidade de acesso, boa oferta de mo-de-obra, prximos de So Paulo e das rodovias e longe de movimentos sindicais. Temos problemas com clientes e fornecedores que no conseguem encontrar a empresa, afirma Srgio Ueta, gerente administrativo da Ueta Indstria e Comrcio de Aparelhos Eletrnicos. A empresa, com sede em Caieiras (35 km a norte de SP), pretende se mudar para o Empresarial Jaragu, onde adquiriu 6.000 m2. Ao mesmo tempo o atual papado esmagou a teologia da libertao, teorias de concorrncia feroz so implantadas em toda parte, o fascismo est de volta, Fidel Castro se tornou uma espcie de Somoza e at Martin Luther King, ficou provado, era plagirio. No plano dos costumes, a reviravolta se apresenta sob aspecto viral, epidmico. Os outrora odiosos valores de famlia, sucesso e consumo renascem, sobretudo entre a populao jovem. Sexo livre, fumo e droga esto condenados. O narcisismo assume uma feio cada vez mais frvola. O substitutivo d aos ministrios da Fazenda e Justia poder de pedir interveno judicial nas empresas acusadas de praticar aumento injustificado de preos. Os senadores que faltarem s sesses do Congresso revisor, a partir de hoje, tero cortes no valor de um dia de trabalho por falta. O desconto seria de CR$ 83 mil por dia, com base nos vencimentos de fevereiro. A deciso foi anunciada na tarde de ontem pelo presidente da reviso constitucional, senador Humberto Lucena (PMDB-PB), que tomou a deciso mesmo tendo em mos parecer contrrio da sua assessoria jurdica. Lucena se declarou pressionado pela imprensa e pela necessidade de buscar o qurum nas sesses da reviso. Segundo o senador, a deciso quanto ao corte dos salrios dos deputados a deciso cabe ao presidente da Cmara, deputado Inocncio de Oliveira (PMDB-PE). Inocncio j havia anunciado a adoo do corte de um 30 avos dos salrios dos deputados por falta em sesso da Cmara. Os primeiros descontos aconteceriam na folha de pagamento deste ms, que se encerra dia 15. At as 20h, ele no havia se pronunciado sobre a extenso da medida tambm aos trabalhos da reviso. O vice-presidente da Cmara, Adylson Motta, disse que o regimento da Cmara impede o corte. Favorveis ao pressuposto da integrao, devemos ser criteriosos, no descartando e nem acatando, de pronto, tudo o que nos oferece o governo federal. Os exemplos de Simon e Brizola revelam que a precipitao e a busca de resultados polticos imediatos constrem o pior caminho para o desenvolvimento e so vcios que o curso da histria no tarda a desnudar. RICARDO ANTNIO SILVA SEITENFUS, 45, doutor em Relaes Internacionais pelo Instituto de Altos Estudos Internacionais em Genebra, coordenador do curso de mestrado em Integrao Latino-americana da Universidade Federal de Santa Maria (RS). Foi secretrio especial para Assuntos Internacionais do Rio Grande do Sul (governo Pedro Simon). O avio em que o presidente da Argentina, Carlos Menem, viajava no ltimo dia 30 de dezembro para Anillaco, sua cidade natal, iria ser atacado por um mssil, disse ontem o jornal de Buenos Aires Ambito Financiero. A segurana teria sido avisada por dilomatas e desviado a rota do avio. O governo nega a suposta tentativa de atentado. Comearam ontem pela manh em Nova York os desfiles de prt--porter outono-inverno. A primeira estilista foi a Donna Karan, com a linha DKNY. Os desfiles se realizam pelo segundo ano em tendas ou pavilhes localizados no Bryant Park, que fica atrs da Biblioteca Nacional, na rua 40. Deputados ligados a ACM diziam ontem, em conversas com colegas, que Benito Gama (PFL-BA) foi vetado para ser vice de FHC porque o grupo avalia que so cada vez mais reduzidas as chances de vitria do tucano. Ontem, vrios operadores da Bovespa creditavam a queda do ndice da bolsa percepo do mercado financeiro de que o PFL leia-se ACM est desembarcando da candidatura de FHC. As constantes desavenas com seu pai, John Paul Getty, fizeram John Paul Getty 2 entrar no affaire. As Trs Graas. A clebre escultura de Antonio Canova, atualmente na Gr-Bretanha, est sendo cobiada pelo museu Getty, de Malibu, na Califrnia. Hoje a mdia brasileira de cerca de 1.800 quilos por hectare. Segundo os produtores, o incentivo pesquisa elevar este desempenho para 2.400 kg/ha. As possibilidades do trigo irrigado tambm so animadoras. O plantio no Cerrado atingiu 5.000 kg/ha nas lavouras comerciais e mais de 8.000 kg/ha em reas experimentais. Os quatro grandes jornais deram, na segunda-feira, a mesma manchete para informar como acabou o Grande Prmio do Brasil, que aconteceu domingo passado. Era Senna erra e Schumacher vence (em O Globo, a frase ficou pouco maior que isso). Com essa constatao enviesada de que o alemo s chegou em primeiro porque o brasileiro cometeu uma bobagem, a imprensa coroou o festival sennista da semana anterior, quando todas -- sem exceo, todas -- as coberturas apontavam a vitria de Ayrton Senna em Interlagos. A Folha chegou a escrever que o melhor piloto do mundo (Senna), na direo do melhor carro do mundo (a Williams) teria sua consagrao no autdromo. E que s Schumacher acreditava em sua possibilidade de vencer. Senna, todo mundo viu, rodou quando estava em segundo lugar e com poucas chances de recuperar o primeiro. Enterrou as previses mais do que otimistas, e deixou ver quanto viciado o noticirio esportivo. O compromisso com a preciso (e, por extenso, com o leitor) vale menos do que a torcida da imprensa nessas horas. A imprensa j se esqueceu de que os oligoplios so o vilo da URV e do plano FHC. O pas convive com o novo indexador h um ms, o ministro virou candidato e os preos chamados de abusivos continuam em vigor. O assunto saiu da pauta, ainda que esteja dentro do bolso do leitor. JUNIA NOGUEIRA DE S a ombudsman da Folha. A ombudsman tem mandato de um ano, renovvel por mais um ano. Ela no pode ser demitida durante o exerccio do cargo e tem estabilidade por um ano aps o exerccio da funo. Suas atribuies so criticar o jornal sob a perspectiva do leitor recebendo e checando as reclamaes que ele encaminha Redao e comentar, aos domingos, o noticirio dos meios de comunicao. Cartas devem ser enviadas para a al. Baro de Limeira, 425, 8 andar, So Paulo (SP), CEP 01202-001, a. c. Junia Nogueira de S/Ombudsman. Para contatos telefnicos, ligue (011) 224-3896 entre 14h e 18h, de segunda a sexta-feira. No ranking dedicado aos homens aparecem o ator Sean Connery e o cantor Sting. Entre os corpos mais feios est o da top-model Kate Moss, que segundo o jornal tem ombros cados e o peito liso. Os deputados que negociam o plano de estabilizao econmica do governo criticaram o ministro Fernando Henrique Cardoso, que admitiu ontem a possibilidade de disputar a Presidncia da Repblica. A declarao do ministro repercutiu mal tambm entre os lderes polticos no Congresso. A frase do ministro desagradou at mesmo ao PSDB. Para o plano econmico, no ter dito essa frase seria melhor, afirmou o senador Beni Veras (PSDB-CE). O So Paulo comeou o primeiro tempo aptico, mas acordou aos 10min com uma falta no marcada do lateral Mac Allister em Euller, quando este ia entrar na rea. Em seguida, Euller fez mais duas jogadas. A partir dos 22min, o So Paulo passou a marcar sob presso e anulou o adversrio. O Boca no conseguia sair jogando e perdia bolas em seu campo. O autor do ataque seria Nagi Mohamed Mustafa e o lder do grupo seria Basseem Khalil, morto num caf do Cairo (capital) aps tiroteio contra a polcia. O atentado teria sido planejado pelo grupo radical islmico Gama'a al-Islamiya, que lanou campanha contra o governo egpcio em 1992 para a criao de um Estado Islmico fundamentalista. Khalil e Mustafa, disfarados, tentaram matar Mahfouz um dia antes, mas ele no estava em casa. Muitos negcios foram iniciados ou fechados durante o Siaf. Produtores da Paraba, por exemplo, venderam abacaxi a um grupo de empresrios espanhis, no valor de US$ 323 mil. A cooperativa de Cura (BA), no Vale do So Francisco, praticamente acertou uma joint venture com um empresrio portugus para produzir melo e uva. Empresrios alemes avanaram nas negociaes para instalar unidades de beneficiamento de polpas de frutas. Pode-se dizer, em outras palavras, e a grosso modo, que o aluguel do primeiro ms de reajuste sempre embute (por estimativa) uma inflao que reduzir o aluguel nominal ao seu valor real na metade do perodo de reajuste subsequente. Portanto, na primeira metade, o locador sai ganhando, pois recebe mais aluguel do que vale a locao. Na segunda metade, a vantagem do inquilino que papa menos aluguel do que vale o uso do imvel. A mdia no perodo de reajuste equilibra as vantagens de cada contratante durante todo perodo de uniformidade nominal do aluguel. Tudo isso deve ser considerado para que se entenda que uma exata transposio do aluguel praticado em cruzeiros reais para o sistema da URV s possvel se os contratantes procederem a uma extrao da mdia dos aluguis nos ltimos meses que corresponderem a periodicidade de reajustamento do contrato, para, s ento, transformar tal mdia em URVs. Folha -- No caso de Vereda da Salvao, como foi esse trabalho conjunto? Serroni -- A pea previa trs casas numa clareira de uma floresta. Depois de conversar com Antunes, sempre fao uma maquete. Como a pea trata de religio, de misticismo, achamos que devamos usar muitos troncos, dar uma verticalidade, o que tem a ver com a ascenso etc. Decidimos ento, por uma questo de economia cnica, abolir uma das casas e s insinuar as outras duas, fazendo apenas suas entradas, que parecem as de um templo. Isto foi fundamental, pois conseguimos aumentar nossos criatrios e agora usamos como garanhes selecionadores apenas aqueles realmente de ponta, diz. O resultado positivo da estratgia pode ser comprovado agora, dizem os criadores, uma vez que hoje so os EUA que esto buscando cavalo rabe no Brasil. Os franqueados brasileiros comeam a utilizar uma nova arma para aumentar seu poder de influncia junto cpula das empresas franqueadoras. Trata-se do conselho de franqueados, que comea a surgir em algumas redes como McDonald's, gua de Cheiro, Multicoisas e Localiza. J bastante difundidos nos EUA, os conselhos tm atenuado problemas e melhorado o tradicionalmente difcil relacionamento franqueado X franqueador. Atuando em conjunto, os franqueados ganham fora e passam a ter voz ativa em questes vitais para o negcio, como definico de produtos, prazos de pagamento e estratgias de propaganda. A liminar, ainda no cumprida pelo governo do Esprito Santo, determina que o delegado responsvel pelo caso, Francisco Badenes, seja reconduzido ao cargo. Em maio ltimo, Badenes foi afastado e transferido para o interior. O juiz determina ainda o restabelecimento dos meios necessrios ao funcionamento da Comisso de Processos Administrativos Especiais, responsvel pelo inqurito. O lder Tasso Jereissati (PSDB) passou de 61% para 58%, perdendo trs pontos percentuais em relao ltima pesquisa. No deve haver segundo turno no Estado. Wilson Barbosa (PMDB) subiu seis pontos, atingindo 55%. O candidato tambm deve vencer a eleio no primeiro turno. A margem de erro da pesquisa Datafolha de 3,0 pontos percentuais, para mais ou para menos, exceto no Distrito Federal, que de 4,0 pontos. A direo do datafolha dos socilogos Antonio Manuel Teixeira Mendes e Gustavo Venturi. Nada do que eu havia lido ou ouvido antes de vir para c me preparara para as emoes que eu iria viver e vrias vezes me senti numa montanha-russa emocional. Em vrias ocasies eu me vi, como anotei em meu dirio, beira das lgrimas. Mas em muitos outros momentos eu me esforava para conter risadas. Ou ento lutava contra uma depresso repentina, tentava controlar minha raiva, fazia piadas com algum ou fazia fora para no sentir saudades de casa. Art. 1 Dispensar a obrigatoriedade da expresso de valores em cruzeiros reais constante dos incisos II e III, do art. 8, da medida provisria n 482, de 29 de abril de 1994, desde que, no caso de fixao dos preos em Unidade Real de Valor (URV) seja exposto, em lugar visvel e de fcil leitura, o valor da URV do dia. Art. 2 obrigatria a expresso dos valores em cruzeiros reais nas notas fiscais. As negociaes com os petebistas se prolongaram at o incio da madrugada de ontem. Alm do PSDB e do PFL, coligou-se em torno da candidatura Covas o pequeno PV. Mas at possvel encontrar pelo menos um ponto interessante no disco: algumas de suas letras, compostas por Michael Callahan e Marc English. Na faixa mais interessante do disco, Wooden Nails, a primeira, o tema abordado a superao da dependncia de drogas e da depresso. Uma melodia melanclica emoldura a cano. O fato que a sombra do R.E.M. uma presena muito forte nesse Building Our House. Os vocais de Callahan e English so calcados ao extremo nos de Michael Stipe e Mike Mills. O beat de certas canes acaba tornando-as filhotes das compostas pela banda de Athens, como Losing My Religion. Alm disso, deve lanar mais cinco novidades at o final do ano, alm de aumentar sua distribuio. Apesar dos esforos, estima fechar o ano vendendo 75 milhes de litros de sorvetes (US$ 120 milhes). Em 93 vendeu 83 milhes de litros (US$ 128 milhes). Ou na hiptese remota de sua votao antes de 30 de agosto. Para Holanda, no possvel a mudana pretendida pelos bancos. Por dois motivos: a eventual queda na arrecadao e problemas jurdicos que impedem mexer na base de clculo do imposto por meio de MP, j que ela foi fixada em emenda constitucional. O governador do Paran ligado politicamente ao ex-governador Roberto Requio que candidato ao Senado pelo PMDB. Requio um dos mais ferrenhos adversrios de Orestes Qurcia, que disputa a Presidncia da Repblica pelo partido. Enfim, nem tudo est perdido neste futebol automatizado e excessivamnete defensivo que a Copa est consagrando. Afinal, por mais que tentem matar o craque, ele sempre sobrevive, aqui ou ali, onde menos se espera. Alberto Helena Jr., 52, colunista da Folha. Sua coluna na Copa diria. O presidente argentino, Carlos Menem, disse que lanar sua candidatura para a eleio de 2003 se for eleito em 1995. Depois de dois mandatos, Menem no poderia ser reeleito em 1999. Espero quatro anos e volto a me apresentar, disse. Duas bombas mataram duas pessoas em Teer ontem, segundo a agncia oficial de notcias Irna. No foram apontados suspeitos pelo atentado embora tenha sido dito que a polcia achou pistas dos terroristas. A esfora incidia sobre o valor das propriedades, as quais estavam devidamente registradas nos diagramas pblicos. Tratava-se de um imposto sobre o capital. Outro tributo importante, e tambm sob a forma de contribuio voluntria, foram as trierarquias. Tratavam-se de contribuies para construir os geis barcos trirremes. Tanto a construo quanto a manuteno da equipagem corria por conta dos que tinham mais recursos. O Movimento Zona Sul foi criado no ano passado, durante a polmica a respeito da realizao das obras de prolongamento da avenida Faria Lima (zona sul). Comandado pelo advogado Luiz Antonio Siqueira Dias, o grupo defendeu as obras, que foram iniciadas recentemente. O Folha Informaes atendeu no ltimo fim-de-semana 3.978 pessoas, que acionaram o servio para saber o resultado de Guarani e Palmeiras. O pico de audincia aconteceu entre as 20h e 21h de sbado, quando o sistema atendeu 309 ligaes. O Folha Informaes um servio do Banco de Dados da Folha de S. Paulo (tecnologia da Telesis Sistemas em Telecomunicaes). Nenhum dirigente do PMDB compareceu ontem, em So Paulo, cerimnia de instalao do Instituto Ulysses Guimares de Estudos Polticos e Sociais. Ulysses morreu em 1992 como presidente nacional do partido. Orestes Qurcia e Luiz Antonio Fleury Filho no estiveram na sede da entidade. Foi tambm o caso de candidatos a cargos majoritrios ou dirigentes regionais. Os carros ganharam ainda proteo sob o crter e sob o tanque de gasolina, para suportar com menor risco os choques contra as pedras. A altura do solo foi ampliada em alguns centmetros para facilitar as investidas fora de estrada. A crise no Mxico deve fazer com que todos os pases emergentes recebam um menor volume de capitais externos de renda fixa (aplicados, por exemplo, em eurobnus). Ele acredita, no entanto, que o mercado acionrio brasileiro deve crescer em 1995. A instituio responsvel pela guarda (custdia) do equivalente a US$ 3,7 bilhes de investimentos estrangeiros nas Bolsas. Nos ltimos dias, houve uma sada de US$ 100 milhes da custdia da instituio, por causa da crise do Mxico. Meirelles afirma que esses recursos no saram de negcios tradicionais de Bolsas, mas sim de operaes de box (financiamentos tendo como lastro aes). Ele prev, contudo, que o mercado acionrio crescer e a custdia de aes de investidores estrangeiros na instituio chegar a US$ 5,5 bilhes ao final de 1995. Apesar de evitar dar um no definitivo, Marise deixou claro que deve recusar o convite de Brizola. Fiquei muito honrada com a proposta, mas o governador Brizola tem nomes muito bons para compor sua chapa, afirmou. O governo do Estado informou, atravs de nota de sua assessoria de imprensa, que as declaraes do presidente Itamar Franco e do presidente eleito Fernando Henrique Cardoso sobre medidas federais de combate ao trfico de drogas configuram posies anlogas s que o governador do Estado vem defendendo. Segundo a nota, as providncias capazes de alterar o quadro atual de combate aos traficantes dizem respeito Polcia Federal e s Foras Armadas, com o objetivo de interromper o fluxo de armas proibidas e cocana no Rio. Confiante numa eventual vitria do candidato Fernando Henrique Cardoso, a cpula do PSDB j articula a formao de um superpartido para no depender do PFL no Congresso. Os tucanos consideram perigoso deixar um possvel governo de FHC sujeito s manobras dos deputados e senadores pefelistas. Grava disse que os dois sofreram rompimentos de feixes musculares em msculos da coxa direita. O problema de Viola num msculo da parte de trs da coxa e de Clio est no lado interno. O mdico negou que esteja tratando mais algum. O Cobra privilegiava os atos impulsivos e as cores fortes e seus principais nomes foram Appel e Alechinsky. Doucet se inspirava muito em suas viagens e refletia isso no ttulo de vrios de seus trabalhos, como Guatemala Blues e Mostar Sarajevo. Tambm so obras suas Labyrinthe de la Lumire e Turbulences d'Abysses. O candidato do PDT disse que quer fazer o mesmo. Os grandes comcios sero realizados apenas em setembro. Ao contrrio do que fez em campanhas anteriores, Brizola quer promover comcios rpidos, com trs oradores. O filme passa como vento pelas questes. nada pequenas. dos tratamentos psiquitricos contemporneos e se fixa no que existe de mais bvio na histria. Isso termina por arrast-la destestavelmente, sem que se chegue a parte alguma. Paradoxalmente, por a que se podem ver as virtudes de Figgis. A ONU admitiu que os srvios no haviam cumprido as exigncias, mas recusou autorizao para bombardeio. Yasuhi Akashi, reprsentante da organizao, argumentou que os srvios estavam em meio ao processo de retirada. J nas primeiras palavras a Medida Provisria que institui a Unidade Real de Valor incorre em erro e abre brecha para contestaes na Justia. A constatao do jurista Saulo Ramos, ex-consultor Geral da Repblica e ex-ministro da Justia, que foi convidado a analisar o rascunho das medidas e na ocasio alertou para dois problemas: o artigo primeiro e o pargrafo primeiro da MP 433 praticamente instituem duas moedas no pas, o que proibido. Alm disto, a manuteno da Ufir para correo de tributos ilegal, segundo o jurista. Outros especialistas apontam mais defeitos tcnicos na MP. Diz o artigo primeiro que fica instituda a URV, dotada de curso legal para servir exclusivamente como padro de valor monetrio. Isto restringe URV todos os poderes da moeda do pas. um erro tcnico grave e que pode resultar num conflito judicirio muito grande, segundo Saulo Ramos. As negociaes para a volta de Mansell F-1 tiveram incio logo aps as 500 Milhas de Indianpolis, realizada no ms passado, e continuaram durante as 200 Milhas de Milwaukee, disputada no ltimo dia 5, quando recebeu a visita de alguns engenheiros da Williams. Quinto colocado no Mundial de F-Indy, com 46 pontos, Mansell disse que pretende continuar a correr na categoria at o final da temporada, condio que ele imps a Frank Williams para voltar equipe pela qual ganhou o ttulo mundial de 1992. Nos ltimos dias, aumentaram as especulaes sobre quem ocuparia o lugar de Mansell na Newmann/Hass. O mais cotado na bolsa de apostas o brasileiro Raul Boesel, que ontem no consegui terminar a corrida em Detroit. Eles estranharam o fato dele no sair do barraco de manh e chamaram a polcia. Jair morava sozinho no barraco e, segundo seus vizinhos, era alcolatra. O segundo caso aconteceu no Pari (centro). Um indigente conhecido como Daniel foi encontrado cado na rua Canind na madrugada de ontem por policiais que estavam fazendo patrulhamento. Quem que vai, ento? S intelectual quatro-olhos? S aquela turma do Cebrap? S socilogo e antroplogo? Socorro! Ento bom a Joyce Pascowitch ir se preparando. O Fernando Collor, evidentemente, no foi convidado para a posse. So what, no mesmo? Ele compensou em Aspen, dando um malho naquela flor de nome Rosane, casualmente, na frente de fotgrafos. Na alquimia medieval, a rvore simbolizava a transformao, sempre cercada de globos reluzentes que corresponderiam aos planetas. Para o psicanalista Carl Jung, esta seria a origem dos globos pendurados nas rvores contemporneas. E, para Bruno Bettelheim, as pequenas velas e lmpadas que a iluminam seriam vestgios das antigas fogueiras que os pagos do norte da Europa ateavam no alto das montanhas para antecipar a chegada do Sol e o fim do inverno. Mais 10%. Para 94, a indstria de mquinas agrcolas prev crescimento de 10%. Acordo na Cmara Setorial garante crdito de US$ 650 milhes para o Finame Rural. Em contrapartida, a proporo de empresrios no partido, que era de 22%, hoje alcana a marca de 54%. o caso, entre outros, do deputado Srgio Machado (Villejack Jeans), do senador Teotnio Vilela Filho (Mata Verde Agropecurio, Usina Seresta), do senador Albano Franco (presidente da Confederao Nacional da Indstria). Essa mudana no perfil social da bancada no ocorreu no maior partido conservador, o PFL, nem no PT, partido de esquerda. O produtor iniciou a colheita, em fevereiro passado, de 170 hectares de amendoim e 240 hectares de soja. Em 93, ele plantou 290 ha de amendoim e 97 ha de soja. Com o dinheiro do amendoim Guidi conseguiu comprar duas camionetes e um trator de 150 Hp, est construindo uma casa em Pontal e ainda tem para receber US$ 470 mil referente venda de 49,4 mil sacas. Parte desse dinheiro vai para a colheita deste ano. Trabalho com amendoim h 27 anos e sempre tive lucro. Nos primeiros dez anos, o lucro por safra alcanava 40%. Hoje, varia entre 12% e 15%. O Banco apresentou em 1993 um nvel de alavancagem alto, demonstrando maior agressividade nas suas operaes. Os nveis dos depsitos aumentaram 440%, em 1993, basicamente em funo de depsitos interfinanceiros e a prazo. MRIO ALBERTO DIAS LOPES COELHO consultor da Austin Asis. Existe ambiente poltico, social e econmico para se produzir automveis no Brasil? Sim, embora a abertura dos portos no esteja sendo feita impunemente. Em termos polticos, o pas saiu das eleies com um Estado mais vigoroso e democrtico, em melhores condies de enfrentar a inflao e o desemprego. Pode se reorientar para uma maior produo industrial e contribuir para a criao de mais emprego e riqueza, consequncia natural do processo. Do ponto de vista lgico, um plano de estabilizao equivale a uma promessa. O governo promete sociedade que, de agora em diante, vai mudar de vida vai garantir o equilbrio das contas pblicas e vai parar de abusar do seu monoplio de criao de moeda. Se isso ser cumprido na prtica, s o tempo dir. Mas, para que a promessa possa ser efetivamente cumprida, fundamental que se acredite que o governo ir de fato cumpri-la. Alm de brincar, uma experincia. Pela TV voc v. A diferena que no videogame voc sente. como se estivesse l, acrescenta Rodrigo. H outros apelos da fita que seduzem os gamemanacos, especialmente os fs de Ayrton Senna. No incio de cada corrida, por exemplo, as principais dicas do circuito so fornecidas pelo piloto. A imagem de Ayrton aparece em fotos digitalizadas. Um dos responsveis pela seleo das moas da Ford, que no quis se identificar, aponta o italiano Maldini (considerado pela imprensa europia o bumbum mais bonito da Copa), que recebeu apenas um voto, como o mais bonito. Este sim, seria modelo com certeza, disse. Os reservas da seleo brasileira Viola e Ronaldo e vrios componentes da Nigria tambm foram apontados. O Viola ou o Ronaldo seria um 'black man' perfeito disse um dos modelos masculinos da Ford, que tambm no quis se identificar. Esteve assim com o astro da conferncia, lorde Keynes um sujeito sutil, bom expositor, muito persuasivo, conta Campos. A delegao britnica era a mais forte em termos intelectuais, embora outros participantes tivessem dela uma opinio algo debochada. Corria uma piada segundo a qual Keynes era inteligente demais para ser consistente; Dennis Robertson, outra estrela, era consistente demais para ser inteligente; e Lionel Robbins, o terceiro nome de prestgio, no era nem inteligente nem consistente. Durante a negociao, ontem tarde, Muller telefonou para o presidente do So Paulo, Fernando Casal de Rey, e se lamentou. Ele reclamou de que no lhe pagariam o prometido, no lhe dariam um carro e que s bancariam seis meses de aluguel de sua casa e no os quatro anos, como esperava, disse o dirigente. Ela preferiu ser a primeira dama da ilha de Skorpios. Foi nesse papel que, encarnada por outra Jacqueline (Bisset), Jackie O. debutou na tela, 15 anos depois de ter-se transformado na mais pranteada e sedutora viva deste sculo. Merecia coisa melhor. Alm de muito ruim, o filme O Magnata Grego (The Greek Tycoon) dedicava mais ateno ao seu segundo marido, interpretado por Anthony Quinn, que alis faria o papel de Scrates Onassis (sogro de Jackie) no telefilme Onassis: The Richest Man in the World, produzido em 1988. O presidente da Portuguesa, Manuel Pacheco, deve entrar tera-feira com uma ao contra o Vasco da Gama. Ontem tarde, Pacheco disse que esperar at tera-feira para que o clube carioca envie uma cpia do seguro de vida previsto no contrato de emprstimo do meia Dener, morto tera-feira. Mas, no Rio, o diretor de futebol do Vasco, Eurico Miranda, confirmou que o seguro no foi feito. Segundo Miranda, a legislao brasileira probe que um seguro desta natureza tenha como beneficirio uma empresa ou associao. Resultado: presso nos preos combinada com a falta de produtos. Se a queda da inflao certamente produziria efeitos positivos na candidatura Fernando Henrique Cardoso, as prateleiras vazias municiam os adversrios. certo que, desta vez, o governo est mais atento do que no Plano Cruzado e vai usar, segundo o ministro Rubens Ricupero, todos os artifcios para evitar uma febre de consumo. Se vai conseguir, outro problema. Mais uma vez, se v que a deseducao o problema que mais sai caro ao Brasil. PS Realmente notvel a evoluo de Lula nos ltimos 15 anos. Recebi ontem por fax entrevista que ele concedeu em 1979 para a revista Especial, onde expe sua viso sobre a mulher. A gente no pode pensar em jogar a mulher no mercado de trabalho enquanto houver excesso de mo-de-obra. Alis, nessa entrevista ele, corretamente, defende a legalizao do aborto com argumentos razoveis e, agora, por motivos eleitoreiros, cedeu presso da Igreja Catlica. A campanha eleitoral encerrou-se na noite de quinta-feira, com um pronunciamento de oito minutos de todos os candidatos. A propaganda eleitoral nas redes de TV custou ao uruguaio mais de US$ 14 milhes (cerca de US$ 7,00 por cada eleitor). Pela lei, os partidos recebem recursos do governo de acordo com os resultados da ltima eleio. Acho importante notar que a Bienal mudou a maneira de conseguir patrocnio cultural no pas, que antes era feito base do me d um dinheiro a. Vendendo cada sala especial para um patrocinador exclusivo, demos a ele um retorno muito mais palpvel, com anncios em meios de comunicao e o abatimento de impostos. Tratamos a Bienal como empresa. Folha -- O nmero de visitantes pagantes da Bienal do dia da abertura (12 de outubro) at o dia 7 deste ms foi de 103 mil. Se a Bienal for at o dia 11 de dezembro, dever ter um pblico total de no mximo 250 mil pagantes, alm de quase 200 mil crianas, seguindo a projeo. O Sr. acha um bom resultado? Para o volante Dinho, um dos jogadores responsveis pelo setor de marcao do Santos, impossvel segurar o ataque do So Paulo do tcnico Tel Santana. A sada, diz, partir para o ataque. Sobre sua sada do So Paulo no incio do Campeonato Paulista o jogador diz que a questo assunto encerrado. Depois da Portuguesa, esta tarde no Canind, o Palmeiras far trs jogos seguidos em seu estdio: contra o Bragantino, tera-feira, a Ferroviria, quinta-feira, e o Amrica, domingo. Se vencermos esses quatro jogos, chegaremos a 80% de aproveitamento dos pontos disputados. Acredito que, mantendo esse percentual, seremos campees, disse Luxemburgo. Gianetti da Fonseca afirmou que a avaliao tem que ser feita por pares (pesquisadores da mesma rea), que no faam parte do grupo analisado. Freddy Tatoo vive atualmente em Bolonha e j est acostumado ao circuito internacional da tatuagem. Ele j tatuou, pelo menos, mil pessoas. Pelo grupo brazuca, ainda estaro presentes Tyes e Caio (Rio de Janeiro) e o especialista em body piercing, Andr Meyer (So Paulo). Eles vo ser indiciados por suposta corrupo de menores, favorecimento prostituio e crcere privado. O advogado Jos Carlo Dri, que defende os responsveis pela boate, se recusou a falar. WILSON BALDINI JR. O norte-americano Evander Holyfield coloca o ttulo mundial dos pesos-pesados (acima de 86,183 quilos), verso Associao Mundial (AMB) e Federao Internacional (FIB), hoje, contra o seu compatriota Michael Moorer. Quatro vereadores na cidade e o vice-prefeito afirmaram que votaro em Fernando Henrique Cardoso, embora seus partidos (PMDB e PPR) tenham candidatos prprios a presidente. Voto no Fernando Henrique porque ele fez o Plano Real, afirmou o vice-prefeito, Manoel Lopes Duarte, do PPR. A coreografia criada por Bil T. Jones para An American Evening chama-se I Want to Cross Over, com msica gospel cantada por Liz McComb. O cenrio, de Donald Baechler, se compe de pedaos de um barco, uma pequena casa e um tubo de chamin. Negro e portador do vrus da Aids, Bill T. Jones uma das expresses mais poderosas da dana contempornea. Agora, alm de dirigir seu grupo sediado em Nova York, ele tambm vai atuar como coregrafo-residente do Ballet da pera de Lyon, que acaba de nome-lo para o cargo, antes ocupado por Maguy Marin. Ex-bailarino do grupo de Trisha Brown, Stephen Petronio outra estrela do momento. Para An American Evening ele criou Extra Veinous, cujo ttulo significa o contrrio de intravenoso. A coreografia de Susan Marshall que o Ballet de Lyon dana hoje chama-se Central Figure, com msica de Philip Glass. Inspira-se no mais antigo e melhor bailarino de minha companhia, que morreu no ltimo vero, ela diz. O tema de Julia Child era uma simples quiche, mas era poltico: mostrar aos paranicos americanos que o ovo e a manteiga fazem pratos deliciosos e que a patrulha do colesterol deve dar espao ao prazer da comida. Marcella Hazan falou sobre alcachofras e catequizou o pblico sobre a supremacia do leo de oliva extra virgem para cozinhar, apesar da exorbitncia do preo. Patricia Wells ensinou o preparo de codornas marinadas e cuscuz marroquino e incentivava o pblico a repartir food experiences (o nome moderno para receber amigos para comer) como ela mesma faz em sua casa na Provence (sul da Frana), regio em moda nos Estados Unidos. Os irmos Gershwin eram to diferentes que se completavam s maravilhas. George fazia a msica, Ira a letra, embora, como em toda parceria, cada qual desse palpites na especialidade do outro. Primeiro, George fazia a msica; Ira capturava o esprito da cano e apunha-lhe ttulo e letra. s vezes sugeria a George uma mudana no andamento, para que este se adequasse melhor letra. Um caso tpico foi o de Someone to Watch Over Me (1926), que nasceu brejeira e Ira recomendou que ela se tornasse uma balada romntica. George era um dnamo, bomio e namorador; Ira era mais recluso e solidamente casado. George era indiferente politicamente; Ira era liberal, com inclinaes socialistas. A versatilidade do George como compositor s era igualada pela de Ira como letrista. Os dois eram bambas tanto nas canes mais rtmicas, alegres e humorsticas, como nas mais srias, romnticas e profundas. Segundo a administrao do aeroclube, a quadrilha conseguiu decolar s 4h, sob forte nevoeiro e sem iluminao. O avio do mdico Domingues Braille, dono de uma clnica em So Jos do Rio Preto. O jogador, sabendo que ganharia mais de US$ 250 mil, fez o que a voz ordenou. A roleta girou, e ele perdeu. Droga, disse a voz em sua cabea. Dividido em seis partes, o livro traz 32 captulos que procuram abranger todos os recursos do programa. bastante didtico e utiliza bem desenhos para exemplificar efeitos especiais. O texto curto e prtico e todos os captulos so recheados de notas, dicas e alertas, ilustradas com as respectivas figuras Note, Tip e Stop. Com direo de Roberto de Oliveira, a Bandeirantes prepara um documentrio sobre a Campanha Contra a Fome e o trabalho do rapaz. O especial dever ir ao ar no Brasil e em vrios pases poucos dias antes da eleio para o Prmio Nobel da Paz. O mais grave, no entanto, so os problemas que os clientes podero enfrentar. A quem recorrer no caso de um descomprimento do que prometido? Notamos ainda que, quando um passageiro brasileiro se inscreve num programa de milhagem de uma companhia area internacional que serve o Brasil, passados 30 dias, recebe a proposta da Iapa. Ou seja, se tal procedimento no for ocasional e tiver realmente a conivncia das transportadoras areas, estas esto expressamente contribuindo para a concorrncia desleal e ilcita perante os agentes de viagens alm de no preservar seus clientes, que no so consultados sobre se querem ou no receber tais propostas ou se autorizam a divulgao de seus nomes e endereos. Como partiu do PT o pedido de encontro com o presidente Nelson Mandela, o prprio partido assumiu os custos do trecho da viagem que inclui a frica do Sul. Segundo a tesoureira da campanha de Lula, Tatau Godinho, os custos se resumem passagem area, j que o candidato ficou hospedado na casa do embaixador do Brasil na frica do Sul, Antnio Amaral de Sampaio. Ela no tem os valores exatos pagos pelo partido. MAIS RETIRADOS: O nmero da esquerda representa a posio atual e o da direita, a posio na quinzena anterior. Consulta realizada nos dias 28 e 29/03/94 junto s locadoras Cine Arte Vdeo, Gentile Vdeo Hobby Vdeo, Over Vdeo, Real Vdeo, Vdeo Clube do Brasil, Vdeo Factory e Wolf Vdeo. RECOMENDADOS: avaliao feita a partir de fitas emprestadas por distribuidoras de todo o pas. Derrida -- A palavra engajado tem uma histria. Quando a gente se diz engajado, corre o risco de evocar modelos anteriores e o engajamento hoje deve encontrar formas novas. Mas o trabalho no Parlamento uma forma de engajamento, claro. Folha -- Obrigada pela entrevista. A notcia veiculada por esta Folha (em 10/10/1994) de que o presidente eleito Fernando Henrique Cardoso pretende propor o fim da unicidade sindical (sindicato nico representando a mesma categoria e na mesma base territorial, conforme est previsto no artigo 8, II da Constituio Federal de 1988) juntamente com o contrato coletivo de trabalho, demonstra sua coerncia com a modernizao das relaes sociais. Com efeito, os estudiosos da matria afirmam que a negociao coletiva de trabalho est diretamente relacionada com o modelo de organizao sindical (cf. Amauri Mascaro Nascimento, em Direito Sindical, editora Saraiva, 1989, pg. 313). No houve mortos. Cerca de 30 feridos leves foram atendidos na Santa Casa da cidade. O prefeito Mercedes Ribeiro de Miranda (PMDB) decretou estado de calamidade pblica na cidade, que ficou sem gua e luz durante 22 horas. Todos os anunciantes que aderiram idia dos clubes infantis esto atrs de um mercado de dimenses gigantescas: os miniconsumidores americanos na faixa de idade entre 4 e 12 anos gastaram no ano passado US$ 7,3 bilhes das suas prprias mesadas e influenciaram suas famlias a comprar mais US$ 130 bilhes. Tanto dinheiro parece tornar desprezvel a tica do marketing: a Delta recebe centenas de cartas de crianas denunciando pais ou parentes que preferiram viajar por outras companhias areas, configurando um claro estmulo ao dedodurismo precoce. Helcio Emerich jornalista, publicitrio e vice-presidente da agncia Almap/BBDO. Fabiane recebeu CR$ 35 mil para usar camiseta da candidata e gritar o nome de Qurcia. Deus me livre. No voto em Qurcia de jeito nenhum. Estou aqui a trabalho, disse. Ela pertncia a um grupo de cem garotas de Braslia contratado para fazer propaganda de Ana Paula Junqueira. Acho absurdo um poltico pagar algum para gritar seu nome, reclama. O tcnico irritou-se com uma pergunta e no a respondeu. O Brasil nunca esteve em apuros, segundo o tcnico. A frente do glaciar, que fica na beira do rio, desprende blocos de gelo do tamanho de um prdio de 30 andares. D para sentir a terra estremecer e o barulho parece o de um trovo. Fomos convidados a fazer um minitreking, ou seja, caminhar claando crampones (sapatos com pregos de ferro) sobre o glaciar. Durante o passeio, escutvamos os rudos da movimentao lenta deste enorme rio de gelo e contemplvamos os contrastes cromticos da superfcie congelada refletindo o cu azul. preciso ter em conta que a elevao das importaes com a reduo das alquotas estimada em 1% da pauta das importaes brasileira. Esse percentual razoavelmente pequeno porque significa US$ 250 milhes, disse. Amaral disse ainda que esse aumento de importaes decorre do aumento da demanda que se exerce sobre toda a oferta de produtos. ENFERMAGEM -- Acontecer no prximo dia 6, no Ces-Senac (av. Tiradentes, 822), seminrio sobre a caracterizao do pessoal de enfermagem no Estado de So Paulo. A presena deve ser confirmada at o prximo dia 5, pelo tel. 221-2155. OFICINA DE LIVROS -- A Oficina de Artes do Livro oferece 8 vagas para o curso Papel Artesanal: Processamento de Fibras Vegetais, que vai de 5 a 14 de abril. Informaes pelo tel. 212-2051. O documento tambm sugere a criao de uma sistemtica mensal de acompanhamento. Elvis morreu quando entrou para o exrcito. A caixa com cinco CDs Elvis: From Nashville To Memphis enterra em parte o velho mito explicitado pelo beatle. O pacote traz as gravaes essenciais do cantor na dcada de 60, exatamente depois de sua baixa no exrcito norte-americano, em 6 de maro de 1960. Esses so os anos em que Elvis deixou de ser o rei do rock'n'roll para tornar-se o smbolo pr-fabricado de uma gerao que sonhava com Honolulu e pracinhas. Nessa poca, o cantor forjou alguns de seus maiores sucessos como, para ficar num s exemplo, It's Now or Never. Tambm nesse perodo Elvis Aaron Presley sucumbiu ao monstro chamado Elvis Presley. O cantor no esteve livre do assdio e do culto de seus fs nem mesmo durante os dois anos em que esteve ligado ao exrcito. E justamente por isso que no se deve parar, preparando terreno ao futuro presidente. Alm de aumentar o nmero de cassaes, deve-se estimular as demais CPIs sobre Empreiteiras, financiamentos eleitorais e CUT a CPI da CUT ter o dom de trazer tona possveis desvios da burocracia e da praga do corporativismo, o que, acreditem, vai acabar ajudando no apenas o PT mas a democracia. Mas a roubalheira apenas a ponta do lamaal: o problema essencial, a grande delinquncia, a baixa taxa de seriedade e compromisso dos homens pblicos, responsvel por estarmos atolados em nveis jamais vistos de misria e violncia e, como diz o salmo, a que no se pode estar em p. PS Milagre mesmo gente como Joo Alves trocar a leitura de volantes de loteria pela Bblia quem sabe o poder pblico no o ajuda, dando-lhe uma cela especial para aprofundar tais leituras. Vocs so pessoas de um Brasil que tem sinais da renovao. Se Roseana, os senadores, deputados e prefeitos esto me apoiando no porque tenhamos firmado qualquer compromisso pessoal. Jamais ningum me pediu nada. Mas, nada mesmo, disse FHC. s 16h, antes de embarcar para a cidade de Cod (MA), onde seria realizado um comcio s 18h de ontem, Fernando Henrique afirmou que vai aguardar o posicionamento do senador Jos Sarney (PMDB-AP) em relao s eleies presidenciais. Folha -- Como voc recebeu a notcia de que seria substitudo? Giovane -- Normalmente. Seleo isso, joga quem est melhor, no tem esse negcio de nome. A melhor equipe que deve jogar. ... casos positivos de dengue em Monte Aprazvel (38 km de Rio Preto-SP) foram divulgados ontem pelo Ersa (Escritrio Regional de Sade) de Rio Preto. A regio vive uma epidemia da doena. A cidade de Monte Aprazvel tem 68 casos confirmados. Rio Preto tem trs casos positivos. Os policiais federais de Mato Grosso do Sul entraram em greve ontem, em adeso ao movimento iniciado no Distrito Federal. A reivindicao de equiparao salarial com a Polcia Civil do DF, o que representaria um reajuste de 300%. Em Mato Grosso, a paralisao est prevista para comear hoje. Todas receberam beijos de Itamar. Fu foi a mais calorosa. Beijou, abraou duas vezes e conversou por alguns minutos com o conterrneo presidente. Eu torci muito por voc, viu?, disse Itamar a Fu. Cada usurio pode importar programas at o valor de US$ 200 mil. Os pedidos at US$ 20 mil dispensam guia de importao. Se o valor da compra for baixo, melhor pedir encomenda por correio normal. Pelo correio expresso o frete custa prximo de US$ 50. Se no quiser se aventurar pela importao direta, existem empresas, como a Brasoftware (tel. 011/253-1588) que importam programas sob encomenda. Segundo Ricardo Jordo, gerente de marketing da Brasoftware, o produto -- entregue no mximo em 15 dias -- sai por um preo similar importao feita pelo usurio. A taxa de servio da Brasoftware coberta pelo desconto que obtm junto ao fornecedor. ARRIET CHANIN -- A artista mostra 15 monotipias trabalhadas em papel artesanal de folha de bananeira e 12 gravuras em metal, que so instrumentos musicais e conchas. De seg a sex das 11h s 19h e sb das 10h s 13h. Preos das obras: de R$ 50 a R$ 500. At 30 de setembro. As novas impressoras a laser da HP vm com um novo padro de velocidade 12 pginas por minuto (ppm) e so de 30% a 40% mais rpidas que as da gerao anterior. As LaserJet 4 Plus e 4M Plus substituem os modelos 4 e 4M. Tm resoluo de 600 x 600 pontos por polegada (dpi), o que permite obter imagens com mais definio. O empresrio e corretor de imveis, Roberto Capuano, foi reeleito pela terceira vez presidente do Conselho Regional de Corretores de Imveis do Estado de So Paulo (Creci). Ser a sua quarta gesto frente do rgo. Enquanto Gloria Pires ficou nervosa em sua estria como modette, o marido Orlando Moraes amou e pediu bis. De acordo com a Enciclopdia Britnica, o linchamento uma prtica que se verifica em momentos de instabilidade ou de ameaa de anarquia. A sensao de insegurana e a debilidade do poder pblico eram sem dvida caractersticas dos Estados Unidos do final do sculo 18, quando (segundo a verso mais aceita) a palavra teria sido inventada a partir da prtica adotada por um juiz Lynch, de condenar e executar seus adversrios sem o devido processo legal. Lamentavelmente so tambm caractersticas de diversas regies do Brasil de hoje. O brbaro linchamento de trs pessoas ocorrido esta semana no Paran pode ter sido particularmente chocante e sem dvida o foi, filmado e transmitido pela televiso em toda a sua brutalidade, mas constitui apenas mais um na lista dos crimes desse tipo registrados no pas. Apenas na Bahia, por exemplo, o comando da Polcia Militar estadual informa que foram 350 casos nos ltimos quatro anos. Formada por tcnicos da Fazenda e das Minas e Energia, a comisso tem prazo de 60 dias para conluir o trabalho. O objetivo e melhorar a rentabilidade do setor. Segundo o ministro das Minas e Energia, Alexis Stepanenko, essa melhoria dever ser buscada atravs da reduo de custos e aumento de eficincia. No dia sete de dezembro de 1941, avies japoneses praticamente destruram a frota norte-americana no Pacfico, ancorada em Pearl Harbour, no Hava. Os EUA, depois de um histrico discurso de seu presidente, Franklin Delano Roosevelt, entraram em guerra contra o Japo e seus aliados no Eixo, Alemanha e Itlia. O chamado mundo livre (que ento inclua tambm a URSS) unia-se contra o nazi-fascismo. E o Brasil? No Departamento de Estado, em Washington, desconfiava-se de Getlio, ditador com uma poltica em certos momentos prxima ao fascismo mussoliniano. Vargas, na verdade, preferia ficar de fora, ou aliar-se com os vencedores. A Argentina, por exemplo, era bem mais pr-nazi do que o Brasil. Welles foi mandado como uma espcie de embaixador cultural. No foi o nico. E ele gostava de Roosevelt e detestava os nazistas. J estou trabalhando com as principais figuras da vida poltica do pas com senadores e secretrios de governo. Agora estou trabalhando no desenvolvimento de um programa para aumentar a contribuio das polticas pblicas para o capital social. Mas so necessrias estratgias da base da sociedade. A substncia tambm usada em vrios medicamentos vendidos no Brasil, que dizem ser capazes de aliviar dores causadas por varizes e eliminar cimbras noturnas em pessoas que sofrem problemas de circulao nas pernas. Levantamento realizado entre pacientes que ingeriram cpsulas com a substncia entre 1969 e 1992 indicou que 16 pessoas podem ter morrido em consequncia de contra-indicaes causadas pelo sulfato de quinino. Conforme acordo autorizado pelo BC, o saldo da dvida do Estado com a Nossa Caixa de US$ 1 bilho ter que ser pago este ano, em doze parcelas mensais, atualizadas e com encargos. Corresponde a duas vezes e meia o patrimonio liquido da instituio. O Banespa carrega US$ 8,1 bilhes de crditos do setor pblico. Segundo o detetive Paulo Arajo, a delegacia recebeu um telefonema de homens que se identificaram como traficantes do morro Azul (no Flamengo). Segundo o detetive, eles ameaaram invadir a delegacia, no horrio de visita, para resgatar dez companheiros. Os fundos de commodities projetam para este ms, na mdia, rentabilidade bruta de 3,72%, segundo a Anbid. A rentabilidade lquida, para saque em 1 de setembro, vai depender da variao da Ufir. Por enquanto, este indexador projeta variao de 1%, com o que os fundos de commodities renderiam 3,03%. Os norte-americanos que se reuniram ontem na Cmara de Comrcio Brasil-Estados Unidos, em So Paulo, ficaram conformados com a derrota de sua seleo. J estamos contentes por ter chegado s oitavas-de-final, disse Phillip Trent, 24, que assistiu o jogo ao lado do amigo Brian Fagerburg, 24. Ambos seguraram uma bandeira dos Estados Unidos durante a partida. O desembargador Doreste Batista, convidado pelo governador do Rio, Nilo Batista, para ser o supersecretrio de Segurana do Estado, disse ontem que, se confirmado, vai pedir apoio do Exrcito para subir morros e colocar policiamento ostensivo nas ruas. At o meio-dia de ontem, o desembargador estava tentando um contato com o governador para definir se aceitava ou no o cargo. Ao lado do ala Charles Barkley, Shaq hoje um dos maiores astros do basquete mundial. piv do Orlando Magic, equipe da NBA, liga profissional de basquete dos EUA. Conquistou o ttulo mundial deste ano com a seleo de seu pas, o Dream Team (Time dos Sonhos) 2. Enfermeiro uma designao vlida apenas para quem concluiu o curso superior de enfermagem. Os outros profissionais da rea so tcnicos, auxiliares e atendentes. O tcnico tem o segundo grau completo pode optar entre o curso tcnico de enfermagem ou curso de formao de tcnico. Os limites da privatizao so as necessidades do governo. Nada est excludo, afirmou Montoro Filho. Para ele, a privatizao entende desde a venda de ativos pblicos at aes de estatais. Disse que o governo vai vender desde automvel at participao acionria em empresas de energia eltrica. O dinheiro das vendas pode ser utilizado tambm para o pagamento da dvida do Banespa. E tudo, segundo a Jovem Pan, para fazer proselitismo, para as promessas de sempre. Fernando Henrique apareceu no rdio e na televiso dizendo que no existe nada contra o seu vice. E que, portanto, nem pensa numa troca de Guilherme Palmeira. Uma defesa bem parecida com aquela de Lula, no comeo das denncias contra Jos Paulo Bisol. No tnhamos uma segurana exemplar, mas ela era suficiente para permitir o alvar de licena dos bombeiros, afirmou. Segundo ele, o alvar foi queimado, com tudo o que se perdeu. Taranto no soube dizer se os parentes das vtimas podero pedir alguma indenizao. A MTV no exibiu ontem o desenho animado Beavis e Butt-Head por determinao de liminar expedida pela Vara Central da Infncia e Juventude, em So Paulo. Entre 20h e 20h30, horrio em que o cartoon costuma ir ao ar, a emissora transmitiu uma sequncia de videoclipes. Antes, informou sobre a proibio da Justia num comunicado de 15 segundos. Os nativos ainda se utilizam de lanas primitivas para pescar nas rasas piscinas naturais repletas de polvos e peixes tropicais. Mas h a pesca comercial de atum controlada por norte-americanos de origem portuguesa, chineses de Taiwan e sul-coreanos em embarcaes de 1,2 tonelada que valem at US$ 13 milhes. Patologistas de hospital de Riverside (Califrnia) tiveram que vestir roupas especiais para autopsiar corpo de uma mulher que teria qumicos txicos no sangue, porque o cheiro os fez desmaiar. Cristou representa os clubes alemes do Bayern Leverkusen e Shalk-04. O empresrio disse que pretende ver tambm outros jogadores brasileiros, entre eles Cafu, do So Paulo, Mirandinha, do Paysandu, e Claudinho, da Ponte Preta. O caso nasceu de uma vendetta, obra e desgraa de um certo Francesco Farina, dono do Modena, um time rebaixado 3 diviso. Farina tentou salvar o Modena na poltica e no tapeto. Obviamente, no conseguiu. Por isso, metralhou Antonio Matarrese, presidente da federao. Farina acusou os clubes de sonegarem impostos com a cumplicidade da federao. Para multiplicar o porte das suas denncias, ele apresentou as suas peroraes em dois stios diferentes, s procuradorias de Roma e de Milo. Muitas de elas so inconsequentes, improvveis, sem documentao. De todo modo, na Bota, Justia cabe a misso de investigar, antes de meramente julgar. Gloria Attanasio, magistrada da capital, meramente decidiu o que a sua funo lhe exigia, e enviou 250 detetives s sedes dos tais 34 clubes para, l, coletarem os livros contbeis e os recibos correspondentes ao versamento dos impostos. Para a construo do palco foram utilizadas 170 toneladas de ferro e alumnio. A quantidade de ferro daria para construir 180 carros iguais a um Santana. E o volume de alumnio daria para fazer 275 mil latas de refrigerante. O palco foi criado por Mark Fischer, responsvel pela montagem do palco da ltima turn dos Stones Steel Wheels, do novo show do Pink Floyd e do espetculo ZooTV do grupo irlands U2. Apesar da popularidade, Chico de Miguel perdeu as duas ltimas eleies para a Prefeitura de Itabaiana, que passou a ser controlada por polticos locais ligados ao governador Joo Alves. Ele atribui as derrotas ao Judicirio, acusado de auxiliar seus opositores. No explica como, mas garante que no vai perder a prxima eleio em Itabaiana. O deputado afirma no ser um homem rico. Hoje, tenho s uma fazenda e pouco mais de 500 cabeas de gado. Mas tudo que ganho reverto tambm em ajuda para o povo, que recebe remdios, feira e dinheiro quando est precisando. Nunca fui egosta, afirma. Outros institutos mostram nmeros diferentes, mas no so divulgados pela televiso. A Liga de Assistncia e Recuperao, rgo ligado Prefeitura de Salvador, est desenvolvendo um projeto para a confeco de brinquedos a partir de sucatas. Crianas de 11 a 14 anos vo aprender a transformar sucatas em brinquedos educativos. A Secretaria de Sade da Paraba informou que Joo Pessoa est entre as cinco capitais brasileiras que atingiram 90% de cobertura vacinal na ltima campanha de multivacinao, realizada em 90. Neste ano, a secretaria espera vacinar cerca de 480 mil crianas. A Rssia vetou documento da Conferncia sobre Segurana e Cooperao na Europa sobre a ex-repblica iugoslava da Bsnia. O veto impediu a declarao que pedia aos srvios da Bsnia que parassem ataques no encrave muulmano de Bihac. A reunio terminou ontem em Budapeste. Se o preo do nibus for convertido pela tabela da montadora, o impacto ser de 10% a 12% na tarifa, diz Dias. As tabelas das montadoras, segundo ele, esto 40% acima do preo de mercado. O Frum j tem os preos mximos pagos no mercado. nibus leve, US$ 55 mil, para o pesado, US$ 70 mil. Antes do Mundial, a delegao brasileira participa dos Jogos da Amizade que tm incio dia 23, na Rssia. William Gibson um escritor normalmente associado fico cientfica. De fato, a maioria de suas histrias acontece num tempo algo frente do presente, mas Gibson rompe completamente com a idia fundadora da FC no cinema ou na literatura de futuro como evoluo. A estratgia de Gibson consiste em projetar no futuro o que j est latente no mundo contemporneo. Nos romances e contos do escritor, o cenrio muito prximo do atual, s que visto com uma certa distoro alucingena, um certo exagero de imaginao. Valores mdios das locaes comerciais aumentam 50,16%. Os valores mdios das locaes de imveis no residenciais na cidade de So Paulo subiram em junho 50,16%, segundo a Hubert Imveis e Administrao Ltda. A evoluo foi menos acelerada que a verificada em maio (55,16%). A maior desacelerao ocorreu na regio da Avenida Faria Lima. A Polcia Federal prendeu ontem em Braslia Vicente Wilson Rivera Ramos, o Vicentico, filho de um dos chefes do Cartel de Cli (organizao colombiana que trafica drogas). Vicentico havia escapado do cerco da PF, no domingo, quando foram apreendidas 7,5 toneladas de cocana em Guara (TO). Filho de Vicente Rivera Gonzalez, ele j foi condenado na Holanda a 14 anos de priso, segundo a PF. Os receios de Durante quanto reao da platia eram compartilhados pelo restante da comitiva. Documentos reservados da segurana da Presidncia da Repblica alertavam, na semana anterior, para a possibilidade do presidente ouvir vaias. Mais que isso: os papis da segurana afirmavam que, em razo da inflao alta, no se deveria descartar sequer o risco do pblico atirar objetos no camarote presidencial. Em telefonema a Maurcio Corra, ministro da Justia, o governador do Rio, Leonel Brizola, desaconselhou a ida de Itamar ao Sambdromo. A reao do pblico imprevisvel, disse. Avisado do alerta do governador, Itamar desdenhou os riscos e manteve a viagem. Corra trocaria a preocupao pela descontrao. Agarrado ao copo de usque, foi outro destaque da noite. A Kodak ainda passou a atender encomendas de peas plsticas para a Philips, Wapsa e NGK, entre 16 clientes. Uma nova diviso de servios, batizada de Kis, opera na administrao de copiadoras e outros equipamentos de imagem para grandes empresas. um nicho promissor. Segundo Galan, os novos produtos e servios acrescentaro 10% ao faturamento, que foi de US$ 360 milhes em 93. No prximo ano sero mais 15%, prev. White -- Acho que sim. A prpria psicanlise est baseada numa concepo tropolgica da conscincia humana. A Interpretao dos Sonhos, um dos textos fundadores da psicanlise, apresenta toda uma tropologia do ato de sonhar e d a base para uma potica que combina a teoria da tropologia com a noo da inveno potica. A idia de tropos provisria. Uma das coisas que os linguistas ainda precisam estudar a teoria dos tropos. Roman Jakobson trabalhou nessa direo. Lacan desenvolveu os conceitos de metfora e metonmia para caracterizar certos modos de conscincia. No se pense que esta ruptura modernizante passa pelo PT. O partido um aglomerado heterogneo, que s se mantm unido pela perspectiva de vitria de Lula. Teses modernas e sociais como o controle civil sobre o sistema de sade esbarram em resistncias enormes dos setores hegemnicos do PT, sob o argumento de que amarrariam a atuao do governo, logo agora que o partido se prepara para assumir o poder. Tampouco passa por FHC, representante da contemporizao bem explicada, ou por Antonio Britto cujo voto, em favor da anistia total aos agricultores, semana passada, ressuscitou os piores prognsticos sobre seu estilo. O motorista teria perdido o controle do carro, batido em um Monza e em uma rvore e capotado na rua Enas Luiz Carlos Barbante. Maria Elisa Flora Demarchi, 15, est internada em estado grave no Hospital do Mandaqui. O boneco servir para criar o que chamamos de 'efeito aurola' no motorista. Pesquisas comprovam que quando o motorista v a fiscalizao ele reduz a velocidade por pelo menos trs quilmetros, diz Lehfeld. O boneco vai ser colocado em locais sem congestionamento. Vai trabalhar das 7h s 17h30, inclusive nos finais de semana. A permanncia em cada cabine ainda no foi definida. Perry descreveu trs fases para a operao: a entrada no Haiti e a conquista de todas as posies estratgicas (com durao de seis a horas a dois dias), a criao de um ambiente seguro (de trs a seis meses) e a transferncia do controle da situao para a ONU. O presidente Bill Clinton passou a maior parte do dia de ontem em reunies com seus assessores militares e de segurana nacional na Casa Branca. Vila Madalena e Pinheiros ganharam novas opes noturnas nesta semana. O bar e restaurante Lanterna inaugurou na tera-feira um espao para danar. Agora o jardim entre a galeria e o salo do restaurante propriamente dito tem um bar com som dos DJs Bidi, Bartolo e Hori. O trnsito nas avenidas Paulista e Brigadeiro Lus Antnio (zona central de So Paulo) parou no dia 9 de setembro de 86. O aposentado Eliseu Francisco de Lyra (na poca com 44 anos), depois de discutir com o irmo, saiu do carro, subiu no cap, tirou as roupas e sapateou. A AI, organismo internacional com mais de 1,1 milho de membros, divulgou o documento ontem em So Paulo. O relatrio, referente ao ano de 1993, tambm destaca a impunidade dos acusados pelos massacres do Carandiru (outubro de 92), da Candelria (julho de 93) e de Vigrio Geral (agosto de 93). O apelo pop do evento j est causando reaes. Ohtake diz que at o compositor e cantor baiano Caetano Veloso manifestou desejo de assistir ao show de Nusrat, nesta segunda-feira, em So Paulo. Um monte de artistas vai querer ver os concertos, diz Ohtake. O secretrio espera a resposta do pblico. Toda platia de msica erudita tradicionalista, explica. Em Campos do Jordo tambm. O pblico de l extremamente treinado em msica do sculo 19. Para mim muito estimulante oferecer uma idia espacial para um trabalho que comeou a partir de uma idia musical. Basicamente, o que tenho feito com os msicos com os quais trabalho. O que mudou talvez seja meu vocabulrio, que est menos estruturado. A dinmica, a maneira como os movimentos so construdos, hoje diferente. 1. Currculo com mais de trs pginas. 2. Que o selecionador tenha que ler todo o currculo para descobrir o perfil do profissional. O crtico (/deve ser), um insolente da razo? Um (mecenas) nobre que deifica o saber antes e mistifica o dar depois ou ele deve ser um iconoclasta, espalhar o que sabe enquanto o tempo faz das novidades fatos descartveis. Ser que j no nos basta a humilde compulso passiva dos leitores que aceitam nos ler para serem informados? Mas o leitor tambm sabe se vingar com o desdm e a indiferena com que capaz de folhear as pginas da sua busca. Um crtico de jornal cotidiano deve se conformar com o papel de narrador e seu destaque secundrio ao simples gesto de uma pgina ser virada e ficar para trs. A notcia um fato, no um exerccio acadmico. A dimenso do fato est na relatividade da sua importncia para cada leitor e no no espao narcsico que ocupa sob uma assinatura. Em Medicina, ganharam Richard Dart e Richard Gustafson, da Universidade de Arizona por fracassarem ao tratar um paciente mordido por uma cascavel com choques eltricos aplicados no lbio durante cinco minutos. Em Fsica, ganhou a Agncia Meteorolgica Japonesa por um estudo de sete anos sobre a hiptese de terremotos serem causados por peixes rebolando suas caudas. Na madrugada de ontem, os prdios foram vigiados a distncia por homens da Rota, que estavam sem farda. Segundo o coronel, eles viram o armamento da quadrilha ser retirado de um dos prdios em um Gol. Esse carro foi achado no incio da tarde em Engenheiro Marsilac (zona sul). Com mandados de busca e apreenso, os homens da Rota invadiram os trs apartamentos. No houve tiroteio. De acordo com o coronel, a polcia descobriu a quadrilha atravs de uma denncia annima. Eduardo Loureno -- No se deve esperar uma interveno que tenha efeitos imediatos como a dos polticos. Os escritores aqui reunidos pretendem alertar a comunidade internacional sobre os ataques sofridos pela liberdade de pensar e de escrever em vrios pases do mundo. Os exemplos mais clebres e trgicos so os de Rushdie e Nasreen. O nosso protesto de ordem moral, temos a obrigao de defender uma das grandes tradies da nossa civilizao, que a da liberdade de expresso. Jos Saramago -- A contribuio vai depender do eco que o Parlamento possa ter na opinio pblica. Podemos dizer coisas importantes, tomar grandes decises, mas se no tiver repercusso... Tudo depende da capacidade que o Parlamento tiver de transmitir as suas idias imprensa, ao rdio e televiso. No sei se os jornalistas esto conscientes da grande responsabilidade que tm. Os seus 100 quilmetros j levaram este ano 200 mil toneladas de soja. Custo: US$ 8 por tonelada. No Sul, US$ 24! Por que no publicar que o PT fez um seminrio no Nordeste sobre ela e concluiu pela necessidade de sua construo? Esta uma obra do Brasil, transformadora da estrutura do pas. No comporta qualquer reserva, e bendito o pas em que os homens pblicos exigem dos candidatos a soluo de problemas nacionais, minorar a fome, dando leite s crianas que se alimentam de lixo, e estradas para desenvolver o pas. O ex-jogador assumiu o cargo ontem em substituio a Carlinhos, demitido aps a derrota para o Bahia (0 a 1), anteontem. As torcidas partem para cima da PM a fim de que nos concentremos num s local. Entre os denunciados por corrupo passiva esto o procurador de Justia aposentado Aldegy do Nascimento, 26 delegados, inspetores e peritos da Polcia Civil e Ary Chagas de Aguiar, assessor da promotora Lcia Atalla. Os 38 nomes fazem parte de um aditamento primeira denncia feita pelo Ministrio Pblico no escndalo do bicho, em junho. O pai de Romrio foi sequestrado na segunda-feira noite, no Rio. Os sequestradores estariam exigindo US$ 7 milhes (cerca de CR$ 9,6 bilhes) para libert-lo. Abalado, o jogador ameaou no disputar a Copa do Mundo, em junho e julho, nos EUA, caso o pai no seja libertado. O stimo filme de Spike Lee acerta nos figurinos e trejeitos da poca e resgata preciosidades do soul negro dos anos 70. As msicas incluem The Jackson 5, Stevie Wonder Jimi Hendrix, James Brown e outros 18 bons nomes. O problema a histria. Quase nada acontece. H algumas brigas engraadas entre irmos, cenas de separao conjugal e uma morte. Em algumas passagens, h dilogos interminveis e vazios. Durante o almoo familiar, por exemplo, fala-se muito sobre quase nada e ningum mastiga. Soa forado demais. No se sabe ainda como Oosterbroek morreu. Seu corpo no tinha marcas de tiros. Aparentemente ele quebrou o pescoo quando tentava fugir do fogo cruzado. A polcia anunciou a libertao de seis homens que estavam trancados no poro de um prdio do CNA em Johannesburgo. Segundo o porta-voz da polcia, Dave Bruce, os seis foram torturados. O deputado Adroaldo Streck (PSDB-RS) ficou encarregado de confirmar com Britto o encontro. Britto j informou a Streck que favorvel, mas ficou de discutir a questo com o PMDB gacho. Camila viajava no colo de Dalva, na frente do veculo. O corpo de Camila foi lanado para fora do carro. A menina morreu na hora. Dalva seria submetida ontem a cirurgia no hospital de So Paulo. A famla voltava para casa, em Americanpolis, depois de participar de casamento na Barra Funda e deu carona a amigos. Outras cinco pessoas ficaram feridas. Niemeyer -- , foi em 1936. Folha -- o que o sr. pode dizer dele em termos pessoais? Jazz rap? Esquea. Ou melhor, no esquea porque ainda se faz muita coisa boa, mas deixe de querer ser bacana citando-o como a ltima tendncia. Para aquele verniz atualizador, passe desde j a usar a expresso blues rap. E, para um efeito extra, cite G. Love and Special Sauce. Seu lbum de estria, lanado no ltimo vero americano, causou pouco ou nenhum impacto na parada. Mas uma boa escavao nas sees de crticas de revistas de msica importadas revela a verdadeira adorao que G. Love estimulou entre os jornalistas especializados. O secretrio-geral da Presidncia, Mauro Durante, tambm tem emprego garantido. Itamar quis encaix-lo no Tribunal Superior do Trabalho, mas a OAB (Ordem dos Advogados do Brasil) no deixou. O presidente articulou, ento, lobby certeiro em direo ao Sebrae (Servio de Apoio Pequena e Mdia Empresa) e ainda pediu a FHC que o ajudasse a pr Durante na presidncia da entidade. Esses dados fazem parte do livro Conflitos no Campo Brasil 1993, lanado na ABI (Associao Brasileira de Imprensa), no Rio de Janeiro, pela CPT (Comisso Pastoral da Terra), rgo da Igreja. Constam do livro 15 quadros estatsticos relacionando os nomes das vtimas e os responsveis por crimes ligados a conflitos de terra. As causas da violncia no campo tambm so discutidas. O diretor executivo do FMI tambm enfatizou a necessidade da queda dos juros no Brasil para incentivar a volta de investimentos macios nos setor produtivo. A uma pergunta sobre se a volta dos investimentos externos no dependeria do selo de aprovao do Plano Real atravs de um acordo formal entre o FMI e o Brasil, Camdessus disse que o que vai fazer a taxa de juros cair no um acordo com o FMI, mas a credibilidade interna e externa do Plano Real. Bancos e financeiras receberam bem a resoluo 2.071 do Conselho Monetrio Nacional. Baixada sexta-feira, ela cria a taxa de juro flutuante. Abre a perspectiva de aplicaes por prazo mais longos. O juro flutante permite reavaliar periodicamente as taxas de juros das aplicaes e emprstimos. A interveno diplomtica russa foi decisiva para evitar o envolvimento ocidental na guerra civil na Bsnia. Ao se oferecer para intermediar o conflito, a Rssia deu aos seus aliados srvios a possibilidade de uma retirada honrosa. Isso praticamente eliminou a ameaa de ataques areos da Otan. Essa a avaliao do Instituto Internacional de Estudos Estratgicos de Londres (IIEE), um dos mais importantes centros de pesquisa militar do Ocidente. Um ataque agora no faz mais sentido, desde que os srvios continuem a recuar, disse Folha o coronel Andrew Duncan, do IIEE. O espantoso que fora da polcia tambm se viu o mal no incidente de Cavalera. O jornal carioca O Globo recolheu depoimentos sobre a pisada no confirmada nem pela polcia e explicada por Cavalera como um simples tropeo, normal na movimentao de palco, em seu depoimento na delegacia. Todos, contra ou a favor, partem do princpio de que aquele foi um gesto de protesto. Foi um protesto moleque. S poderia ter partido de uma cabea de merda. Um psicanalista (sic), Jos Nazar, sugere, acreditem se quiser, nada mais nada menos do que linchamento como punio. O jurista Miguel Reale Jr. prefere um pessedebismo pedaggico, ao aprovar a atitude da polcia em det-lo como exemplo para a juventude. A tumorectomia garante sobrevida comparvel e melhor qualidade de vida do que a mastectomia total, escreveu a chefe do novo estudo, Anna Lee-Feldstein. Cientistas da Universidade da Califrnia em Irvine (EUA) publicaram a pesquisa na ltima edio da revista da Associao Mdica Norte-americana, (Jama). Com isso, o Cruzeiro deve comear a partida com Michelli, Magno, Derlan, Marcus Vincius e Anderson; Emiliano, Juliano e Anderson Leo; Ricardinho, Nlson e Herbert. A equipe vem treinando junta h pouco tempo. O atacante Nlson e meia Anderson Leo chegaram agora ao time e o ponta-direita Ricardinho, o meia Juliano e o zagueiro Derlan foram promovidos no final de 93. Recm-promovido, Derlan, 17, j um dos destaques do Cruzeiro por sua habilidade nas sadas da defesa para o ataque. O jogador chegou no clube h dez meses e, em novembro, foi para a equipe de juniores. Estamos muito motivados. A casa vai estar cheia e ns vamos mostrar porque viemos disputar a Copa, afirma. Como sua musa Iris gosta do tema, Silvio Santos vai agora todos os weekends ao teatro. Depois da pea de Leilah Assumpo, os dois foram juntinhos assitir a Aluga-se Um Namorado. Amaram. Pargrafo 3. -- Nos contratos celebrados ou convertidos em URV, em que haja clusula de reajuste de valor por ndice de preos ou por ndice que reflita a variao ponderada dos custos dos insumos utilizados, o clculo desses ndices, para efeitos de reajuste, dever ser nesta moeda at a emisso do real e, da em diante, em real, observado o art. 38 da lei n. 8.880 de 27.05.94. Pargrafo 4. -- A Taxa Referencial -- TR -- somente poder ser utilizada nas operaes realizadas nos mercados financeiro, de valores mobilirios, de seguros, de previdncia privada e de futuros. A sensao que se tem quando se est andando num tren puxado por ces semelhante do esqui aqutico. Na minha primeira tentativa, fiz exatamente a mesma coisa que na primeira vez que pratiquei esqui aqutico: ca e me soltei. Precisei cair mais algumas vezes at aprender a usar o breque. Alm de dominar esta habilidade, as duas nicas coisas realmente necessrias para se viajar de tren pelo Alasca eram estar em razovel boa forma fsica e ter mais de 13 anos de idade. Eu e meu marido, advogados de meia idade, apenas um pouco fora de forma, satisfazamos essas condies. Todos os plnios costumam almoar no Esplanada Grill, refeies de US$ 40, em mdia. Os motivos vo do bom atendimento certeza de achar ali pessoas como ns. E se encontram noite no bar Cabral, embora no se conheam. No necessrio. Os plnios tm no Cabral, aberto no final de 1992 numa travessa escondida da avenida Cidade Jardim, sua meca. Luciano Huck, 22 um dos proprietrios do bar, o profeta da turma. L, os plnios fazem amigos, influenciam pessoas, revem colegas, contam as novidades -- as ltimas viagens para os Estados Unidos, quem comprou carro novo, quem vai ter que comear a trabalhar com o pai, os que ainda no conseguiram telefone celular. Pedem cerveja, tequila e batidas, num gasto mdio de US$ 30 por noite, e falam mal dos mauricinhos que, como diz Rubinho Gimenes, so cheios de querer ser. Eles no. Eles so. O jovem corretor de seguros Kiko Villela, 22, . Cursando economia na Faap, Kiko espera ansioso o seu telefone celular, que est para sair. imprescindvel hoje em dia, regulamenta. O brinquedinho eletrnico tem uma funo interessante no mundo dos plnios: substitui o antigo torpedo, bilhete desferido entre jovens em bares. Em vez de mandar um papel pelo garom com cantadas escritas, o plnio liga para a plnia de sua predileo e pronto -- est feito o contato imediato. Entretanto, h possibilidade de queda nas cotaes externas nos prximos meses, em especial no segundo semestre, quando se verifica a colheita de arroz no hemisfrio Norte. A queda das cotaes internacionais viabilizaria o ingresso do produto a preos mais baixos, fato que impediria maiores avanos das cotaes internas. Segundo a previso da Conab, a produo nacional de arroz deve chegar a 10,6 milhes de toneladas de gros. No ano passado, o pas colheu 9,5 milhes de toneladas. O diretor-presidente da Yashica do Brasil, Kazuo Tamura, 56, morreu em um acidente de carro ontem de manh em So Paulo. Ele dirigia sozinho um Santana 90 em alta velocidade na avenida do Estado, no Cambuci (regio central), quando, provavelmente, perdeu o controle do carro e bateu num poste. Isoladamente, os EUA e o Japo, por exemplo, praticaram alquotas ainda menores 5,46% e 4,79%, respectivamente. O levantamento mostra que a Constituio de 1988 elevou de 15,27% para 20,53% a alquota mdia sobre o consumo no Brasil em 1989. O ministro da Aeronutica, Llio Viana Lbo, aproveitou a comemorao do Dia do Soldado ontem para queixar-se das dificuldades salariais dos militares. Vivemos ainda dias difceis, num contexto econmico, social e poltico em que as necessidades bsicas de nossas foras ( ...) tm sofrido severamente as agruras que toda nao atravessa, disse, em saudao enviada ao Exrcito. Os fs de Frank Zappa vo ter uma boa surpresa em janeiro. O lanamento de um CD com uma coletnea de msicas de Zappa interpretadas por bandas cover dos Estados Unidos, Alemanha, Blgica, Inglaterra, Holanda, Itlia, Sucia e Brasil (com a banda The Central Scrutinizer). A banda brasileira nasceu em 1990 para fazer uma homenagem a Zappa. Era um cara que sempre apostou no que acreditava. Lutava contra a bestialidade das pessoas e sempre mostrou isso de uma forma bizarra, conta Mano Bap, 29, vocalista da The Central Scrutinizer. Para o diretor de crdito da Febraban, Christoph Heinrich Von Beackedorff, no h novos tomadores, s esto sendo rolados os emprstimos que esto vencendo. Ambos apontam o baixo volume de captao de recursos com a venda de CDBs aos investidores como o principal fator de insegurana dos bancos na definio dos custos dos emprstimos. Desse modo, apenas parte do fundo de viagem seria empregada na compra de traveler cheques ou cdulas. O restante ficaria em uma aplicao com aniversrio coincidindo com o vencimento da fatura mensal do carto. O caso passou a ser investigado porque pessoas que foram ao velrio de Pereira teriam dito que o cadver suava e teve sua pele arrepiada dentro do caixo. A famlia foi alertada mas no chamou nenhum mdico e fez o enterro depois de dez horas de velrio. No cinema, sorte do espectador que ter no uma, mas duas mostras internacionais: a Internacional de Cinema, que agora vai ser competitiva e tem Pedro Almodvar e Quentin Tarantino entre os convidados, e a Banco Nacional de Cinema, que ser mais que uma amostra do evento com sede no Rio, com um ciclo em homenagem ao rei do filme B, Roger Corman, tambm presente. Shows h para todos os ouvidos. O 9 Free Jazz Festival ampliou seu leque e programou uma noite trepidante sob o comando do godfather of soul James Brown e cedeu espao para a modernidade de Guru, US3 e dos Digable Planets. Fora do festival, o De La Soul a atrao mais promissora. Como eles aumentaram 6,28% nas duas semanas, acertamos que eles iriam parar os aumentos. Dallari recebeu tambm a Associao Brasileira da Indstria de Mquinas. A entidade tambm se queixou de aumentos que variam de 5,5% a 11,5% no custo de alguns fundidos, forjados, ao plano e rolamentos. Foi ao final de um ano e trs meses que o prefeito teve sua melhor avaliao. Recebeu 27% de timo e bom, 30% de ruim e pssimo e 42% de avaliao regular. A boa performance de Maluf em maro ltimo coincidiu com a fase em que ele anunciava sua candidatura presidncia e investia em inauguraes de grandes obras, como o tnel do rio Pinheiros. A Gelaguela maior fabricante nacional de sobremesas individuais geladas est investindo US$ 200 mil no sistema de franquias. A empresa, que faturou US$ 1 milho em 93, vai inaugurar at setembro mais quatro unidades da Gelaguela Sobremesas. O objetivo consolidar a marca. Esta taxa, no entanto, no reflete o que est acontecendo de fato com os preos nas ltimas semanas. Isto porque os ndices de inflao s captam totalmente as mudanas de preos 30 dias aps o aumento. sempre considerada a mdia de quatro semanas em relao s quatro anteriores. A partir de agora, a taxa deve sempre recuar. Os preos estiveram praticamente estveis nas ltimas semanas. Juarez Rizzieri, coordenador do ndice de Preos ao Consumidor da Fipe, prev para agosto uma taxa entre 1% e 2%. A inteno votar e aprovar a MP na prxima semana, entre os dias 26 e 27. O prprio presidente Itamar Franco pedir empenho de todos os ministros para mobilizar os seus partidos para a votao. O ministro Ricupero ficou satisfeito com aprovao da MP da URV na comisso mista do Congresso que examinou a medida. Para o ministro, disse Simon, as mudanas feitas so assimilveis pelo plano. Os novos proprietrios sero integrados rede de franquia da Pakalolo, que conta atualmente com 76 pontos de venda. Tambm sero submetidas ao mesmo processo as 28 lojas da Body for Sure, a grife esportiva do grupo. Eu acredito no efeito barriga no balco, afirma o empresrio Humberto Nastari, 37. Na condio de senhor de meia-idade com idias adolescentes, tio Dave vem observando com bastante inquietao que seu sentido do paladar parece estar se esvaindo pelo ralo. Peixe passou a ter gosto de carne para mim e uma noite destas, no Rodeio, eu confundi salsicha caseira com frango kebab. Chocante. Constrangedor. Ento me deparei com um estudo que afirma que algumas pessoas realmente tm um paladar mais aguado do que outras. E, horror dos horrores, so as mulheres que tm o paladar mais apurado do que os homens (pare de dar essas risadinhas, voc a no fundo da classe). Parece que a Universidade Yale, aquela que nos deu o inesquecvel George Bush, comprovou a existncia de superdegustadores, ou seja pessoas que possuem um nmero de papilas gustativas muito alm do normal. Adventure in Castle um programa para aprender matemtica. Em ingls, traz noes sobre clculos de soma, subtrao, multiplicao e diviso. Custa US$49, na Brasoftware. Tel. (011) 253-1588. Journey in the Universe um programa em ingls, voltado para crianas que querem aprender noes de astronomia. Est sendo vendido na Brasoftware em promoo. Sai de US$ 79 por US$ 49. MacMillan Dictionary for Children um dicionrio de ingls em CD-ROM. Conta com recursos de figuras e sons, que mostram o significado das palavras. Custa US$45 na Multimdia Center (tel 011 959-2650). Ele disse, no entanto, que a lei antitruste aprovada pelo Congresso reflete o clima que o pas est vivendo nessa fase que antecede o lanamento do real. Mas afirmou que a lei antitruste aprovada pelo Congresso tem as deformaes tpicas da situao conjuntural. Apesar da declarao dos separatistas, o governo britnico continua disposto a iniciar as negociaes com o Sinn Fein, brao poltico do IRA, antes do Natal. O ministro britnico para a Irlanda do Norte, Patrick Mayhew, afirmou ontem em Belfast que o cronograma do processo de paz ser mantido. Mayhew disse que o IRA poderia demonstrar que realmente no autorizou a operao de Newry devolvendo os US$ 210 mil levados durante o assalto. FHC -- Esse comit faz tudo errado. Ruth -- Por isso perdemos as eleies. (FHC comea a rir). Sabe por que eu amo tanto voc, querido? Porque est sempre rindo. (Comeam ambos a chorar. Um estudo do Corecon (Conselho Regional de Economia), do Rio de Janeiro, divulgado ontem, est projetando, na hiptese de adoo do real em abril, uma inflao de 2,3$%. Com base nesta projeo, o estudo prev uma inflao com a nova moeda de 30% em 94. O SPQ (Sistema de Projees Qualificadas) resulta da mdia de opinies de 16 economistas que fazem anlise de conjuntura. Segundo o presidente do Corecon, Hlio Portocarrero, a inflao do real no ser puxada pelos oligoplios, apontados pela equipe econmica do governo como os principais viles na subida de preos aps a implantao da URV. Para ele, a alta de preos na fase da nova moeda dever ser impulsionada pelos setores competitivos como o agrcola e o de servios. O projeto ainda no oficial. Segundo o professor Vicente Amato Neto, 66, h informaes que todos ouvem, no sentido de fazer o estacionamento. Ontem, a diretoria da atltica e Amato Neto que iniciou a organizao dos alunos para a preservao do clube se reuniram para discutir alternativas para a rea. Cedras no tem o controle de sua polcia. Para evitar distrbios, vamos distribuir s TVs uma fita para mostrar populao o que viemos fazer aqui. Esperamos uma mudana de atitude da polcia. Esto ocorrendo violaes dos direitos humanos. Se o sr. Cedras no tomar os passos apropriados, ns vamos dizer num clima de respeito que passos ele deve dar. Tambm vamos distribuir 1 milho de cestas bsicas, para matar a fome da populao at a transio para a democracia. A Roma venceu ontem a Lazio, tambm na capital italiana, por 3 a 0, na partida mais importante e empolgante da dcima primeira rodada do Campeonato Italiano de 1994/95. Animado com um teste de vestirio, o tcheco Zdenek Zeman, tcnico da Lazio, colocou em campo o atacante croata Alan Boksic, cuja volta ao time estava prevista para a prxima semana. No mercado futuro do ndice Bovespa, a cotao para maio ficou em 20.000 pontos, projetando rentabilidade de 66,78% ao ms. No mercado futuro de dlar, a expectativa de desvalorizao cambial para abril ficou em 42,40%, contra 42,53% no dia anterior. O comportamento das pessoas no trabalho est passando por uma nova e importante alterao. Gesto participativa, times de qualidade e outros conceitos foram assimilados pelas empresas ao mesmo tempo em que elas passaram a exigir funcionrios mais abertos ao dilogo e cientes da importncia do cumprimento das regras de boas maneiras dentro do ambiente de trabalho. As empresas notaram que sua imagem a maior prejudicada quando um funcionrio comete uma indelicadeza em pblico e diante de um cliente ou subordinado, afirma o consultor de marketing pessoal Otto Reiter, 69. O refinamento do executivo mais um diferencial na hora da sua avaliao, completa. Nos Estados Unidos, universidades e algumas empresas chegam a oferecer cursos de boas maneiras nos negcios. No Brasil, essa atitude ainda depende da iniciativa de cada um. A questo que enfrentaram era retomar a arte intensamente dramtica de sua tradio sem se escravizar a ela. A soluo que deram, obter equilbrio entre figurao e abstrao em que a profundidade no nem clssica (pers -- pectiva) nem modernista (planificada), mas sugerida por jogos cromticos semilivres s vezes livres demais, como voc vai ver. Cada um o fez a seu modo. Dos cinco, Baselitz, Immendorff e Kirkeby fizeram melhor. Kirkeby, que vem ao Brasil para a abertura, tambm ter sala especial na 22 Bienal, a partir do dia 12. Tanta coincidncia s pode ser instrutiva; pode mostrar mais a Alemanha que qualquer contato poltico afinal, como dizem eles, o ser reside na linguagem. Habite-se. Alm da proposta de 18 cassaes, o relatrio final da CPI sugere que oito deles sejam objeto de processo criminal. So os deputados que, segundo o texto, devem ter os elementos comprobatrios encaminhados ao Ministrio Pblico. Esto nesse grupo Ricardo Fiuza (PFL-PE), Joo Alves (sem partido-BA), Cid Carvalho (PMDB-MA), Jos Geraldo (PMDB-MG), Manoel Moreira (PMDB-SP), Fbio Raunheitti (PTB-RJ), zio Ferreira (PFL-AM) e Paulo Portugal (PP-RJ). Alm disso, avaliou como fraca a organizao existente at ontem na campanha. A partir da prxima semana, vo viajar para os Estados os membros da coordenao que no pertenceram ao ncleo que passa a deter as decises. Cerca de 300 pessoas foram convidadas para a cerimnia. De esse total, 40 so parentes e amigos do ex-jogador. Entre os convidados esto o rei da soja, Olacyr de Moraes, o empresrio Alfredo Saad e o comandante Rolim Amaro, dono da empresa area TAM. Os funcionrios da prefeitura reclamam que Ldice est adiando desde maio o pagamento das perdas salariais da categoria. Eles tentaram agendar um encontro com Lula para pedir que ele ajudasse na negociao. Uma reproduo gigantesca da Maja Desnuda, de Goya, ocupa uma das paredes da casa. No banheiro, um funcionrio vende balas, cigarros, chocolates, gua mineral, remdios (Engov) e, claro, camisinhas. 4 de novembro -- morto em emboscada o lder da ALN e ex-deputado Carlos Marighella. 19 de junho -- Mdici anuncia a construo da rodovia Transamaznica. As questes colocadas pela body art nos anos 60 foram retomadas agora como reflexo sobre uma realidade em que os corpos esto submetidos a um trabalho de transformao que reala o sentido trgico. A Aids foi apenas um catalisador para a volta do prprio corpo como objeto esttico em diversos trabalhos. O vdeo uma experincia fsica, ( ...) tem um efeito direto nos corpos das pessoas. O vdeo pode ser um instrumento poderoso para tocar as pessoas diretamente, na percepo, em reas que a cultura ocidental no leva em conta como um caminho para o conhecimento. Desde a Idade Mdia, esse caminho feito atravs do intelecto e no do corpo. O corpo foi negligenciado, disse Viola Folha em 92. A ao alega que a Petroplastic no considera a Dow acionista da Triunfo. E comprova que a Petroplastic recebeu US$ 5 milhes para concordar com a transferncia, para a Dow, do controle da Atochem (fundadora da Triunfo). Boris Gorentzvaig, da Petroplastic, diz que os US$ 5 milhes eram adiantamento em acordo de gaveta com a Dow. Mas a Dow dos EUA no quis cumprir o contrato e transferir tecnologia a uma concorrente, diz. O rgo responsvel pelo mercado de telebingos a Susep (Superintendncia de Seguros Privados), uma autarquia federal subordinanda ao Ministrio da Fazenda. Compramos o ttulo para verificar, levei para casa e no enxerguei nada, disse ontem Vera Melo Arajo, chefe do Departamento de Fiscalizao da Susep. Fbio Igel, Joo Herreras e Marcelo Loureiro pilotam festa no Cabral. em prol do Fundo Social de Solidariedade a apresentao de hoje de Mulher, a Melhor Opo de Investimento no Teatro Itlia. Divulgado na semana passada, o fraco resultado em novembro da produo industrial e das novas encomendas s indstrias acabou com as esperanas de que a Alemanha tivesse sado da pior recesso do ps-guerra no ltimo trimestre do ano. Os dados detalhados sobre o desempenho do PIB entre outubro e dezembro no sero revelados at maro. Os economistas afirmam que a recuperao puxada pelas exportaes que o governo previa para o segundo semestre de 1993 no aconteceu. A federao das pequenas e mdias empresas revelou numa pesquisa que seus filiados esto prximos do pnico em relao situao econmica. Suas expectativas para o futuro prximo da economia so as piores em dez anos. No preciso, governador, porque, sabendo da hospitalidade paranaense, eu nem trouxe carteira. Os anos 80 foram um divisor de guas na industrializao brasileira. Vera Lcia Barbosa, 40, morreu na ltima quarta presa pelo cinto de segurana ao carro em que estava, que caiu no rio Tiet. O acidente foi s 21h50, perto da Ponte Cruzeiro do Sul (zona norte). As outras trs pessoas que estavam no carro conseguiram se salvar. O tombamento de um caminho no km 226 da pista Rio-So Paulo da via Dutra s 5h de ontem provocou engarrafamento na rodovia. O acidente causou a morte do menino Ren Scorsa, que estava no Logus dirigido por Srgio Duran, 33, que bateu no caminho. Sobral, embora seja uma das principais cidades do Cear, ainda no dispe de um sistema adequado de esgotos. O projeto de canalizao de um crrego no municpio de Contagem ( ...) indispensvel para os moradores porque, durante o perodo de chuvas, o rio transborda e alaga alguns bairros de Contagem. Nenhum municpio brasileiro, nem mesmo So Paulo, dispe de um sistema adequado de esgotos. Enchentes do-se pelo pas inteiro. A escolha to particular de Sobral, que por ser uma das principais cidades do Cear tambm uma grande concentrao eleitoral do Estado, coincide com sua escolha para a ao de campanha que l fez o candidato Fernando Henrique no fim-de-semana. A mineira Contagem outra grande concentrao eleitoral, com prefeito do PSDB e onde o partido, j como propaganda, fez a conveno de escolha de Fernando Henrique. Covas -- Quando se falando em emprego, no se tratando s de uma questo administrativa. Quando se fala do Real, fala-se de posies que traduzem aquilo que deve ser o papel de So Paulo. Folha -- Pases como a Argentina, que passaram por ajustes parecidos com o do Brasil, tm hoje um desemprego recorde ... Qual a matria odiada na escola? Lugar de que mais gosta? Os dados so do SCI (Servio de Segurana ao Crdito e Informaes) que armazena informaes comerciais sobre 1,6 milho de empresas no Brasil. Do volume de protestos, a regio Sudeste respondeu por 39,1% do total e a Sul, 23,5%. O Estado de So Paulo representou 22,8% dos protestos. O conselho tambm manteve multas aplicadas Indstrias J.B. Duarte e a seus dirigentes pela prtica de atividades restritas a instituies financeiras. A corretora Antonio Delapieve e seus dirigentes foram multados por negociar ttulos de renda fixa a preos superavaliados. Hoje, os mediadores internacionais lorde Carrington e Henry Kissinger (ex-ministros do exterior do Reino Unido e EUA, respectivamente) se encontram com os lderes rivais negros Nelson Mandela (CNA) e Mangosuthu Buthelezi (zulu). Pelo menos dez pessoas morreram e 80 se feriram no terceiro dia consecutivo de luta entre faces rivais em Kabul, Afeganisto. Tropas do presidente Burhanuddin Rabbani enfrentam soldados do primeiro-ministro Gulbuddin Hekmatyar. Biotecnologia -- busca o aprimoramento e criao de novos produtos a partir da manipulao de animais e plantas destinados sade e esttica humana. Apesar de ter conseguido a guarda de Natasha, Conceio est aflita. Tenho os papis, mas no tenho a minha filha. Agora s deixo a Frana com ela. Esse o meu grito, se inflama. Segundo Conceio, tem existido certa indiferena da polcia local em relao ao caso. A estratgia montada por Qurcia tem o objetivo de ganhar a prvia do PMDB que vai definir o candidato do partido Presidncia no prximo dia 15, bem como manter o PMDB unido durante a campanha. Os ataques a Sarney s sero feitos se a contabilidade quercista constatar que a vitria na prvia est ameaada. Esse o tema mais discutido na imprensa do pas; anteontem, ele foi objeto de debate especial do gabinete de governo japons. No caso de ontem, um garoto de 14 anos foi encontrado enforcado numa viga em sua escola em Yorii, 50 km a sudoeste de Tquio. A RBS tem ainda trs concesses de TV em Santa Catarina e 30% da TV Cachoeiro, de Cachoeiro do Itapemirim, no Esprito Santo. O nico caso que se tem registro de aplicao rigorosa da lei ocorreu no final da dcada de 60. Destaca que o Partido Liberal Democrtico (PLD), derrubado do poder no ano passado, dever apresentar seu presidente, Yohei Kono, como candidato a primeiro-ministro do Japo. Publica documentos comprados de James McDougal, ex-scio de Bill e Hillary Clinton na empresa Whitewater. Os documentos indicam que a primeira-dama dos EUA estava mais envolvida no dia-a-dia da administrao da firma do que fora admitido at agora. Depois de uma disputa acirrada entre as indstrias para conquistar os melhores mercados, o ranking das gigantes multinacionais que dividem o bolo mudou. A japonesa NEC continua na liderana. O segundo lugar foi conquistado pela Ericsson, que em agosto de 93 ocupava a quarta posio em contratos assinados para fornecimento de equipamentos. A NEC tem mais de 55% (US$ 455 milhes) dos contratos assinados pelas concessionrias da Telebrs, incluindo a a segunda fase da implantao do sistema mvel em So Paulo e a terceira no Rio, previstas para 94. O engenheiro Bruno Maranho, lder do Movimento por Terra, Trabalho e Liberdade, pertence a uma das famlias que dominam os latifndios no Nordeste. Uma invaso do grupo, feita h dois anos, em Pernambuco, na usina Massauassu, prejudicou justamente a famlia Maranho. Depois de conquistar seu prmio de estria no concurso Smirnoff em So Paulo, o jovem estilista Srgio Machado emplacou mais uma. dele o segundo lugar do Smirnoff International Fashion Awards, disputado anteontem em Dublin. Alphaville uma mistura de fico cientfica, filme noir, e a irreverncia das histrias em quadrinhos. A histria narra a ida de um agente secreto desumana cidade futurista de Alphaville, controlada por um crebro eletrnico que baniu totalmente os conceitos de amor e solidariedade. Para construir o ambiente futurista do filme, Godard recorreu quase que totalmente fotografia. A cmera de Raoul Coutard conseguiu transformar Paris numa cidade glida, sem contar a ousadia de montar sequncias inteiras em negativo. O experimentalismo do filme no esconde sua proposio simblica, a alienao na sociedade tecnolgica. Uma audincia hoje na 7 Junta da Justia do Trabalho em Belm reabre a discusso sobre a obrigatoriedade do diploma especfico para o exerccio do jornalismo. O Sindicato dos Jornalistas do Par exige a demisso dos cerca de 150 jornalistas irregulares (que no tm curso universitrio de Jornalismo) das empresas de Belm. Como o Estado no tem recursos, nem vai ter para todos, o atendimento acaba deixando de fora justamente os mais pobres, que no conseguem acesso aos servios pblicos. O problema poltico porque envolve, por exemplo, a gratuidade da educao, da sade, da previdncia mnima. Excetuada a educao bsica de primeiro grau, que deve ser gratuita e obrigatria, para todo o resto preciso discriminar quem pode pagar. Dados do IBGE (Instituto Brasileiro de Geografia e Estatstica), de 1989, mostram que 30,6 milhes de brasileiros. Dos fumantes, 18,1 milhes so homens e 12,5 milhes, mulheres. O homem fuma entre 11 e 20 cigarros por dia e a mulher, entre 5 e 10. A PF (Polcia Federal) indiciou ontem duas pessoas por vender carro com gio. Foram indiciados o dono da revendedora 3.000 Automveis, Leonardo Romanioli Filho, e o vendedor Lus Silveira, da concessionria Frame. Os dois foram presos quando o taxista Jos Fioravanti, 61, foi entregar US$ 1.000 a Romanioli como gio da compra de um Santana Quantum. Ele estava acompanhados de trs agentes da PF. Um deles era seu filho, Srgio Fioravanti. A partir desta edio, a tabela com o levantamento dos indicadores Folha-Sebrae passa a incluir a comparao da capacidade ocupada no ms pesquisado com o mesmo ms no ano anterior. O uso da comparao anualizada permite verificar a variao da atividade industrial sem a influncia de fatores sazonais. Com a incluso de mais esse dado comparativo, a tabela fica mais completa. Ela tem tambm textos com explicaes sobre cada item pesquisado. A verso mais corrente sobre o incio do enriquecimento de Qurcia de que ele comprou terrenos em reas de periferia que seriam beneficiadas pelos projetos de melhoria urbana da Prefeitura. Na sua administrao, executou um plano virio com largas avenidas interligando os bairros, que provocaram um boom imobilirio. Nicolau diz que hoje o produtor de leite C recebe US$ 0,22, enquanto o custo est em US$ 0,24. Dallari orientou os produtores a negociarem com a indstria. Esses aspectos podem ser observados nos relatos das pacientes: quando tenho que arrumar a casa e no estou com vontade, escuto vozes que me incentivam a comear a tarefa. Depois de um tempo, as vozes aumentam e acabo ficando atordoada e no consigo terminar o que comecei. Estes problemas no so frequentes nos homens, pois alm de terem uma possibilidade menor de casarem e terem filhos, esses no so os tipos de papis socialmente esperados para o sexo masculino. O restaurante, especializado em carnes e saladas, oferecer uma garrafa de champanhe espanhol para os casais na noite do dia 12. Quem no quiser beber durante o jantar, poder levar o champanhe para casa. As pessoas esto cansadas do hype que se cria em torno das roupas, dos estilistas e do luxo abusivo. Conforto fundamental. Eu no me associo com moda. Para mim roupa e moda so coisas diferentes. Folha -- o que a mulher precisa hoje, em termos de roupas? Desorganizao e violncia marcam torneio que consolida o 'efeito Copa'. Os clubes de futebol do Brasil esto valorizando cada vez mais a posse de bola e a utilizao de lanamentos como armas para se chegar ao ataque. o que confirmam as estatsticas do Campeonato Brasileiro, encerrado ontem com o novo ttulo palmeirense. Emily Lloyd vai ser a desbocada punk Tank Girl, dos gibis ingleses. Ainda na Inglaterra, Spielberg negocia os direitos da srie de TV Doctor Who. Chris Columbus est ligado adaptao dos super-heris Quarteto Fantstico, que inauguraram a era Marvel nos gibis h 30 anos. Mas o oramento de US$ 5 milhes insinua que a informao no sria. So comuns casos como a filmagem do gibi Sargento Rock, que teve os direitos comprados por Joel Silver h cinco anos. Arnold Schwarzenegger deu entrevistas como ator principal, foi substitudo por Bruce Willis h dois anos, mas o projeto nunca saiu do papel. Na campanha eleitoral, Alencar e Nilo estiveram em lados opostos quanto ao das Foras Armadas no combate ao crime. Alencar defendeu a adoo do estado de defesa, que prev a suspenso de garantias individuais. Trata-se, segundo ele, de um instrumento de defesa da democracia. Ao contrrio, nesta situao econmica de extrema gravidade, todos gostariam de ajudar o pas. Para isso, porm, necessrio que o nosso Banco Central busque no a independncia do Tesouro, que impossvel, mas que deixe de quebrar-lo com a sua poltica agressiva de juros. O shopping West Plaza j comeou sua promoo de Dia das Mes. Diariamente, est promovendo desfiles de moda para seus consumidores. Os desfiles, com roupas a venda no prprio shopping, visam orientar o consumidor a respeito da moda. Estilistas estaro disposio dos clientes. Segundo ele, isso aconteceu aps a priso de soldados venezuelanos em territrio brasileiro, em fevereiro de 1993. Depois desse incidente, as Foras Armadas venezuelanas tiveram de esfriar suas aes na fronteira com o Brasil. A Guarda Nacional da Venezuela passou a conversar com os ndios e a convenc-los de fazer o papel dela, ou seja, reprimir os garimpeiros, diz Altino. A maioria dos bons jogadores brasileiros atuava em clubes profissionais, filiados FBF. No houve acordo para uma trgua durante a Copa. A soluo foi negociar diretamente com os jogadores. A CBD aliciou Lenidas (Vasco), Luizinho e Waldemar de Brito (So Paulo), entre outros. Era uma poca em que os cariocas imperavam em campo e fora dele. Por razes que a prpria razo desconhece, no sorteio do local do jogo decisivo, a bolinha quase sempre apontava o assustador estdio de So Janurio. O torcedor paulista sonhava com uma era futura, em que So Paulo chegaria supremacia, se possvel de forma arrasadora. Passaram-se uns 50 anos e o sonho se realizou, trazendo alegria e tambm, inesperadamente, frustrao. Ficamos todos , isto , os torcedores paulistas que vem dos idos de 1940, como o americano mdio, no fim da Guerra Fria. J no h o imprio do mal para combater. Vasco, Botafogo, Fluminense e at o Flamengo so fantasmas do passado. O bombeiro suspeita que o golfinho tenha morrido afogado. Ele teria ficado preso em uma rede de pesca sem condies de subir tona para respirar. Os bombeiros pediram ajuda ao Cebimar (Centro de Biologia Marinha) da USP para apurar as causas da morte do golfinho. At as 17h, a Prefeitura de Caraguatatuba no havia retirado o golfinho da praia. O governo vai usar a URV para corrigir impostos, garantiu FHC. A URV produzida por trs ndices, e um dos ndices o mesmo da Ufir. Vai ter paridade, j desde o comeo, explicou. Quanto s tarifas pblicas, no sero fixadas em URV para evitar reajustes dirios, mas tero correes peridicas pelo indexador. Do ponto de vista de proteo ao consumidor, vou reajustar o preo da tarifa pblica no dia normal como se ela fosse URV. Ou seja, pela mdia real dos ltimos quatro meses, disse. Onze milhes de aposentados ganham mnimo. Esses vo ter um aumento real, pelo que disse o ministro Cutolo (Srgio, da Previdncia), na passagem para a URV, de 17%. E alguns vo ter de 30%. Ele atribuiu a discusso sobre o valor dos benefcios a uma iluso monetria. As pessoas pensam que recebem US$ 80, mas 30 dias depois esse valor muito menor. Encerrada a festa, Maluly viu o rapaz esperando. O deputado se aproximou. Eu precisava de uma ajuda do senhor, deputado ... O Prncipe Charles, do Reino Unido, fez respirao boca-a-boca num boneco (foto) na inaugurao do novo prdio do Centro de Ambulncias de Wellington, capital da Nova Zelndia. O homem que atacou Charles anteontem no ter direito a fiana e ficar preso pelo menos at o fim da visita de cinco dias do prncipe ao pas. Litunia e Ucrnia elogiam a Otan Os presidentes da Litunia, Algirdas Zebrauskas, e da Ucrnia, Leonid Kravtchuk, elogiaram ontem o programa de Parceria pela Paz da Otan (aliana militar ocidental, liderada pelos EUA), que prev operaes conjuntas com pases do Leste Europeu. Segundo eles, os laos com a aliana no vo afetar as relaes com a Rssia. No verdade, como diz Gusmo, que a misso enviada a Israel tenha feito confronto de preos. O relatrio sigiloso no traz qualquer lista de preos. As evidncias de superfaturamento foram levantadas pelos professores Armando Lagan e Mrcio Rillo, da USP, a pedido da Folha. Sustentadas em depoimentos PF, foram confirmadas em trs percias. A ltima delas, com a participao de trs cientistas, comprovou superfaturamento de 343%. Stephen Freehill, 16 (foto), compareceu ontem a um tribunal de Cingapura para uma audincia preliminar. Ele acusado de vandalismo. Pode ser condenado ao aoite, como seu compatriota Michael Fay, 18, preso na mesma ocasio. Fay foi punido com quatro vergastadas e agora cumpre pena de priso. Foras do norte do Imen anunciaram a tomada da base area de Al Anad, a principal dos sul-iemenitas. A base fica a apenas 50 km de den, a capital do sul. A guerra comeou h 13 dias por causa de um confronto entre o presidente Ali Abdulllah Saleh e seu vice. No damos conta de atend-los. A resposta ao pedido de ajuda foi gigantesca, diz Virginia de la Guardia, porta-voz da Mdicos Sem Fronteiras. A maioria dos voluntrios so jovens. Mas, comenta Enrique Albizu, presidente da Medicus Mundi, precisamente nas situaes de emergncia que necessitamos de gente mais especializada. O Santos volta a viver um clima tenso para o prximo jogo, domingo contra o Santo Andr, em busca da primeira vitria no Campeonato Paulista. O tcnico Pepe, irritado com a atuao do time no empate em 1 a 1 com Ituano anteontem, disse que est faltando tranquilidade e talento aos jogadores. Pepe ainda tem esperana que o centroavante Guga renove o seu contrato, vencido em 31 de dezembro. Se isso acontecer, tem que ser rpido, do contrrio no adianta mais, afirmou. Nascido no dia 23 de maio de 1972, em So Paulo, Rubens Gonalves Barrichello comeou sua carreira automobilstica no kart, em 81, quando foi vice-campeo jnior da capital paulista. Em 1989, aps ganhar diversos ttulos paulistas e brasileiros no kart, Barrichello disputou o Campeonato Brasileiro de Frmula Ford, terminando em quarto lugar. A Sunab (Superintendncia Nacional de Abastecimento) autuou sbado duas lojas do ParK Shopping, em Braslia. Ambas no apresentavam preos de produtos expostos nas vitrines. A multa para lojistas que descumprirem a determinao pode chegar a R$ 128 mil. Segundo Eduardo Lago, superintendente do orgo, j foram aplicadas 400 multas em todo o pas desde o incio do ms. A Mercedes-Benz deve instalar a linha de montagem do Swatch, carro compacto com design do fabricante dos relgios da marca, em Sarreguemines, cidade do norte da Frana. A deciso foi tomada pelo conselho da montadora alem, na ltima sexta-feira. O anncio oficial ser feito aps o dia 20. Beckenbauer aconselha o lateral a conversar com o treinador, procurando reverter a situao. O caso Effenberg, jogador cortado por ter feito gesto obsceno torcida, tambm foi criticado. Foi um ato muito rigoroso. Beckenbauer tem o direito de dizer o que quiser. Rodado no incio da dcada de 50, poca urea do macarthismo perseguio a artistas e intelectuais comunistas ou acusados de simpatizar com essa ideologia liderada pelo senador Joseph McCarthy nos EUA, o filme ganha verso para CD-ROM, com direito a um extenso material de referncia. Preo: US$ 74,71. O filme narra uma greve de mineiros em uma cidade do Novo Mxico, nos EUA. H uma trama paralela, com Ramon e Esperanza, casal que aps a greve se separa. Caso isso ocorresse, eles seriam colocados em liberdade to logo chegassem no Canad. Documento do Ministrio das Relaes Exteriores do Canad, obtido pela Folha, d apoio ao lobby (grupo de presso) feito pela Embaixada do Canad junto a senadores e deputados brasileiros para que os canadenses sejam expulsos. Segundo o cnsul, era crescente a presso do Congresso norte-americano sobre o Departamento de Estado para que se adotasse uma poltica idntica do Brasil. Queremos dar um visto por dez anos. mais eficiente do que por apenas quatro anos. Estamos sendo forados, ao invs, a reduzir o prazo para trs meses. uma situao absurda, disse Taylor, 49. Um jogador que estava nos planos de Pepe era o meia Carlos Alberto Dias, que acabou indo para o Flamengo, j que a diretoria do Santos vetou o seu nome. No momento, Pel, o maior jogador da histria da equipe, tambm a maior esperana de um time forte. Durante esta semana, o agora dirigente esteve no Japo, em busca de uma empresa que, no futuro, invista no Santos. Recentemente, o time firmou um contrato de US$ 50 mil mensais com a Lousano, que vale at o fim do Campeonato Paulista. O aumento das despesas acontece at mesmo nos ministrios da Fazenda e do Planejamento, responsveis pela montagem do Oramento federal. No Ministrio da Fazenda, as despesas crescero de US$ 1,13 bilhes em 93 para US$ 1,36 bilhes em 94 (ou mais 20,07%). No Planejamento, o crescimento de 33,19%, elevando as despesas de US$ 87,6 milhes em 93 para US$ 116,69 milhes em 94. O deputado Aloizio Mercadante (PT/SP) divulgou ontem nmeros do Tesouro sobre a execuo oramentria de janeiro a novembro de 1993, demonstrando que o governo conseguiu um supervit operacional de US$ 9,2 bilhes neste perodo. Embora esteja a quase 40 km do estdio, o hotel foi escolhido porque est prximo da California State University, onde o Brasil treinar durante esta semana. A chegada ao hotel foi s 12h10 (16h10 em Braslia). Toda a delegao subiu diretamente para os quartos. Os jogadores e Parreira no falaram com a imprensa. Para estimular a participao, a cada dois dias de trabalho as meninas ganham uma boneca. A Repblica Movimento de Emas trabalha h 20 anos com adolescentes carentes. Cheia de energia, a interpretao da Orquestra Estadual da Hungria vem amortecida um pouco pela artificialidade da gravao, de quase 15 anos atrs, mas compensa a sonoridade de estdio com o vigor. A Sinfonia toda uma espcie de demonizao da Nona de Beethoven, que muito mais seu assunto do que o Fausto de Goethe. Nisto tambm na resistncia e repetio de Beethoven Liszt um foco importante, com reflexos na obra de Tchaikovsky, Scriabin e Strauss. No poderia haver realizao maior para um compositor: substituir Beethoven, ou ter pelo menos a iluso de ser um novo ponto de partida. Para Simon Franco, 55, presidente da Simon Franco Recursos Humanos, no se pode jogar fora 15 anos de experincia. Segundo ele, Carvalho no est avaliando que pode ampliar seu espectro de opes sem a necessidade de uma mudana radical. A reproduo assistida est regulamentada no Brasil desde novembro de 1992, embora sua prtica tenha sido iniciada oito anos antes. Determinadas pelo Conselho Federal de Medicina (CFM) as normas ticas foram inspiradas nas de outros pases Estados Unidos, Frana e Itlia, por exemplo. Mas, como nesses pases, os avanos nas pesquisas mdicas tornaram os cdigos de tica ultrapassados. A possibilidade da gravidez ps-menopausa e o uso de vulos de fetos abortados (veja texto pg. 6) so exemplos mais gritantes. Ficaram de fora de nossa regulamentao, disse Folha Antonio Henrique Pedrosa Neto, diretor do CFM. Ele afirmou, no entanto, que pelo menos a gravidez ps-menopausa ser objeto de nova resoluo. Polcia Civil encontrou dois homens amarrados, amordaados e de olhos vendados, em um barraco da favela Nova Braslia (zona norte). Mauro Pereira e Gilson Paulino dos Santos disseram ter sido sequestrados por traficantes, que os teriam confundido com policiais. Fiscais do Tribunal Regional Eleitoral e PMs do Batalho de Choque foram recebidos a tiros ontem tarde na Vila do Joo, Manguinhos (zona norte do Rio). Os fiscais retirar propaganda irregular. No tiroteio, morreu Adriano Herculano da Silva, 18. O casamento ser realizado pelo reverendo Onaldo Pereira, 38, ordenado h oito anos nos Estados Unidos. O pastor responsvel no Brasil pela Comunidade Pacifista Crist, fundada na Alemanha em 1708. A seleo brasileira feminina de basquete a campe do Torneio Internacional de Basquete, um quadrangular amistoso disputado em Recife (PE) e que contou com a participao das selees de Cuba, Argentina e Eslovnia. O time comandado pelo tcnico Miguel ngelo da Luz ficou com o ttulo ao derrotar anteontem noite na final a forte seleo de Cuba por 115 a 93 (54 a 44 no primeiro tempo). A cestinha do jogo foi Hortncia, que marcou 38 pontos. Alm da fazenda do Sabi, participaram do leilo a Terra Boa e a Mata Velha, tradicionais criadoras de nelore. O principal destaque na pista foi a vaca Santya, selecionada para ser doadora em transferncia de embries, que foi vendida por 40,2 mil URVs. Filha do touro Chummak, ela est prenha e foi comercializada com uma bezerra ao p. A Times Square tambm um ponto de encontro e uma alternativa teen, afirma o diretor da Multiplan. Para Spinelli, o espao representa uma nova tendncia para as reas de convenincia dos shoppings. Paula Toller Isso uma coisa que a gravadora vinha sugerindo para a gente h muito tempo. Gostamos da idia, resolvemos pegar velhos sucessos e fazer neste formato. Gravaes acsticas se encaixam com o nosso tipo de som. Sempre tivemos canes mais lentas. Bruno Fortunato A gente sempre teve msicas que do para tocar no violo. A inteno louvvel e esta Folha tem denunciado sempre a ciranda financeira como obstculo a uma autntica estabilizao. O momento e a forma escolhidos para mudar o sistema financeiro, entretanto, dependem fortemente de variveis polticas. Est em estudo no Ministrio da Previdncia um conjunto de medidas para desestimular o aumento de aposentadorias no prximo ano. Elas sero oferecidas a Itamar, como alternativa propalada quebra do caixa previdencirio. Causou m impresso, na quarta, a participao do ministro da Justia, Alexandre Dupeyrat, na reunio que tratou do projeto sobre abuso do poder econmico. Chegou atrasado, foi duro com seus crticos e saiu antes do fim. Em Olhos de Serpente (Snake Eyes) de Abel Ferrara, Madonna faz o papel dela mesma: uma falsa atriz, falsa loira, que paga para estar num filme. de longe sua melhor atuao no cinema. Ferrara pe o excelente Harvey Keitel para interpretar o seu papel: o do diretor que faz da falsa loira uma verdadeira atriz. Olhos de Serpente o filme dentro do filme dentro do filme. Eddie (Harvey Keitel) dirige a histria de um casal de ricos drogados em crise (Madonna e James Russo). Eddie tambm est em crise com sua mulher (interpretada pela mulher de Abel Ferrara) e no resiste tentao da atriz, que paga para ele filmar (Madonna produtora de Olhos de Serpente). Caso o tetracampeo seja o Brasil (seja feita a vossa vontade assim na Terra como nos gramados), confirma-se que os ltimos sero os primeiros. Vi, num jogo quase inacreditvel da Copa, o Eire jogar futebol muito melhor do que a Itlia. Venceu por 1 a 0. J bastava, para orgulho da Irlanda, que a Inglaterra sequer tivesse chegado Copa. Mas na Republica Dominicana que o cruzeiro atinge seu ponto alto. O navio ancora numa praia, Serena Cay, que se torna exclusiva dos 1.600 passageiros. Na areia h apenas espreguiadeiras, coqueiros e barraquinhas de artesanato. O mar cristalino. Quem quiser, vai em excurso at o resort local, Casa de Campo. Sem dizer que o seu clube dispensou, humilhantemente, o seu arqueiro Zubizarreta -- depois da derrota para o Milan, 0 a 4, na deciso da Copa da Europa. Chegamos at aqui, claro, tambm graas a Cruyff. S Salinas discordou dessa posio. Dos dez cestinhas da atual temporada, apenas um joga fora do garrafo. Dos cinco primeiros artilheiros, quatro so superpivs. Todos eles estaro esta noite em Minneapolis. Chance imperdvel para aqueles que apreciam enterradas, tocos e a mais primitiva troca de porradas. A NBA nem existia e o profissionalismo no esporte apenas engatinhava quando apareceu o primeiro gigante do basquete norte-americano. Seu nome: George Mikan. Sua virtude: 2,08 m. Em torno do gabinete presidencial no Planalto ele quer os dois maiores colaboradores na campanha eleitoral: o amigo, empresrio e secretrio-geral do PSDB Srgio Motta e o ex-reitor da Unicamp e coordenador do programa de governo Paulo Renato de Souza. Cardoso diz que a frente para sua eleio foi formada exclusivamente em torno de um programa. O nmero mais alarmante se pensarmos que daria para abastecer toda a populao brasileira por oito meses, diz Dalmo Rosalen, da Secretaria do Meio Ambiente. H cidades que enfrentam programas de racionamento de gua. Um exemplo a bacia do Vale do Piracicaba (a 170 quilmetros a noroeste de So Paulo). Flach Hoje esto estabelecidas penas de priso, mesmo que eventualmente no sejam cumpridas ou sejam convertidas em prestao de servios comunidade. Mas eu imagino que deveria se dotar, pelo menos o Judicirio, de medidas que possibilitassem a concesso do perdo judicial. Folha -- O senhor acha que o usurio questo de sade, de educao ou de polcia? Flach No terreno das idias, eu priorizo como uma questo de educao e de sade. Eu estou muito preocupado em estimular as atividades preventivas. Nossa preocupao um projeto que estimule no jovem uma vida sem drogas, promovendo-se um comportamento construtivo. A espanhola Arantxa Sanchez, tenista com mlehor desempenho nesta temporada, foi eliminada na primeira rodad do Virginia Slims Masters em Nova York, que rene as 16 melhores jogadoras do ano. Sanchez, n 2 do ranking mundial, perdeu num jogo dramtico para a francesa Julie Halard por 6/2, 1/6 e 7/6, com 7/2 no tie-break decisivo. Na 22 posio do ranking, Halard foi empurrada pelo pblico no Madison Square Garden. Se for uma greve s por grevismo, da prejudica a quem favorvel a isso. A, poderia afetar o Lula, no a mim, disse FHC. O candidato tucano a presidente deu essa declarao ontem tarde, depois de discursar para cerca de 550 empresrios na hora do almoo (leia texto nesta pgina). A PMD (psicose manaco-depressiva) uma doena psiquitrica que leva a uma alterao abrupta do comportamento. As pessoas com PMD oscilam entre dois plos. Tm fases de depresso e de euforia (mania). Nem todos passam pelas duas fases. Os irmos Osny Silveira Neto e Guilherme Silveira abrem hoje o bar Horcio no Itaim. Jos Arthur Giannotti, Marilena Chaui, Gloria Kalil e Jorge da Cunha Lima faziam parte de platia-cabea que acompanhou anteontem no Masp a palestra de Claude Lefort apresentado por Srgio Cardoso. Essa poca marca a formao da base da moral samurai, resultado da disciplina fsica e mental do zen-budismo, dos ditames do confuncionismo e do esprito militarista reinante. Os EUA entram na vida japonesa em meados do sculo 19, quando obrigam o shogun a assinar um tratado de comrcio. Esse sinal de fraqueza associado a crises internas levam queda do sistema em 1867. Um ano depois, a capital passa a ser Tquio. A hidroginstica uma alternativa para quem no tem acompanhante. A professora Cludia Morgado, 25, da academia Competition, recomenda exerccios leves, como a corrida. O importante levantar bem o joelho, manter o ritmo e encostar o calcanhar no cho a cada movimento, diz. Primeiro foi uma oportunidade de esclarecer muitos pontos que eu no conhecia antes. Mesmo entendendo suas explicaes sobre o programa de governo, entendo que ele tinha que ter um programa de governo, que vai dar um meio de cobrar depois. Um programa um planejamento para se atingir certas metas. A falta de um programa poderia dar a impresso de no ter metas. Gostei da sua posio sobre a religio. Mesmo criticado, ele defende sua posio claramente em favor dos evanglicos e de seu ponto de vista religioso. Em muitos momentos ele simplesmente deixou de se posicionar. Talvez ele no tenha idias elaboradas sobre os problemas do Estado. Ele foi mais direto, mais atacante, deu respostas mais profundas e melhores. Minha expectativa no foi atendida em vrios pontos. Redes, multimdia, sistemas operacionais ou informtica na educao. Esses so alguns dos temas que estaro sendo discutidos no Congresso da Fenasoft, promovido paralelamente feira. O congresso dividido em trs temas, com o objetivo de atender de profissionais de informtica a executivos e empresrios interessados em novas tecnologias, pasando pelo usurio final. O tcnico da Holanda, Dick Advocaat, disse que sua preocupao anular o esquema defensivo da Irlanda. Ele afirmou que pretende reforar seu meio-campo, mas no adiantou a escalao da equipe. Folha -- O senhor foi indiciado em inqurito da Polcia Federal por manipulao de arbitragem, estelionato e formao de quadrilha. O senhor teme ser condenado? Miranda -- Em relao a mim isso no vai dar absolutamente em nada. No sei em relao a os outros indiciados. Me indiciaram precipitadamente, aodadamente. coisa orquestrada, dirigida. Com pregaes que duram at dois minutos. Os candidatos da Bahia esto encontrando certa dificuldade em contratar cantores para animar suas campanhas. O maior aumento aconteceu em direito (mais 366 candidatos) e a maior queda, em engenharia (menos 980). O curso que mais cresceu foi odontologia na USP em Bauru 83,5%. O nmero total de candidatos no vestibular caiu em relao ao ano passado. De 140.518 inscritos, a Fuvest registrou 139.369. A inteno de Fleury vender as aes para equilibrar as finanas e comear a pagar a dvida do Estado, estimada em R$ 31 bilhes. Segundo a assessoria de imprensa do governador, Fleury j pediu um levantamento de preo das aes, junto a consultores especializados, para estabelecer o valor mnimo das aes, que vo ser levadas a leilo. O ministro da Fazenda, Rubens Ricupero, disse que o resultado do IPC-r no uma surpresa. Ele explicou que o ndice foi calculado do dia 16 de junho (ainda sobre cruzeiros reais) ao dia 14 deste ms. O IPC-r de julho, segundo o ministro, tem muito pouco da nova moeda. Normalmente ns utilizamos dados histricos sobre a produtividade em cada regio, alm de informaes de agricultores, diz. Segundo Formaggio, esto em estudo outras formas de obter a produtividade das culturas. Ele estima em dez horas o tempo necessrio para classificar as reas e culturas plantadas em uma extenso de 180 quilmetros. O prefeito de Juazeiro, Manoel Salviano (PSDB), disse que qualquer funcionrio que tenha participado da recepo foi espontaneamente, sem nenhuma presso da prefeitura. Acompanhada de Renata Queirz, mulher do candidato do PSDB ao governo do Cear, Tasso Jereissati, e de Patrcia Gomes, mulher do ministro da Fazenda, Ciro Gomes, Ruth Cardoso visitou trs cidades cearenses antes de Juazeiro. Entre as mais de 140 participantes da feira, estavam presentes companhias como a Xerox, Kodak, QMS, Digital e Pennant, empresa da IBM para a rea de impresso. A Kodak, por exemplo, apresentou as impressoras/copiadoras ColorEdge 1.560 e 1.565, capazes de copiar em cores frente e verso de documentos de vrias pginas. Podem produzir pequenas brochuras, mala direta, folhetos, realizando at sete impresses por minuto. Kim E sobre como eu fiquei totalmente nua o tempo todo. Assista ao filme de novo: voc no vai me ver completamente nua a no ser numa cena, no final do strip tease. E por trs ... Est tudo na sua cabea. Como quando eu conheci Alec. Jurava que ele usava um chapu de cowboy. Alec Nunca usei um chapu daqueles na vida! Nasci em Lono Island. O que um chapu de cowboy estaria fazendo ali? Apesar de haver anunciado que o treino da Bulgria seria aberto ao pblico, os dirigentes mudaram de idia pouco antes do incio do treinamento e proibiram a presena de torcedores no campo da Southern Methodist University, em Dallas. Os dirigentes da seleo pediram aos seguranas da universidade que afastassem o pblico, composto em sua maior parte por jovens, que j se preparava para acompanhar os treinos. noite, o projeto prev aulas de alfabetizao para jovens de 14 a 22 anos nos Cieps. Na eleio de 89, Leonel Brizola prometia a construo de 10 mil Cieps. No projeto no h definio de quantos podem ser construdos. O custo mdio de cada um dos Cieps no Rio de Janeiro foi de cerca de US$ 1 milho. Nossa preocupao promover o desenvolvimento. E como no h crescimento sem educao, a causa das causas para ns a elevao do nvel educacional do povo brasileiro. Claro sem descuidar de tudo mais, mas vendo com clareza a questo das prioridades, declara Brizola. Em Manaus, o representante da famlia Graa Manuel Tavares da Graa, candidato derrotado a deputado estadual pelo PP. Paixo irmo de Jos Tavares da Graa, preso em agosto de 93 em Belm (PA) com 435 quilos de cocana. tambm primo de Curica e de Floriano Graa. Fui procurado pelos ladres para comprar a carga. Fomos eu, o Violim, e os investigadores Gilberto Brito, Euripedes Tozzo, Mauro, Marco e o Adner. Prenderam todos e apreenderam a mercadoria. Eles desviaram quatro toneladas da carga. Quatro Kombis foram para o Deic. S isso foi devolvido para a Riachuelo. No realismo moderno de John Cassavettes (1929-89), por exemplo, o ator construa o personagem em tempo real diante da cmera, baseado no improviso. Cada hesitao, cada silncio, cada desvio do olhar era um ganho na contabilidade do sublime que o diretor-autor planejava. Na semana passada, o jogador Buchwald criticou a escalao de Matthaeus como lbero, pedindo que ele jogasse no meio-campo. O tcnico da Coria, Kim Ho, afirmou que, no primeiro tempo, a equipe estava intimidada pela Alemanha, o que natural, pois eles so campees do mundo. O secretrio-geral do Conselho de Segurana da ONU, Boutros Boutros-Ghali, pediu que as potncias que negociam um tratado de paz com as faces em luta na Bsnia formem uma fora-tarefa para agir na regio. Boutros-Ghali teme que as potncias se retirem e suspendam sanes contra a Srvia pondo em risco outras partes da ex-Iugoslvia se o plano for aceito. Folha -- Qual o segredo da vitria? Gilbert -- Acho que Andre usou essa mesma presso a seu favor. Ele funciona desse jeito. Se voc bate nele, o revide vem forte, pode apostar. Alguns tenistas so assim, usam sua carga emocional como combustvel. Haver mais e melhores escolas, laboratrios, universidades e bolsas de estudo. Pequenas e mdias empresas, Foras Armadas, deficientes fsicos e artistas tero apoio e recursos. Segundo FHC, o Brasil possui esses recursos, muitos de eles naturais, e vantagens comparativas para realizar essas metas. Segundo ele, o tubo era independente do sistema de propulso do submarino e o vapor que escapou no radiativo. O Emeraude no carrega armamento nuclear. A energia nuclear usada s para movimentar a embarcao. Entre cada parte do submarino h isolamento, evitando que acidentes numa parte atinjam as outras. O relaes pblicas da corporao, tenente-coronel Fernando Belo, disse que o encontro foi uma oportunidade para agradecer e elogiar a colaborao da PM. Belo negou que na reunio tenha sido discutida a incorporao aos quadros da PM de cerca de 2.000 soldados do Exrcito. A possibilidade foi levantada em encontro do governador eleito, Marcello Alencar (PSDB), com oficiais do CML. As emissoras se negaram a se retratar e o juiz ameaou terminar com a cobertura do julgamento, sem especificar como. Ito pode anunciar hoje que a transmisso est proibida. A lei do Estado da Califrnia d ao juiz de cada caso autoridade para impedir a presena de cmaras no tribunal se achar que ela prejudica a Justia. ANGOLA, UMA TRAGDIA ESQUECIDA -- O fotgrafo Andr Penner expe 29 fotos coloridas que retratam o cotidiano da guerra civil em Angola. Abertura hoje s 19h30. Seg a sex das 9h30 s 18h30. Preo das fotos: US$ 300. At 17 de junho. BASTIDORES DOS MUSEUS -- O trabalho feito pelos fotgrafos Eduardo Castanho, Saul Queiroz, Rogrio Voltan, Ricardo Hantzchel, Fausto Chermont e Eliana Lopes retrata o que as pessoas no vem ao visitar um museu. Na Anhanguera, que une So Paulo a Ribeiro Preto, costuma haver neblina pesada entre o km 21 e o km 43, tambm perto de Perus e Cajamar. Isto ocorre principalmente noite e a visibilidade pode chegar a zero. A Rodovia Castelo Branco, que segue para o oeste do Estado, apresenta vrios pontos com neblina. O mais perigoso na regio da Serra das Conchas, entre o km 129 e o km 162, logo aps Tatu, para quem vai para o interior. O paulistano no acredita em queda de inflao nos prximos meses. O ndice deve aumentar, segundo 45% dos entrevistados pelo DataFolha. A impresso de que o ndice de alta dos preos cai se restringe a 27% das respostas. Tudo fica como est na opinio de 23%, enquanto 5% no sabem responder. A derrota (1 a 0) para a fraca seleo dos Estados Unidos conturbou o ambiente da seleo mexicana. Um grupo de torcedores arremessou ovos podres e insultos contra o time, na volta para a Cidade do Mxico na noite de domingo passado. Nascidos para perder. Segundo a agncia de noticias Notimex, a reao pode ser uma amostra de que como vai reagir a torcida se o Mxico no se classificar para a segunda fase da Copa. Adoraria ser candidata. Tenho certeza de que se pudesse concorrer a algum cargo, me elegeria. Eu adoro poltica, confessa Mercedes Rossi de Almeida, 73, a me do candidato ao governo de So Paulo Francisco Rossi (PDT). O painel vai funcionar das 8h s 22h. O pblico alvo so os trs milhes de pedestres que circulam diariamente na regio. Empresa funciona das 9h s 19h, diariamente. Na Samp, dois titulares esto fora de combate: o zagueiro Marco Rossi e o armador srvio Jugovic. Pior, o mister sueco Sven-Goran Eriksson tambm no poder contar com o garoto Bertarelli, contratura na coxa direita. Retorna ao banco o bom volante eslovnio Katanec. E, depois de uma semana com gripe, recupera a sua forma o holands Ruud Gullit, artilheiro do time com 14 tentos. Na terceira colocao, 34 pontos, a Juventus de Turim visita o Genoa, 22, no limiar do rebaixamento. A Juve vive a transio para uma nova temporada. Acabou-se a gesto do ex-craque Giampiero Boniperti na administrao do clube. Em seu lugar, vai assumindo um outro ex-atleta, Roberto Bettega. Com Boniperti se despede o tcnico Giovanni Trapattoni, que ceder o posto a Marcello Lippi, hoje no Napoli. A torcida da Senhora se conforta com a melhora de Roberto Baggio, que no ter de operar o joelho. O projeto vem sendo implementado, em carter experimental, em trs escolas: Morumbi, Visconde de Porto Seguro (particulares) e Godofredo Furtado (estadual). O programa ter ao integrada com pais e alunos atravs da formao de grupos de informao, diz Antonio Carlos Gomes da Silva, superintendente do HC. 4) A realizao no Cairo (Egito) da Conferncia Internacional sobre Populao e Desenvolvimento da ONU tem como maior objetivo a divulgao mundial de polticas para o controle de natalidade como forma de reduo da pobreza. A populao mundial est em torno de 5,66 bilhes. VERA LCIA DA COSTA ANTUNES coordenadora de geografia do curso e colgio Objetivo. Com a determinao da Justia, os empregados ficam impedidos de fazer o pagamento das aes. O chefe da gerncia jurdica do BNDES, em So Paulo, Arnaldo Montenegro, 43 disse que ser impetrada uma ao de agravo de instrumento com pedido de reconsiderao ao juiz Andrade Martins. O banco no informou porque o sigilo no foi mantido tambm para os grupos estrangeiros. A relao entre os dois piorou quando Fishel arranjou uma amante, no final dos anos 80. Ele teria deixado a mulher e trazido a amante para Roma. Vrias vezes os dois foram pegos fazendo amor na minha clnica. Eu tolerei pois no quis me intrometer na vida privada dos meus funcionrios, disse Antinori, para quem Fishel era um excelente tcnico. Era um miservel quando chegou. Em poucos anos ganhou mais de US$ 1 milho. Aps vrias divergncias com Fishel, Antinori o demitiu em 1990. Nessa poca, Fishel teria conhecido o ginecologista britnico Robert Winston, a quem convidou para um encontro cientfico promovido por Antinori. O cineasta italiano Franco Zefirelli revelou que, para iludir o servio militar durante o perodo fascista, se tornou um partisan e acabou matando um soldado alemo. As sinalizaes que tem dado o presidente eleito, Fernando Henrique Cardoso, sobre a reforma patrimonial do Estado so auspiciosas. Mas tambm preciso a sociedade mobilizar-se para sensibilizar o futuro Congresso Nacional na promoo de reformas constitucionais indispensveis ao xito das mudanas pretendidas. O patrimnio pblico no fetiche a ser adorado e sim bem a ser utilizado principalmente em favor de quem no tem acesso a escolas, nem assistncia mdica, nem a empregos. Abram Szajman, 58, presidente da Federao do Comrcio do Estado de So Paulo e do Conselho Deliberatico do Sebrae-SP (Servio Brasileiro de Apoio s Micro e Pequenas Empresas). Alm de Mauro Salles que surpreendeu a galera ao revelar seu ct de fotgrafo profissional no nico da carreira, o foco principal do encontro foi a mulher de J.R.Duran, Alexandra Brochen. nltk-3.7/nltk/test/framenet.doctest000066400000000000000000000244151420073152400174560ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ======== FrameNet ======== The FrameNet corpus is a lexical database of English that is both human- and machine-readable, based on annotating examples of how words are used in actual texts. FrameNet is based on a theory of meaning called Frame Semantics, deriving from the work of Charles J. Fillmore and colleagues. The basic idea is straightforward: that the meanings of most words can best be understood on the basis of a semantic frame: a description of a type of event, relation, or entity and the participants in it. For example, the concept of cooking typically involves a person doing the cooking (Cook), the food that is to be cooked (Food), something to hold the food while cooking (Container) and a source of heat (Heating_instrument). In the FrameNet project, this is represented as a frame called Apply_heat, and the Cook, Food, Heating_instrument and Container are called frame elements (FEs). Words that evoke this frame, such as fry, bake, boil, and broil, are called lexical units (LUs) of the Apply_heat frame. The job of FrameNet is to define the frames and to annotate sentences to show how the FEs fit syntactically around the word that evokes the frame. ------ Frames ------ A Frame is a script-like conceptual structure that describes a particular type of situation, object, or event along with the participants and props that are needed for that Frame. For example, the "Apply_heat" frame describes a common situation involving a Cook, some Food, and a Heating_Instrument, and is evoked by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. We call the roles of a Frame "frame elements" (FEs) and the frame-evoking words are called "lexical units" (LUs). FrameNet includes relations between Frames. Several types of relations are defined, of which the most important are: - Inheritance: An IS-A relation. The child frame is a subtype of the parent frame, and each FE in the parent is bound to a corresponding FE in the child. An example is the "Revenge" frame which inherits from the "Rewards_and_punishments" frame. - Using: The child frame presupposes the parent frame as background, e.g the "Speed" frame "uses" (or presupposes) the "Motion" frame; however, not all parent FEs need to be bound to child FEs. - Subframe: The child frame is a subevent of a complex event represented by the parent, e.g. the "Criminal_process" frame has subframes of "Arrest", "Arraignment", "Trial", and "Sentencing". - Perspective_on: The child frame provides a particular perspective on an un-perspectivized parent frame. A pair of examples consists of the "Hiring" and "Get_a_job" frames, which perspectivize the "Employment_start" frame from the Employer's and the Employee's point of view, respectively. To get a list of all of the Frames in FrameNet, you can use the `frames()` function. If you supply a regular expression pattern to the `frames()` function, you will get a list of all Frames whose names match that pattern: >>> from pprint import pprint >>> from operator import itemgetter >>> from nltk.corpus import framenet as fn >>> from nltk.corpus.reader.framenet import PrettyList >>> x = fn.frames(r'(?i)crim') >>> x.sort(key=itemgetter('ID')) >>> x [, , ...] >>> PrettyList(sorted(x, key=itemgetter('ID'))) [, , ...] To get the details of a particular Frame, you can use the `frame()` function passing in the frame number: >>> from pprint import pprint >>> from nltk.corpus import framenet as fn >>> f = fn.frame(202) >>> f.ID 202 >>> f.name 'Arrest' >>> f.definition "Authorities charge a Suspect, who is under suspicion of having committed a crime..." >>> len(f.lexUnit) 11 >>> pprint(sorted([x for x in f.FE])) ['Authorities', 'Charges', 'Co-participant', 'Manner', 'Means', 'Offense', 'Place', 'Purpose', 'Source_of_legal_authority', 'Suspect', 'Time', 'Type'] >>> pprint(f.frameRelations) [ Child=Arrest>, Component=Arrest>, ...] The `frame()` function shown above returns a dict object containing detailed information about the Frame. See the documentation on the `frame()` function for the specifics. You can also search for Frames by their Lexical Units (LUs). The `frames_by_lemma()` function returns a list of all frames that contain LUs in which the 'name' attribute of the LU matches the given regular expression. Note that LU names are composed of "lemma.POS", where the "lemma" part can be made up of either a single lexeme (e.g. 'run') or multiple lexemes (e.g. 'a little') (see below). >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) [, ] ------------- Lexical Units ------------- A lexical unit (LU) is a pairing of a word with a meaning. For example, the "Apply_heat" Frame describes a common situation involving a Cook, some Food, and a Heating Instrument, and is _evoked_ by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. These frame-evoking words are the LUs in the Apply_heat frame. Each sense of a polysemous word is a different LU. We have used the word "word" in talking about LUs. The reality is actually rather complex. When we say that the word "bake" is polysemous, we mean that the lemma "bake.v" (which has the word-forms "bake", "bakes", "baked", and "baking") is linked to three different frames: - Apply_heat: "Michelle baked the potatoes for 45 minutes." - Cooking_creation: "Michelle baked her mother a cake for her birthday." - Absorb_heat: "The potatoes have to bake for more than 30 minutes." These constitute three different LUs, with different definitions. Multiword expressions such as "given name" and hyphenated words like "shut-eye" can also be LUs. Idiomatic phrases such as "middle of nowhere" and "give the slip (to)" are also defined as LUs in the appropriate frames ("Isolated_places" and "Evading", respectively), and their internal structure is not analyzed. Framenet provides multiple annotated examples of each sense of a word (i.e. each LU). Moreover, the set of examples (approximately 20 per LU) illustrates all of the combinatorial possibilities of the lexical unit. Each LU is linked to a Frame, and hence to the other words which evoke that Frame. This makes the FrameNet database similar to a thesaurus, grouping together semantically similar words. In the simplest case, frame-evoking words are verbs such as "fried" in: "Matilde fried the catfish in a heavy iron skillet." Sometimes event nouns may evoke a Frame. For example, "reduction" evokes "Cause_change_of_scalar_position" in: "...the reduction of debt levels to $665 million from $2.6 billion." Adjectives may also evoke a Frame. For example, "asleep" may evoke the "Sleep" frame as in: "They were asleep for hours." Many common nouns, such as artifacts like "hat" or "tower", typically serve as dependents rather than clearly evoking their own frames. Details for a specific lexical unit can be obtained using this class's `lus()` function, which takes an optional regular expression pattern that will be matched against the name of the lexical unit: >>> from pprint import pprint >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID'))) [, , ...] You can obtain detailed information on a particular LU by calling the `lu()` function and passing in an LU's 'ID' number: >>> from pprint import pprint >>> from nltk.corpus import framenet as fn >>> fn.lu(256).name 'foresee.v' >>> fn.lu(256).definition 'COD: be aware of beforehand; predict.' >>> fn.lu(256).frame.name 'Expectation' >>> fn.lu(256).lexemes[0].name 'foresee' Note that LU names take the form of a dotted string (e.g. "run.v" or "a little.adv") in which a lemma precedes the "." and a part of speech (POS) follows the dot. The lemma may be composed of a single lexeme (e.g. "run") or of multiple lexemes (e.g. "a little"). The list of POSs used in the LUs is: v - verb n - noun a - adjective adv - adverb prep - preposition num - numbers intj - interjection art - article c - conjunction scon - subordinating conjunction For more detailed information about the info that is contained in the dict that is returned by the `lu()` function, see the documentation on the `lu()` function. ------------------- Annotated Documents ------------------- The FrameNet corpus contains a small set of annotated documents. A list of these documents can be obtained by calling the `docs()` function: >>> from pprint import pprint >>> from nltk.corpus import framenet as fn >>> d = fn.docs('BellRinging')[0] >>> d.corpname 'PropBank' >>> d.sentence[49] full-text sentence (...) in BellRinging: [POS] 17 tags [POS_tagset] PENN [text] + [annotationSet] `` I live in hopes that the ringers themselves will be drawn into ***** ******* ***** Desir Cause_t Cause [1] [3] [2] that fuller life . ****** Comple [4] (Desir=Desiring, Cause_t=Cause_to_make_noise, Cause=Cause_motion, Comple=Completeness) >>> d.sentence[49].annotationSet[1] annotation set (...): [status] MANUAL [LU] (6605) hope.n in Desiring [frame] (366) Desiring [GF] 2 relations [PT] 2 phrases [text] + [Target] + [FE] + [Noun] `` I live in hopes that the ringers themselves will be drawn into - ^^^^ ^^ ***** ---------------------------------------------- E supp su Event that fuller life . ----------------- (E=Experiencer, su=supp) nltk-3.7/nltk/test/generate.doctest000066400000000000000000000036641420073152400174520ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =============================================== Generating sentences from context-free grammars =============================================== An example grammar: >>> from nltk.parse.generate import generate, demo_grammar >>> from nltk import CFG >>> grammar = CFG.fromstring(demo_grammar) >>> print(grammar) Grammar with 13 productions (start state = S) S -> NP VP NP -> Det N PP -> P NP VP -> 'slept' VP -> 'saw' NP VP -> 'walked' PP Det -> 'the' Det -> 'a' N -> 'man' N -> 'park' N -> 'dog' P -> 'in' P -> 'with' The first 10 generated sentences: >>> for sentence in generate(grammar, n=10): ... print(' '.join(sentence)) the man slept the man saw the man the man saw the park the man saw the dog the man saw a man the man saw a park the man saw a dog the man walked in the man the man walked in the park the man walked in the dog All sentences of max depth 4: >>> for sentence in generate(grammar, depth=4): ... print(' '.join(sentence)) the man slept the park slept the dog slept a man slept a park slept a dog slept The number of sentences of different max depths: >>> len(list(generate(grammar, depth=3))) 0 >>> len(list(generate(grammar, depth=4))) 6 >>> len(list(generate(grammar, depth=5))) 42 >>> len(list(generate(grammar, depth=6))) 114 >>> len(list(generate(grammar))) 114 Infinite grammars will throw a RecursionError when not bounded by some ``depth``: >>> grammar = CFG.fromstring(""" ... S -> A B ... A -> B ... B -> "b" | A ... """) >>> list(generate(grammar)) Traceback (most recent call last): ... RuntimeError: The grammar has rule(s) that yield infinite recursion! nltk-3.7/nltk/test/gensim.doctest000066400000000000000000000117031420073152400171330ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ======================================= Demonstrate word embedding using Gensim ======================================= >>> from nltk.test.gensim_fixt import setup_module >>> setup_module() We demonstrate three functions: - Train the word embeddings using brown corpus; - Load the pre-trained model and perform simple tasks; and - Pruning the pre-trained binary model. >>> import gensim --------------- Train the model --------------- Here we train a word embedding using the Brown Corpus: >>> from nltk.corpus import brown >>> train_set = brown.sents()[:10000] >>> model = gensim.models.Word2Vec(train_set) It might take some time to train the model. So, after it is trained, it can be saved as follows: >>> model.save('brown.embedding') >>> new_model = gensim.models.Word2Vec.load('brown.embedding') The model will be the list of words with their embedding. We can easily get the vector representation of a word. >>> len(new_model.wv['university']) 100 There are some supporting functions already implemented in Gensim to manipulate with word embeddings. For example, to compute the cosine similarity between 2 words: >>> new_model.wv.similarity('university','school') > 0.3 True --------------------------- Using the pre-trained model --------------------------- NLTK includes a pre-trained model which is part of a model that is trained on 100 billion words from the Google News Dataset. The full model is from https://code.google.com/p/word2vec/ (about 3 GB). >>> from nltk.data import find >>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt')) >>> model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False) We pruned the model to only include the most common words (~44k words). >>> len(model) 43981 Each word is represented in the space of 300 dimensions: >>> len(model['university']) 300 Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score. >>> model.most_similar(positive=['university'], topn = 3) [('universities', 0.70039...), ('faculty', 0.67809...), ('undergraduate', 0.65870...)] Finding a word that is not in a list is also supported, although, implementing this by yourself is simple. >>> model.doesnt_match('breakfast cereal dinner lunch'.split()) 'cereal' Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example, the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'. >>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1) [('queen', 0.71181...)] >>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1) [('France', 0.78840...)] We can visualize the word embeddings using t-SNE (https://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words. | import numpy as np | labels = [] | count = 0 | max_count = 1000 | X = np.zeros(shape=(max_count,len(model['university']))) | | for term in model.index_to_key: | X[count] = model[term] | labels.append(term) | count+= 1 | if count >= max_count: break | | # It is recommended to use PCA first to reduce to ~50 dimensions | from sklearn.decomposition import PCA | pca = PCA(n_components=50) | X_50 = pca.fit_transform(X) | | # Using TSNE to further reduce to 2 dimensions | from sklearn.manifold import TSNE | model_tsne = TSNE(n_components=2, random_state=0) | Y = model_tsne.fit_transform(X_50) | | # Show the scatter plot | import matplotlib.pyplot as plt | plt.scatter(Y[:,0], Y[:,1], 20) | | # Add labels | for label, x, y in zip(labels, Y[:, 0], Y[:, 1]): | plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10) | | plt.show() ------------------------------ Prune the trained binary model ------------------------------ Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/ We use this code to get the `word2vec_sample` model. | import gensim | # Load the binary model | model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True) | | # Only output word that appear in the Brown corpus | from nltk.corpus import brown | words = set(brown.words()) | print(len(words)) | | # Output presented word to a temporary file | out_file = 'pruned.word2vec.txt' | with open(out_file,'w') as f: | word_presented = words.intersection(model.index_to_key) | f.write('{} {}\n'.format(len(word_presented),len(model['word']))) | | for word in word_presented: | f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word]))) nltk-3.7/nltk/test/gensim_fixt.py000066400000000000000000000001111420073152400171370ustar00rootroot00000000000000def setup_module(): import pytest pytest.importorskip("gensim") nltk-3.7/nltk/test/gluesemantics.doctest000066400000000000000000000301221420073152400205100ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ============================================================================== Glue Semantics ============================================================================== .. include:: ../../../nltk_book/definitions.rst ====================== Linear logic ====================== >>> from nltk.sem import logic >>> from nltk.sem.glue import * >>> from nltk.sem.linearlogic import * >>> from nltk.sem.linearlogic import Expression >>> read_expr = Expression.fromstring Parser >>> print(read_expr(r'f')) f >>> print(read_expr(r'(g -o f)')) (g -o f) >>> print(read_expr(r'(g -o (h -o f))')) (g -o (h -o f)) >>> print(read_expr(r'((g -o G) -o G)')) ((g -o G) -o G) >>> print(read_expr(r'(g -o f)(g)')) (g -o f)(g) >>> print(read_expr(r'((g -o G) -o G)((g -o f))')) ((g -o G) -o G)((g -o f)) Simplify >>> print(read_expr(r'f').simplify()) f >>> print(read_expr(r'(g -o f)').simplify()) (g -o f) >>> print(read_expr(r'((g -o G) -o G)').simplify()) ((g -o G) -o G) >>> print(read_expr(r'(g -o f)(g)').simplify()) f >>> try: read_expr(r'(g -o f)(f)').simplify() ... except LinearLogicApplicationException as e: print(e) ... Cannot apply (g -o f) to f. Cannot unify g with f given {} >>> print(read_expr(r'(G -o f)(g)').simplify()) f >>> print(read_expr(r'((g -o G) -o G)((g -o f))').simplify()) f Test BindingDict >>> h = ConstantExpression('h') >>> g = ConstantExpression('g') >>> f = ConstantExpression('f') >>> H = VariableExpression('H') >>> G = VariableExpression('G') >>> F = VariableExpression('F') >>> d1 = BindingDict({H: h}) >>> d2 = BindingDict({F: f, G: F}) >>> d12 = d1 + d2 >>> all12 = ['%s: %s' % (v, d12[v]) for v in d12.d] >>> all12.sort() >>> print(all12) ['F: f', 'G: f', 'H: h'] >>> BindingDict([(F,f),(G,g),(H,h)]) == BindingDict({F:f, G:g, H:h}) True >>> d4 = BindingDict({F: f}) >>> try: d4[F] = g ... except VariableBindingException as e: print(e) Variable F already bound to another value Test Unify >>> try: f.unify(g, BindingDict()) ... except UnificationException as e: print(e) ... Cannot unify f with g given {} >>> f.unify(G, BindingDict()) == BindingDict({G: f}) True >>> try: f.unify(G, BindingDict({G: h})) ... except UnificationException as e: print(e) ... Cannot unify f with G given {G: h} >>> f.unify(G, BindingDict({G: f})) == BindingDict({G: f}) True >>> f.unify(G, BindingDict({H: f})) == BindingDict({G: f, H: f}) True >>> G.unify(f, BindingDict()) == BindingDict({G: f}) True >>> try: G.unify(f, BindingDict({G: h})) ... except UnificationException as e: print(e) ... Cannot unify G with f given {G: h} >>> G.unify(f, BindingDict({G: f})) == BindingDict({G: f}) True >>> G.unify(f, BindingDict({H: f})) == BindingDict({G: f, H: f}) True >>> G.unify(F, BindingDict()) == BindingDict({G: F}) True >>> try: G.unify(F, BindingDict({G: H})) ... except UnificationException as e: print(e) ... Cannot unify G with F given {G: H} >>> G.unify(F, BindingDict({G: F})) == BindingDict({G: F}) True >>> G.unify(F, BindingDict({H: F})) == BindingDict({G: F, H: F}) True Test Compile >>> print(read_expr('g').compile_pos(Counter(), GlueFormula)) (, []) >>> print(read_expr('(g -o f)').compile_pos(Counter(), GlueFormula)) (, []) >>> print(read_expr('(g -o (h -o f))').compile_pos(Counter(), GlueFormula)) (, []) ====================== Glue ====================== Demo of "John walks" -------------------- >>> john = GlueFormula("John", "g") >>> print(john) John : g >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)") >>> print(walks) \x.walks(x) : (g -o f) >>> print(walks.applyto(john)) \x.walks(x)(John) : (g -o f)(g) >>> print(walks.applyto(john).simplify()) walks(John) : f Demo of "A dog walks" --------------------- >>> a = GlueFormula("\\P Q.some x.(P(x) and Q(x))", "((gv -o gr) -o ((g -o G) -o G))") >>> print(a) \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) >>> man = GlueFormula(r"\x.man(x)", "(gv -o gr)") >>> print(man) \x.man(x) : (gv -o gr) >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)") >>> print(walks) \x.walks(x) : (g -o f) >>> a_man = a.applyto(man) >>> print(a_man.simplify()) \Q.exists x.(man(x) & Q(x)) : ((g -o G) -o G) >>> a_man_walks = a_man.applyto(walks) >>> print(a_man_walks.simplify()) exists x.(man(x) & walks(x)) : f Demo of 'every girl chases a dog' --------------------------------- Individual words: >>> every = GlueFormula("\\P Q.all x.(P(x) -> Q(x))", "((gv -o gr) -o ((g -o G) -o G))") >>> print(every) \P Q.all x.(P(x) -> Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) >>> girl = GlueFormula(r"\x.girl(x)", "(gv -o gr)") >>> print(girl) \x.girl(x) : (gv -o gr) >>> chases = GlueFormula(r"\x y.chases(x,y)", "(g -o (h -o f))") >>> print(chases) \x y.chases(x,y) : (g -o (h -o f)) >>> a = GlueFormula("\\P Q.some x.(P(x) and Q(x))", "((hv -o hr) -o ((h -o H) -o H))") >>> print(a) \P Q.exists x.(P(x) & Q(x)) : ((hv -o hr) -o ((h -o H) -o H)) >>> dog = GlueFormula(r"\x.dog(x)", "(hv -o hr)") >>> print(dog) \x.dog(x) : (hv -o hr) Noun Quantification can only be done one way: >>> every_girl = every.applyto(girl) >>> print(every_girl.simplify()) \Q.all x.(girl(x) -> Q(x)) : ((g -o G) -o G) >>> a_dog = a.applyto(dog) >>> print(a_dog.simplify()) \Q.exists x.(dog(x) & Q(x)) : ((h -o H) -o H) The first reading is achieved by combining 'chases' with 'a dog' first. Since 'a girl' requires something of the form '(h -o H)' we must get rid of the 'g' in the glue of 'see'. We will do this with the '-o elimination' rule. So, x1 will be our subject placeholder. >>> xPrime = GlueFormula("x1", "g") >>> print(xPrime) x1 : g >>> xPrime_chases = chases.applyto(xPrime) >>> print(xPrime_chases.simplify()) \y.chases(x1,y) : (h -o f) >>> xPrime_chases_a_dog = a_dog.applyto(xPrime_chases) >>> print(xPrime_chases_a_dog.simplify()) exists x.(dog(x) & chases(x1,x)) : f Now we can retract our subject placeholder using lambda-abstraction and combine with the true subject. >>> chases_a_dog = xPrime_chases_a_dog.lambda_abstract(xPrime) >>> print(chases_a_dog.simplify()) \x1.exists x.(dog(x) & chases(x1,x)) : (g -o f) >>> every_girl_chases_a_dog = every_girl.applyto(chases_a_dog) >>> r1 = every_girl_chases_a_dog.simplify() >>> r2 = GlueFormula(r'all x.(girl(x) -> exists z1.(dog(z1) & chases(x,z1)))', 'f') >>> r1 == r2 True The second reading is achieved by combining 'every girl' with 'chases' first. >>> xPrime = GlueFormula("x1", "g") >>> print(xPrime) x1 : g >>> xPrime_chases = chases.applyto(xPrime) >>> print(xPrime_chases.simplify()) \y.chases(x1,y) : (h -o f) >>> yPrime = GlueFormula("x2", "h") >>> print(yPrime) x2 : h >>> xPrime_chases_yPrime = xPrime_chases.applyto(yPrime) >>> print(xPrime_chases_yPrime.simplify()) chases(x1,x2) : f >>> chases_yPrime = xPrime_chases_yPrime.lambda_abstract(xPrime) >>> print(chases_yPrime.simplify()) \x1.chases(x1,x2) : (g -o f) >>> every_girl_chases_yPrime = every_girl.applyto(chases_yPrime) >>> print(every_girl_chases_yPrime.simplify()) all x.(girl(x) -> chases(x,x2)) : f >>> every_girl_chases = every_girl_chases_yPrime.lambda_abstract(yPrime) >>> print(every_girl_chases.simplify()) \x2.all x.(girl(x) -> chases(x,x2)) : (h -o f) >>> every_girl_chases_a_dog = a_dog.applyto(every_girl_chases) >>> r1 = every_girl_chases_a_dog.simplify() >>> r2 = GlueFormula(r'exists x.(dog(x) & all z2.(girl(z2) -> chases(z2,x)))', 'f') >>> r1 == r2 True Compilation ----------- >>> for cp in GlueFormula('m', '(b -o a)').compile(Counter()): print(cp) m : (b -o a) : {1} >>> for cp in GlueFormula('m', '((c -o b) -o a)').compile(Counter()): print(cp) v1 : c : {1} m : (b[1] -o a) : {2} >>> for cp in GlueFormula('m', '((d -o (c -o b)) -o a)').compile(Counter()): print(cp) v1 : c : {1} v2 : d : {2} m : (b[1, 2] -o a) : {3} >>> for cp in GlueFormula('m', '((d -o e) -o ((c -o b) -o a))').compile(Counter()): print(cp) v1 : d : {1} v2 : c : {2} m : (e[1] -o (b[2] -o a)) : {3} >>> for cp in GlueFormula('m', '(((d -o c) -o b) -o a)').compile(Counter()): print(cp) v1 : (d -o c) : {1} m : (b[1] -o a) : {2} >>> for cp in GlueFormula('m', '((((e -o d) -o c) -o b) -o a)').compile(Counter()): print(cp) v1 : e : {1} v2 : (d[1] -o c) : {2} m : (b[2] -o a) : {3} Demo of 'a man walks' using Compilation --------------------------------------- Premises >>> a = GlueFormula('\\P Q.some x.(P(x) and Q(x))', '((gv -o gr) -o ((g -o G) -o G))') >>> print(a) \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) >>> man = GlueFormula('\\x.man(x)', '(gv -o gr)') >>> print(man) \x.man(x) : (gv -o gr) >>> walks = GlueFormula('\\x.walks(x)', '(g -o f)') >>> print(walks) \x.walks(x) : (g -o f) Compiled Premises: >>> counter = Counter() >>> ahc = a.compile(counter) >>> g1 = ahc[0] >>> print(g1) v1 : gv : {1} >>> g2 = ahc[1] >>> print(g2) v2 : g : {2} >>> g3 = ahc[2] >>> print(g3) \P Q.exists x.(P(x) & Q(x)) : (gr[1] -o (G[2] -o G)) : {3} >>> g4 = man.compile(counter)[0] >>> print(g4) \x.man(x) : (gv -o gr) : {4} >>> g5 = walks.compile(counter)[0] >>> print(g5) \x.walks(x) : (g -o f) : {5} Derivation: >>> g14 = g4.applyto(g1) >>> print(g14.simplify()) man(v1) : gr : {1, 4} >>> g134 = g3.applyto(g14) >>> print(g134.simplify()) \Q.exists x.(man(x) & Q(x)) : (G[2] -o G) : {1, 3, 4} >>> g25 = g5.applyto(g2) >>> print(g25.simplify()) walks(v2) : f : {2, 5} >>> g12345 = g134.applyto(g25) >>> print(g12345.simplify()) exists x.(man(x) & walks(x)) : f : {1, 2, 3, 4, 5} --------------------------------- Dependency Graph to Glue Formulas --------------------------------- >>> from nltk.corpus.reader.dependency import DependencyGraph >>> depgraph = DependencyGraph("""1 John _ NNP NNP _ 2 SUBJ _ _ ... 2 sees _ VB VB _ 0 ROOT _ _ ... 3 a _ ex_quant ex_quant _ 4 SPEC _ _ ... 4 dog _ NN NN _ 2 OBJ _ _ ... """) >>> gfl = GlueDict('nltk:grammars/sample_grammars/glue.semtype').to_glueformula_list(depgraph) >>> print(gfl) # doctest: +SKIP [\x y.sees(x,y) : (f -o (i -o g)), \x.dog(x) : (iv -o ir), \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I3) -o I3)), \P Q.exists x.(P(x) & Q(x)) : ((fv -o fr) -o ((f -o F4) -o F4)), \x.John(x) : (fv -o fr)] >>> glue = Glue() >>> for r in sorted([r.simplify().normalize() for r in glue.get_readings(glue.gfl_to_compiled(gfl))], key=str): ... print(r) exists z1.(John(z1) & exists z2.(dog(z2) & sees(z1,z2))) exists z1.(dog(z1) & exists z2.(John(z2) & sees(z2,z1))) ----------------------------------- Dependency Graph to LFG f-structure ----------------------------------- >>> from nltk.sem.lfg import FStructure >>> fstruct = FStructure.read_depgraph(depgraph) >>> print(fstruct) # doctest: +SKIP f:[pred 'sees' obj h:[pred 'dog' spec 'a'] subj g:[pred 'John']] >>> fstruct.to_depgraph().tree().pprint() (sees (dog a) John) --------------------------------- LFG f-structure to Glue --------------------------------- >>> fstruct.to_glueformula_list(GlueDict('nltk:grammars/sample_grammars/glue.semtype')) # doctest: +SKIP [\x y.sees(x,y) : (i -o (g -o f)), \x.dog(x) : (gv -o gr), \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G3) -o G3)), \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I4) -o I4)), \x.John(x) : (iv -o ir)] .. see gluesemantics_malt.doctest for more nltk-3.7/nltk/test/gluesemantics_malt.doctest000066400000000000000000000050461420073152400215340ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT .. see also: gluesemantics.doctest ============================================================================== Glue Semantics ============================================================================== >>> from nltk.test.gluesemantics_malt_fixt import setup_module >>> setup_module() >>> from nltk.sem.glue import * >>> nltk.sem.logic._counter._value = 0 -------------------------------- Initialize the Dependency Parser -------------------------------- >>> from nltk.parse.malt import MaltParser >>> tagger = RegexpTagger( ... [('^(John|Mary)$', 'NNP'), ... ('^(sees|chases)$', 'VB'), ... ('^(a)$', 'ex_quant'), ... ('^(every)$', 'univ_quant'), ... ('^(girl|dog)$', 'NN') ... ]).tag >>> depparser = MaltParser(tagger=tagger) -------------------- Automated Derivation -------------------- >>> glue = Glue(depparser=depparser) >>> readings = glue.parse_to_meaning('every girl chases a dog'.split()) >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str): ... print(reading.normalize()) all z1.(girl(z1) -> exists z2.(dog(z2) & chases(z1,z2))) exists z1.(dog(z1) & all z2.(girl(z2) -> chases(z2,z1))) >>> drtglue = DrtGlue(depparser=depparser) >>> readings = drtglue.parse_to_meaning('every girl chases a dog'.split()) >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str): ... print(reading) ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chases(z1,z2)]))]) ([z1],[dog(z1), (([z2],[girl(z2)]) -> ([],[chases(z2,z1)]))]) -------------- With inference -------------- Checking for equality of two DRSs is very useful when generating readings of a sentence. For example, the ``glue`` module generates two readings for the sentence *John sees Mary*: >>> from nltk.sem.glue import DrtGlue >>> readings = drtglue.parse_to_meaning('John sees Mary'.split()) >>> for drs in sorted([r.simplify().normalize() for r in readings], key=str): ... print(drs) ([z1,z2],[John(z1), Mary(z2), sees(z1,z2)]) ([z1,z2],[Mary(z1), John(z2), sees(z2,z1)]) However, it is easy to tell that these two readings are logically the same, and therefore one of them is superfluous. We can use the theorem prover to determine this equivalence, and then delete one of them. A particular theorem prover may be specified, or the argument may be left off to use the default. >>> readings[0].equiv(readings[1]) True nltk-3.7/nltk/test/gluesemantics_malt_fixt.py000066400000000000000000000003371420073152400215470ustar00rootroot00000000000000def setup_module(): import pytest from nltk.parse.malt import MaltParser try: depparser = MaltParser() except (AssertionError, LookupError) as e: pytest.skip("MaltParser is not available") nltk-3.7/nltk/test/grammar.doctest000066400000000000000000000035301420073152400172760ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =============== Grammar Parsing =============== Grammars can be parsed from strings: >>> from nltk import CFG >>> grammar = CFG.fromstring(""" ... S -> NP VP ... PP -> P NP ... NP -> Det N | NP PP ... VP -> V NP | VP PP ... Det -> 'a' | 'the' ... N -> 'dog' | 'cat' ... V -> 'chased' | 'sat' ... P -> 'on' | 'in' ... """) >>> grammar >>> grammar.start() S >>> grammar.productions() [S -> NP VP, PP -> P NP, NP -> Det N, NP -> NP PP, VP -> V NP, VP -> VP PP, Det -> 'a', Det -> 'the', N -> 'dog', N -> 'cat', V -> 'chased', V -> 'sat', P -> 'on', P -> 'in'] Probabilistic CFGs: >>> from nltk import PCFG >>> toy_pcfg1 = PCFG.fromstring(""" ... S -> NP VP [1.0] ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] ... Det -> 'the' [0.8] | 'my' [0.2] ... N -> 'man' [0.5] | 'telescope' [0.5] ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2] ... V -> 'ate' [0.35] | 'saw' [0.65] ... PP -> P NP [1.0] ... P -> 'with' [0.61] | 'under' [0.39] ... """) Chomsky Normal Form grammar (Test for bug 474) >>> g = CFG.fromstring("VP^ -> VBP NP^") >>> g.productions()[0].lhs() VP^ Grammars can contain both empty strings and empty productions: >>> from nltk.grammar import CFG >>> from nltk.parse.generate import generate >>> grammar = CFG.fromstring(""" ... S -> A B ... A -> 'a' ... # An empty string: ... B -> 'b' | '' ... """) >>> list(generate(grammar)) [['a', 'b'], ['a', '']] >>> grammar = CFG.fromstring(""" ... S -> A B ... A -> 'a' ... # An empty production: ... B -> 'b' | ... """) >>> list(generate(grammar)) [['a', 'b'], ['a']] nltk-3.7/nltk/test/grammartestsuites.doctest000066400000000000000000000062001420073152400214300ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========================== Test Suites for Grammars ========================== Sentences in the test suite are divided into two classes: - grammatical (*accept*) and - ungrammatical (*reject*). If a sentence should parse according to the grammar, the value of ``trees`` will be a non-empty list. If a sentence should be rejected according to the grammar, then the value of ``trees`` will be ``None``. >>> from nltk.parse import TestGrammar >>> germantest1 = {} >>> germantest1['doc'] = "Tests for person agreement" >>> germantest1['accept'] = [ ... 'ich komme', ... 'ich sehe mich', ... 'du kommst', ... 'du siehst mich', ... 'sie kommt', ... 'sie sieht mich', ... 'ihr kommt', ... 'wir kommen', ... 'sie kommen', ... 'du magst mich', ... 'er mag mich', ... 'du folgst mir', ... 'sie hilft mir', ... ] >>> germantest1['reject'] = [ ... 'ich kommt', ... 'ich kommst', ... 'ich siehst mich', ... 'du komme', ... 'du sehe mich', ... 'du kommt', ... 'er komme', ... 'er siehst mich', ... 'wir komme', ... 'wir kommst', ... 'die Katzen kommst', ... 'sie komme', ... 'sie kommst', ... 'du mag mich', ... 'er magst mich', ... 'du folgt mir', ... 'sie hilfst mir', ... ] >>> germantest2 = {} >>> germantest2['doc'] = "Tests for number agreement" >>> germantest2['accept'] = [ ... 'der Hund kommt', ... 'die Hunde kommen', ... 'ich komme', ... 'wir kommen', ... 'ich sehe die Katzen', ... 'ich folge den Katzen', ... 'ich sehe die Katzen', ... 'ich folge den Katzen', ... 'wir sehen die Katzen', ... 'wir folgen den Katzen' ... ] >>> germantest2['reject'] = [ ... 'ich kommen', ... 'wir komme', ... 'der Hunde kommt', ... 'der Hunde kommen', ... 'die Katzen kommt', ... 'ich sehe der Hunde', ... 'ich folge den Hund', ... 'ich sehen der Hunde', ... 'ich folgen den Hund', ... 'wir sehe die Katzen', ... 'wir folge den Katzen' ... ] >>> germantest3 = {} >>> germantest3['doc'] = "Tests for case government and subcategorization" >>> germantest3['accept'] = [ ... 'der Hund sieht mich', ... 'der Hund kommt', ... 'ich sehe den Hund', ... 'ich helfe dem Hund', ... ] >>> germantest3['reject'] = [ ... 'ich sehe', ... 'ich helfe', ... 'ich komme den Hund', ... 'ich sehe den Hund die Katzen', ... 'du hilfst mich', ... 'du siehst mir', ... 'du siehst ich', ... 'der Hunde kommt mich', ... 'die Hunde sehe die Hunde', ... 'der Hund sehe die Hunde', ... 'ich hilft den Hund', ... 'ich hilft der Hund', ... 'ich sehe dem Hund', ... ] >>> germantestsuites = [germantest1, germantest2, germantest3] >>> tester = TestGrammar('grammars/book_grammars/german.fcfg', germantestsuites) >>> tester.run() Tests for person agreement: All tests passed! Tests for number agreement: All tests passed! Tests for case government and subcategorization: All tests passed! nltk-3.7/nltk/test/images/000077500000000000000000000000001420073152400155255ustar00rootroot00000000000000nltk-3.7/nltk/test/images/twitter_app1.tiff000066400000000000000000001452701420073152400210330ustar00rootroot00000000000000MM*V?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjTWVkUv_XlV;%gZmVeo\nW;BW`pX<& bqXf7dn7W-fsYw=htZ=&M`f]lv[=mnw[Spx\>'ry\g7QWv{]wxz?z}^g|zO~_g Iirr@.m |xh&hR*!1} hA1p8 h"t` Ā4 1̈A`xD& CaF%Ebxf5GcE#IdyDU+Kee3Mfyu;OdOpnd= 3 $)pl= 0 H`|g6݁ǐ|1H (!zj(<'"0}$|lUtry\g5(p` EY[5F.gw{y|G{}]ALbUA˚*2f': *&.$ J5z<-2HUMVr<;CZ9Nb c{2!ME| [)eq`lF !b~.Xȼj4 ;t05u]uyc9I>QY^Y~aI2e ez95P)@%X % :+YPڵzXCLX8`4G H0zkiw*@$̟Ý~)N.'%Ajw*9GV|_q<y&pPaTN% r{ CfgzyS fIB0~=Z"}ɢl$ sF ?dyd85!Sd @pe.,-A>bM 7 X|U<#y)<A)!]@Qmdk;gN59#Pj҃p5B0![Pj? F"y!ZP!Dp^-3 ΐ"30 XE wT').QJA0rtʚSjqNPP> Q.B,TA@t:pT'* 5Da.#,J4EHfW ɜ0J0%Nhs蝡Iu6U*XU+, تcA cs jt2%o!@h7#[KkʹAv%؛6Wa7^+[pd]W1lE$HQ$ M%꽁_ ]{ ޿al9 V1;\ZWZu`b@VIq15cu= X՞5^E;"A#tcBM aXD$ Y`#C1[@A` E9FTj hTL t-9os8`7]z<@]E l-Iꙋ<8pB$(,4 Pbz=ۏ:npDQzH@@0 gSPS& lB 8i7 Rbe @@5@Wf K*X2lۛT" 4 cm$2 jK2t̘ \nrZt%t mH MNOߣ2 Ͱ`fFg$3 (^DA$gg!L #\<1DD @]e] Pʔ- L /"YO)@NnbK&*8>ud< >i+&UA@x ~ 647h:>8A@8˲ÂN(ьQ(-q:?1GKaaȵ9$?,84Š8 Xoߍ@DBY at)+wp >&DbVC0!@ R!̌ !L[ t z $Oʵk6{  (K))qK!""`QNVd]1S112#2S&%D&a,#rZD#R1!\p̡RGoB a` @maЎ[0b@ʧ"q!@ d@Fa,JZQ2!`Nas,%0 3w2F32G F‚(o!dN{ 83StJ8F<a>];e<2m! 2g:pzTjnߣ9 MO/J2EM@M !0\*aW=>$>>ʁ?39t?C\ tG'z*<"2Ӱ c;-JG`$|  /%&IAmJ ޞ b7>8H@84:"cqPcXd,NQCUD*OPC uQ3;>0Qc]t@2gVkVoWsWUwW2fAAtdȢzX@1`#!j !&*ɀ0!HVKGLY`7s];pJS[$@ZQS7T]5*rL(@D1,-dvxB `99ICUι@T_}- *"Q:@t5s`ICԧ`N"jva\ %ri6vH'b)c#cv:ԡgU!/vUk`b& hTOa!=tg(HN̪0yl-/`MUG@7O]tp*pcn Z wTr%rQq_UosEV5{wWww{wxxUuWl@ (&Ɯ#d 6A`Nayl)T|a(BO5s!aD\x(@dd^` @ҹ.T/*sa5nd!:Ӱ8 He@FGa.Bl cvakW9A3@!6XqX/Xvd0"k ^)9A T."i1X8m>DJ9t_V{DXl S8b@| !Z`4 v 3`ɬ]dsd h @\\ cCt:D%D! riDUGÅ07c5YطꞫޯ^''Ay!~7$%D-@234:Az*r1!84=5D-( ~( a0n)@>'x.^G@ CWE0>RO)x ^?+Xu@FĠWxx?Y8P'ѰB%F#QɌG%er}!bq)\aiX2D@Yu:Gcld戠n"; LUdǃR0.r32,㇏Kg8j;k%KE`*&A18^3 ҟK֫D5 0dӵ-Z^9΃:;<3=Oc$2P%Rf('f 1wG H2"H4$I2T&It(J2*J,K2Է.K0L32L44M3T6Mt8N3:IΨyvgˁ~a&4xX2IZ/\xʅx0@2-^R]`?If H !e`Cpݥ V%i0yl'BT@SX)6BvnGO݈yXJcG']Gj |!T? )0Fa:{E'[O1L($?m۶*I Ct!(Z%&clX)z硞N|P5] N$DCZ aa!b6r[qghb)$<$$v !b+`pd[?] $kDp@?FF&~gϴ~;9Ba>(@[+@U &" PNAh~'h".)($x(P O]3AG`!,>Dt?!XD>T'R` d?:.:AdZ$l8G&lC8|G)%X`, +?(s d#G\uQbNPu/%aI1FM95fכfmM7fߜqN99g2R m "Y}%`< pPTh>p BI-ChtOxf*  dBN?x F̎ K!I{'Ʀj6u;H(RJ҅Hʠ)2k )4Ov_=mcetIF]\@vm%Z c;Lze!ܝo- 3~C@HnyB) dX6!3ϴ RFeZ }29`!|U4hek]A&}'}it{ZXѩj҄2˜ )Kɓ̷z,O(=JZ<< ZJ0 z¢P,%: D.#ְ(("ť<Zʉċ@ sϾ-:CT0㰢-۾ߟb3Sn#) N補:+ޑsOK.t@k+N;;o&44Lz#$+A+N%-AO(n,60’?Qұ  H^I+KJO.0̂)KlSSXY 3hwg%3:0Cw<46At--[΢KHsɉ6RTJTo~3DKT/Vq[$оW¶煷Ghz:]b@"EHrIiѼ5XPR/B2wI3u5P\K\r}PȘؾȜUO W\!Eȕ K5T2,AΥX$Cblr|$O'4c $8|0XI 0Q CX N2$2֔Vqg$ؒr&caFt{nUR D(6ڧ Ie.jIy+M  ӹAHfJx"]I25.OSE[OƥZ/4vAy:Z- 35V \i؅XiZ`(BJ3YM<npVTYvdZBm% }VZ]c;&EŶc6ݯLE/G_q [f@0D~ZC MbcQO5F992 yfrU`+ZC&' #1 e 3q2 QP CXf3X5`71MYˈ5)9acQ:%*W@ s>E0hoS}K+Rc1.Ϻd W$hwo:MV!udn'Hp\?3~;{ׁ-" *ΦFۙ)`|odafdV7>[DpLulNՔ"F4·#{] 5M--tˬwOG&xа@U" p1? /Pp0sJw'./4@y FO=ojpۑjచM G P2/N= QDkG J,<x" 4^9 XN*V%BVPJ "6*^{ɞA ~$8C-1z`~-h`X:RbQ2K?Y!+ e ۩F^ì,D.ΎȐL1⌊d&"ro2i 8#)Ft *}:R,+(0S&̘2Ј")(V(k!VN<m(.&"c8i*r1܆2|DJ Nv1Qr+ a !PO!ĥj-%;""D.4.zΒ?%?$C/f!RMW%%]&#&rI.@z6z+1S 5bdq,9)B)+)i**l:p:Q=+PJkw+3,>% ~sq.SG-//I//䚨B]0:W1%PK3=Q22h1--B(ӑ3r-rq?2 /dS5R%5@$a#P-% 6B77;'k&]9nrr=*_"i; (.<.Ų9+K+ O2&;t>/X´?2?.x;& }t7,]1B9ŻK+A1B4\2P"D5HM3DE5>Ft (WFTk(sOVn*s\(vJkG96A74$IS&(K3t @ԫL2!:2y+KK3.t L4("f"sM =2=NuN;],?ArI@5-9LYPԩ_uACaA1CR$)19 Sht]. ƺvLf̸t43RUބK4m+re>(>8Jί_DDa$9"?֢ #eg$hƟ@"p.{57 @ rda*X*CSsalnPJ<4k [cdjqU5 m!p+̖CevHNRKXf#a6N^GOH,VD1RUgh.GH6p 3iVui֠V)@$Q kb#$i*=&)mjMmAn(qb0no% dVpeMv hitƌFZ^!|}lBrQ~Es}dsz;W6WkAfVqvk);vEv|mWrwhwz6֑xX_xxחy7_5|87oACCD0qrm"AmAn"i}1Vo {K~8xsWŀwzLtBwt7d dx sx#5s`-5D'>7![_cQţշj;[۸RNy@;(h'5 ,iywÜ1;˻ \C[Z;5<(\,03\ C[=rT;dl|W Y8ߵC»[gÒj>nG=hQwgp\a)*+:W|W|z|(y瞢ώl*DMka3Sm- gkYQb-)&J4,ntxDdTHF I >~M;2*bg=eV.VC.3k O9GVQ&Vw, 29$z*uh;0 ԁ%SuaVt?'@{fS_&cy>=e^F]2#*&IˋKcm5)I֝lcDtO~+U(!L ٓ9nxW3o%n~7Qܮ;C@}ߥ-O4;X?]{ &+K!ތ:^C ?O>3U^ӦGLn7_9O(^[>_;~#gjou9֥~]q蓡Z*V^BsݗX 녿_ 5M읽7,>;>}]3H5n @08T l<ؼ@o !=^p $:GI/LTM7"PZ:H#.E5b]B#fT$*cS(.HY:nCVY4|Bl)&X((GItVjY`vSSǀ)p<\vFtj} 9%J\Eh7r˄(}ԃ^7,ױ~ km_o qU)Dy>jv"K# bɺ@h+Xo#N[ %Ic$0)jA)t/赨 +F*Te+#KƴB/w 7mRu$,QB-/SL,lSB~FI<4,Wpr)2,L:$h$ Oh[|X9LL8!3Cdh+l8Ӻ4{"ĐR.9Đ1 4 THU6X P-""S~G& &T$CIFx( P.>MHb͌PQsX:ia0=fhsDSū4saNj<. Mg'/˓:%tc4ғMxrhS9`# tF!T B< vfPV,AT]3L%99℆Tʓ4~6p}UVa`]]hV-t`BMWݓehK;>j. 2lW5oU^\ؒAb/wVވ|_@ȯ;39co䳧 +Z׺޳7.e&|q]|.]6e>| |k7E 6B]ӵ6_r$R FQ0NU@dn+u\SBoIZr\W,5HJc$(>iZitsTn}p7FH!PBqfv hw8}N0!yɎˋ 0FW' fB< iYٌ-Y|ΐ1>I yDY)ʙ@ĠMl}b>Q;64#QKd eLiZQ(b[w1_W# 2GA[hR&'A=kA?L-f Jڿ{9{xʤ"s;p;c d;7@K¿+$cZxȊ<>( j12J$=H=\=q|1.\0#@h+ZBB;»6<.y) 8C##zCCSüÿ8y<#μ2@ {/CcRF漑-HI<ңGȤDSDORx+E;`J6LAţE\ӥ?l`"$[a|b:c8d>5Rg:;1,45#EBz7 r5.V:#{K)p315lܸ C\EAK$kFdψ "\DĬLcLrN ̣̐~:' B IMgd6ɦ[jxS M[ͻ3A%:cTmEdQu@ܚ|@-\Y5ɖ*cX q c|1M]Sˠ5T܋a@PͦݺY1ZÖZM4e"k[ [([=xl}\ !CmSUYZ]j_6A =U \KaNS\|}Yp\E6 =2+ճ,ҘCݲY݈R׵= `"]=Z}ʭS G^zQ3/S Wm⇋:2i )"h#iy^ijꦵ%~doO0_6sֻ>TSg7H5Ǒ ,\.1p>6.P0p|ĄhA| GMZ|B K@ d p|mdMwGvӦyMuqmwLOyzNŚwjO^nUD)^ol: #H[vW .Z0vMY XTBCF[uU]w~l7yww}İ$_,߂8xONLjz:Njx "͗ߓW(.^˲Cyp<ByF;hf9ߦAMws'GIz z߼nk#U@X"Y}|n'iBAw?t⍣EA`}( @%?HJ1 CV-}@ ?X FbPgT?h0(tJ xY](FFp9d, t<Q &-LST]m1'[5exDJHyMA$8^.R}XwX^> RޠI3ꨡ<-<dzmziX_m7~ +pPFMpqe=&w@,wiht Dlki(la+; 9ҺJs1y:F֊Q/ `تGhZx HESړ7 -FC[!'P jL{hQRh>d48iq>@U j͒:Q)Gvܞn5rzcon0#xaA"i=jBH|(aBG9"GaW:!à20]B#:IT݈5&%$g$XE9yRZ&WH#([,K8rjEM/0.AiR@*CjUvgTJ!M*2 [ `vE-ЃS( -(Š]{N))jS.jd!k%l?аr4O*E mKQwK=%3%_P:3xWAl*q9:z9F#;Hp.6MsoU"w Vh kz$Ȅ.zG>Twcv# -r= ㌗TkAM뜷F.*]/V"⭾O+ V| ZmuoJc ];1XιqG_ AbXbz9qnIՊC\0TԮτAfͤǕ-GڣY-SǙY1;kNHC+2p'eˑsw>X)υiݺuTLyоL{?-UM^U7H}H^#K1`!QWC)Rj^qx.ܓcŪ\cs$a .' iv}%EmQec#.7Xe.j>Po2ЃsQ'͑S7ض"1#slDe'vN9ML8~@!;$bRH*6QBHe\s!(ۏbd bwƼHCqLԔkNP#a$? ,Ee~ f{F|#snwF{#O@'/D"eC/T]/dу$\)F廓D.o|mo<4P MKd%CU*/ֳ~;k} "00qn=dq+!P&(P*4 k1J)$PMd@oc[q iJ #(PvIBoN" -  ET/  O K ¾?p}  OH*bMR9hHlRQQ[qq "܎cgdp/F} XOc\1fxOr^XQ+oFl$.oo[#1Q+PS:5~S4/pI 6B/sr!| Q $)So O ) !p!W"--R6g`j%gf$$i"Ao*$1y"zR=2SZq|r1W*2:lQq.= 0,,-1 ̷.*EP0srOc0hHi(Q202sPo 9Em Ǣd8nb(jL6674C4L(B2#2'L_$$aB CڥT4DccbT7F&.e;tEmL%F4XmTSGc( dc5FmPGOvVZl/5T2FI [j3J$CAJ F0 _(/R#\.tAHU_Tx@iM c٦^P &WĩSʇOWPeFvfcGLG7GH5%YeH ԗ&"'v"1T tԯK,;0CKn)KKu'TFUFOV`R ʐUYfL"t=XeZdSQ.)!Qa4Hck\Iv3SUYbTj!UFWLCGEU"È5IP =kN#KIW4WpWz5xO$X%X YYvGWTe {+?1, KО ޟg =}A^%kwA^FE'y^ڽg=Ɵ}۞G5H]9Pr>Sy*dhk?@pP O N//1F@dQd(H~HM"̀P( '=JLƪF`)LDXdZQ%8S;-"k@C(Kd23kZ2)b0,]La<%c$yBE]-_W Qv#뭺g-a+H%fia[u/!xQ5[sK u"ׯRWspP 1mzN<04A0TAtB0' B/ C07C?D1GD4OE1TWEt_F1gFoG1wG H2"H4$I2T @d(J2*J,K2Է.K0L32L44M3T6MtM:NO@P4BP5DQ2LEQuHR4'JR/LS47NS?PLteCRT5OTU5UWVUu_XV5gZVuG[W5w^W`X6bX5dEœfYuhZ6jZl[5m[p\7r\5t]5ݹu]ux^7z^|]e~_`8`6_xFavb8'bc87c?d9F@&Oe9VWev_RXcfog9wggh:h6i6ivj:j6k:ַkl,|l6ϴm;V׶`۸n;nOop<?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjTWVkUv_XlV;%gZmVeo\nW;BW`pX<& bqXf7dn7W-fsYw=htZ=&M`f]lv[=mnw[Spx\>'ry\g7QWv{]wxz?z}^g|zO~_, Ll! - 5 ÐQ C Eđ,MLUYeƑm ]Ǒ} !Ȓ*EL%ɒl'$JR+̵-˒*K 1̓,3LZSl79ΐ:=ϓ?SAД- CMPT]G%IҔ F5MӔ=OAQԕ-MSKQDf a U_ aؐXW w a}` `iq (`xy^pZ66@.nX j z/j$.%g8Gh)8$ U,;DbQ8V-FcQv=HdR8ݪ5KL+;$NgS}?PhT:%GRiTe6OTjU:VWVkUv_XlV;%gO>%m 6V7:Ѡ/ 'ry\g7tz]>WZurnb?~`>'650<S;øpO#/NL  g ē`/3Z({C]h6IRd(JR+KU˨ĿW4; ϩ\Tb8EP(922əC*eG+lCjB(Oa;H"#(G@)qQQrnTJ2XUKy\cJi^ DRSʑT18+]p\`vNu)3줞bf؊;ٹLF=d`-0G'p3\p^ C2Υ;5 MEr~ 8*)؟d^~_I}5" ջȸO{A&"T #(!XLP$(A)H.wQ!AT5&prC d@ՄsP2W:( AI)E,U @/<JTSuO*|fX S .7'#<: m^1R)ډ'7?ȑ&#!X_0o4Kpcl o dL#h!8?|_GGG'0Wt !X)WT Bj'8BAD1d vÚ` }S0nD#` Z @H#P tOHC`<5B[4>AV-*,҅7]Ԥ7LT"/#A* `\v0Nd-*a)R f4̐ l"A` OJ#dlF0m# X(Z_`!ARL # C/# P$ f"IezI"INp~`!bfBaXTMquqy}"(0k*"%#Am*AB@""/#!g&b-d #&k ".߯ #k#NRN (8 @=0n.# Avҍ,k #q1VQ# xTZ2"J X6.#'2w'~"҂(9)sb) .h i :hc R--.2{,ь1vz [ 'qk jz#o BjJzAlx,~# dlEhb&U04 3FSJ"Fp#zLπrAV"#$&ҭ. Ҹ {6CXh6 j##<cs2@32 (,#Q.; 2n"XB+<=9=Fb=g* @JN4z.tAtAA Pb?O BaPd6=Xd!!"l4 cR8K)@ j#׊!994wß+u Ilpp4N7RM/`3 j_dfDh{ ')%t$B7S%;@W-s`W8D& }x-4'T@FH'n)Jbf`/#CD@et+;e*J@^΃ih)q j)m `G F:0gP JTse5 LY腣 ږm/o J^ t3S. Nn%☮- xL&# >}`B1̦P Y)Ƞj/IHl Dp p\(}ƼLU iKnM2-<`zK 3FKgƇ(f(X$.-qj.?šc7Oq`V84%bOUo]emu4IdQr#SX!*&˧ҴUZ@"#%P-$0Ʋ5':g)I>Ԗ@n82z`Tv ! ~.`#62 Wn5xg$v)dBX д)pBc hxHW*Ӳ*n&D؝QQN*EXYQmKi-!!"W0]9x@("2$t  08P*0"B"Y L6AU X)>ap Ա"`pʤB$1Q05`tBJBT'a6Iw4eXMmLT#Cza3xb(`yUj4DrPZ AE T.PCGqy:#D#ePH$g WHHsP"BP(ZJCdiI m7蛅gV)$gX QqH|^h8&<1DH*'}B؎7WB+ v`láa> OXj tIċ|0akJZ]k텱V[[mm<"Fz"挊|" tCUuuQT&Ru&ЬL8b ].rRC*G 01ent=BJ]1(5!1\#3co`epje^ca:#$CseGU|&8hRJ%B,\KnrQYO*e\YVݰ[{( 2ԧ28r] B0cJQDQk~Dx"DyoxoP+B _"˭F.Z(h4xuG3n6*h\C3l=CHH\hl"$/Զ?I 2Aĺ}0d v!xXgyE2ph=6hmݿwܻt!A1 PHTbPJAᬷ\oq=!3yue0h$bXP5t.! 9 ܠ!QWmMش q1Vy&{zy^SuO\! ]`0Fh .Ek̃@$ 2onޑ8Qy/,#^@T*+,>(# 0aXHZӰWCB)ѿ!va$@Ax!P:AԻŨV"fB=Qk"9!>EG_~1Ċ) a^TBFR7V"CC:;b@7㰫11FfofJG$b,Mv+KDA9vmf-x\zEMNl#l qV-[#.)En8>@X9%ITY-Ki5My=OEQU.MSNUUn?뒷@k `|ț+MC. &*룸7,l .]`+p8Qu: [*N@B!Q&Q@xLe%J`}SzR3 V2Sh4Uwty$K|4;)&c3*=fCC1nN5` 节'nO(%O4p"'L&9 "hnksGq{G!Hr$#HD%Ird'Ih3ܳ- RVvFPԊcJB, \}`B\ַ-a Cϴ)qN Lɳ&BPBkEQu . <Hf)AxA0%Q=l쪧y ({ :o[9NcUH1:NWiYfJ10C %k[ յn[ēW#PYjH@JE@?q8Faxfa#bx+b3chܧ#τ ^X:!xJT5FVSIuGSCi;SPLZBy%ٰWByaReTݤ_VإGD;Eu,1 )';%zF;^iBa\)|hǥq4aì*L1 4N)BB`&]߉d0!mH2`ܯSc4]{՞\zpWdVQ~H eX}~~9?tc@h=@{p1@,-`q`\ >8+\'c|8(t". C,م@PA]I%O`:l@ 0 ĂgMPV(E(Z/F#4h'xp90s!M.} 4@> ȓ&dԛvOIA(eS$)Tr5جܯ#RVyq.eԻ_K0fIRbLy2J#2ذ13TӚVkMy6fۛvoٍ7S@)f9'wNE+ Ae*UNSLm-SPjCNTzLt$Iry\g6 kWv{]? <[N?z}S߷~_?_<+|4l<%B 5 AD Eđ* 4T\]'FQmtE|ű!ȏi"L%ɍ{&T(ʒH-˒켩)S,35͓l7 L; |<ϓ?N C4DєmGT! P-K˔]1MӔ9Uw *.}hH((gf@3᫦bIƍ~@X4 Bt>!DX^1FQ!Y4Q)Jta1LYq9N{AP5XE"9oPF#0#z)HY2Lpmlף4J#sh6J-r|6#d2P {89kᢜIe/6 ]-JJhXnqs%ic (n}1 Sr^7!,t-wc#*>O@0T$pA0TAtB00/"!t KƁ*EI;PO`Wf0b1H6 fDc J%H)J-K"L&MNkJL8J/Ϩ ` {'`,BX># xnPP~h!XM@ +̷.@ k|hFCŔ!sjBoj6շp\7r\5Δ·E֗G1 K @Ta`@zH%0EE"lYH6Hhac1f )H-ZfCUcC)HB]DpH+ ց’!yjf O ݅av!b1cRqcC\vV b;8@Ԗ0OQd&Ӡx Ĥa&gm$qg%CpфF@j9XfD cV0gt=P#(R ?XjrzajhFuA$ p,C$8lɳmٷn]cVp< ! (.@]c%+Ll p,XJFH:NI@ @hV!Dp(CC@S `$P`}B C QAhW'`蕇0!qA)`1H9b,G& Ӊ* IP@=ʸu p*<^ UriIz.@euTDVQP2v R8(H$ Vq&{H+B=&P=3)PD@k \  ę6PEɩ`#"(xeLUux.$0^~Krp\Db+8>(x 8zɰ'6u#Ոp`+ e$@pal9C@ mB!zʪ Q|WVdE h}{v(ExbmJrS.MVmƹ\wW[WU}6+ 8+jx¤`AEjӉC`Q[!'C Ga=*aV?@3JpI`H*.[HńpH/4U sz/[l$> a8,Ub[7JЖ$ (L!D[AP>"ip$ XC\}< (؈"& Bº0b,H\ԈJجDH0X 8@ +v:wA}?~^AaZN䂂6_X"ĵVecۢY8,-qB- "%h[-8 ) Xڝc"eg?Y0@hP ,)N`Lr;4hD73E xc*\8<{:*s|9zI0 mx"'tV( ɋ#S!-pJ-XUàAA- B(%D1#h_"(.%sA@}-U="4%kH+A8V+2d A' V%y_ZXEqS8[BTXyH/qBE Xm,gƈFkFF2$3`&|,bdFS@sŤ[EdWEE;$]88Í@Z tvYF 9*8fhxW,7`70-[b>2q!xNRl/Ae*!B $? Y@OpN[Δΰ qxYN WGH69(h@,RGȄ A9L 50I P1*|.O,OÏ OOGOZ$8B=h!(Da-P3ϜϸQ.ɜ|HL{̴]O=k/P22sY yPɇg\ #mQcQ}є90QFK@FhO3sRRkżm,4 E }ȭv#8cFezc=(T o0-愠^-7y _Y挙!j,ϠZUQU!d8UMUM%[mKLM G˝AmBʵW{vEM #H_,{\ &pڢS-Ǽ|IA4\Ap/G`1Hby!ES`a ^]Hׁ[d8 4H2i[:dp P ZIP„{[B 2(Hу:͠?)g[ŏbmZjb+j[ !_ @aKVX."Ip`u:Lj3R(@@!+0gj?h>\QI!uXHDaу@9i$P%$fqBф8,#P1>HEFPBငR.χnn#|}vd~Wc@~.HmXde}QQչ ڇy Y:2 FpTeF BRJa^VeM-.¤+Y9(T,Tўw@݂`9V Xo=.");xJ+nz%l{ÈB NфPutj0 X?& 2= W!s ao>@'B7C@_D~,,)PARg(: j>l?82 xFNxpHcUrn@H=j_.K(I7KMLWNOuQj6o߭X{KH p(Gp ؆2 V$%xEEa16P쿥I5F8CinĊNnkީ B1&<[{6xj q펍FskwJcsQb%?8Z\5bwo꾹oψᾆфzE^ظ6qeV )g1l)i"u*f [.4,JU"zG߮HP#망mTA+z\ F,1H%aЪrG*RG&pw H X3w?Xk73zBҘkB5nXJB'>) J<}.H(k@3`4^`0H#4`%&lRDI {.G ="_%ii&*OlNR'h'61u% aتd`"= m_KS gZw^BMS=PT@IS/ ksiR/fӂ@O5JZh1plr!9=k'J'~JL0.]!BEE;h5 #"= <Oȱ|+*겵ZqHy(+9PgC 7 xitr"hC 2((ZzFԮ`phx]Z |-s!WZT;' >Ϡ0׽/Oϟ?WgVzZ4`E䳟Њ*ip$V? +`?l@G#/ >e͛}vG8XwXHet3cُxW 0A>h!X[1 ~DXL 0P0sh`,2 (0fz"Q%mH|#I[q?$H qNIפ\RR)eDRVJ]+}ZKYm-|õ @,@͘w"(lUK{/ XdL5y]mJQAl4˜%a& hybvIǧ;gErS9=S~Ig?hT H!`!_mn= (K:Q =E(i;lAFOD .N#L/*6V 92S|h?UTZQL.TڝSQUL U0fUڽWUVZYEiU[AκWZ]McW_uZXYG\5V.O clVNYZu[z"Y=gZ[MiE1bZ@@?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfPwg7NgS}?PhT:%GRiTe6OU VWJ`pJv_XlV;%gD&Mo\nW;w^oW `h5 bqXe׍drYBSN x]l[eY@{Um[Ru]}_aj%{tNO)B*lrIVski_0{}y?xU0q{xΥK0@lR@DApb,"{B% Cl°CˠÑ$J14R;T[EbjBC)Ƒfqg [xh(Z"'Fl~@&zi]=;q PDQDQmYFQ#IR1'K7QMGr, `#e`Mp%pw$b a|" Ղ)=Px偗 E_7-G_`n+`3Mbf|#1 >B`VJ)oX4q3 p/7m`x1$_e@X`^2D"T {#05K00z? _Uv;\@^eaÿ};pkv plpy`&'ZDPOغ*D`1he?88;@DF`gNKyMl ; =uA/ GqZaS[>8 t$5q=6uxL Q" w 7#8[Ð @c vȲshOG\@ZK!#9Za#33y@'^zTaP&ThF!9 $@GD=iPqx?@@1Y8|X !bNbCCd5~68O`/P And o !88>'Ex `f+< 8plܹف0f#.bLrx%av WB^@)uH!8 !w4b @ZhI"L;@9sD92@[ A <@[8q ')F xILİ #RR p @Bx$|X .G PX UQ Nb3 `s20)0H@WǪw *P%cbkDM;A2z >'s и@ \X  B`Dl )G ^@"Y#i؎% bt] ANb- 퍆~S4" @ރ0F a$3ΔH{c4AB.#zem2/QwY{c_[I˽r0ȎHp m#`R093uBI0(=x X@0[Vy ¸@!l Gބ*}bab L !`\P@&$I!iXpGĐ"|l[d N$:%#Y"CB1#Gd[ؠ1h&P @ %P/Cp.)$ QODA(PC=B0„T…cd34(M. *+4](٠P8YSLG0 cf@@x@Bi2%<4^He\Wqt}",Ѐ 3Ix. , $Sf$1 8qpb5H "pONix3@D,n`A\Ipd Fu!SaX <"՚VpPtdJ=4]8RX Ofn9x(Bc,0FP0[#F& !E p'F+#)}Wb@liI)CTVXq^~3 aDi2BzdO0'` DĤӘ l#`+8~X72Ɂd*`/=> 8:9ҋ.X12;yC8 A[8AA۸8&alj}9$J||.IlH+i3k4Eh|ã- x#0(y!H4$@ ?BBX6(4HrBP,S@j,!)$#I9ʑO-Phd[XT@HA wU(!+Xh8}P,6h*Hh-/P^!C_12"h=reNjA TjAII ̛AQ|A {Y(yj@hH\P,I\XDZ{Q<+,FPz8/j/>'DBLj13B8~Hd(01*7 1$'@UX,)}ў4D]h!94jZ/hIDP[TX)ψx=-2VX``'\džV5k@ !lX: %Uր ՅJV tGV{X&͹[[NЄ/)`G;PY8HƇeS>x,Y5:x*(9@GUXV=X 8 u^!ʜ 9cDĚć-0EPH4Z p1O*Ch[Ё8~`]BDQݣaZNaZeݤ8f!RZ PwP{ 8 Ȣ *1 ᇇ|Qǀ݇-2Jm{⠯ ~, pcq.Eb~9iqZI)atg^vIiu" oyagChgِ @9.X+ Jj1.nBȅP0T<(&X .rgg>?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfPwg7NgS},0Rge6O@ t r(v0_3-o@[oWC< bg8&+drYv{]w<?'Ej:wD<>-w_䧠 P, Li  EI t ől]RFQl('t!C>92$'Q4)\+̵-/vüCbLL|M@+7&MSJSJ? /PT+|⹁ K4]< N@1SQUYWdPl`pW_Eoa5(AebZ5Uhڎj͵mխo66s Cc]wvyw5{o_W}r@`w D(=Լ8>^X~$U؞-V8nAI>U7eMNq+(unh~ꚭj&ű&zѵ홖幼>?[~nOU_?Ŗ\=\AMQnu'Kvu݇yv}=ljMmyPz^gy߽,|j/}ߏ_G}~~? ޻.@gqF ;`DZA!!tp^BXM`'0ЂBmGGpCX"ƂC|(XIqCؙDCEΉ1`A\j&0FXWjYќEqQ:/cyQ>GcRBHY cR.FH#tNJIY-% RfNI='rRJYM)D> ?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjTWVkUv_XlV;%gZmVeo\nW;BW`pX<& bqXf7dn7W-fsYw=htZ=&M`f]lv[=mnw[Spx\>'ry\g7QWv{]wxz?z}^g|zO~_, Ll! - 5 ÐQ C Eđ,MLUYeƑm ]Ǒ} !Ȓ*EL%ɒl'$JR+̵-˒*K 1̓,3LZSl79ΐ:=ϓ?SAД- CMPT]G%IҔ F5MӔ=OAQԕ-MSKQUUm]WeYei[u]ו[W aؖ-c6eٖmg5hږkiVo [sMuW-wyލz}ߗ {. >nxN!☮-1=A.MYQn]͹Va普m3fq}?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjTWVkUv_XlV;%gZmVeo\nW;BW`pX<& bqXf7dn7W-fsYw=htZ=&M`f]lv[=mnw[Spx\>'ry\g7QWv{]wxz?z}^g|zO~_, Ll! - 5 ÐQ C Eđ,MLUYeƑm ]Ǒ} !Ȓ*EL%ɒl'$JR+̵-˒*K 1̓,3LZSl79ΐ:=ϓ?SAД- CMPT]G%IҔ F5MӔ=OAQԕ-MSKQUUm]WeYei[u]ו[W aؖ-c6eٖmg5hږkiVo [sMuW-wyލz}ߗ {. >nxN!☮-1=A.MYQn]͹Va普m3fq}?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjTWVkUv_XlV;%gZmVeo\nW;BW`pX<& bqXf7dn7W-fsYw=htZ=&M`f]lv[=mnw[Spx\>'ry\g7QWv{]wxz?z}^g|zO~_, Ll! - 5 ÐQ C Eđ,MLUYeƑm ]Ǒ} !Ȓ*EL%ɒl'$JR+̵-˒*K 1̓,3LZSl79ΐ:=ϓ?SAД- CMPT]G%IҔ F5MӔ=OAQԕ-MSKQUUm]WeYei[u]ו[W aؖ-c6eٖmg5hږkiVo [sMuW-wyލz}ߗ {. ) @( =RShs HpB\cl`.n_g6 HLinomntrRGB XYZ  1acspMSFTIEC sRGB-HP cprtP3desclwtptbkptrXYZgXYZ,bXYZ@dmndTpdmddvuedLview$lumimeas $tech0 rTRC< gTRC< bTRC< textCopyright (c) 1998 Hewlett-Packard CompanydescsRGB IEC61966-2.1sRGB IEC61966-2.1XYZ QXYZ XYZ o8XYZ bXYZ $descIEC http://www.iec.chIEC http://www.iec.chdesc.IEC 61966-2.1 Default RGB colour space - sRGB.IEC 61966-2.1 Default RGB colour space - sRGBdesc,Reference Viewing Condition in IEC61966-2.1,Reference Viewing Condition in IEC61966-2.1view_. \XYZ L VPWmeassig CRT curv #(-27;@EJOTY^chmrw| %+28>ELRY`gnu| &/8AKT]gqz !-8COZfr~ -;HUcq~ +:IXgw'7HYj{+=Oat 2FZn  % : O d y  ' = T j " 9 Q i  * C \ u & @ Z t .Id %A^z &Ca~1Om&Ed#Cc'Ij4Vx&IlAe@e Ek*Qw;c*R{Gp@j>i  A l !!H!u!!!"'"U"""# #8#f###$$M$|$$% %8%h%%%&'&W&&&''I'z''( (?(q(())8)k))**5*h**++6+i++,,9,n,,- -A-v--..L.../$/Z///050l0011J1112*2c223 3F3334+4e4455M555676r667$7`7788P8899B999:6:t::;-;k;;<' >`>>?!?a??@#@d@@A)AjAAB0BrBBC:C}CDDGDDEEUEEF"FgFFG5G{GHHKHHIIcIIJ7J}JK KSKKL*LrLMMJMMN%NnNOOIOOP'PqPQQPQQR1R|RSS_SSTBTTU(UuUVV\VVWDWWX/X}XYYiYZZVZZ[E[[\5\\]']x]^^l^__a_``W``aOaabIbbcCccd@dde=eef=ffg=ggh?hhiCiijHjjkOkklWlmm`mnnknooxop+ppq:qqrKrss]sttptu(uuv>vvwVwxxnxy*yyzFz{{c{|!||}A}~~b~#G k͂0WGrׇ;iΉ3dʋ0cʍ1fΏ6n֑?zM _ɖ4 uL$h՛BdҞ@iءG&vVǥ8nRĩ7u\ЭD-u`ֲK³8%yhYѹJº;.! zpg_XQKFAǿ=ȼ:ɹ8ʷ6˶5̵5͵6ζ7ϸ9к<Ѿ?DINU\dlvۀ܊ݖޢ)߯6DScs 2F[p(@Xr4Pm8Ww)Kmnltk-3.7/nltk/test/images/twitter_app2.tiff000066400000000000000000004507141420073152400210360ustar00rootroot00000000000000MM*D?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkQ_XlV;%gZmVeo\nW;w^oW`g bqXf7drY'ry\g7]>Wv{]wx|\z}^g|~_?~_ , l% -  Ð= Eđ,M t7ől]eƑm\qǑ} !Ȓ,GR<%ɒl')ʍ*˼~j`}H&`B 0LS 3MT-O=ϓc+gueAb)lq" PFЄ!~80OMTGA0VIz J& @za`(9W gՏdvmhvUܗ-sJ@'GD$EٮAa4Ձ&,)Aap$(պv_XlV;%N@."]NxB@V& X bqPE*NB@ ZZ%a0،^MjuZf]lv[=mnw[}pxR)L!KŭxN R<[VX0'1fwݻ~gZ-VtBrzl+7h(@p$- 5 Ð= Eđ,MK'5Q10 Q-kj޸k}#LHIopȘT;D+[&OҜ+%-˲1N;=ϓ? Abs&ZHRPfjp|>@(T$=2>{"U5]W }Q'Ոij`/ ޅPd IY҅Zw^Uaض=e[o qܗ-s K2`PhH!]Jh1t/űBIiV RB$YT(x5sbuW4"'F!'P3 IEdFTevaf.Nn ֡f* #p#bPJf` " w `$sp8 0~u)տ& p&˱µG}~yJw&P. `$&: R e !PWr 3g=t@I }GUu݆ٶoW{~R>z`CO}m(1c9@OPvp`A L p<'H6!gvSOU޳om:b #w舻~aqJ t ܙ{G` A , 5ohU R}jQ?cDiQ6FqNL wPA$M)8U$+[$ q13( VTs$a@!ƌ%ĈkC XQ< MuB ! u¨>pԕ-EZ 2ʙ2 Z e pȻ<`hyH5!A3(,HQ6/Ę`byDg͙h6fmjnMF(Rt BrvO@LҤJ+R҇a({ @Dlhsv|C9g<"MI=@%0 BYPL=bh "Z`x@ 0MB-U)]WUW cV[r؈" mxԵKLh%"`бO1=!0\Q J9RfKֈTGIgTy89(<>~@%0UY%K*Ŗw2gdET#VUq \ 5, @ś">YZe4K9jEi5ިX`\ t$zs$W E"=!{RHRwBXQ0(@gP V"6xBPaf`GG)BX=@(m$cP)DwJ(@'g<!2\ȅ#䜗ry9O*|Qr^2,Qa LK*,kTqC>9) Z$Z/ǙHN<:$ mbQmH)E/Ub#zԊ}s+7k-uZdU 1R!@  "5?=t]Ժf|Ӛ6 V1/;!RN$Y<B#n:Lm1G\Oq^-C(q' dT|/Eajeu..]f8(}* !^ø,`2s !5T"m.s^oJ<tV'F(tΝR!ܴs/pz_M?X0_ ;97|Bdy6A<%<  ,WnE~~\:W%%ܓj!t@|nXVNA<_w5_o}߽8!"'vDBACoH/ǘX)y`IQve^ N1ltѬzB ꎬ.+" P1Pp!o, < NĮc!R;"҂44o !tW·( O΄p+Fr!gpx,~^eCK P(Y= /bq .h1e-AE Up[ 1qP/# j p64s"0*V@F#t rgVGxPv"AFdtbz b`fɦg/Xau,R[jA1>A`=J Z"_1go qz"1~FGo(1ѡLcq!)G ȟAElY b!@1!ѱ"1"2) b#5#<"w5#t.x$2~w fo o(r 2)t$!'Io\$&"%RY!_r!g"") * *Ұ52 i05(7#:@2F;3353s93=3@1q#!%`"QXK cS'  Rc`v *A.I( bZL"V &.@ b3˼h ;6A4 `A E Z!%~b&f"hx  2` 4 BQ$FTyB2`$X~8ļ S9b9:33:;D#;,;= =S=> \!%&Bh"~ 򢀈,B"01tc"G>"?1*IJ(Z2d TJ*z`a +"NOOP4" PQPR/1C$@>`@TsJ/O:m 'AN |Ft=tGu>SGHRx)Y@ @Q{A 4 "B-CiT3 B[UUb,hm>nBq/,h"oTv iQ 2JTR"tK(Cc65cv9c=c4b9 0%5 a(7PR!" j, re`aC``$sgvz" %ni$14#Q#q+#D!! ̈N hyg!ւ]h#humviuij jv<#vKHG:mA"a* 2!(-A204jVwok"3pvp-HAJ@SnBEb~"Q28_ uMu]vvWh-Wn"7sw`r*NP qR"kPts aK h`ѡ@ PN"w?o֭p6%p z/l"l!47b%r,", nGu>!F $j(@!D u5 qB rW!s sv/dY]a8d]6D#vIb"@%9vSf) @" A X#H!u}e4Qoq\Hb  A 7Q7bל!ؗ"8#f!xXOXȉ Mxڋ"8玸Tt~.0b;s+T5~!B~" (m)0Ir%v"zw"LwMΉ7WTuR-M Q`S^B2#W#)y, m7kBћ!1ty#3JSN"A19ċjS*OsgA:EzI/5wd17,4R "/X!A N# A;w1Ha.x @Axyq8p"{"-5boP%Qӽ:6zZ%b']t9/5X!8Q"1Z֋OWr.QQq'z"aAM0;$ Zt"Xl,WI{S+G[{Ua5"{|?PzA8A/%H#Ir!+K7T @],!3M<bM6T&!6COrKMcPY<4^b9h8xKƒO[<\$:>$(\HLٛ \ A R?s[,oÜ=,.|g|[Y ۆ"fhUC.w!X"ܺ \c ț ;{"!V9+D?O BaPCz"N|2-+pHdR7ӡh;@3#Sm0˘A/Nao@@"*e6 *z5f{P+|" A eg=-'s2M+\0QM j5WeY{p[vGoqѐ1=fZ_h3HNb囩exeO5dvr8Mo}1R"K݁[o%= !9lBXWat0Ev§SЇ.3BN9%0}%4.p]( +}P2!ґ~e$L16+~Zzu:A\b2- HSG]IY-%ęRnNI='RRJYM)ۧIɴ._A &Ay/)t@I4.>@iD ZDcnV8$R<"T(.`D af>E % v ٟ4Z>7&)nr9'DꝄwOG"\&PdUĆ:l/8S8=BGSG +(&o9@)dQ'!4ΚŲNS<5T!+Q*4%_ʌU##9 cBg@ p D F@@&@>D 9ȦNODɅrH^wϟq(P`%1RWA`j@1R9 ϔ<@{X'`\A!L&T[}oW\[qEɹW.$T9)Ԃ?@* `!f (8BSш``i2\Q )BB*i VQ}0hcK%06 E4}(jbƋ=c˲רGF Wؔ꫁p88,0› a8!1bkN/1N+Ŷҭ:#!D,E\ T1e-֜ c)6 Aڗ " iJ3 B_NѥG&4yX5iO@"xMBc"@'! ; #B?= lA#yxmhdcLl1:ǘ *٤ii&2qa@!H0dR+PsQyR2-$\oq=!\r^MҵJC#匳" ~-#;_<+PLP9Q3j0o"FFV!Y`?bx=7p;DbK>Q ^:!PN`&#T# '4 46Ӟb} OA}t~oO=Odb:y^]{)Lvܻv9 2zT}[K{E}0vv|1xr53~(޵W҉ąٛ7gmizc7FiK18N$OV!odb!.+n 7J4" <7 ! Ab8  o"Kr@Q}0pTJR V :< &)&KzBB/8h!)p Tu\v紶Gh  $ p N P@ o҂zͤ) m\ofOBrFA̍иhgz ˆKbCBCH/OSqW/0G ț: K``"&%i",q 2 r d` K\ 0fQX* zKhABSN2(qmDbg }R"C mP!r=$$D$O%1DJ%~'1!qK _m!LX7,n(2 d+,r,a)9\P!1ܡƲwP)rdRjnns2BFa8 /䬖.t@f_2 2W Y5]5a63e6si6l)KO\ nV_$&@>)m.~-r f6J^ n J)F 26L@f $P&$j#$S:@:Iӯ;! ;sKS<3-#==2m 0qA! crV?NoGNsIX IJPE?GsHMsB@o%6 PQ5QuQQ#7$&.n؁XAL/v >@ P/V 4IbpA<tb2T uMU +Fu!d"@z nAj " ɠ^ )@RhmW)WT>}XTRLY%Y5ZuZ[b[{\ UsU_V4,bFԠO5d`5`(UZ@Z[ U'6 H!4NaBRaw01 g`;/OjCvn{vu{Ah6hAU\aiIBkV+kžAG`AwWbl @TeɂdYF'6KaSavYa\/\u\`];5l6'kv/'PUpU5$ դQ0$#6)k,q05cURevwivmvqw7u$rH@AЛaE@aT <$00(6ȓad [`)z6S4Ja1-}*"^<,J-!,|xD{{|w| }~W3~w x?uW ŇXxā!|AbD4^LMIx88Ìxʸ{ÊAƛan! 7wؕin8sxI!xk7ۑAXr7~Xxɶ |`42 v  AT. CN)En9GI%ITY-Ki5My=O7 AQU.MSp-qux7VYV Fm[u]}_ a%cN?cV+UT١ݯ)iVYkimv/uonZ˙)Lş5jsOuow`Zvqy}'}?#@D:9m A0+ B3 Cp;C#1$K1*˚cڣ1\[ kFsGq{GG Hr$%0)@@yx T'J*Jĵ.K47M9Ns;Ną'ry\g7]>Wv{]wx|\z}^g|~_?~_ , l% -  Ð= Eđ,M t7ől]eƑm\qǑ} !Ȓ,GR<%ɒl')ʍ*̵-˒/ čJ3L5͓l7,9Γ;=ϋ> AД- C5Wv{]wx|^?%+=T\@E O0N(I |DF2{ +LQi;e|_<+\k(3*d@ 9 &ρkPdEOFG48OCx_ 6?YDdPP p4&,@ #`8n?#晫}pv-k7='{}}~P <:)t(ކ$6"2е-r.`[09} ג*\ZEg 5 SX{#`chBH8N@ sDZr,I2\eQ3DT]F}!HT)JԽ1LT9NL|s08Qc)gx>$hX~d1'&2 ffS @`&@Xh!f`;xv cf"=CκfAmp~7f0MTVYdfi @V[`v)gٲ4Z2.R` n+yd&Yna{ b] tAkC[ASyyOD!(B(?)5-^8"Gl1 cVadbGb(jIv; O:Udi~꺾ñ}&sdGה lFʹb)s̊K308<5 PD%5A~BE$%ОBU d-Q HjD#[(( P_s`E5(ZPxP"p>GN 8(WKB4kX8 ` CFf'\gm +jDB"#fPn&,,!9`B!,t` {?Bw  1@ Ht-q^,ŹP'imDq8`)da%GSHJ7/ĩ[i0) bT^2P G)H|˕R7YP (؃:J <V@@URjy?`Έ]Je-ԾSeL5ޜSuNɜ0&Rv'¸K #P]T rH)3 *Hg"MIp 2LIB@٘ QGm6|rc& & P"WZdC}@ D 3nYL A .Np )Z #@ A6+@ v S c!/4!T`$@ aQ %^,x,BrLդc:ڢ 9'01M8d^ L!ـ Y, 4u⍑l P%CȚA`(*:eDf%œ> uHػWr^ \Mb%AMg e4Ec:\+t+-y0ŷ ' .Ȇ"ABкy&Dhf?Hi O@A񿑂I aPO ]# '/{VbY,*j`L20KpE(@ x LX c J1LDF_u1^k. AUq `Cu(9h(7:Fӑ*_C,ӊ H\w{w7jPKB@t1 FȡJ4Ku=:gkosZwb_9X.~ wޝ~b3  @hJ`d4=F D 7rs'jDJt!s08"WjP9 GEX$ Cvب380ɭO |  0'Y!P H1 .㈸$BX&!  00s{" $)!?ud58j$9^.AbJB||nzB y'H1L t@ABl!q[!^[ZYh̺]0f!t ߢ 3bDt#\'œ+/2Z~%ڂ#z>bӢ OM6د=qȨ  _A*j! ;`!)L 9{8L 0[^ v!5d)!6`M dB S@ʓ!a-=DiN8,iapz鯶!0V.D ؑ@ #*8u26}-8B4|""@t |]&IoU>` ෤5!C+dnL: k>@ l Pӝ<|i˱z)H Īv ^3^7;?B(|8%?O BaP6DbQ8V-D eeL(0 xHr$ WXb#)a8E3bx>%  Xv˞3 szfZoy$86cb!; 8-ʰWv{]wx|^?'z}^g|~_?~_ B:Hi>:)J` 㠞 $j roD,MLUől]eƑmuǑ} !Ȓ(2R3#{ )N.c ( CC,3L5͓l79Γ;=ϓS$IHOIhO$/f3.K(EBԕ-MSMUUՕm]WeY֕m[T'ٺBz|%Վ85ͥiږk͵mۖo q^4%uݗmwyޗ{}߈~. Nn%Ɨɀ)=E.MKŋon_%-5|kDz9/MOU{/Ole;}߯uߑ]~b4hPP #DQ4@ @V6Ě ! a1PCXm :4\#82 bODK("p€% 6?xH젏еgHqqQ:GWO<#4o ($ _KPArmTD|~;DRVJ\ԏD^> $$NPJDZL=F"bJє5@Ø AV!)<@aB>$pG@j%FT "Vvѭ64B >h!5F45 Xr C+U:BqR&Es}9T L* áJ'Ţj7ǣ G$ɤLW,˥ g4X#v>gXȆ. TQ1:`KP>W'H  Ak>@ GvZ0$~\aa\\8d89-/E; gw O+Kk7g١hHtE$Gr8R@mF]! d\Wq7@Ь R(4$bO`l RL,hR0LS@r@ }<Rx  @Œ\x,tS r0 ^V(@r(tS PFP3ʀ6i]:z"2B:Rzbqw]>PFR( = `i+iBi88 _B>% 1@LV3*3H>'8%!|.)f q=Tm)F!V "JLvj(]6M/G\q bωT UKXP1dSDvPbzo ⸾3C>S~c湾qz*7Y/<}- j'K/p@Lc٭kb$$y'[v]bvMhڸVP-hF3a&jA1$30/E_0 H <0*|#|Dx+2tԳ0f7^a-{gjƃlPUc%cW[kƼ[eJmD7QPT)ԁNSc 1`(2c=Sh4({ loJ A=4*C}!΢?;(4j;G 4JJH2 ?O BaP6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkU_kgZmVeo\nW;w^oW`pX<& a+{ UerY'ǩkg-tz]>Wv{]wxr2ɱz}^g|~_?~uIP?P , Ll <5 Ð= Eđ,LBi*Bd1|GV;ηwx|^?'z}^qEH|[01sc]Ll% ‹ܛ39+ EđLUől]eƈ.C/&.[Ʋ !ȍMr,%ɒl')qkO6>̩/ ĚH31L5͓l73+&C->:NS?S,@P- CMEь\C=F4QE7O EQԕ-M4Q"l{.=eY֌%;3ֵu]ו}_r1JOT`ٖmV}iږkmRhZݳqܗ%rMuݗm[ie]P h  ☮-4`vkXy.Mby>Un]x[Y uNyndflg^ }ꚮzJ1:nNյj=%Z//ݻ ϼvHŤ%Q¢:Rh4e6OTjU:VWVkUv_XlV;%gZmVeT`pI5n@]^0X<& bqXd5r@NWfsYw=htZ=&MjuZ5rط۽f}px\>$I<@7tz]>Wv{]w߅-;o @?~_rI4Al% - ޸ ʻy(xQl]Ft1} !Ȓ,#KCIR)ʒ+̵-˒/ &)LSL5͓l79Γ; ȧLɋg< AД- CMEєmOjlO}-K5MӔ=O EQ4IIUՕm]WeY֕m[2T%Uq_ aؖ-cMJ x]iږk͵mۖLt([-sMuݗmwjg{}ߗy'W{`XNgG5>`.'=N`V牰EMzvZ0b'Txj, X B` jƵk b vn aD%e"@ YY^vzz~pR!BhȆl0I+}~xl>҂ݏtsg6,l7 (d ]qvyuLFh>_ qqlpo:aodv4 @-n:GLS"6:NiBUHAhM 1ukn T6n]ii )4M.R8A3 |8C{ Y0~`$A>Xbb<_ ('ͥڋ3DHA`xD& CaF%Ebxf5GcE#Idа@ bx,p/4Uc3]$ s5 h1gpbY2K5l[|DIu' Ưl N9@ "n KH {V")9 iqiMקZH,=jaZ\,~bBFƀp@_ Ec 1^^`gt+@w`{>yWQ&^x.Fv9vqqysK T'f6 FDq n0Ʊt D0iD)Ba -pYlD,HYaTzu쎱h' 2e&;U 31Ib`5!t 4VBQ@(ud$z[D1lEؾ? 4F<."(}0dD’Qm 6K~ah@b?GKs|: B    `# |=>PtMll P6+i` 5fpK1~BfP#04X:0"H錱:XM h+F22Tj.#PYO(_ az& (črEQ"<b'W@8 |KG3aL25qq x#{!=# g $ef@Z?80N 2&:svNM0nN]y>K]fWДGxϑ>wB$ƔҶ =b& 4_W3#bSdbq"PIp2; d,AtI-GQDBYH(H'RB5f坐6z . f8QnCۏWJSRy3&Q9L%DfJ*Nzw'G!y.8i=G2)>]>RIRrڼWHK;d+Xݵ @tֶd[+{o M:_13OS]hh3j+X fٜhxπ@(@% Ę Y2 CSFZqjŪ5H6?Aj!5,jYL B2mse loPً#N+ D̾mH&/zvjFm(RԮP vS@j"`ŏHױ1}-w@;(ZRsMFRU"^@RX@78ڣ'lm(AD@hA]{vuS*Ըz` ME%ȝ6-w(%Uj j2@?B-S: ) A*݄Cyg.B4gN3hCKmp5ԃ{l7㜃`U˹Q!#TL | Ym{Y:RpnI,D*Ү̈́tCHɽ+a 39'BUEBr6}Ì_Ё@,Hݍ4M8-*k@(\A2 ^F3P7;?[2[`&PB!D6< N䓁@ `!n^ <!5*eXV@ |?9^ AL)K`,Z@J-D aj!i@^4 GUpxi@@3"q/+WK{K/>( AD b4h8^` c^afbn N_@n1Fx &3"p0C8<2a0Dx.DJn*[^Sqd.QjlC8 22!(Z  èa\!ƥ@' qW 2R  @x>9&@u>@ Jgqn3^ ^`ʦԈz ,aC`B'`f#]z~@ DB Ei" R #4ÀR'AB `2M+V`q;B (6p'AAgr8 c j'4D{A, hB!q S%2,S33s;5DDTC%bȨ&]^3A\vA % .11JFpPxG  z A$!$`$%7: dDa=1"4#=t&CʨgJA^Y,U>r!i@   0 0 xf=cB󊏓33]`?A؅!`Z ,hI@[F2-r/ -xaK8t8eKLR!(RȊ`Zbq j>ASp{p1T3O53>AX\al]BGn C@ZGeAE2SaJ90a5r2)*R+9FGnjqw0[ 5F|M#?B@$SArY(>b LMp MPu`u%a&Bx@$ijD`Zx*$*r&K3K9 b,xnO$6B PW~3 `*c Vp.2(*Q:;A;3<h@<"Or!L!3KnQ5UbOVm6Si6q2f7S13S9388ePiPoWXZ Vy^MA´r`_@BPdT $%R(0uQo3I}#'nVoR' " o'bЏXq8VA@ n!߶P֢ PH` H .TT%Fpy A2hFz3 q T3TsTU#^AtAW0 <aM W+xEfPCpii0xMTa4.jAM i#!`'A|7Kv"EL-+b30%JT FPBAR`"Z`NYazx3 3]VafF @Gr[R [{M:. 0z[+ @_Ab Gy"\4atG4`@*;ա<@t(@ )X;_Ac `pB(1J 6^l1 sF I|z:$Ȝ ed$x?dDFA _A-W{I693 DTp (AFTa*2,Tf^è `?h8}H!t, <\'pu$ ʆy/`,@hA{Lb+s!Zz -@f:G#"9CV9:N\ s!{}ٝ=.5!no }&Vh Dd{@Hgy˺ݲ 'ü?4.1[8a߳维Xb&P˦\Aẁ [*AÜ<@.s\E’i8 b,έ` ʧJh61v'[Sp. [\F;5>wav]{2^MfpW=C};/\|Ȫ&V9uQȊ3C8 pGaa;3{;J beۣ}Dܯj :rD@.ʌdꖈBH @hн=шD !bUŜ\ "Ƈx`|8 @""o"0(}5v`2㑷W250" @dDRp`h4+1)Hۍ0huW1`,@` ^1A ]*F%q4MmCA0=+Ո#HMnV^µ&mB:s2 EޏJYwۙR$adXp,N) c kx5H,` &}J^h"`:"xaBʳ+Zڷ+껭! W&`uQV vj (!R"$ Ajt78|xH$nnF/&H-+iNY | d  ! 8ıg;Dz,*˳,آJH]*X tHZx1 !B6w.KwuaYRRxq(eEœl^.!OXŕ%iꈦB2 (49!$)J 8y^h%!pkh.e_oeF[aW }ږU6k)RS ,$8{ xhY+g(\ Ie,mavFfr- Rط. z&!)"j´<# wރjz x zp5y- 09n'0Z)؍`Z}7*Bb'h X`l*HiFYZ|@>kGˈhx8/pSJ\rBi"$M04ʙٽ7u6,BLy0+%l7+ T ŹӀ'j H5F8$P\D)P4[́"98\1L!PHÕ4eh PcoZEH#dtFII9)%dfMI9'dQJ9I)e4ʹS"R|9[-dX1;8ZȘtAc0$\ E RNԉ;%,PLHT[GcGl 0h2cہpo1B@(#@86RK X@6(r#|_tF4ESRh&`^bĨ#gZa3 cbl»k r2"g v"RS K,Yb,P'8'&흻.@0 +Bt#p<#ex6nČ@XQ3*T*Ux2laLI2ʙ e6?qVXb \ (iT0Qqzy3'-~20``x .cuJ "^69&خCmپ(&9p¸^EUsT&d9 @e̹q.5mh LfCmj1DMUqƁBc|pnjp^ߥ/H8q)f)W;Ca#Sx86HYkHAU=*YhqUG.o[d al:(t-@1@ E~/ 7 Y`!]Gn4H)ap 4F` N>G-ܼ|ZsOp`hqyx8T @T[t%:c^) e+f\^kEů-~6sd 2F7`|[R2.FCi@^:`PӴ1߹XU,ݒ.u{;wNWq IB޷>mGxO.u=Vp\UY5f9oNRЏV\@*4T8 K iL\q1u6#.#Iu`tN[u4ZX{2;9ۚIMqM [0V.dGDEtgGZ /'(P& /*'SGst0bPa'6K8x)zC.Ў8^nЈnn yﰀiGNy.^[#@( v/>vG8f)p Ά"*3> d)!PyxXDlTgnWnpnKcf3(uxx/K u?H^x e*40MaZ Z;Zl eLf@X]3 MflqC!") T*cU08Q]MN |]:Y ݺG*s2+IX\6^4LhcqGnJh>RA(%nPƛN,rVo H3J]7C iv8 w6 5$ 7XW QRAC|*GVZ QaPNbx?*J+I{*Vd-J& ±Ad"%@Xlɜu&G:Z FHIjWh"۷-z߸(J#Zʷ DQ$Lb2ȳ BԶ6 GXA n(pLhePQdєhvDŽpRfh L&In`!%°//[ RAv"X]vA֌И(2ZX04lY-`LaBhl8&Ys ,AUWϱ~r=8qa|BؚhBjk<=s>@uk`iqt 0R^!pS\?&fYex.i -D m)kyHEhdH}f[[gO!Ѱϑt798'4E;R#@$R@Y&= B@S@40iU&f_=BC@23h˂2b\< Pj S6 Aa@  bϺ];ud~V[oA 8=$r0k¨W,gpޞHEEr҅T.%J)e0NTڝ | eD[(]lm`#0 c5("P.!㜃NQ2\ ;cb= (0 @8B8D$ N'b#ZzjKt,"))@C 3Af@Wx/њSNjMY5ٛSnnMٽ7SrNY9DSvN;Sh?@S~?2/@WI_WW0@$pkdd!0"5ac@d&Y 0 &~Dpv`$NͣZ|?s87;6Nݨ"aQNL|Z['kې36h؛fAx Qz+1] u`Z4t !E( Y'% @FQs;+qom&Vk 82>] *[7̷{~A"NBmڡn /; +s$E^{;v0G5_xzZ&X.C+ Hk~_;n^BкLsG>"nSJm\ :ST|p:T\$sMͤ3{co0g}a$ə8rCV xWsfaypp,x3 Z޳5>5!f͝m^;N# .SJ PL/dx,"¯ߏdTD`$H: jBJxt fnLڃ͸ƬJZa|;|*" -:$ .P&S" |!Z  @APT0Xt4 M-6$€+o+ TI7,Fs0<3uDA8<C;?k;S4ErGH3:?sDn\v"0 m\+MH!b8/6+Gs5<337PK4.uMPdTaK'-G<3IB-LIL4i;ttTӺ4Rz-=KTڛUA0IVbeLAHtJt-9UCTUuC0KXQ3u_JZuZZ[5[u.r3[.!B$b 0u]!s]q']Ց\_5_u__`6`v ` `+n!J!DB^Eca69c=cAd6EdvIdMdQe6UJ.U0ev>An`%/fmggvehvhhi6ivi-Ql4@:b uuil6lvllm6mge0Vnvnno6ovoop7pw p pq)v]nanWr7%rw)r-r1s75sw9s=sAt7EtwH.^tYu]uav7evwivmvqw7uwwywO-RWwxxy7ywyyz7zvwx)xw{w{{|7|w||}2z{ {W~7~w~~7w-w-?O BaP6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkU_kgZmVeo\nW;w^oW`pX<& a+{ UerY'ǩkg-tz]>Wv{]wxr2ɱz}^g|~_?~uIP?P , Ll <5 Ð= Eđ,LBi*B<]eƑmuJsly!Ȓ,#L%ɒl'K!DX^1F~A!HY4Q)Jta1(;p>gipȆ. $q5@ # :ՈO YkI8> o m{*Z-Vut^,p ̰ >x'[༮al]qnp^7rw?t^_*ԎxP(TJ5"_N[LDW &ThxtP*g$JŸt@6@()m,Ho 恂f Â( }D>H@vkD!0  qrU03E.q NDhj3?0aZ(Vmy| K^( 2F 8e*"AJbk3 ̺WqR\01BX( J41 l? *O|U. " jm@0Y(>E5,fܷ5t]7Uv]ux^7z^|_7~_6ۺi(j*$6a`<3O*.7'")?#x*笸 Z`&tEP $ᰆdtgP(Y 2";$>1!'Ä(buL F" T?jZ q$Qm‡*X>9Āx~@~ uw_v=gvow=wwx=׻3ń(8SϮXm/c~G*+cq8RR32' V&Nȡ GbLD@t` PW1: t)R(%&1Ji* }f@AA"dl 2BK@ Q4x0- x$֘jPGUO(CSׂ%Xk|o#CS~%|(R u 1炈\mg<bp rN~0k[”QJ9I)e4UJY+etY/׈e^1`YŶd@yǩEx@*@'Y 8o[@Z  U ")CFCOA cxN=Fc=`~?\˶/a9lo/aN `zA_Lˉf遊38C^hC6VAO#%<Y-G=E\0 p3 (Q}(/fUWj_V:Yk5gZYnH 㥯0ti =AadvAA01)4{SX'w \,(,$a$ ië |ϴN4 SMN[u.Kph~3 NR@ r.qA]Aś @m¡# KnSmc 9P  Tj`K2Rcd>ֵZp6'`pvG a<)Hk,yiq ,ba J'F5 `%D1@@ "H_* Ď3"Q  ш+ָ2 .6+ >ȀiV&?.v_ PBJ }.a_y%|&;@SSc%o *ppr'X) kOj D%bja;  E%al=v6'ele $r]^/vXsfq!%-ќ-l!LF|G>7~$Mь(y|&h ACuцfHk+3bQ A<HHbA)0 N)K\Оk,,[ iL9fC-ЎS"D #g`4qOJ=k>@LɃ:_v>{7gUMGW!m$@h#DI LȈ FtXF&V@ R1﫨XT+xr&^$:Fu}[\DTdt ?O BaP6DbQ8V-B_{ఀ"E_}@,4ʥɁyf|5@3> dd@ VGD#ף A[ o\nW;w^oW`pX<& bqXf7drYWv{]wx|^?'|;﷿ ~?bP(, Ll% - ˜hڇCQ Eđ,MLUől]eƑlÌk_q} !Ȓ,#L%ɒl'rGmA(̵-˒/ 1̓,3ʫ49Γ;=ϓ?[6}CMEєmG%IҔ3A14"CR=O EQԕ-MSMUK+G]eY֕m[u]ו| E4Syc6h|Ziږk͵mۓo}*nkWn^%{ nߖ. hwbI6,l5ڹ]IǏ⸽ d 0eNUn]egl^Xݜ@;lWһkKJj@X4 Bt>!DX^1F~A!HY4Q)Jta1LYq9NAPZ5IRu>~PnP=~r?䏛5v Ե6 h\[y^`\6bv?!d' 3Jcpe5val]qnpX[1 3 Ŏ#>pm@qvx_7zw̳yCb4k>@4A0TAtB0' 8۪5.+D4OE1TWEt_F1g$O0OÞ:M"?WȲ4$I2T&It(J2F?CDr2L44M3T6Mt863,.D <3@P4BP5DQ4S9ܵdz~1E47NS?PT5GRJlrtIԬ!ԵgZVo\W5w^WO GPmKӯXUfYuhZ6j@- tp\7r\5t]7T,uHXJ/L]w~_`8|۶Davb8'b^EO/d9Gd6OYSa-[cSfog9wA[U^UeW^Ն:Vivj*~~9u[|jZl;DzlNjɦ_f6͸n;nQ ԙ ?O BaP6DbQ8VԌ1 =HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkUv_XlV;$g3@ei8$# Hr<pX<& bqXf7drY'ry\g7PYv3Uk؆t+>z}^g|~_?~jLԮJ:/ AtЋ - 5 Ð= EďaTljl ǀ'Ȓ,#L%ɒl')ȇrˀI/4 bױhx9ʓ;=ϓ? A E HQ:4%-K5MӔ=O EQԌ]"@KUՕm]WeY֕m[ MN3uٻWn]/L[3P<@!<`;GlўCq%I?aIa<8ۍqEɂG*bkyyb! LZ0DJ-rX:z-Xd%f܈@c ~0@,@DtΡ:! q{@Bt @rCș*$tRRKQ:K4A@H% _Mx$d#|Tm:nM R% P+,.%v\ e |Ǵ}iH)E$.9Q )bPGp"YBzJB,j H 4 @(A=XD3P#i'# p9H POGh Fpp B /ȆEX#LA$HAh(Ų u*rR[}Q({ʞ/@ <! @|66s@ BGfN JvA23 A2D 0 nϐ23٥ 8 DOù0 | C 7YΧ 0-eMlk",-BɓaDT k)+L,5γF0e?}|>-!dD`,3@]$;1ZnS ca4(l*G \R`Y APd4+q0B7$IPQ V@èܧZ;I#/|i_;'~jy?@PF P?Z]Kw?(LPA@R0a(nĖx'b G\1+ t"-ݼ!,'0! atXFt&o`1cY< !$)3Ef,yEQ8t<3\s]f@d S=db9-S(!$' $V@B 2\8aP53̛_l'I"!(%7je;YPF"àer5XSDV]+|vȂ_0&; NDM78 渰#m 780-H5. 6a{C2ՊV܉+vUsu" OC&%ZQ$ ;@h"83鬞MP ǂ@ AD E QC,Pݑ9p0+т =<ǜ8-콰 ntCW#S0BS85MWB+}بftF:@s€?Z~+w:nj<>8wݶ 1 "FV,'(""ͩ7TӔ! Nla!<ڰU P[ 2@/`.D@R|Yz G^JM:+Jli✕a4 *Tji5%"ñ D0g-Z(fT6ui :k"!5'5Qx!-R$;@]# 'd3Qt`Vm\)s pXE egPSZDk9Wjͷ< q(@rϐ$lv =, S`lAh%ؐ~|ĞX`Ǡh Q'^>0x 69MX!,-aFr.IazAAhKPXL 3{KS_cT,% w'` a/'cɨՂKV!ڰ@2xz( *4P0LV<"jPa]-X YD/0Cn9@/ H{H ( RD=eVB`}``|P,4@`Vp CIÀ3D@c`/H8@LR| D EX xl0 7@`V0/4è 0-$!,A @RR4 _AXzC6L+%C\uà0L6 ALәn$Mc0!T@hĬ6_B L5)y `U(@B_@LoՃ5C,80u<+4T_-37Ad J ` ,8-P>@XՀ X `!"0"!"R#ll5ռRa13W fۃ(BGF}4$4U HHnBպp@`ttA~Bא6δ,8VT`,$5}@@l?d.cP\>+ *"^&bl"v'(╳RMNhOPQ@Qd?)WbK UL &8 DxGTUAeT<И0AX0\T1pG_Ð6FH E9r3#8B@#N5e3 ^N_"r'`AheZH <Ҡ0 AT H)vhgB*Db6#ļ9)g[.I>fL[LۜEM3gXc( JTP;eX4`%hDHIR {t( ŁL>Bi .Ơۼ! IB=@ޏż7c/@A Hll׃32 rKhU[N6F6Uev"AqB" .jZèfC>IFLeB.Ec!Z" ŖG"""#B$%"Z&_exb)"E|4_Ai%Gi28O:By[%eŃƤ CW*DM( (|C 6Yb0nAPI@ C16AjHcBJ$L+svH'R #HfcR@rD?HLH^ERҴ:bLd;(`3 D&f0b@!?<~tGZ$j2DTԐ/ýLp0)m>⎁2./.. ȪzUj趛vW0E(7}HtD#%eDb`'B RTﺧjUVT8+**+5ʣ-LƑi=|@(]<,,C$=<,BbЛJ`0L @QӨ7ZEnYZ-^Ef\%jAT/‰D1p>F 5mxT-D-D4;(91ʯO`Z56_I#tc0`Ô/yCP*ppqFjr"ˀx9 `-@ 7,5.)*:k(F*l*uG`0q:L݈lvn|J!vaz |14/A-*M )qssIX h3FৰA d41BÎXDJ/_TA:, LDCtF.6CGdl `z;/BՄF5E"c0@{H6Q55Au(Du3SC6V$[5@TPtcLqڤD3xD|<<\8׀=5[GWss7B;$8T58Au;l1>Ät˜02440AZ:fCK.1PD#Eku\FVD!6Ь U+W0ˠǤ:#-xlŠG\\vT=_u`K7a1ao61&D@(>Pvp=F; lpH$ `:\d)@ HV+ 1[:E Ldi2 A`=cy> ۨ ŻL*$P"g2ΎULa`$E?\ rFc Ba Lt^b@@6q'[4^eEhmT><^ -=<*6|z ><-*!Tsh Fw~G SapXDrs j2 @DBG3 6 1p5>IB1 X(mgif04ň1d 9tA@[r= +M, /J M 8IA )J>D,/R ZqmwO#pp`CSg 9$$2N%)RV_MZ)\JJ}1ڗ7 6N [14+&™'e&xAP0NGDԓ(Y ; -VUiZ4o\W5"> t,+RSR:z&hF/iZ^y^\F(B)jaMx8(@_F$XL*ld 2 APsIJl D)`<%as}|_ STAQEYXVx#@8Ľ9kHZ9 ұO4@Bd^& CKX5(p X B4@KgL*c!AL\}ڼO'7GOWl8ߕzW8H r':BP ma.J\T{ !ED` %@=(HyXOU p<&Le+ gQ ` ;fS*$x8-a6;{h`hq)K#Tȋ1["T[ #<4u*@~%.Hf7) ێ^ '=eaOBCQK[h>)\ C0D ]qAx"/xcԐb(.6q X-n:M2,bWpDhoaKpQ_X2 s !b]W´$Oe".ő,>`Ĕ weG A֕G8`@89|Zd6M\ ԣ7( bMv=; d*tD㢩IBceT;zB)%R[K3Gp*)bt,9tXcMfem3%R dL3&tЫɰQ'k,r@0ϼ,U*RG1>#.p0aVd`@qC` hBθR*FPL?S$*hr`6JĴT-㝧AJR*P^..e/@ awdEE (` =`dtQ8 8\Ow2!vŠN(hTXVxXZ\^`b8U=ȅc>>2Ct{pZCRM 0<"0Fg+2p5A Xtw^P0](@@@A3t;_G 1Q$ K8DJ`8JI&&P=P $>1QJmff@+6u0[pfG$ Z ?m `@0*<2)`$@0 N5`<r V`x^q*K%.a1 8Z H4U8_\d A//B hg @p4p b6S\`݇ :>) k$ r'ppԸX63CHR6AcA@<b PLDICi*Q % 5Jy: G)q~ 'E0 T'(#H2b ]S  $r YI.J <7 PcC8ai k8ww 9 8 )$;y5ɰrY]6h) -U(S/v P>h ` :fxbl8a%7(.: `Qѹ J +AOiQ1LVF 0]b@p_TI)?2t94B' (t]Vi PXIȜjtThUX֍j朩SH%y"!@>! 1`.% PR1 )2{>(9E5E 1  ;tiъD!q;0Yp8V6?0_ZYh@D0I+`IHح5O"zzU 7`ݓ$z u x&H:|nFIi[ObH$ ZŬzꘪc ! ZU("Q0JGCx h n` 9I)M%Z갫 `QI!V[njp? yG$ @A@Fҏ @ ħuJ))P飹N:?JB62{#BTIۈ"> ëIF]iH7/0!a!8t%&ۡI맺ҁxOz۬*Đ[A9ɯi Q;듽hRjQbo(6R}:ӊZ֊<5@٣P9195h'%Y0 uBCrk% ,$}2 PO@U`u\ Έ@6O!O\bf`LS:J!z%*,/YI)ȚȻT-1ap)d%QYƶ \z$j( +&.*G+p wz ~*J %S%@%{2JS] +Q1[W>2$ `D*˘n<\|(fi>#ϓk6(UYO JџR00p2+cP@(!] 1)2 VCTs\a1 ( Q 0A;Ca{ &&P!SX 566pRU=I KO+R CE . %Bp5#Ms I AT Sr#`L  p(U&9{0%R8&-jGX k!`*D39v[;& J~C xz[QA 0-` 4 z`*T%5(ap-c 4ב`L* A70@bЌ 4r':j"H6 m=)'EŽGN܁)1I=V}YհP[-#` }qrֽ-MGmڀڭr5N 0՘  JBgr[ ahZ2eSw\ټGbl=[R c+2p &kS9 ʹtn䈃MQ @e6j'/fGҲD7Eh˃r[`;PKpSSL 1Z &-`S5]#~u 0bf:{یM溽靝n@Fa gN۝Zl R b χ`rP Ujn=eA ~9kF%p~ ֍d)^bړ"p* b Z:O b M&P# *dE0=պ؝PS(  \}4`]Ӎ:kA8 @fN*׍! 6m֟72'P-а 3[`pGPǸA! f0ӝ;}@;1 -#U?  8P0!/RIZUН  @]\±hGJ .% ,^U0(Ъ D$R(؄ص퐌@a ]=^&MRڽtZ2P @dRR ǣc-RZ`4_@\fG؍T[Y _x@X@B!PtA(pL`Bš.Z`ЦZ> iT'SZ@i(A{/XLQkEhYBNҧT0@ @mME"A\0d C|7}gL2v]`sl]q/!Sh<}c@ k,:b6X0-L25+I{_WgXf!WUezs5+KPF< 8,TQI2M:L- r' <@ t7q MNXE$ى49B.dEՔ ;=A\ ^c?G)U lC A04 \0*E0ս@ nx J>e4\ d͛&9-]@QN*-J֝NxB4x9 T?PT5GRT5O3:#$Y2z'F"p)d% CPyPD`6K22O%|Os9mnp?#3C?Scsڻd,ד(N D_ܯcΖ eITql##  j 7 M,RaBi܏E2ȀpOJ"l)8΂zxc L.H.z0&@dauXxEHªzHxRSQK)p>*mNt%#d-06"uQPPiTA7GTFִ%Jfq?A!4!Ġ"eL}3ӈ2cBX'+|^̨WV )c,q">~ 7>:Z8YSBh⤚ 0NH& 5FxR͜QD*8QlU2UMZM:p_Au |@zIhc XauH}7%@D ĺ_L"< 8K,aCTrhgTt`< e0><5./Ti!˸eZ/F24hAAA$8BRQW 5s!.8 ZѰGh>)`6MPY\˅.0pπF7a] f=H |r,Ty#d~OR{؉5qqRJuRq$:[h ivCYEjI.v3$3E5'>AW`_H}u2f\DT @f.e-2_?_Z\qPvs'(4JL\ܐp ¸bL4'-'R|sթ9oe]4BIpaX0(q?`&iyzMι;=@a91 e; ˘70U+%w!@ZKipN1@( |3xiX818!<$j^M 'H_ o<>}׌(YۡXk܉a >K94C| ( kQ toC8ϕ:M.l A@=*ʈMH\^!fn!v~!!lv< >ϔ?CY̓PfOb V:r@.@>"4NbD>bN%OH?f!R^ a CbPߢ!b"u",.CCH#H^\bC%""^&F#t|f":)H̸5!b6.:2>7;V1<*1ON=,B1F@lD(C2F"Dc2jebXaW$?dv2%MfUDbRdV]E#4CQe;bNOB\f]&=-v"eb&23XU#Z%.eBTvQ&Pb"^ei&ij&jk&kl&Ø0B[cNeTofn'&r^n.f'_g-X.4Lu'vw~x'xފ+~ӀF|g}~'~'(((&fڂ>Rì?ELJVhnhAhu~h(x>\Cx9 $@2gh)))&.)6<(2φma”)^)fn)v~))))J8󡾛֝ޞ)枩)**!j6*&*FN*V^*fn*v~*nfO2ꆪ***Ƭέ*֭~*?O BaP6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkU_kgZmVeo\nW;w^oW`pX<& a+{ UerY'ǩkg-tz]>Wv{]wxr2ɱz}^g|~_?~uIP?P , Ll <5 Ð= Eđ,LBi*B<]eƑmuJsly!Ȓ,#L%ɒl'Kjx 2&X 0 (H$aC# fԴ6 pS-  Ӑ</vD4@8u# xŅdh B`HX3̔7@-WS@H/PpF)# z \c恶 960!m@03DD((CCDq$KDDSEqd[EcFqkFsGq{G!DHzGa xHTcHIAD  Dj#l7'"):3859Q=A* L|tڼ:|4f#S!GQO~>K+b)|CZ @8I>$m[vo[q\w%s\Eu]wew]y[,I'rQ+8I.'#hiC2cбQM  (,91SJC)+'SzO`"6ZcYBh}7((NK97)BlCxjOf$!RHVFԔ", E(%ne,rq`RP2atA@PkRp0~ b6}cv}kvsw}{wx~'^š$j$ EFp&x=8FH3z-qfMcOp VdN %B?а PgB`8|+q8Hl`E~ԉ @ф¨^q;on 0i{zѾL4D B= _@ z.IExb[v/Ecc3FxJ3&)|#3L H |$l4nry!3A& F`aE(Whl!"@@ |e!b@LAA0۸*+sCldb,0Me{(Ţ.7k d9.\5 vD)P=G`l NLj{Oy>gO@h6hR9Cѯ yALF4 GYU@(Y, -l5Bp4 JA! &$ĔUqN (6#!aEx'PYaP0,v@ؑm'P!rSR "Eh5f D_ ) C8Y~EAD! dlVY{1fl՛vYAɩ?O BaP6DbQ8V")LD`Y &" (9QPw>EdK@Je ) CH,o"@ PtCּ2R`Z00@I2@*[GJYhp( uZ *|@N9! b4)u: 0cPD)E[}px\>'ry\g7tz]>Wv{]wx|^?'z}^g|~_?~_H!E(ܸj袐R@t)JzP(0z`:;!a6@,i*ʶa:r- *2 sr5ű@)\g!`G  ,bsP4 xS YD>;#@%IҔ-K5MӔ=O EQԕ-MSMUUՕm]W{(.|#{`Z#|~| X&E(v}(j㟖 ,6z)~Y|Q\yޗ{}ߗ. mgZU%☮-5=E.M0dn]e普mu} PeP&Nnꚮ]qű.ͳNյnݷ۽Z+;po%-4qqE/MOUo]K>=mu}kػ]^OoW/L~x|řOG./v{jP}@P7 @P,-PN AS?`ZwA !Ў BXM D)`ԘP@% 2\ЪC}2]}A=Q&D؝Z(`Y."@a12FW?@ Lq:GXȤS:Tu"5$2 F(!Dt#,R*JIY-%z:G IM)Co:HI Y*eRI' ?Q )eS J3䌓.fLٜ)V)y nnM+1lʛӎrNYȥҗ]ɯ(4Sz*2%tS~O:N7W+V<rGҾX[cŁNYW1aM"V,-gVZ[LȳDF+=i텱tv[[mme8ZkN\;qC#G||.ɺWM[u`,?O BaP6DbQ8VԌ1 =HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRe/`O *TVWVkUv_XlV;%gZmVeR pI5nxx`pX<& bqXf7HYfsYw=htZ=&M\(퉽_1mnw[}bYF}Fry\g7tz]>oUk؍og~|^?'zq:-|~_?~U.26Ll=- 5 ÐLqHŐ]eƋ)t1k !Ȓ,#p q)$K̵-˒uL1/ML5͓l7Q&-{}O=tГ CMEGrҀIK55FS=O DNJ$N\n}\5|Vm[t!aqmwcMeٖjwRu:_T֭k͵mۖo h(V]j\WMuݗmwyޓUȠmzߗ. ׺|%☮--'bWcYE.MNUx}%YY普mu^\ Ne.Nk Qڮεq'zOk.ͳNյlݰ'[MmۮqVU,W-̜qbVXŭI.牰EKMZvZ0b'Tj, X BɝV`, xGy}`^W@+y[w]}LUiXS JP V2= mrD۽d.GӀ"@*7P(PH ,'ߤ` AH `n&iI”qx!P$-x/yptAR6a"c]~03 #$oA$%yp̳t,XUT Xq5gH-_HC A@lV|tv5, @+9ecQP& $vbgBh1*e\ſKIm.%Լ?8+&JB$D8x =Wd'A=G {>0]H?("EoKE2B A`\?&=G $($3! !t0ԉ2pI#-KqA YQ:`@Ya wf Qv<I &b6Ա !4!(jǠdU‰X}!$-`]McåՊ1p@@vW32B& SR^ʠ=pϱ*آA|L,8СK&v^_a B\@T$EEAVX[,:'I8 /o! v[^WFOO1Iǃ @)%q0E(#% hp/-!h$-VDn9r  rL; @4 @te1v0N=lq>- X.P:]+-噪uVժPj5p9Z+Pj:GX#W!aƘ#?\C,[ǐ.BY0YCh썓ӄ h3PB8>PvKx/@#&is:'Bƣ%"1BR_Hz e + 27P |9^ %(Ve@5^`OcFȹuh1]6Xa~ 5q`D! %\0Xv@PT8;y|G݌`f 1(؁q^2jGMl] W= =o;M|2Uq`7H(5Rs`'8 ā ? }6x:`-ocX&871^B(P<Ot) Pj@09,7Av=0C=fY@/D`1X&b1&Qf 444=ph @ IeGp"0`WU3@6H 6aU9_Cpg{/8\Q[5:L/?pqoFBsP(ZJ?L9!BΛ 2Q>ΰ. $9_[ecfyk3.COzfܧi3ڼ\*d  P'K`=‡:<áA$ YlL3Z^kCCKSYw#:$MDA%Js׌Q}_@J3K29*  @=ce+~(  0CESeq)̅Iu ˗B^{ 34I C:h5%MI pkk<[Pad-8-  -ʈK98>[;imcȋ~4F-` (Y qiC=JM/] ÈZx @ $dW}5&@TjЙ/uAG̔1-` + g&ʴV])x X ֚blͩ6ݛ;A $e蒒aIt A|V/$ #J炥 M@k? `@vC•, #\ %ؿP`2m-[ `4Md# ^^dۖD`TbRuR;J[ |L/EyJ !L 8vd vv6.4s)H@J3F( Ж P5'%p2K=$ HZU -EEK**3t%rrFjtzl e@2Q}Tn >]v,7G@6 ր 9c]1!<(CaLH)19of&lΙBh)KFn1t`_AHD#X?Ebhp @(;Fs*1Z/@+c д16np7L pOv!@T rP @$ۉpnÀyq=ȹ8әP)7 #~. c Ymq/mF$ !B`1( VZ h }dbs 4c* @ʩU33U U*XN.Ñ#x{},JDB`!ab-6rR ik3W_ +P;d"Ы].3 ])8ڐ kM>el~{-ĺ`k" 9غYi_u[ 9J;| ˜pbLZQBəS.sւOk qr o8N:þt.빉vNc{Oy  ^Q\.DP!BdPgEEʄk*OP$$${{Ii6JB> @0_B~'$`A:Qpz\`P/B9~#h` &@0] 3: L/VYp QZ/w z^)-D%6P + /pP W)CR%N B%209mS@?X@!̈́XGK.XM>o @w1U`S` %qkp!B8H1Kp cIH` p?: P-p)$K @xkoq%ф1 YWV0  '4)@Қ0TSq ~gӁҚ`C~ ' TCTS@ D~G1yp] ` =Gnp зwZ((akx0BUk "()BQhqpՍ@]C (a0*P rn "1a8@*h_w0gՐ F00X"Imu!xwz|bCn;B9Ap8֍A([)4@U -6 B0A͔ےYg_ep> s@E<Q 0$m 2zRKpi@jBWy i [%HH@p HN  +w(jXjiQgc!`2z+nA ag ?9D9%r٘l"T%HA6"VJ;q[PRVɥ ` c.2  *Y'0T`Z/Θ I "$%P CW4!% ciQ TUa  @'ir= o T 9I@lp%%&ҚAᡉ[j] @ ⛎8*` SpST }Aԙhܓ8^ yԝ`(jP('"s#U@"4 PHFA+[='Ǣ0Y 2K`]@~jʃ:5Lx9!kAs8Ōy0Ɋ&w HrsjmlQ> UK4pP)Ȯ;+bR'1e02~ rb= Gx9X ^^e ;0#`w,u iB^rUQqY Ij)a(2Y3 (P0A 3-07eFj'=a} p Cu~q*и\ n`S?  ~Q( @zVY~W=E!' 0(F0 NW2%Ez pYם 7OK-"S_a  ;<7/%eut%~Zh #ɂZCSJ  2^00$^;_@8*HWpӸkP⋠'3D4 {1  ! *jWbAK%%JBʵT % <|p8ŒNU@Z0.w鱛0S ).\WPb 1qȉYIyژ哨d u ~L+`J0Jg[]M}Gg`౱!'Co Q9^"Z~pP] к| ;P ZN; +`)^pi еBH [BY"RPP 1+ `Z87\ +An<Z!0Jk 0`>uAV"T(g(Pi @`y\WYP)%Qq}8xGx `0Pjm/ Nu `?Hw `A &@ #WP 1u 0WPUyD0^ c1 0>Nޏ쁿&œ iC u cjW ^%}oq]q s 6 `^̈́*X@ҩuC-w 2*TkT12As# 1`s t(K)K `@Ҡ o Y =8\-nP&Om%W0!Ѐd;w\[ƈ!r偰]0 0@#ȤqWpzY@2 ȟ`ZLz u^b8 ~t:x % CPEHteGEfeeve{IgH<dBd [ޓ|\DhA#?c8_ /Sch9%'%#8r=OHEaY%s-0s-(S@ 6>@ ]DJÉ8c"\y!"*uAu@K(A%#~.1/D! j0%<'MI9'oܚ-rNX<g7zgf-䇻ʳ'(`J@EfU'*_,TTZLP:`!j \2$t:p GDukǶڗ`O{@cdm 7F.もct|,RF>; |.PƆW]=0Bvͣ,a09W*ؽbf36#@8oT LI F#Dݵj.E*#jDa6݀zoby,Z`H(}c[~nb-/mdtZ+MjwIrb$B$ br˻KAi-@a`I)E,š r"/EB6 qnj,@̈\@cC`_ ZcIT` wԺSS\DVk=#[]s+-%`0b2`|G+5fDZA-(yX B|X 1aFXfj~ք$I'DH1t w56X#T1kd¸78wq"lGA3T }Bc"_n`&^P2@>0&ah#%T҇`34^(5ŚEx/YG':幕`0{H"Ơ% )ps vd6;Dnw@Jc5ʁ/P2LRGVr쒊Snab4Y&%:?gR*0#ce`҆㵤]2$s Ca:̘׊u9FAeܐC@EC6d-Cg_~C0 ٞ`YB6DAXܒsQ5A0}6F?E@huxNBAAĨ;B7 6܁C![q<)TzXY8SE@4D@@\_tN`4z_>_4,ČDٱØ (gFW$R̰S >F0X:h15 -}%FP 1r["mM(9bhKɟ_() :ĜaDDd-L@tJ r`u( -lG`E[p5,8ij@( ]fǪ _ս[#<+mIe؝Gab 8S^@l *­e^ՑeZI Ƅ $dJP%D^4R5^FqN1/*[4I!@ySP0 EB.(@̓P>#}T܃ME RΑ\1QAy~e=(DzY]l `0 È a飴RhFVhiPQ"]2 HOUFlIt.Ѻ[Ztdc!h܀PU nGbfcd]LPnT]6^| t]!V޺Z.[D[Ct҂8z899@Y 0hT$ (ڐRR&P3Nޖ׺F@ Cc@ DQz[f@@M%c^NΞZA,(j+mE:F ?Cl,h L)ԔV 4@=Ǥ?h5hAzoUJܺoڔAn ,I%i ܃8/N^K,^E `´24xuDY KFTA8T4ȻɉapeʑCF]ÞJC(D8_0GkE7t?Hg,_4k  4J4`]K >|Qt\;<Dn@#lI'7=9R@l=.8e >Q*sLB2L<EB+ͫgr*¤/Gg12栔C,?ZH:@$o%r00EE]U?:G9Cp>ͫ18ܩ)E8g3ɔ:a|j@(!0t71,Chf4ɦ@ M7^|lYǨ>Q͒\Ĉ?]U@X*:!.E\l}jO+t. 9t(rT`*M4 tOlXM q 0X?1D4J޴S3@t80-CLR[C6 ZЇ$ry6kwpq`7 7) 2xH=ٛB)Cv;ddev\)6dL6JtPa:,;@ Dn2O%Ns`.4|N}riZ^ ^\PkX (@(Ru/+2^_u`DW`av܌;B\(4C9 `@pɎD0aT4i[)VVs:;D;<:]0x`0w0 0Cd$Pr\EV4lj Kw5g@ 2!:+#*'ԤUVz!GӭW?:bV,L,ܔ/4q'hhWi nkVļedɤ,1j kD9.'r7 Xt DL54H M0Xw7C9;|'uF1c@zq1͌ȼ <,sC> g 8sż`7~9Ú<( t:G\DVeD0ԋ@/q9x9;bwSc(G΃I<8(N<̼fmk:u 2"<B|ݎJC 2M7k2˘{G [s\z:iVPC6::Ri4-\G\-pZo6|~kna$@1`FS)m[P(B)1h !@7sbX!@ۘS.Yh-z R/з뱗Z@9h(aT`d*@ gH@O' ZB I`Mi[#[hq \)p]˸ 5ѡUHHh|6k(V/҄Eitq]]#nUsO Hf mp D|b %a0 >ߒA4]Q*KӐ\-5 ?BEͰf]B`a3 3%],25`Up)KɃĕ0 Rd?@]d>@*$B5DQ0C @848V@# E`0ume%4efl[6շn[@)DlP_s*JC:4**Z@jE 9d[0hT|` Т1z7,m@Q  p 05+ˀ0y E\"TF&Dk)LPS ҆VTA*/&uQle1/ ȗM Uð9$T\+4L:a&_K]l6ўakay  Tq~:v#?Nx.x"\O-GNWgp:P>zv=- fB4*0 (M 3 rQ;0#*˷1(N]\K+0,^yFˣw46#BPf  B,^d@(hzHc@ tPQE1 |Ϡ&1Bj><4LK'b .U. '$aL-Q|a70Se֏8PBҸ 4Ǵp)-d̛~P FZh < ktX႕jlbMDD _< 1J()d-GDh5@ !h x%WLL9Ai1m_v ?+8)KHֵu TjReMA`- w "&HPTXQX"dNEd,o"H\ hrW&$( `hT0kQj=IzZعcz 4mS=5y(JhQ@a V[5 <9sڌ@B|4p÷sTExoI3& JFl Ĭ3deZ&$Ԝ҂QJ@KY[/f@cL[kP+4 @@vj.FUR@h$zOam>'Ѳ_~ՠA%0=<$Guz`28:$cfa82|qpkGh#.ٖb-P3 W#tp:o7"FApA8$<cw.޻n[yAp& >`8Xz|㠇?b! ћ%!p1ie*: *3GXAZYR@4V#495|(V.: f?HxwگRѺ(hbW\~B>7T^sw޻~2`|=<#k]9vKcVbc-T_`j=J@ZW" 0zI.oߥtY+ZvPvwikIn*qaFb5fHqP(2 ?VompoG{/SP5-u7U&*c ` _%85uq u[']$ *At00 C 1h <* SRq;[W2[cȎ@$^ @ABtABBFC+T0C%5Cd8CSBDBDr _QE)QY|EGEFcFWBjoGow}GaFc5H`? sC 9V&hx #{f$mVH>ErBE@~~igVXąpӕ–$=&䘴)},XŎP"jst=cvm93:Z:dz9q4" ħ+@p UfQG'H? qa!QSVM&!@2` Q@=TJ} aQ: 0A`C0Zg'*ۤB*'SIr-PMD Gp E@UC0P09/G Ԩ  ::*p H*oQ'$(+ 0ݧzxK,"ȂfF00CC  8A@#Q =0 ݥaAL@, 2}G0* !I"dꁤM *8ϩ t20 `z-=8C!ʎʻ*z 5E+3-2pM, W ҴVzKp'sJ$ ԯ"7 vZ`p5 cZx2մ`)aaN Iiq%M"Q+0!A3A7Q,  2 4g+ڇ)!D %[&* I \jħQ&PpA}͊Q*bJfy|2/[5{7K$5{ $`V^:d[S+ּ̦ݧQ :,IU9";%dLZ p`'"% 1"l 3!@ N< ҂ĵ,)  P@?Zϭ1b 8dP4>y}iD`Ā" `z}]ջ{%`ỼͮDZL`UXS.Z}pѽ۟ AKMl 1la rν%`5\m(RLq ZQmʹ"\>֎ֲM ٍ͞2۳ީT7N#!h%03[gm: rԑ@ԽP0,Q `ܯ$ ^;S"2O]̪6Jv*A:ӑ#&4  sKAz{oaJvjz2t] ʖ 0䪡:r ʮ ޝ~nj5ȽнvL WG͈svM` ݷdlh YͿ-]p@9|:O1,:d?%p:Adc2O!>_M%0c"Vi-ny?T9a4b`Yî{33B2 "Og[ӟ/O&@B8O2ߕe>&<;Ovy9ϛLP3/o6_9/D/F$?o m?@w Deb vC+liT 3a#/>'#!r|V nGYn8Ů{hamu@xKCSUv\jmD%mna;m>]_}Dߊ߉~_Dt$z[xW|([R^RD=/v `wPp9NU}hJy ]l : p0PQT^-``> E &KW>"]^EmHV##V!}=NU*Lš@%…z"vDG@R⎥za9.OiMvj:tj—<+Xƽ\X.@<;@ pL Ѵ;R՜*zJ# w,:P,^@L2ԕM`BWNֿ3nU+c~A.pDDH7Ʊro{w%ɫ6>a 2|5ͳ|9γKȲ:bcIwGԵ=QO-0t >c) />?Sc[幀9sznVz^ ~_?3l7_V _6& 0*|0J H+0j ; 0^#"0[ |00Hk 80{ $V$?O BaP6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkU_kgZmVeo\nW;w^oW`pX<& a+{ UerY'ǩkg-tz]>Wv{]wxr2ɱz}^g|~_?~uIP?'@`(O4- 9BBpEđ+'oO:[JTL$j q&I@H@HJ x_`%b!RHZe̠}4g!jm+eϥ`X,yH@h6`pQ:R<PtE-2LEDT5QU e\ǘnGP PV @2$}v`HR n[q\2'\M^vt`qܰ2cq jJi`gG@aض=e*u`wl CgaSX7qe=ObРZy}Wm"@ pAPdY6Ȱ/_Bo.eRׅwy{5Ow i^UX eFlZڲղY- Z"jPv:޻ T`U2* Y*[1!rL&ɘl&jh57Gܦ=ׂKFg P8$ BaPd6DbQ8V-FcQv= c#>eRd]/8J<A>.T+`EEA08b>@/@dERJ  ͚KxH vV@&&u1dÏ@~mÐE@yXO Q,g]Qx\8-gVo݈ iPAH: El/Og`r1_/ 7 xh`/ gz*o$vkjo6X@h뮎ӸdB/Gr 8ݖ+b!;x`i-1̓*6}(Rg$H˙}JD H2$$IRc:;3Q)J->$ Ah' x,Ee=?q{ADl/a %08ma@H",}€?I&#Lɹ:$5n MFMH t[B\Azm" @ D&D9l՟g̙4tBaS,.0Jn- sjm]eT.q5 CcbuUto]Yx@ y4v $җN4Alq#js)VG!Qr/F0Dh ̆ڌ¥0zHm)<0^g|mA/!# n7TNI4ˍ3@'% x#n?F{ 5FEa/!|N*fځOd@/̪Dl7'@a|:`` Iiima j 1 &,J4 8@NZNhb힦tJh! {|'|' oDJrAnO` IȜȞJ\`n#TNXLH^Yn.d(<xb11qWFQ%Ȥ@(tO*j @'fjNF¤B#al `AaƁ^tG!ȗ&\`.RA W !0X@l-.ͨ" R`D+|z`dop!/\N ꤎED4SB c^!pp(AR,b U%TY%&m&nI&'u''(r(ґ)@)!3012'a0L`@$a7n BA+`V ,hHZmЉSo T?ay%r[%c8j/`VB} `Z'le&< js!R'!&3?E!qt\#n BZ@Ilo{ 6Ď!0I@V㫪Bv3 `E`KQ@ 5axn(f6e Q [Qu9aI/, `ha \WK`DaኴI@pɿHsC HKo5-RmCRJr!W ) /! W;Q<#4|<аN& NbS/N.06(C 22#V/gF t``?`Zs>S?T'ސ` t DWGFSj#6yTM@oM(N`N vT SBS8Re&I@ Y /iDEDD{E"r"!tc`evB6a &>60`2t;a"M`MN6E`(RF`e>s ba[Bo*Fdwj{7)8v/`<,̇`in<X >LnUf]Vd+VVo\d6VR0O$ T,f|7|gF `*~cFtk|sGt{GH W4uMIL+I)J b3cepZaFIM /=i1E@2w"8B/0s!r)bga8w(Af! bh/ah6#nxov'&r27s?tWINh7TYuF]0#5EF GEw(FsT/.!h[Q`*|SD>adP[5_KL\ dZ5 :xi`r:;[DsFRsbxat0p CeP<͌IswCFE/ZӭzڡJ }_R o7Qx{{XWzsǩU=Z $vnV `a6+JUcJaW J lavG fE`PC9;?4uPU PY5-Y!5/[ۜa[|stDA\@Т F*gك~;* ;aS4̧·NL;5[m.Tż@AR3kN@hA,)\%ل3 !f56B@ qg|@PcGkR- 6 <>A^H!rIQH o"!+@1^5g\%/At _ޅ*ר^  3gzU p?`aᰙx^VW"fz)jbsڝ |Xg*N ws8ZAh5 @>Uh=D L`?N62%5NF @qV "C*gS#J $h*Հi\(hx-ih) `] UȌQI< ,@`;P/#^$Xzn"_(iV@^dJWP- ]2 `X>"z6 m|zX+@yHPB,0~y*' a:g | +F+`i0..i`%rN!B8L@(0 Cd旞SȈ-` FJV \c`b%H x)Yp (%4 $2{ x@H º?gHF@"g ()r e~$I"L")RX& 8&! i]ИFiRx^ BHPf 'J+KԹ/L`V--Q)s"# ˴ e1;2 /6eBAР !+@m`@ WHezq␾3I^(is03@m+NԵj#A' ݔ;}P 'j٢Pz@)Da4d.Ʋ潰"[+RogP% it% DQIGRX um_Xhsu^WbXE.fXbcC6PeAWt,+&Z$tQltoXzp'(2)r/Λ.ۺ/†Qkq\3tz$'T4ȴ|=6̜FȬ1@ @zKiuE쾛lB  V8HN4adxHВ *HꠙiQ0 nV ZT@])Q0<v"nSo^%ppUO(ApqH(0#!`њt]O#yAP#sAJXX]5;hx$EWK"aZ,L7,btn j+ܜtN=5q3]m(PlrdW Ԙ ghBU@&=(Dp?U@]HA1G }Z;Ii`Z7*C"X}0fކj&Ŵ24YHLb/#QDqt7@7BQ(AK*`˔UY9jW0e&"eLPbu>Wq'o~N(0XBml(DtBHQ SnnKi2&R+F8MvOIA(eSJyQLV 3m%7tT#q16l&|5OYzXy4^V `4y=!#,SO„t2"hdRZioɽ8't@O%DU2H q޷)ohF)X@#Q@-lAq#Ęb|QqV+Řbaq3Ƙc|qq8ǘ?Ș#b1(e;5ФC0%t)t*y$e^ + C a&">p% HV;tbhC_@bA4 :x:-{н7, ˕9c9b9)A, S%BUD1@C` p8@` 8^ۓ73<3зAaSIq@Da\:x&)5{<+&C8c h| P\H̄ %\lbL;  >czX-(jcYJgJc?Ťl2L)?DƈJ< /9V5p7(b C0y)*;;Ɇ:C$D4@ŵh RnFp %\r ZkNs7B47X7sxyX#')0X=@PjDPc 1"X ]?^Gu_9`{U|88 #~WWX XX-X;1v 2"CLax$TTS&X63$  Z"ll_ŠH[xcXTX-`:LY\"~,Ȳu'ɆpS`{ O`D}(|hō vE#Մ5Bv \'L}1dw/{  (h]hP_Dإ͒TCOQ$p0| tbɇ@ ͲI[c[xې@`5v}Ι (,8}؈3G0{) ұ PSژ<Pވޝ"[X=} t3*Xrfp((_y0]U]F%sNY֍lL8V貂0-R3[ s0 U}DӖY(ٛٴwY͝Zd*=YCE_8T\x_H/Zd?(EIXh!I Te1e'ؚ(Hyl XrY__ߨ\_RGy;H+V(؉ v-$%%5Ÿ2\A0w"^oPsKb/2c1/K[pZ[PrE^65N.CUy(@Sm @u 3@wP߀6cf4p~ PԁA  .C˅1/J9hh ^}V,: Ghm{C=͠cdU,dZEI!Aa{P ;k& Fʧ9K[f~pr&{a] e%&>WڦLڠХ8QE`}C X (jXcVcp)*=nٛ6),hP`*Zsna~" j꾬6k>6G a2f va(!%m؈AҸcsSㅙY:YŝYY`Vd/r"ƽkl(%*XeDoP{`- 0Q}}oc2n>$ pX` Nfffc Nc>c]}y!=-]vho`2Z͐@ a a `ij>k`hUa+0nbf7߲Qfwnogq5?cwewpƲ!8 ]ac(Y}㕚پ;qt=ho2ږXY<t[f]^`wuS&]%t-]֎n~H{I(~h&i.8t u!+wWy۞. f2nv^0^& ^E:WG͉O_o1EX" 1! =- *b9J:P(T 6˨ʠ#^]7C it 0lPHB6 ExC-:Y@QAh%t.@ F5$a In4pd3ѐ& ItA E+itC(qݭ؂ْŤPd4qP^!WۙRIt6K1! P@EHp "EAA ρh@ri*|:6 FJVQBh`iEf!(kl+ılkg2 &ʲj3l>дpV/0 )B^#SI EJ8q Htnn@i ȮBa@Ul*mZH+m#@- CF!B@`ky`Y* 5យL&ω0u "d:$ H%rT%v'µEV|R^X(0q[]؎i`dl'I|(@)dXo@3P CFUed?O~$F1JHkRYz`.NZۑ-_Oqn&rai(yI\%ܒ@o4MSbW7N:NggԶ- U yf uy@rB=4? e٬YEcg. z@Aa aVbM/O׼#ig aTz JZur/).1nn~,4КGA霩&_| > DX Q(u zIc`' B*`HY  Zm^B&H5Ny@Oj-L5VZ]k큖=F~ S@#K!$4=R, S=aR 5gPDKmne˿]H~'ŁA"`$H0\w tB9 :/A4Kô\ iMI&ڛzpN"N@g% )8dDkAA~GbD1$7\hDud}i(YL,*If9H)%(˘C YrrN?(% p \r]6ޜS } [v3ZZH-t (T"\^h酥SʉS*sKt;"(2lA] d"0' X<3=4;YEhfQ9G`&2!=#Ahfv4,ƨIF0m҉BV=- .S5@0p"TM^IwfdTH\ ?s5I͹ӂnT!MTY+)U. i:*N b(D]4 : Dn e\J\Jezp%H;*HeG=@EL9B_L)0,"eW:;bbK/UDQKX 2mtt\"q@X㎶:< j!ó#=Y6:D + b)/ i x`C )CpQGby^ʡ{ꛜT՚s[;d JFXD q+>; :r{SCoT&7QXHdA2g͌voѽ/\ p8@r.N&܆oTsE@%e*C\Vђ<ҀP@=G>=9 :9ЏYbckĂcm鴯BPPrʟ9Gݵ#$hGH0.9*A  U_u]v'eݟvݳ?@üVǒS4,G?ȮAN(8Y0ZԸSuI&J/*eP PFC6`.a,P%ptkakqMD.- 4@E@$oFB)%v |`Q%avd8 BG[1$EC1|%pJo+ɚ@1Q"v 8*cށqT4ȢjP4`00P P4Л814&K6ސeJ-a 0֪4 ` Cjz( 9(A`K]Iwo#"7q8 D‘%zc!V1),jNp4؈.18d@ P@19@0/PSFvbVB`Hn`460M2 S Q ъ4ǘRaI+Zsrty3 >S4;)?@KN4i P '3cHKP&\I,@R)x @~TatfoZt ꒔d4`%A#J2VK`34%..gPY?7h.#_q4.1_T17.KQE!ĬH@9R0)Rn.6%?N&M& h"u ]KH4:t - O(II( (yq4&h@cjb!MH @1`1jS 0/TAu^4B$7'W1w ,v<1CdYYIZZZ] @P/7՚SE4MH3SH[5V![oB)b06V%`\ar&4^'(R(%GrfUE))(8 |=ZDŽ,$ 9QbOD1#5#1`D5$RI$K%%RY_kkkO7<e0")p(>/07X!n$ K`f &|r{ 4dNee W2Wq@vawu&t& $P[h 2ݠӞ8 X 58=.=ܫ&fgl^OD. \'A8V6\s8d[d48Y=!=gxCjDZى6xxC*u5Xxa8>[x#K|x[$1x=/xՌx~CӇkՅx#9 ˍ"XsЫ8q$/o8g h.ݒ8!) %x K>C8q9͔xY1׏N)yzn=9gNW,xAcӒ ӒaEywyY#k}bW@aBh( %˜9OY!IȾ67 9u: ecߟN9/3]y.ɟCZ;z/wIXYrVG_a1qaZ#m5y[N BF OV𴱵oo7 %Ū?3ߨxZ:T:).4{q :#['+/#2ߤ AS[W[_c[gkynBGNѷ;w@{nMtyt[Q2V@@ "*)[羛71̎ $T#\'œ+O!  J,N bg< H֫ 0CV[c\-\wǜ{lCP%&`*_@DH<\ʜ8{=\ܿ\̜\͜\Μ[73]Н ]ѝ<|˽#3]7ӝ;?C]GԝKOSǝ'+|]c]g֝kos]wם{U՘Nϝ]ٝݟ]ڝxخݿ]ܝ]=}ݝޝ]ߝѝ݁}]^#]X`MX'KO! DS^gk;|'<P6 $4an{Box_\ Mx+XG_ B$VAp0(EH߱8EMb~f ?  `/ZN0"O`Z(~U2x!#Z^;@%@*Üb!Vyz .[#|ab.{)pgv&}nW4`4M^AP f _ʭ϶E׳(?s{B@X4 Bᐇӹn@` 4cq]PZqta1L@/f@eDyěNTL#Q׃anX3҉TgP۴Z=&M`ZL| @^7c\[y^`\6bv?!d\_1cOfoQ48{0 VHR/nڅj Id$ XǃQiRi_.ZK[l ƺPoHG4څ&xa.c Pj)YbBp@V*ur*yFc0b9R`Bq(Id pr@*yjfʼnBYld@! r1L44g x b8ӛ`PPH+sJ]O$CeD J@t,>[53g7ԕ;SUV#{4BP5n2+ "L&?;OȒ4%IswŀYaez^|_7~_`8`782 @4G {fQ8P"oɬ 0-"HdiG,e~bgIE"Qf~ `zR( yV*0|+A@↝܊`FΟH2@%E:@@{d9w'Tf rAR/J0@ KdsAP0pI 1s/P(A*P4`~J2,L[1 N0;!(oP"!:qQ]+ oD*LX$c4 (C /MxX.8: <0zKAXA8H.Th5}$(~g,/Hq$#b;\$}6&*;i( 3/$ RyRB`z]=CF-s J#c Dt x7%:g `ptpu@`qh]Z0"ǘ$bc}KPPAS#5γ @%Cp Y `w ARjG+&%\ ,=L>Tzp^I J qhp~Ø_,:=0Ti {FDpES\*T+B<9w% Ljv[ G7{%< :c+,`m P.p}'8n~$c,l,"~pdx0kX:<\yul X1+ 1@% PȊb18wҾc m:c۽ݣ4gP0.L[E]:ӮE󯜴`ab +\=Iu>8oΔ7xX]v k:QQzT6>7)w1 T23M"4vGyGAG}T(GkH0_V<'UdUsѠQ؁FM !o T!478U%5EU y08$Š|  i'I0"j Fpt t(R@PmA0`!# 0` @wo-/ }xtZy]0ZݡFH$4h֎ qP  `v<\Q#Ƃ&u=ji^_(q^8Zڹ6ZծZ ѳxx9^\ghh}JuOQ~RT+Df^hvjцTXr7㈂dx$HTh3m <b aaZa 1LQVMPpRe7eR 2 >hn(nFn`d%0-$NAXg4W5g6w789p?O BaP6DbQ8Vb*@ b"#^P | k,:*PTPq f0pX&X" HdS{0pX E,a<b(i1HIavXW+ЗۚL]tX(D`UMv h5ǁ,A`;"o-J:5ZvenqUUfO=Ԋ 3}BjM뀦^',"⾅s /ʅH+j`+J@!` t%MXTḐ z}Yf^.@5*Bx_ hm`Ve(@NU`&(m8~ %J bJXx>X z, b z Ad?ű=O` {(%f_L&'&9iR,n*@*y y0H:~%5H" s$.k +0YV\WH=쨈Rjv`&R0h4Hő °Q& pC))X!4n,*"rtN VvԘ5t. HT )Ad3J,+"P]Ip{rUDLuF[Zk@qy!ɂj@gieJ[9jh]0 -"P kk&0UkJmBh'Cs5mk_c(RVmKhO/l/@a[fVr*z@@+1eRHL)4 zP) C'Jr&_Zt J^B`$H1gb7R khtVt.`JlRB"llT*8E8G2зqBHh~Q(T`Zbml憈+u(M)<9kb$AɅ\Fq2]2l)x,COy>EpPQ\xTQ yOdQ?  a8e$ tDHYm-ĹR^K}/SbLY1DəS.fLٝ3њSNjMY5ٛSnnMٽ3ZT@x s0Xc,p h0V.lrny$"@iHSHK)(yM1;/&\Z(>B9iMfK1'``Px xw@r2r6$btA o/ޫ{$N׬PA{y +J E_J-Ƙ f0 kWdD-зػcŲZۇdVPPzj hR @ѯ`[Z2M#V0x5m!tñ?N lk-uCڸq.M/".0aI M(S" FH7*`"gI1(QDipY[+hȨK5V(1`* BZl-\|$edxtM P!f42 D#!3R"mH 4޳9'Ry4 EjRP$f@Pj@[1GSq[b-k RqU -A0ZA If&'3='4)żR#WM4*MáNa&aZ& ZpnCm cVAZ>KZ@hĠA0ԆA e6UUFoZ}@b@0o@qywAAs=4$ u%O"$7KoST4 Hzhd..& s7e 1ZuZHt[!Tҧ [e8" d!]5J]@-5qWUx5~)!#QgQ$@qvQRtAz( \hbAp&\ 押FYBXZ tAfMMAVd"a۔GٍمەQɕɖ0a,r"`@ͰAVL=r( =s"*rA':Evjqyz6Y!G qL2 j]W12[cBYs$axB ff*Q FabkB | y`b!yZQM d3!@@   2 X#)8" ÌtɌk ⵪BeISy6j&AIF&!ݨAi @@tj bt&Ad o5'baӬ ,= #ź#0*Vh}E z !gx\gn[8<0yzhw]yY3x"K_{~3h';CA$X'`zvc%+Af[  ~_;ߒ,D9 >`ǭXɭ'ڙa8SG;kͷ:!;K\wYn@hR[V'dV2mf !\&؀/{\.$.⇈|+<8|<\A[ZE~IMQ>Upb8gc1GܙK % D6"=B#^B)[#c)V>Q^Z~1 ‰V!GIwØ5쾨"޹V> !dixG !4.a\~JTb,~y<ɟ9~񐖧UImjT^>adMQIomjF!)i~r6! 5 5B+ϵɺ @p( CN)/0 "CAA@ \i5`c6AQxz>E1~Ih[9vd`/q2d)@ ,J[a:[tźye } 5T*C,)eo9gIiVYkim^y?f ̀_؅J~ `1k/ {}0_yWZ|kv|C%iaJ "1Ĕy$H Od&^%p~#SМ B'f |EQsGq{Gvyp\c?x"l-"e%Xkaf5Msd7M9Ns;N="-hLs߸& `Faxfa#bx+b3cx{&KdFSeyf[ecfykfū8 :hz&hFizfi2akz滯kñl{&˳lFӵ6_׹n{no{o{7dz{~{rS޿}g}~;+?`?sNL6@!`V 1iQ^AAa#BsxO!g̎Ba a3CwsM,.vCbC#DvQ$<.$DbSV+EsmL.q:,Ecc3BXhQў7FcsƓC4m?G d<O䅑6GH!$d~ЩBư䤛vOIA(e~Di(TWJa)Tq.eԻ_=fge4cLy2fT˙`ɆI&,͚VkMy6f,l?O BaP6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkU_kgZmVeo\nW;w^oW`pX<& a+{ Ue:9pP`L:BZ8M  f4\$/u-&1p"_E=|_7$uL  ®wBr7$r}** !!GTKN5 Ð= Eđ,MLUől\ljǡ˛t )`)({@" ă Dj `XcQ>@e8f "8 LGRnf6M9N$P[$R\7 4P22.P2d=@e!8J=ʵ_!,7_71 jH(G((:0P2`c`Aqfch(0E0sG.ͳNյnݷ{i'پWFM!.J1 q0ef3hG$rVFz)AHl+c\xS~_!a=~3X҂XhAk*Ut  $nĹ-7> #<clu1!(/3lNxR:BH"0 ٲTqf ?4nY=g𔂁H_q!l3L(`N2ր\ḱHݢA@pH,¡pl:ĢqHZ/ƣqz? ȤrI,O(ʥrl_0̦sIo8Χs@ШtJ-Hҩth9ԟGZm)M YD<O׫w0hP <r@b 3Y( <-ː#ye)uYfT2iD4fbo(a:=* Xl!}8/{͇kRe \ iLoi!Ie4 gAP`|@*P `FB=M< @Y%l hWm`XEhZ?$ C$YG$*K$:O%JS%YZW%j[%zPSuESUUy>3cv &bQ`q\y 4+T M3 !a*N0 F$܆f6uhZ6kjƺĈ@(0ni@Z "\,إ2aQdQ! eP3Xb`? & iZx2X*  daX_g2ďM٪9YV{_2#2\'2+2ܻ/333\7cTSP5J`m[rnG ?O94 BPL/P5L،Ò@ڊukh&ۚ "<W-@mB92v3]1K8;{0B2G>_>>߻?8TDVa@a ^ @A B0&+IY81K| 5jlSJ[oJ} xP* 08YO0KцV@xqB<i "q0~z~ J ~y87a3 a Yΐ ׸ c|p'ϘbT^=iH@haT4v QS MΦܗ`* DM<`'xF@ٞ #14gY4hz^d v{RmQyGN@',lx јY(pǁȊ>$BR(G ;\<\}=_"E#M^(`A `@6U 3frJ&V CvE",^\ΧDp[w 㼷޻{DŽǨ-{L(T<`<2h[QIr @, PC")qjEGE#йPa@G'ոt`кDҺ_L9-og [ֺ\{ػ#$˩;_lܻtۻL{A]Wz&伟[3ߜ49|S_y]Hߣs߼bK}_; g#?{R+c/_!*_$"Ɯ>`b@"@0F`BJR]eY >xrMpH.  ` YX=H(;J _HTDI88*  !JRaXʃ@ ;nv^a `J hE`p^@ a&aa N<=T6"a"2#b8aIE!R%Ť @a#z'P?3*b*+bObDILDTb\5"f&/b/~()r#1"2c(CIbDԐ".br/#.5b2# )N1cf78bz3D3F4H4j.v8;6*=c=V99B::j5c@ #7cAd*B1cHCV$;#ZCdbF_~ABdjGH^CĄ@@?O BaP6DbQ8VԌ1 =HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRe/`O *TVWVkUv_XlV;%gZmVeR pI5nxx`pX<& bqXf7HYfsYw=htZ=&M\(퉽_1mnw[}bYF}Fry\g7tz]>oUk؍og~|^?'zq:-|~_?~U.26Ll=- 5 ÐLUől]p muǑ (bKGL%ɒlFN3/'ʒ+̵-2!&$K43L5͉ͳ9Γ;0% ;DAД- C|iDQmG$OJO-t=O EJuMSMUUtK Mem[tT}_ arX%ZPV%g^VkʹL3[w-s=tݗmw[Uw}ߗ֦. {z7|%׊5naFbM/n] >)>F :ތH&QYM lű9ճn˷hjά a| |OŠ AT. CL8GiRI%ITY-Ki5My=OEQU.MS=N?oFd|Y떛Um[u]CV/cO)eo!SjsIiVYkimyo||yW/sOuow:=^2qW{}(#p$ @DApdA# BpJ °;CCDq$KDDSEp$.C)FqkFsGq{G`FR%Ird'I)Jr+6"#%D/K1Ls$3LD5=Rʃ-9Ns;N=Os(ͪޖ3 ȟ`(Qҡ},3 HP=}.|QHeLUucyMRt]UOUk[LA%#g0Xb:s], >Ϡ|RpVx[V'TH[j&f0rP'"tHZ1r`m@*`Y@ zY ah-n,X6lxX&!%/]%y޷}ߠ[Fpv]ׂ}:vhJ REamgI#Y8hBnw;@U`z,fwr)xi(!v&J٨yy[fN `BRvU Wq&Qt0 t t"cHqzGbh @ȉQlstp@d[a!@㜃r57[el{8\Ƒۉ\L0ØbY_l7pvaڏl FHW2jPʜ ,a2^@]Y@Iǃ9q@y+WS[i@3 cluF@`7_!QQ5:pͩ2 tND0]d ~ pi~ 4@ < 8:bA(!.%3cZ퍨FDh(HtׂQ$O$jNf6չ8W$]zca"_12e`Oq6 r EHeD'J7p ("9X Qv=X_hy3&aЁ*BX|6RnGH\< s .G[$ )=W@rx!)7 \>@E(c@ /Bd$q.9b!qy&0 p;Tb9lM'Pl~=H0X`:z{A(: N6Uà( `AP8@/LRv `c`!tG ! cp0%2^”fiD  B`i{c@~`ԓ"HpR"P`k*/h d~H,/<1"74[3KJ$Ԝ0t -eȠ`*6֯싍5v.Ư S*ǴL-6ֺA[rc͍:,$g72BA(GMo5g,z"TShQZ*ɟQj1FM([f=010r;,[t%rڼK } AB8Iken r;T`+nA]@τ͎rCLσZpx5qaj6 ,v~MY};]4m ga3!C>WvfWY^VY0wbל|`oÕG8!fE^;wqJ*ٗzƁ*%8X\Y`'x0Cx,x g=AB}` ~4 RBk)X#] 07hxAщ lNwx8A _!;r^mt@2-W!$' *t\eN@e,p:@'vlp<H8󀁩1,ň&qr`4YbqP*hs;˽$<<%<c‘{( ψҋ@g^Yh0^P(刨v@@Vɐ+(b x P%azq %y_:g  iq(  o{XBh~Ӽ;Ө` G@@ @R1/d'bɐP#¾3A 0JB,#HiC\0;#uhz; 8a0\'`iD G'yiCl@y`H9@pgj?pU7a0AIL/xu4 YĂpFt҈ӨԻA\mޝ`vF$x lJKsu*QH#3wi!b!Kԛױ<˺FSڻZ1MUA(eYJ,HW5uX=P=d=p=MNG,@:¼;ļ[Ƽ{ȗR\r=uUwGqu }K |t/d2>{` )Tܽ\S{KT%r>V?h|@<cD|@6zb(HXoU& ^2[0wd@8EP!9 )+RRLeLFTsL{\hiƫ;K(F}:Nʼ HP`_$| ؂TP^$SxyǫG}G\;ᅂN%-ZDEI,[,gLLm(.Pz+CŻ}\ U{Y:~LMtTA^VK}q|Yº˭믈alcCUl֕cZ;s?c-݈U ߌbqĝ+,$q!njhSɈ(t[nD1<H()W %^EB$cO^IΒ[ ,9eڽ܎3>@yihxb t3Nh|8 !\2AwU_U<ս\FZt5 0r%HZ|ҹ BhVH㫀&40@mQb8&kM88KUj# Ͳ7l^P.^bXh(`0π~8 ѽ)l*S^HL=[+:W,Achae}'$)*1-emFwKKq3D(i .8 L,XH|ZBH$(zFagO &jEDU}dVUn8p;F۵f['u|ݜG;"ݱSYdYd@ dSMKzӔ˱<ܾ FnגG5uGMvewmLNjMjvKpKLNmPURfq=(?O BaP6DbQ8V0+XPB%(v)"N| @X) a`㋆̐D7 j΀QDR``ȌT@ - (<c\[o8VSCƼL-Dye84> u6@*BnpZ=A@ri-F 5V`mN gL0@huR` ҎBc-5#n46H1G)%,TʩW}$`ʌva;hBGh_"d . `HHl=H$ @"CxDL* P?Hsh4{^-.aMAV@=V+D_p0Mr 9jU=X,`@ AXSƠA$i xJJIb,At<h.a{#a83tGQ#j u˜V!ip(C8jXBcY#08h H9@ %=!-HƄ3tFɄpr൅6 z@&8rQ90{SHi@G mfXY 1.а@ z/L=W@Dj8&t:5bqq% œT Ty4g (Z7HI2eZW`#|Ħ}\ƘcZ0`%<^ B|cO) r 8f1oS$@nhzqdeNe=рXUUDi\X # d G# X` 6Ѳ80$8'@ɇb˜i,5H5[@Rh2=Qմo=#쩌ar.،2U` ^,*ZIT8 ڣ_Ys-ĹR](* HmcYP#q+恷)vm?hzB(" 48ALJ xw֛SzqeC}*S6.ڸl2FLQwUݛݶ;3.!8 %   d HYīr[XQJj^j#X;E&{`e @ ” ޵S֠TJ qс@Q~?b cB7 clۇn5W0DT:%to ¡l9kjn05qFR{4U[+']2˯VF(A@DY1F~!Ei.6hlj^'J$B¶oo%pE Zh~;@L_e_,B`L nr{ .kRAlp!uAMa dFn@EbhP,~p0C]H'΀FPIk>jkD4K`5sA/~eU &~ƕ 0f tR-|-Dq)-115q9A6 b B!/UM4ynHu@#2;@u3B`,i\ b0>RDJ| d AZI!b: aXU2  *ve N p&f`!9 l%c|z/Y-!*DC2Q< `>~6B؁bPdh J3!j @sM 06O*!.!`M ,),^J ".aUFM0M6!ZanȆ +(3!' aDMbypH ' 2 R!%Vi q@Fb c 5zap `>q4:BD/m3!D E[ T .@l@ )2(#"nh@!A N&dz.Rnd '6$ޓ %0` IS$2 >q@/b(>#*aʤ6sk$"73v\w*yj!uj A`m `X-3533A4SI4 &Y!"%"" ,2:00@ @Fy%= 2YΚB& eafUVoQ{_vGg`HC5ahGŘm?).SՌBPӱQ Qap<@ uHyBֳ" ʧ!Ll TM8#ZE? ggQ{g `ٚy5elĿ=_>El:"3&a٤)ZT`!eFFPL5SUzU{P{A*`Gn(qluWY`SC/] 1/-]!!"⭄"$< 8!@ $تˏl "$'a26|EL P  xUZ(0P1H7y@ &TU-X4| C-*T҉rfE$2~v5p88P7nxc7LZBARQ|$*}â(Z1G)!(2(~ 6lY+ @eHUiv0O "22h΁PE`  |`2(NY`R( v?O Ax jt( .ZY@.a\Gnf)Z XiDDUEl`h+iЖH.44|Pb+IP`/ bw bn (>j ffO~$d0-1GWo8RzIPpU4T9f3*_psBrPS_pj/`J4yf t'r r) [5T8`%)viWZB^PnuZTVoS( "Lɠ[%9g-3D8(Ȫ/Pz !Й'0TH%mIrV, 20gIX!4T,tȒ4%)/[)-K,mRciA"4e炋²±;=B+Z,2/˃@p,!ehbv~=ô&no,hx no)k&G1& p$m3%4AST w9$:0|BwH(  fh L;.fLٝ#U#Dl:'#4!5 L7{0#mC4g &`T d  >_EA\Ku. 7`!dBK0% @xAAQY#,WzP 1"DbB4C4Ed)`%<ŀ I%0 U"4OX(5<NI'FN,ʌ l1ᤙڈ>*-}A\(Y 2HW' ÀsHCR]IP%F L@Ԇ) |Oߟ0y(FHL%Dp8Әҝ:IE:g衣f0jPP,&cr )Eh p2W8*-spSN!Sp=(-7iTՅA[H9M %6'^I묵>97j|@G HH 1{ H i""SONnwH3+aI LTj`5)ҭ*"^l4RΦ/3i(&)e-ђUOSW"Ko df,v^ R1$Lv~F$$.PmWED5/RBAp##Dco*m@E\~R&Z QIWUAW&Q} CE}甈hvۺÂ㱴Ҷ%swNs nZ}jmTAh%ə7'drSʙV2|ˊ-Arc̆8 *tF&O9F(88hώcz&cgAB-*uCdq Îae0F^>Ѐ$?B$t%=@b |.Ԋ(DY_q \g;:s:͇˪fp}6 a[cd6Pt6a:)l  j.^kZł :8A!9ʪ6;!>c[Q~H \6&@1-ڬ( 9'P`}z &|{7‘"~1nP/:;5CļXæL48@@ CC@BX!!C=p=l`e=*d"^" ?D3z & Hj#hP xDzxÅImN{`dtZceE`yiaha| z\E^_LaF#}Xcj6;Hg!`"Ue%l `WH8, bĠ82Ёzshj js tx{H(~(wLXl_Fȣ2C9$:;g;Nsu ~km&+PECAF43@NӴ,FEh^x#4IK:SL [邂i,j55" ˈ p{+-.BCn`Y48}~t$`\|1Ҁ0r:'zJ|@<= Xa&B+w[/xY@}PȍWP&6ikKm6o7`VdmjVmVFG4+JtۨThx;Ȼ l,D@d;4xVxqr†8 0聅VͫUm:s>C=C^S)E^NӬS<*Р__b!X0[Ž:UDKm{D> ?ةmȈUL @Q=If^,=DG: XPEjM\֡ .3.>c?c@du/3+0Cdi}`aX a:Ђ }<\tt@$*we #iDhi\Z)uhKV] 8f)9_p#PyYKdТ8l&HjZ`f`x#ՏXY{-!HfP& Z ev[ &fuaP(lІq\noqV%ti ,=sB8](hx0a3N8Jpffgh iknmno{l:^"XXZ$$I) &L#@( I!\YGJ zJ:Z3pkH˰Z0pX] ػhwlzR#aDdO1t Û(` irhkfƬXk̀+,P 0hh `>ĎھH[ظX!؅iki-_C|ЂhyeZ*qNηo,0^g 8gNui0p`mj"vk XeŁ&IZ?loV^llښ`}*#$حn\q⨢-.@ kp `W辌r;~8&@(x *vϣ ޳kAypζp*1gFFlIn(Q;ł8 j gO` 020 (PhG Ł u~߀Xcly0NmU[9Wh{j8 vh=‚ 4 8nA(K UPD'bXX)W@ ^xAɩ@ m@`Al!PEj>\/v~jKlPJ/j2a_U#8= 8P(`baIXoh][l=L5Yt{`.ޕ #ydhl@])_v3q[bZ23`A dvWfgZ Ł O(rk '-xBAb0 %bx04~u''pJHy ֽ_Ӓ[jzP> I_ RPq @|Z`4_@\fG1%SLm8O' H xC-,n @lU-.RqRp^PƋ0,N[XEmqꒈHr!RXlvPE3V0r\z.x0&pA 7%}jT'fdx/2Rh^r|/+0&2+[. &"ZX0 a#O`S`k/%ɠshEe JcXo(H!dTh)($bhP 8rL VmȠ"n(>˳nK麮^,,3 ðCı'fr\(:pi3 x/A0t nn(πi2ȳ- Rط. )%F*^X^0gꀇ 4 8^8*c "aBE\8X6TFf^l<^F&(w4t0z!*#efQᇓٚπ~[:Aa`&H癷 6# U3Xy{ @N "@ )n"@*R? ?usi)wg"2]ђ" )8@8!W0FfAٶVj@8 Ix+ElJʻW%4HB%VxZ iUR}p2o-#$;`@PK) jigBht>A#R&K8-h@(Fp]3: O h/ST &^HZ k\SѠE1T`GP$ZGiŒ 6 Dtl00՝@tMCwRahqtZt=a,!$p6泻˚`H(6{TBUk)M xPNB8=VfLN3-mF7c\sU qFк(csBBy}q2X5=exw`bBw 2u5;Ifz =bcXp ,k=k&bbGO@&FG17%EX|*6;+ZPl¹!2K V]Q\ ;4:G *ȱ4bDwT,P =lXBXGZ{ Ź QH5ҀK k[@!ĔB(Eb mGYxi@7MPZ e 2G*AP-f@`JbO4YF j9`j;JA2 VAȜ"ay$ >a@DH6B@sP dǡVVn# `U./X\ a\)avPL te(낸cRhF bԭ  $?Mj~` D+(MAˮN|..ZBN|>@k @(-+QrljK˱ q\¼dd$*ܥ B@I`u4y!4pg@lC`$1a͒= D $c{fQ"^.րP' +Dvc p) J@A-ڪ& oq;MDi1 $)[!NJ! @ B)v;Z jQ $Ρ NkjT8jbpCQ-+1iiq{N8f,z ,F,>`f'glJJ U2x$i)`-<hk:$IN_ 2.6V/ 0\ bxA0L̴){XlJ m  $ qV2fa|t( FGXbBxIy@JD.\a#I`>_80o01*J{O?l)\ d h,L,魷-'-J-*+.x~J$%qx¢Pm{ cxVww~j2|-7$(@-h8 aV@u}aG antA#(ߦodav}ej*$.$3swt6cwvb`X^6jcw qVlG 7Vmr8,/)$``l7]wk?_K*EEu(%ai@-8W~ıj6u!}aCs 0xvkbKia poW^ZTy 6ӂ͍gɂՋsbwm؛mְJw0&$9}966iq8 q4aD1*: cw9V6yᅸlwꇖQ}W%m5WAV \KE;ٿ]^u^ÝӝYם۝ߞY瞕VTanvA#hx9wwbSO3T |Ǡ+.8A)H+o*Z(aBYOS!8!cԂzU !"b* ψե] gH/ᐢ1h,c:ψAd'F9`<ϭӭU`H ]׮PpP/!QTyǛ^#['+a,@//AEA$@`1K$80D8׵e`fka*a 6Ye AT@PزvS["A82Xټ b@<;<ۃ`  Y;Ӝœ+/3\7Ü;?C\GĜKOS\WŜ[<#]'kos\wǜ{\Ȝ܏JUƜ\ʜܯ\˜ܿ\̚Uɸ<͜\Μ\ϛI*|Н ]ѝ#]&x]ҝ7ӝ;?C]GԝKOc ]Sc]g֝kos]wםQw;ժ}{؝ݏ]ٝݟ*{}1yic-91˲]wݝ;wڪsI<#Edu` @vbے$3i>F@k3* %紥{6>Љ8af` RaCaZ>jj y;N/&HĮ$l%aǰ kL ki~G`7$7^^ Ɬb43UBʢz \/cJ%P:j *s :9 >LS/~7ӝe(Bm\jF",*:@xX_ O&%WZ(dXk>u[Meps=g X4 Bt>!DXDn @tU-Ez d!^a1Lj Id$ p @a-k`X]:~`]I2liAގF " ^I (S]VH ( ~V:|+/Ia]OV]jy`q551 rrA(ʏPlANk \\=<^V[fEiVPh*Èp:h@4pk}!$cVN6^`#ժC >|&`a;B ZWj_F"@_zЍs& byaBZAGZfFE,8  E QY𩛚FڋSj]px@kRQ H!$tu Pd`Aa0n\JlՖZil_ iqb-'dϘk*<$ÿtjuA Z B^+倰PZ,6/Eel%(P0 z8Ψq#@5;)U,™SJm9`ƞSC*k`Dpd1$Ǚ3,iٟ4H<ԋfZ1|f 3@ p O0 1B k 2҅,A4X d{.a&?;XWX4be$9>Hvm5ȁqeX d!HdFhh T}! qU3I S1a5@.0?H >0!8Hfcm :#4o H`[ fi )(jmG EDUI% ZLB 1S#2"QP=5V]+sr=jM ("ԣ$Xc HaQt Z!T1x7.KNY^b&fQX Bˋ|splO0T fX*`FcHQЇ7\*ͮŸ\orС9<3ӚRn5! n \B `Ɏ l`IрJ}"ucu?@Z1,5m p6W hAvBKɦ' 8Z% Q+%f [fKXopTPo ۛΙ<;px'tzwOGu>zQUJUW* z#5LT #`T~8LM$d` qt0'TQR*(iê:[p@\,0 0  7@?0 Ц8 .\|cɢsnaUf hR^;txTP`oG3ĝ{K0!@"=A *:XlT; ꇈh`h9( k]hXv5X `haP u@al )e( !8l[]HxgƂ 30<@̼μ k8\"&Qxj-KAA ";>PBd'B hH>X邨)VEbH O8XD lhƜjdl|aF+F;;f(: aH8pp3<,".DʀP( )Xi i 8|lWI 0YpaxF3Gx +KC(ihkwIIԞ(g*XADrꂀ~]1`!8v̫Kp FFP!qjr FٹI <! VhqH#&H kNftO0 C 0\ȰgKlˌ˨˹4KўԹq  r_Q?bCAA$yYP,KlKP @y`$i΂ KD´d3 > ?  p`PhP meK\H^t48MIhK"t1dH -Sm8n<]<-- p`U\gu0!x(XqDtt,rbsƁrG,kGDu1 ɹНм9`Hx؂?`C{QU$hRL孼80pNmrI ophp.P2Q8{ȼ 4=/!}/̽\xXd'dT_AR)puL٭^ VD[`}C`|dJ( 2X43ՄS=-0npb; Wfffnfgg$XqY`< /h)QIC=jP1c+>iЩ`( D"YLn`~HԀh}~G0W(/B)ȱ $ l;Yl0 sϘ-O0{Qw*B.aVW j2"?$ro'}Gh>X|>6:>Q+ 8w<^N2>_Sp&ȕ$h%q0bax|΂x1<iFs^eZ"~4hmFmbp+PmP\Tuԓ@\{wa‘i3,ё}&th/qs Pn٪'!P]mkko@ Xh_dV``c(8gR"xq#6Q{D &+<Edh"^f<}ݮ~F%hhxtv 77ohXi]h~xz8ܞJ&7z|ا8_vk`\cv8(f9aFc2½z">^&y88 ;C LgŖV&llZeBDob>d&eng8(]n^IDdLNFUZ l࿂P f "pgW pxفH{ XPY ݼKC.],p $? E +:r҄U9A0= jHeP)mР[ p ѰP+Z `2ዧ ~<@PV# .& bqXf7drY "$P9d+>C 2KWPA@3AM!.rV,4  sJ0ev,0XF"0xuJ|9Luq% N`Iü:} (=!aY)F`i4 R0CT?P (@]=KR+ *$䠴YT*'d! 1ș1fgqUGK H1(]Ә#1l^Oa?H8f0Ѓj+G k0U]˹z2Lْn%v(X=j>u^:J7k* (:@@$!2h7_y E`8s`&.ܝ Q/bW k7G~ A&R룄#j'&nƤ0Cdn d!!Џ " > NBJ4A(nF; \6B(P@F&@ Abp/,bB  ƤA!3hZϢ!A"7 P`$/lj6`фUOVm8dI :Vڍw0Ŝ|"@pӠC9>.o<1z1&$&|(꿎,,*%At%&RJlK% PwdDPl 6mBmJLĬNp2D HP*\: "l(LH#WpBM"BM`NDJ8Q! @|! QH@ueLt'~ *,9#/lda ai  Fg nnʠ'rz%(0@t @ @A4AtAA!B4 뢽Cz7$/Atb&dZ9lZ!h bH%@'`Aa:aDTD!pLMa_ % T-8 /p@AHd!)t D`DOE.EE d<ZA`d~lGA`qv9@n*rcmƞn# P4 @~1Ԉ !_m[KmK1>!B *ABPT@f8i`j " ia afJ5OId<Ԝ~"VAO+B PCaf^D1G L\3\UȏL]ƈ cУn;h~n^A,{LUC#RD#͔ 5cDIDEUPT\j bzAZ8C@PD+^ Cgl1!AB jyOjEt[Pkht<dALa Q+f[nޟ~ꜹFO|f@]5֠@/!{# <506a:]H#``;1dVZ01\ւף}AtA"d 9Fhi BjD G qCmt xօVNˌwH?*E?005&U|D1 Ehq!  $N$݀APPzVHzA"Id\p*(ZS-Ψ  C 5K`-0G m`ǑIa $!piCV0ۀH&0Jm>SbZm^CFHT4UlDHA{|/||mV-NT^/ۼ̀Bc(/h4ZM6UT+Xp}@Q qM; p | z ubO8ypUJ Y(j+! tL\-,` XV~!+.X>(z'b`LZ3A}H" !hjh*œ,Al3GDB%Tưk(GhexhҲjz XZ@jj,0:x`1kecH",F,B ' `ke~S' HLNv3p6JFJ@TLmQݹf4"DqHA8XIIo/0CUJ}J^ƒi]0?{0 q?:F;Wc8}-!qD}Dc `|!Kǎ3pQX% pFhw)̘apmDOC'i!LowqDK"!%FQSR :=Hh,A|P_ah(9#`|(HYG>>Q@e:i&Mq-Ռ Y-,r-IqImS;1zw)IIa/.2i jK_ f~kMy}eFu3=&My &<?Ĝd+b XA`9"иCc2lAHѢ۾EPQ(K#JzRjUK6TTjSVUzV]YY=ׄZy{`*Ҙ1<'ǩkg-tz]>Wv{]wxr2ɱz}^g|~_?~uIP?P , Ll <5 Ð= Eđ,LBi*B<]eƑmuJsly!Ȓ,#L%ɒl'KWv{]v x|^?'z}^g|~_?S~_ , +l% - 5 \9 Eđ,MLUőjEьeƑmuǑ|!Ȓ,#L%ɒl }DE,!D=RSE|s HExD >= =F V2JUspL)v,CZMS's8~;0 r?HB HLinomntrRGB XYZ  1acspMSFTIEC sRGB-HP cprtP3desclwtptbkptrXYZgXYZ,bXYZ@dmndTpdmddvuedLview$lumimeas $tech0 rTRC< gTRC< bTRC< textCopyright (c) 1998 Hewlett-Packard CompanydescsRGB IEC61966-2.1sRGB IEC61966-2.1XYZ QXYZ XYZ o8XYZ bXYZ $descIEC http://www.iec.chIEC http://www.iec.chdesc.IEC 61966-2.1 Default RGB colour space - sRGB.IEC 61966-2.1 Default RGB colour space - sRGBdesc,Reference Viewing Condition in IEC61966-2.1,Reference Viewing Condition in IEC61966-2.1view_. \XYZ L VPWmeassig CRT curv #(-27;@EJOTY^chmrw| %+28>ELRY`gnu| &/8AKT]gqz !-8COZfr~ -;HUcq~ +:IXgw'7HYj{+=Oat 2FZn  % : O d y  ' = T j " 9 Q i  * C \ u & @ Z t .Id %A^z &Ca~1Om&Ed#Cc'Ij4Vx&IlAe@e Ek*Qw;c*R{Gp@j>i  A l !!H!u!!!"'"U"""# #8#f###$$M$|$$% %8%h%%%&'&W&&&''I'z''( (?(q(())8)k))**5*h**++6+i++,,9,n,,- -A-v--..L.../$/Z///050l0011J1112*2c223 3F3334+4e4455M555676r667$7`7788P8899B999:6:t::;-;k;;<' >`>>?!?a??@#@d@@A)AjAAB0BrBBC:C}CDDGDDEEUEEF"FgFFG5G{GHHKHHIIcIIJ7J}JK KSKKL*LrLMMJMMN%NnNOOIOOP'PqPQQPQQR1R|RSS_SSTBTTU(UuUVV\VVWDWWX/X}XYYiYZZVZZ[E[[\5\\]']x]^^l^__a_``W``aOaabIbbcCccd@dde=eef=ffg=ggh?hhiCiijHjjkOkklWlmm`mnnknooxop+ppq:qqrKrss]sttptu(uuv>vvwVwxxnxy*yyzFz{{c{|!||}A}~~b~#G k͂0WGrׇ;iΉ3dʋ0cʍ1fΏ6n֑?zM _ɖ4 uL$h՛BdҞ@iءG&vVǥ8nRĩ7u\ЭD-u`ֲK³8%yhYѹJº;.! zpg_XQKFAǿ=ȼ:ɹ8ʷ6˶5̵5͵6ζ7ϸ9к<Ѿ?DINU\dlvۀ܊ݖޢ)߯6DScs 2F[p(@Xr4Pm8Ww)Kmnltk-3.7/nltk/test/images/twitter_app3.tiff000066400000000000000000003076021420073152400210340ustar00rootroot00000000000000MM*?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VV+Z_lPe8%o\nW;w^oW`pX<& bqXf6q]do։%fsYw=htZ=&MjuZf]lg9mmnw[}px\>'ryZ/tz]>Wv{]wwy'z}^g|~_?~_ , l% - ʅCP= Eđ,MLTÑ\]eƑmu}!"ԕ 4)+̵-ˋ[. s(r55M|N;(ONT AД- Ǵ=Eю$t$HǠ{S-O EQԕ,/MT P8$ BaPd6DbQ8V-FcQUQ$M'JeRd.9tm7NgS}?PhT:%GRiTe6OTjU:VB_zv_XlV;%gZmR"ZW;mwoW[`pX<& bqXf6ZYWl7]wx|P>-ÏNwo|~_?0* 04l$?P- 4@o<6Al?đ,MLTB\]dñuǑ}l!Ȓ,Ʋ;IRl')+̴2i&K 1̓,Jʑ|w!':"pMsiyH t .aH&pG@@JFQԅ%JR@Ӏ Q@#?HG >5ux\ `{_Ͷgh@Ur+\@՘Km8 -JL`h`0)(7F>]eJ|RY2k&J  vpZv;Z.a?0H7Xe6a]`k(-T삈Z'znz ;%YTM"c>Ƞ w[@ihIagǰe i|8WQW@qQ${NYZD xߐM <ǡ Cj&Xϸ? !M yB!eA=!4lب p+c}.4 C=}0$ Q9%m@Uv#}8xM6QD84<XwyűZ3@L8RHY$&%TBM H 0^J` o X2 3EnCI>sFPbDȹ`r(`_2Äa 2 `Y<>pbd@!)=(8 x8u5P0 ]7aXt^ !(\F| p `|V@|'HhQJ,^hɲpRZMIE)+xR%,kJd 1<4;KltZu?X`UF@. DwS&@k5u 8H &[z]F%!2pV[ys/f ř&f5f7&_ 1MәHxЉa(> C f' ` H2$ O^gXڻ9A[H@O^H՗l z@Asp:ALBF]n㹷[}fo (4\ F. !N᜗G roz/'FP5p9=6 !NJi Tf:M ^n ($Md Z P^נ `BYaQ^eǴ2BNK  r^OaLM4@ckkGb @gH P|P(" ߂(, `\t Ml\lP \b!F: [` @bh@>~X 4eIQ K# \ `nd H!sM^ wN!l 3a<cbiHraPc jav dzL2F Oa N P Ow Ȟ 3\ xld $MNeUA QgbA[`E FH"؋8@ z 'N@H"% ~aU`\hF[`0 , &g@fppjHD#&9#$E$!j 30R( ](`()1 G:ZN)r#Iy3abLtrLP9P %odk86;#Bp$96 ,-r-.'3©S<==ZI졀 {<AnOa4  `| Бd !J NN i~  ! ^f4Ng@8 OaA^g@CV  dfM?T5FvULQUUYU `6gS4h"4 GFkTKQ @ `Ob46k Fl!\`<TTUIVruuw>kL%3 ` `D)ތ.!)|鄝  OaM2 4A1`%MU( AITaOa (zѯNg0P (42{@ G{0fv!ƌ1"rr4Q`@ Nb! avMT !ܐfdQy!0!N^A=`| C)La?XkfxA~_@ Jl Js% jj kVk0<'ЂȀGHJ4& AҞɨ`_f Tt Ba:s|t7N Q @8!6kfrQxz62 bWG{ @bMW /2}@} {@n0n{(|u}GS|Ae@Ban i `]W bi!ܐ4@^_.l ȥ{Èo\ &CX N!{>dYY x"6e@nɀT" |AkN fTI@T, j{XB8H Z|!7X>8CvQ@WA% y-XbHX@p`b2Ԝ!Z `&!2 S!X T`@a`,OPiO@H I-M}Ȟsj!>)RΨU> a`P\b4ohڔڠ:%X.:`Ÿ` `Pe\:h-. Aۧ ! w:h`"Y`vQ;UdabN!cHHr baaeDbN@ | FC5|^ E4Z&_A\ұh7h/hs+ %>sȎ Yh4aNe P I9!ҪwxM(\d'Fz q ssY< |D7if) GD^FAv7jK#NQֈվED龀Cy_z@ ivy^TR$yșQ:uAxԍ#O@osTQ>n!m>CS ~?3HB+kż5Rjvl| `S8 Oj 97..#DdaN\& O4GՎQG(u!E]9gBr"f8 } W b]t}d{PKaf8 aT2! L!"n!Ș a <ۅNuAjE4: >gBjK r5b ==)Y%t;1 p Npascozn\A#^)Xh /MOZ`Zj T zJ :}ZX`g8 `VB&z h0a" _(_, N%+X`ao H !!ޞc5!WMj)촒 ~vS  =?A_+kY?d— AP_b `)B oOL\tOýEp@ I~Op D $؎9o -.cM1@ !"`@U.[.FNx,NUUn]:Zp' PFr +6AU(XHL>ob{30EM-fsY;.L`{ON*\K' K67@ x!gsEiTuB^w epڵv079&Pdd({( !3پőΝ`70|`ù T͓`9*#hQ - %tEA%+H" !Hb"H,#NX3gWR''2L,&9ݷK`p5g`e淙RnJt6z|.s\m[HpJUX͐ÆZűwn]rF(rssj&?}g}~~?`@x`T 6@!`UA2 YPyo bb.`.,QtUF$˜{FSK0\ /kp4Z*TObPs Y4\EF TU:@i0)X8d0a a~pAb*9D|J 4C1DdE hF((pG+2&%P&T Hj 4єyI$`%!0qy !bD/# %DjG =$Ab&',QaT@ FX,@P$*,UKU{Dx9]rB1 i #A0b¡3!TLl}E-Q '!4A O6{.-$U@$?Ҍ*ܩ ?مhz '|%(V 3) g҈mXB!'&#\X{A@Sr@|xh>dHY |jɮarV0I,EOp@AebCp se98sk@Cgh ePrِ$^@m{E?pp]¸#,$a8X{0P`Zg ;L d" `&KoX94Э%88Í @P0b":sp~N<'! I?8x=誌Vw;y4RJ* p. `'(CZc)v5) xJ(d 3x  Pk01xRЩA羈>꾻*e?,S(?,C@@ o$ӰhDH0JK 0M a `@f>p>>>|1 iӑO?ČA?KDL210{H1X8186.$Ed@3)Q\{2@jA2Yi"4*?2-"c.5h |p(`ӆM8Ǡ> OI\|ɜɼz+O [.Pi@HjHE($(*䒲IQ I(L܏5KL LL,L Azy۶vRxXJ!Zx }67ܔ 8|L4Y9a9!Mm@wQԡ{ YG. N&%☮-$^-UWcE.MNUvYe晪oٶu"cw.]nczyű:^ɳNյ٦~߹`liOo!-Gs\=EIOUYs=^uݏemtݿua/N-^o`~ϵ^gp|AU׫ow^(3@HUPD D M PN AX-G =!PBXM B] 1P+8U a9PC}&"DXDIQ.&8bQQN*EXf.EؽaQ2cDiQ6Fg:GXyNN9GRBHVEd4R.FH# rFJIY-%ęRlH9'RRJXs'4RVJ]+ޜZKYm-Ĺn]K}/S ɉ1DəS.fL1tњSNjMY5 ЛnnMٽ7SrNY9D鍄?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVjP8%j_XlV;%gZmVeo\nW;w^oWO+& bqXf7drY'ry\fstz]>Wv{]wr'z}^g|~_;~_ *d@Ll% ,A5 Ð= Eđ,LC1'ǩkd-tz]>Wv{]wxr/2ɱz}^g|~_?~uI'8?P , Ll 5 Ð= Eđ,LBi*B<]eƑmuJC1ly!Ȓ,#L%ɒl'KH"H  d5`1x>}}px\>'rq7b/"A~G@sɧa2+7; Ʉ@8xhkb_-׿xQ"@]ޱ|~_?~_ , Ll *Al? ^E|CXZQjOG!| BqHH1@l  k%#D& h5":cA A(Ed80(cYBkF+J&P@5B4mG%IҔ-K5MӔ=O EQԕ-3 ª.|k>+`8r @FN>c@BJ'N<c(a 3E:%xLr |qxS%eeh ֕%ŵn[%q\"Qd= `QXr&@Oewqd'`E% &4 2fAMU )xp29 (mau ."T5h! gj_u6ꚮεű. "Y` b3^x?ޖ9UH*ɇ u CP=9 cZ`a=\,Kð tp(o!v\™㐄 пœ_#eJ] 51mbJHH)Zwx&egDHa5x D P.@PN AX- i#U$Po'up48ˆ&H+xPM)CFbd'B/ˡ /# TÆ0&!ʞa=. ȇb9N&M@!EA$q.B0&+؀ 8_pC XP@9\b @64j3 57$i9P@t̤R^K}/SbLY1DɘphAچ0k1ݐpt !Ę. %:²xBuc;D|+z,r8b͆ Q"Nk%b| CaP!"ŠIK1TΚSZmM9TS!3"͹Xxā1F, | 'f0ԌU9UUp ʋ퀲3ňb[v6hq t  (V -rbD<|+#jY[-eřVnY=gGʠʄEj"T:DlB@VA},($/qMАdq{B\>h@ _reκ6]Kv Lx`Hl B?H|p ̤ ~:G(_!(Sh^1pZHJZ#Ѯ2@b:N >xb]1Xc\m-#vZ;cx!\;N" 䠫-{>'}-BT8B1|*%E(.Be2 9fR *b аg8܂u@%(ih`=pz o9ޢ1FiTW&B`4L<@ YWåf@d}[bl]Flc']&@ ˜[C)Cs4<@o7BpnHG~Tk(y?h(/c16T {B`So8[x fxA#4hA ._~_GA}w#nPAx!4Ād&&!r O6! @AaA.~A`Z  #N.$! D H` #r"KA0A P 0 p . 8" n0x&P! ` 0/ai #" @2`.lMQ1UqY\k"ȱmq1uqy})b"f!11q-b)1q1SPd,qq1b%q 2 m} !r!!!"2%"f !R)#r9#=#A$2E$bU" ?O B`Ϙd*DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkU_cgZmVeo\nW;w^oW`pX<& a+{ =erY'ǩkd-tz]>Wv{]wxr/2ɱz}^g|~_?~uI'8?P , Ll 5 Ð= Eđ,LBi*B<]eƑmuJC1ly!Ȓ,#L%ɒl'K{n^ y " e٘n%=Ie "JFp i%I[Ltv)J }B]c6*cQLъV( P8$L)a@749#{ 9$M'=@ s!&d46CEGRiTe6(dP8͂ᬩ̩((p9r9 OaIN@}w^oW`pX<& bqXf7drY'r{ p'+J0,2 k1B<ň= €?H*],  DD< {f+C=σ IHFEɋt般(<xG19\R"@po.e/309Γ;=ϓ? AД- CM5;T5̈́, ȫ`\GaAbQZzfi8 q> )ze3-<*Vz h;Dp=LKtp 8+ +@v ::AlE D%e @x!z㨨YB92 Fe~i30A xTy6`HC4'i@ m p,Cqi`H\ 8+-Ð!ZC,0N+)bjͷnrp<vIHK1$A øuADզȜ`רp. ,|(grn~r|@Q%wh4`<O8ux$^?ߞ!PF'nJ5l<:9DF,GhY{1fl՛W`Yqv4ZhCl_Ājd4CpMUe05@P70(ح(U`) q Hb h+rε P@I5@+x|<1D<.3[Zam+րT-) 11qq΃(Bo ș#&e 84fsxF+p܆6}ø0lxI9g0|okz@.)H]kwCTCHN HЀ ڨÃkŭֿ&Jc0p#=d@X]&kABRNJd@(1 j@3`Tgp!ٴ9{L$sQ0xP*2! ta$#Dn9pAd.S#&ڨ Ȃ~ebahHT]aUVZYEiUV[qU(jԈOF8f+/<h {3TOTI (3Bl)E6EX, $EsC-da,S0SI#AyV=1Nؑ4\1l-+%%[҄q{/px^! Ka coJʀ4Ua`O`B$AW @Mx[8@@e(XKd,53h.9" gh: !n0Qms p"WtAP\ )Ll>Ԁꯌ/$8V=cUvDSt80 d㜻D fFLb< : r(T !L7+U0UpY]a0epimpOJ/X55P! BxMA&=>da"TC!!6ʴv`c &AJ#@ u#`y4n N> A'!U\GN aD@0d$<+Bk I , AlBBᐧ aL,! ,n a M@ @@D a*>CjsBH=bLr' aaA@B<`$g`9m D(aQ걂;ʺ a A [!4ܥpè q q&Kd\ aP@È`&&Ja''`'.LBQ""@n>~!2 aY0 baj>f&$ % ^a( *: B! $@vn0 2.@( > J3Am3\x9`-=NhJg/C,F&d-vb+•0#`ei\R-# A2  K ]d qiqB P e78"=. !r+Zaaӧ32 %q BTK3s7`=13g>", %Nd:(J!NRׇS@TQ @ R$ёR%d6a"=-Rfb 88"`̀M4>jo03t$+#!ld5QuQQ!R5%Ru)R-RbPar 4j[*J!49`$@,L>"G NJ A! x SNaDıEz)"? 5z Aq24Ȱ;Wx  ÖU TJ%AD>hf UbL%Sp{ ! h$BBp&8Cr)UYը\ a:A>  cuv>D)!al aab6'bgoL 2# ` gNl6 vlO؄v$KY֗d6Gl\u,$Ar$oS>a6aOakkV//([ j@ 2eQ"S4 a.xUjtW%v !Ljb _e[[xe8sLSQ)gvkZuZCuod$r@ #U_S` `V EO @N{g86s(Zfr 3}_oBJcp7$vKde*WXuDzEw7CJ w!g3jw7j;bW?bX n("BC걮CkADN'hv`!g zJ.V;!$U19y 9dSBS~RE u!fC ο0@5 0DU7X ХEL y XA;8?)G6C5Hh*_ 5+ `8 >-WL090 9w @nwB  d|@H A|b c7pxYd*R Ke;W%t#Yx"|ق{<ؕG$$pbF@!/垙yeA"nKabYu>x"W3"e몠/[\,  3X"Xsxw@9ߞ"ç#evAbVY" vTAKnŒS> EBtEXl?Ud& p=" z6 j8It7|)+j9zz5Y=:?9oqNVH]s @+I_b˼zN$Sb>WY=y75 Z )eAAfۖ=v{Hٕwzyy밢&y r$ H!dTAEq(F!@"AA^$B ATAF|˼<|̼B/Y )ЀPx KTLX b|ɼ[dWzѸ[zlA JDeb[ID. ◙. ֱ&Jswh6#gGEq//\Vʿ/; f <[ѲAL(χS&aDbb%IPeD SW`:#f}5 !](M(-$ Xl X <=KF }:ݜbf n`ScTVqnVH,Q@\ny4% ڡS 5{aD5h&L,"ׄˠA`u_`a؁ I .n!6 -e\5oT-^y v;*r,+6F''(C!N o&=>b Γ·7Fݽá"7eiҠ^!|뼂jɰ^,"}hW+J@A+c;BM/߁^ }@}5DpH Y`Y@.oA9Wb-+: )\*"% c(O5er,c3^ `iP= @mJ!'CoD@(-A rN8@9@~(6#%("No40L\XQX4dIiVYkimyoqW/sOuo^ww)E3`"9rMKg!># FX& FAđDXgih`z9& 0q"'z>џ8cQ> D'/N k2K$9P M@K V`B<rxKQm,1` CJa$8`ny @Ta`<'H5PX*H*  4°3 ðCı8C' 08{6 tY͇`Z* c/@aA*^@z=?4 CL9Nu!I `z%`6xDh-ƹ[t<0X#⸾3?dt HS8B A}%x5Eh8*\fkjf0NDBK!e,>P5E`mHAH,`Pb#PV^sTLy7HO$=1*ozqSQLUZUs b=P$ d#'qdҀPnp`lE줔 e8(:@ M}?5rXY cSR`5&iDIT!i5-Ez%TWJa,e[Kyq.eԻ_K0fØ\sx)瘤qPz BLX`d Ϛ!bi 5ÜK|1t@.+OI"0!4y6)֜"%[|94`vsJp 28 HTSSTP Y*3V gC{ XE]6h ^A5AI.4 QYO0)FN"AC׌ 1``#4{>$yAJP,"$& $h;&Xu#Pn JR*Q6+(;١t9 > f0?pBa im& L P#&Lux6 IJT6̸xfQvsD(C-G,T*:%ċH)MG^E+[ H0@*b('\Blmqmp@7Ǎ:CN%3 BfV]W[$~^p/C6Q >DUՌL5wµ{O½AM똻ՙBEw>u~ { b,QGzzMqu4 B pG88/ʭV*\4)'s* ȯ? H rȅ ɉ MLG)*tM%VדM=\>M9Q _ 3!i @^U9_5#9Ϊ(ğ#D>9Y.vh;),o:(=@ ΄k9l!n__|cMŁЎޓK9 `wں_7缜~{:=p< Jj4/؏Wm$qc,r4ogLCv IW'rݻzs: p2Kw13La=?@ @7} 3@+zPg*0S9مXc-3L8Zaq`tAH`[`1J0%hIC$AA!qT$BL%B\&Bl'B|(B)B*B+B,$,(o0J!J)Ћn0k0/ûi')diBXA&@ JNbi0'(CqeC9ì"x RC+t]Wex''hZVl*u i]׵';'B> A, rbr@@A7 T.Nk#7$خGdZeٲh:|c)u=zܘ)V@>/ZrRVb,k ;Hr, `MDzYa &ʃxTˢ'D #@8l`R\@z/Q &<ǣxnb?^ mbC 2>Mn 2`lRM'JeRd]/Jh)(c?Jsf phOTdg#t5Z:ą0_ҕHX^eV&ӌd@%3Kh ,Vc\.W-fsYw=htZ=&MjuZf]lv[=mnw[}px\>$οO+:@٦p-J @x q3`;]|?"0Xh Ax~G'p,~%y¾A P\iPt(1A(g2XgHKik!0``t$`DC9m&AmD2(Ze 2rJ%# rjya(Jqc1'ILfXKi& kjPLb$b1z@}Jԏ!0 R41TUH-YhLz&@Z0v*@``sdN-uo)m1n2LVApuRӾ[')H 1vU Ɓ*xbc@Hd0(,Fğ'9o8( @h%(dM29As#&3Ɨ%h@v?G Dm¥ܑ s`dwHIBRlHk ^6)I/@wQ8$B F'Ev~cҴ dM!8UV"",u4`gڤ: j2+j>wgEr`~?GTr @_kd0A(0 :2S lm fcE t*>Is#`xHw@"`D(T?BsPP߈zY~#" kח &X]10:ݫ8Bnp&sNp9@=ș't $r$P/HC_ g3`".E 1?b]A”JHfaadE L-I:$%Co30Dۓ { ^ (;r@0G(*nxfu.8b2@dT YУTnQ=G!TRZMIE)TR]K1T̟vvfNY.8S5ŘI A*Ũidp?@&Fp0`&dq4C H**;(-7=h5 `_b2O(a(Q5@^вxX$rXYdP Cøz /U (ZPunuFZXR;0< uAC~0o?"7"qʈY)+xOӨ5AŶ;1pi_e 5'8d 0Zi"XQ-h,yZ@ Yd)6(ìаw7`P/G aqˍGȥ4T A;G<+rA:`ɠLϸthB7$tH.8- ajaс' Ɗ!$A8YĀ)C@//҈SdEtꈣ=HXΗ_@RLS ( Dx!0|P!\(Ce&1 3@P]B e`@)q~&&4Z @ zJ0b `F}|I@6)No? |ka-C"D`fPҞ v8 W~+  ZwaiI*H00i 0 p 0 p 4*l9#9'Jb졂|Q"6 :qaBmTf2cbLA>  "z<`^ ;>]M Y |-Nb!*!V@ 2bT/P" @$N,BNT("0T(C "g`f YAH f>J!^ @J*b16"bd`H`v$RtQ`JAdA +6td @A+%t `;2">1l;  | @P/Q|D0 L2"hBČ=@ ^a"A\& f=`$/<fl *2a:icDh`+l0 WQp(!` 0 IS0#vH qD1 " p<P@$F~\` #" $(2?x '"hO #TR8 gABXM@"} A6i B: `j ƱT HWAbxa2 TϜa(cŎ'!4 q$ "a> |336pY $X )M:E Ȫ"k`Z8!⇁r'T JV>s>F17~ F*J $)p".fs4#xt~/#Ը| 押(!B`hCt:bJOB!* `M=@:ijP"BehMS<)<< d"! )'n +;a;JXbr@X FP>W]Z$`8, 2Y.<";agB Yl#Yњ%Zu@;bS/!B/-ta>,@* $p9qhphhi6iviij6jvjj980̧\!LRX ,!`4|X@a>d Kdn@?B|'ðA`8)pH aqaA`$=`FH 5 ^@Yūl%|F& qB( )1~N#`X$kR md>d*/[e ^bx#@w&;roufrc,WuV%A&Q $(G+ 4%6w( `tZl[wPG%YxhpWF8G1C"@,,1!B"R 4BYWJa)] cxAG"@+pXK Mq+m;nbh ށvw`n.j !@L"Ee.!Gn&`އ(g|"vgMMgS|#A<`tGMs*(bXWAr<k Zހ B HdY,nxy1Y8*aƁ"h SbJ͠MTb&hetcm^oQVQFl\ of /B/~0Y%†5d86%Dr@!d-j`M"bCPvܢ#?n h"oRƋfL8Ya "L"t d(9#Y!q|G$ xi݋(bJ 2" @ `)N"^c`R D{Y;d ฆ f{H`gO:WX! 8 ^$'4|a,Q$ cBgMT-!7XZ)AtC {27!Qa d FJJ"'"z\ jͼѽ;ս{ٽݽ;{%pk0ʧpΥ%X!# N /2zON/*~d ) VWA!sE? 0F~; π0c#-!!K!baX{q(a 1QuH^adn1\yV a, +pޠZ `8@T\\OŚ!`pP'''V.K."V"aAYd2!(N],;E|y؀| v{U$y#" aQ"@e 0%<\)`Ϝ 2-qrK]>.*qz6]!Ym W q܂ x)Al$0 s&2ͬm &O"s1!qSYpmM8D%b0nzM nA#sFd~N;HPav :1lٶ$B/B/ b,`]O2/d a@f ?:pa&]u&:i!:Ar0b @=ϊK@^o< !2. H./b~`.D`zxWo@1੷9F-/ bPxaT~av4 ۼ+!:(e>Hs)>k9>GX%{4hf%5Pq+` U:@$b w@D4 ,Pl@ @a3@2@I%=D @ Vl# L! 0P&,D/nÁ y%y ,2]5#du[Um[u]}_ aV/cO)eo9gIi }V;EV !TV뙐R.BrA`m"-^kaf|\Pr(M#I骧]i"|N HwBY! A:@|knܪxyp\c`4=aӽ 9D !3h@Han@$-R  "0vɉj"Fd.aXB'Tf z0LV*0E $ $D,va.Z@"@~ M:e0gX7Ho%(`-Tj$ :uYEdmb(- Ly)$AڔT*oy= |dX@^" (' 鴄vFmP]M@j"4T# TQfBY `h g>0 kfᰡA;8z짆j&HK@`+PWU~&^2`y7~V/x|dBAbBgRт KID4J8* ?ҧGbtf,t,ՊP~*ġ8^z> pT@~:f'6Jo-pvCY44ǩN2à'΅Ѽ0>@4Iv@(" h@$| @eJp Cy<L[)1l@`Տ1t t"B zA 3n7d`CM@W$#nĨ[Sz<^x.Hۂ*a3@)Al5#ܪ:=d06F@{K`h֘%L8h(/ G #@} "Q! SQqX Re[MJS vmu0 @strw]}|Ñ2b  ^@0񹷴5S,iL^F#(~.: &)n >1G#ATW:?@ @@,@<@L@\@jo'` p r^[H0h-lxvf.Xy\b2 X0ic_h 9`z0 XHoD l (0}*- zz y<> (} 0^*`*рna 0ua[+ض0 !!n:3=`z`-zX$c}ypax hxr݇00n~5Xl9 4!x#$["DxF0te )Ъx~x 0rXZ^+ x iɆ[hR݋[@" (rDw*Qf-wH.Kfd{pt. Aq\X500u P$`+5\YbZ!y$*@u 0B9cƼyYqDdļXf)X Xˁ%))\\6} ~ %JyP(B)uT KLe"͂8av7q7vX=yF4FAaH7"LRi67dڗshʋ:"(bM \ _3h3IX" \0!I׃ih`j͔2ײ1G(9ׇJ4hk ЉMMw@Tiu<ʨTQW `TX̱J@h 'ĸɦ%= PܲP_=*W`OɄ! VavL<$C#Q KHu@a 1FQu( ` ho`2ʐHSp.M2z`#ԋ̏uITJTKTLTMTNTOTPU Q  wy51c ((}`PXP|c( Y %Z(~PUn5^h`{F hmj bֽ_A V ~˲VթbջUл^+Pei^@~}zTաhV yWXUa&\#3 ݇*I~WP|f@9Xhm@T)Z!\}PWWPw\0Ymt 劕͋Puz]U8໇|D@}̟ڢsZL ZݪU5XͣHYVZYWb E٭UU\-\=\M\]\m\}\ RS \\ ࠀl?XV8ȼ B#h08}]]]]]]]]ܨܸ^=\9G(ASUX!)TwZ^P_ __-_=_M_]_h^^ ^5______```*r߂w@UN`n`~` ` ` ` ` ` @_\aa.a>aNa^ana~`n  aaaaa b!b"a.&bn'b~(b)b*b+b,_x6^.b0c1c2c.3c>4cN5c]b8M{bV:rc>=xc@ ݀DcfEd^FdnGd~HdIdJT7 8 %HldNnQ, P8~UHtIɀY`%H~ Pikp9xyrbh6eHxpCJofpgqgrg.s'^-ܶ.ߞ9'Nqg=M?M&|a meh>Z尾z+87I*lL0(#ah]ri-P|,Nag>iiiia&KLMu :^h3 xyR"H@,i :ظ€`-/Y6o1*q5Y&݅(+8R KlNl^lnl~dtu'  ~!]ިj:^0l(Tl8hR U=hpe0K HRh=ipk8EXR*0l׊܋=a- P]GH^Xp)i!@j`>IXpinHna݂H+n950qc1qGX.;` &l…$88LXCX.vn 3[ 6Y:?)Vq r!r"r&ƞbrn &D#㠧l, ԇ3l(T X{# +Pa2nκPQ:V` QhhOE₸'8~,X T^Pa([,;+dMfKx{G6N︹k*!.0P%Xg0 %QN: 8@pq@H6,l[X>iX}Wp.j)hh"xx/x?b#$%]4c7 @}xȈl/rӃ~Ԉ`,!p$BT%ɂ왦sb14F8$ + *ʄ XĔd XR`89s:Ӽ@T B=DT]F}!HMc3Of8t&jIIRD˜$d績f "8ZbD `B) @Vd': xEh*ZPU Ar'`%VdNT.ж-B> O)G`'\lXj84X 9GWԨJ⦪RXf'!'N3'پqYyZ>Z^ҴL㪀Z kYѰT L8z<#m޽"]f/!Q r*`Hf$e[ Pb~5)BlH Kx{`Jxp8hqC_hJ>#y`{#`ay䳏B>翢!A 4$@d >A%<?O B`Ϙd*DbQ8M:@tEa. '@E` &3% &RtyT@࣌>)g$~1ao4d Ub(fGa&2+i>cWCO7A޶D-0ŶOw&Y9, ʠPOa4Wv{]wx|^?'z}^g|~_?~_ , Ll%   h졇*"r4u|LHEA\d1R&~ |`H~, ̑x8HLx|Ig\Gzˀ`De+j8D0C? AД- CMEєmG%IҔ-K5Mӏ,.(1g}Tuj#YTLӵ˸uŘS UP@u՝giږk͵mۖo qܗ+O(Fm.oø-(C; n-. Nn%5u؞5=E.MNUkӊ89veٮmu}.fdT>ꚮε[.ͳNյnݷ幾:&/ %-5'q/MOUo]|Хau}vNhvo7诓/)r~zE(3P@X D P->Pcb  d:/#*\AY}P E@H"G+XyAj |QJ!x`=7] !Hj @o4J()EB;$ h @/%|jx RBHY !DoA h<"|%E"hb3lG(S"$ᙀ`zrO#QPњ+X "R5$JG.XFFb PvEK)p/< Y,y+%.0O2 ['0U+eg_T \C cQchQlةp=d ]' gv@/@)$z^ini~j*jjjj꾰k*δG Xߪv4* l' ϱbC'R!bREUUb),ӀI2bX#JRp]@d2{Q``A\, 3eC%Đt:`AWR#D.HTT g$иLu?pa@c#h4iA30^2q8*AB$T*cdCA\v@ P:H5 e$ 1 -I{ 0!(NJQMjc>\I"[ tEi,vnn^A7G~/֘,аol k#'!p gA3Ho@1"|nO9O*|_9V77xc62YfF`` Q i zJ tک R7a xvn4b8^9h@i ,@$2j %Y`~ "e+i:dž~&dXk%R&. * 5B<]m^p~B6xO(0!;9@~ڲBanx;#f"g&DW{k8eo!;$1"(~R]K#p;@oC|+0;l)sMw"`$f p1 (Hh>357)܂! F" QH+u2xrQ|Q\p#` 3iWHg1PRR C)af`KHpgW OCPWma %2%yAvHp~pIr$5K8@H Nţz!wF;\ Р)!Q \!\u ! HuTدW'a "~_ph,A` )` p C cy%nYA fy:-(Hh玈鎨H" /'$z y$RXEyXrІH1pd N#p l$Њ:@cv@33 Px@M .P|7`O PJ1p6 IOP4d Ёt3K&jl& 6`s` @m6r0#Kzen2uC 4jvH$ ;P p9*ehb GWCW1 :+).WvPp} J{˜͜ϝ ѝ)ӝI՝g)e1>R9LQ(gVys'ǩkd-tz]>Wv{]wxr/2ɱ:@'8@C @,P , Ll$28.f40/LHtŀdNi}**N'ǐpB xi 6:(Ig@"d`L q&iat4\7^sBC<]P%7.gD`GUm[u]ו}_ (0CQeM`%69ѤlG 8QhI LRra *B3`R/8Hv+ 6,(h9:)s7P%e=Rn>@3@,KQL%BGEHoiڄ0MyלgY  K J<`/ OŠ AT. CN)E  x1BEffd}>cUf`$`O%G%&T`o:=* #'`O>kM !f@@# ҹ "& (q:DKQt_p>ߏdy"r>9[o=nakux/@28^hR>DApdA# Bp+ B3 Cp;CCDq$KDDSEqd[EcE($ .1`2#gaj(0z yP!2,I&EASR!: wEllB(v$6'@zd% wap6 `𴙄6@OX>Ab?q0Llp03D Тc| $`@TDe;6ش+8bP`:Nh2L6:)=ϳzPEFQԁm&f ._P =v;R hIUC| 1Ns+隊<xbW +gON4clU: -(dvdH.yB8`!;A@`sD6M9N VO&4!*|?}+Ku]b+湾sz.GWPTA2086ʯnq DrIr5"(9(0l[&ʹr׶wTC1 AC@f(NJC8}Hp8dZ @\`0Bxt>@`QɆsP,Ag cm#;$kєr!&DJ9Eb>u*;Sdڻ ѻ7+0pm< \TfƸx!@Q7XPhv;-8GWJa,e[Kyq.eԻ_K0fØr@H$4-4OꚄɰE(3RΙa@W Sa&KpjC` ]`pMAҲQN`nLKSHzTiLaP+ l{0-Cd30b @jl5!\As4/. D,™a `!>%pj$S@)[ d8c A^@ ҜXs{ϐ>裟ցPEj %a CкC^Q8tAL UG0>ADQ~ V4F'I4v4(@m@;G"cXz"8sP 9dl7~v 14~ S1)U IX-9fʣXjKbga@n8ї z O> 2NJOd TJ;K8@O es M394!d3~ e.DeݟvڠknfA  JQ (A A~ѝl5E+m66 ֺ]tua~\ Dx`V:Ǝ¡vb~qy;szCt~zQ Ly,SXnx[,b^^6oJ0p@*SfIBn8naTf0 "`^$ SPH64']ba@狀>Ql%i 7r:l.B[q@x<`7 x!<>( h3#@ KFPJXJ 6n( [/ۀޜ>$f ^<0C`%Xq])B+,,L.Jp2[ ;C7+G>>>7ip Y̕+x8I4*<"XQE@$"| RMJ$m9XbHfz#V5q!-ָ$=Jmz׼}?]~хi@:{ @X"RĂ[N#$ݮZZ[ [[-[=[HشY]ZEh0&uR`m;hBoRp؃k;np%]Zh/>Ԉ*j8)*e4 R͌!K2tyJ7ЂO-r!Ҏ<2&&K֚鯈fd(pB  E+} -lYPU 4\~\\L؁\΀Mπ.SIRd](=]}'QP'i5 _O7XU1}UM/)M'C8:c: 1C*\c%pFk0~ S<фZ@a46X6%ܵ_) ؝ [][)R(&)q bBb0c22c>4Xз J WE 7P}r_M]+h,\@ LX1D4DN @-]i]ndfDd3ewJq]y0㉂)!02 & bGh^ s#hdP'=(fif渁HE9Q9$# @_kW< ȃԵ)8 (n^`.cef`Py] ]JS4? ӵiNi^ini~iii[`pۂY( j(j=_+CM46X$d`X)4)B9Zj@P+P\z~aPbwE0ylu@7іʜ$d3~CE#(Vj.`k[ kte~U@%H@A)vUSI\RvlD`Gm-߀6PN${~ϴ JDGx[@K.~6kցp Gx6 !2p pDwp*`oZn61Roh.n{$ՅVYѮddRA3N.R8qJ;xWP9y'o%=8oNvfoƇ& 8 |;1VFW]qW]s у;P 0=(VXGR`)B9Φw?twOuw_vwowwxwy:N 6%xiDS jGPoM"fȱ)k؆δ ]>9)?=طp@f [;_8'*pxbm1M Xx: Ey'qw@_)tj}F֚fу؁^=QFYFJHcn_=r>8 cy*)w@[MWByGm>m)P%0> 40aNO`IϳP5{HZrK凸? K-G FHNxр$"T uY-86hz8|?WO$A8̨0!=? p@xL.p.Kep F:`&M0;(T%!;K@p+ B$a+:hp@If7>cbO-,%lK%2ҩ$*kKagkI4fy p3`JL5φC5&@\\=ku݆Sd"]*@7Š4# y n\F#6j  W@RO /{SA`.XD%@-@fGN6˜4&th ;DɃp BN<F[M!@T B=DT]F}!HT)JԽ1LT9NAPUIR=QTU]YV}aX@=iES@5eAuuX@Ebő?XoY-hC'o3#HNJ <%'Gl<2f19ú09` x@-*E,!C1LJ_^Q&QɎc&ʦ!.caN ozP\S,RjED䡵d}7_ ȡ:)ix{ ʰE*ۏ4Q{x^D{_5e8 NI@>D 1& bMslFfC!00ᬫ"0T8|V pbYk`x,-MX;[HHyTPۤS! `Ɓky޳|` ~y=,[t킐{9H (N ͹>]tԀVsk-lsPjI~) bT^0,ˀN4d^[O<=$*^%XXSy !Nbuc|2АMѼ%Jpq8!ax2\1(F lq( b~>Y HlcrHII, Z[0M @X Al3n\" Ð!y XX;Th\Ӝr?!$2p_0_r WtN:dӾxO<|O>PVA+@XC[@>B3De]Ɛ-A>f7t?)M` dK8~P5 gN#jҚWKi}-jCoN@];#֟2k}g#ЖFRڰ|YK_NVKOudպW'D椀0AE1O8zdQ:ʧ,uy% Ժk5Y0p];JK@5[[yo3$L-<5v þ-,p aL;pK`-8>@!V@M9[ma { j_~`&`CAh:*(ý<4LnGQJ-D R?e le5qrT@@YxJa ؾ|U_,e]0f;bW @Pǚfmٿ8)6hRx18>ͨ,&DU8f?Hi%ُ2σޗPjE&ڟTjUfڿX`24ں\ku`l&defi'hm_lm}Ml Sjtnպfۿxo%BDp'pL7„I~p%_q5w>8ܟrUg-ܿsT#O|su=t4P<=sgM?uU@tҺ]v'e':gݿwuOӿmx'){v'y%_r'ǩkd-tz]>Wv{]wxr/2ɱz}^g|~_?~uI'8?P , Ll 5 Ð= Eđ,LBi*B<]eƑmuJC1ly!Ȓ,#L%ɒl'K!DX^1F~A!HY4Q)Jta1LYq9NAPZ5IR=>B?60j. k _ΐF; b`>/ `7(R-?!d\_1fAh]6QjvaBN>tq! 9W`NJfXzU*@e-G `PE ivւ{ .@:as.j yXe ^&qoG1wG H2"H4$I2T&It6I] c `"a&1 Sx`_"h- `^N , QȉX@Pu L0H vJUWVUu_XV5gZVo\W5w^Wo)6u+3 qaPLsft a f67fh Z xbd(F, І*8[Y`&, 8hsw:(F A.f`=C9og9wgh:h6i)Ux4NRf0(: _ @|KPqq#( n@ 'ݒVmR _.~pYq_.zt7Ou=WWuw_v=g(b=X>wŠYp ȬYa%E0B>SѠb%鸁*q?qdEH) @} D:~LHZ=NCx9wk`tF A8)`f A"ϝ-:d~XKIpm9 BZkdqv}@,d˒`!0Ty#`v¹ 4,0+'50Ѩ !6a 9C 2H9 !d4&EH#dt 6@EaAypF] A4Eo [Y h@8E/2$*"S ` DAԄcXk#u  +xYd(TTy#;gtyO9=g^I27%H3k, d H&>cRZ8:VtLe5bd$fIV"\ h@nAT%,b81-DͧOFU:UjWfHF =b ?g|B?#} θ֭; al5&Xclqf*c쵗fY9gl Y1dȍVZYkmu[:i"[yom\;qMm"۫snuϺF];unVu߼^;yo5Wd]woE_;}o;C/u`< p6:+B G a<)p B~bR'XevȃgƓ~ צiچm(ѻ }x$Y:y%(^p"̘'"F2hMZl;  nafa-x]`h#g!xyT@b)t0Ȁ f #z]f)'ދz]wފ@.^).u@6![j fZ{[)?ܓ~٩sz@=gP @X^E`<2P2AF8߃raHP_~@i?"0M6! [ 1p6 > ^?X'X* $Բ< B r< ^0>x-p( =Ff.]1X(a5N˹z[U@+cpՀ6 7@1$Y@^SiJVf:Vb!h>g0 .p9٠ @@ ÀF"c-f0@p"UNSi=&gT6 3M@pH$ c.  OCi1 cD:O[H0r (= P ֠aDC/:FAA*28 @eR\@дM4Bb7d#3/p34C@€S`I;V׶m Ks,`T;Zmet7X,@TNl,mšK@6Ű*BQ!KEOL -O1 4 v2 c4 9,~L8@bh- rT?Ô xBtvk-mDG/ DEW-# CxK Hu; .~N8G !Ë5~.Da`X>FK~!Pqip Vp(Ds L&\S{y7Pex?i 5 (fA$ M;' KКG=^2|$ FA3vL5w^ snݷ2x'$CD,2Z;3fݜ`=/}Ё`~h mEA "knm 4пw-X 5bczG2]ˆUrچChqʸΙ- ?B(U}8Rq@c0d *q 1FZ`:.W0pNe iU,BG7Xcy-vdO qkxfO9F;8.VjI~᢯WQqp8Gl͡")p|(8wqv/ g!|&V-M<-B@0tSG,m#D[M8k rWⳌڻwrOnw^Y DY+'e4u 10Ap!i:@N@39|jvNg{ϙ?v <˙xRe4+,'dTc Wm|i0 i@LM9(HXf֞ 3A)#u6 oU}]fwdpOE'E7*ESuBQVbdVa T7\8rtLsGOQ|ual FJ7&mFW` `a,,pD1: ?Qu]Xv'{Vz#IPItJJJA)JhgJj(kJn&aaӗa!7a @@5D BH)'YE')r`A/ Ed0 c  4XHMM7 0C0@Q YS!~&m5TPR & AT9 @rC"Rd)œ)-)̎a36p=!I'PM~`# p 4<$1?+SQ=Wq 0d*B b`P#ApZ` Ď *> 0cH0pX@a`,p $+ YDL).a gI)N@T3ih"M 3`-* 7 HS0 cW/p3b,p' ag6xvIz~Rä A hnrI6: ug0`x 3 mT5  Ny#Pq)+鞘a ;P1н AU(; i|`Ȓ 3_) @it4|y0O "p^X K@+`7(Pəـ p 'H90Ye@ ;}s3pAAv(0&:' 0PC'Y@›adC1k8(B'/C0 i*T%aC0' 0Pü:J 0Q U8pې,=P&Ow*YRS`&ZH6p  r#&٢BU:)* 2p%H0 %P⭊mcr0 ^sgs;sҨ  `,y(.Sq( @ $Q iY=`&Urn:)/$3 P>Y(P("'Vst3yP:@s֬P` VpH'8zX @`'8(& }+  ߒ/3ZY _F x +bg Y=ay P#7::P` P8qC0V@6` ؅Ij' \PN[s{%? A:sz"p[Y 2 P*>jZ_i螠I<Xr K IƜ̑љ"ҝDoKJ?;䡐+ΰpӰ Q[$0a н+Ի%wǽ^[sq3 a  0Za yfIl魗)tix '*iLE$I')[ #pC9gp1 :|=|?Oca:UzY`; \Uū9uuB;ut}1o"3 @p 1Qji0!So a 2!? |o'sI鞻zZ帉q` Ȼ9l< >;I KMLa2rHh˚Ӣ\ƭ+*_+ؠ C0 4a9SqaY| ~oSC@AF%~1ҩI8& j' Z]]3zٿ:Qy{D/N;@Wϕ@gRZ+&{68  ,3 N jz 1:<6|鿜|oiL]qpٻA{Kt{ [ Q˫ɑXX p*^PZ۰$ Tp(mjjD ruH(M(m,,а*5Q]*6=901c[g i?*ͬ@'ܦ?#Hq Q:-}Yܗ4 Ͱg',,T c;Q͑qZ[Ȑı 2 EjZ{&֍Ap\D J=P)S쫾aJ2g=`ʑ!:$Ap[Z'"%" ЬQb_UW ѢĊ  0XR7 <1iM \ipP͓D ;C0=1:9 P@cPIqH9e{p pL3P p@ 0pT !Bo!I*J9PG$?qtl U$??bE e@ÜLE`~~$[,P.3$ :P Z Zm7h47&(V0UMr͚ x59B 6N]Eʐ$)FJ$@Hc- (ohz&hDg@֒qhLv@Sd)g`őN$j CXӪ X4YJC̈5$]R0c` 4,^.@K3|\)0,]W@K`t/ XVsSuE)!#uPi(rdlSq]%{COӱh+/p`zCg}7Ao~خݗg6WR<ͳ@- 6 lL(ʲ̹ ՜,uLӹyW˸8w"C + xF hy6RY9{uUMКH $ D#h \h?E h )1ȞcQyP6XDI A)'$}?'s(7QAHi#RzQ\`QO(LÙSNGnFlb oRd4u/HP$hGp-"r\zp'rlx^ i5=^59\02TloD3|JSGʹaF80C Q+R`] @g,ힳR+Q;)еGqhIpY Yf-Iw/%No)"<<Ɂ2 !*¶WJa,eJpm!Z1HAVBEF+>E A1ZrN'JP@Пh$ШBe ;M#DZo2F$@`D_{~dx &"V["A6xyUXkElS6@y HhHI% r]B L,PՌaO5 =BCLxV@0ph?BD@1J?*: BQ '08W"UGXX2֗ܫVNuQi G< b?=XidV hzTLB@ߠ6BY0A4&> d! DH;K|. 8< cr1B 10c~1r)E']!6$\<c`UhO ݵ@\> @ l[uۅە%K5HUCO\XpC״-ͬQL, DX-N*a%G4l D*ܝJ=K/(Ø8P.|?@7Q U@74-dASp|AŜ`LOP 8eC@Q@QR2SX@8+ _P#[XN-M픇4p@ 2)[ ۮ_JQ4=zA4 h%XZ_,ޠZPeY VYZ%-Y\:+m C9Нt]?[]5|fCfC)xDhKI5S\CXfYeRfSe=؉_hERqe%g2MNZAbd)d1bbL:Z IՠQpE0k8?طC74<ɽ F;`f (ca$YC JH1%Q&{ZfĒ[´`YIEI\M_-_=_M_] JHS UUy<2T6](#` ~D47P8}~ge^*~JR&Kbžcc̦UiD`vcv"#$$V%^'NHC` @2p' )ѝ Nkqgk|v"BC>D MMjr~h?EaИ̐CGԆF@ɝT_alUz)X3aĘcHCxl@,4* Ex6)cfOlN4 jeNj^}l)(hª(̪;"5 RրOlFTŀf j@H7)}-Tfh|m(ajavenLCaBaVd^)گBfClŀ nTd)l,ЅLk83 4eHdNP( (!Xh x@ߺ]vpF;5EGvv5A< ؞oE!- p,Hx䝆҃B.@ A-ht88T_/L,$3萁p] @`J0>@6ɍ$!=Cyz Θp<[kaV̍@h[`TsQe>D̀4\ K Q17D3-PA)Y@4@6 Ռf48p c_ F*0Jq&@unLJ~DX@0a)ppvpȬ0dĤED))Ld΁K"=` Z= 7r>qC.B$n!*-Dh~V̖nN7C7e XVP~C|4?\T2\dC[!r"r,27#À(T7B3bXƼS`Jʌ9\ 0sݝ6r**2FXArH,lk"5zW1)I4t (s3ĩd3Ep3Yf\9@hCX5ZD;ZsXn5l@H;$S9b uy2s^۲MQIdoJfPCcP.yBQ SK8Y@ 3iXNu=NATH q1XPr!q68* C@@15_ D2@d;Co?{9tn[S@%-&3R6g>:7RHGjԥm=6/rxC9v@Tܳ!2 I0@58f<+Dw7ʝJ7P|t E?CxeckD$+C$<[,)x8C @ԡB*H-,O O2Q WQQV 8$w.2O p03 (, <:(s3`My$/_9Gx>pC<36> "?V@\?1M ;2CUtT`k~4-(FPCÜ}x8y38C *XtuIgI?J7qۭ3`7lN_{J CfVq,ǼхDJ<cWC6:eB՘\`8 6{%]IK;4;'{=QJ.q/59t2𠇒%LTpp "5X; $XP. ϋK:)!X\6_ t2"`DV-U2UL3 p㜗')2,ˏ[T[^ٶkv4lDqLb3NTEa .Em~,4 5B.DdT%]#(˳NUC3)3kWzH7WS"oC0&XD`3dL&pGL @tSjrYO*ZT|2d<S[w'! 1( B,D1S2Lp#)kM ! Y 4Uj`!K^պWb +%hBl)Ӱb8-aQ%JteVO%5d%];r56CE2 B6>šUDZXҳN>RPɰ&`Yk/&^X0I)1F-c rv@ZbYg0~ gYgjGql4|hƁƩK{`/Iv؂ Q[4"lLF7XUR$1IfHdƕRɣT*9b]`)b0V},TA}ifFI-Yu\7֏Lݝ>[2T.ux4;{u|X_-6n2NF'4Jdޫ׫h=|pV\+tC:~4fwC'H6mZe~SiS2 &X6V3[SlmZv nUC8guN:1f wGlžR-+h v-߃_K֋_4sfs3)9lg(~0(2*1)$YqK_udMّvTa%ȪVtרDL&p;7:>p460T c -LB]!.RhgaE&HAD;<ɍ ` afPA4-ׁ.Exo&[Z= 49s.;]v4*Jof2&B":P'޿{|C]ۊ8&z*s@2q6GG$PZ~}[$gF}wDG4u0CyЮ[x[,ep7t_`bSD4fx567mCpZ kffmLvp|2#D}\06?g{a 2b48 P Q27!Iѥۄ(ikl(8h(҈((w(+0 q#43gpŢ 40'1@W}v·5[s/3MO3>'Z 4A_ _0MQ`-cc`ԃ=I$M_P3Spp %Td?*Ҽ p#~00::0 Z#qppv0`30#!s` z},@.` 0'#2/ 0 GoDĺkfSqS`}!c 00:o YxYHxU) 0qg@"0QS i=X-PUP#~ "0;g8@:2#L&~ary01CC=p fZ@q(1="0 Q_ !-[ p)$ Gic[   YbKc 1h@2uB/ 71@( )0u\{1-ГY7)9aB0 1[([g`A ؽHkI$kA;ZT/T0 T`As9Xl qO բJ&*q`?Ьz025 Z-5 Oh )v)Spj/:}*}J+) v?16&Cl C W?/ p@ߕ1qK#0 ZQTBY3DEugՒ|_xLf4MzU qsFHʬ$zؕBxStz |ꀨ ͨH0Rq$iU97p)5+Wr'v#'ǼrU0' 4 <0/ VPl8 @P`*/EA`ExP"ׯwEWlA3ٍC*h?wpAA#%$z+I[0Wa D0Wz`xP谋!~Qcy β铙Y9H 8Zq ; 4jџ0x L8' s0̽iν%(@,ƉLnBFlYn+sQ$ x yz0}Ct@ꟍߪ\A\0 ꮮu[ b:iPp@-K* +ݾkX9ɺ<ͺpV0 :w;q*بkgrVq@\Z%AtJ ml,!c˵I/B0<9|;?11Ri)ƙ8˗.`l'{@*6͏Ê \H@ L0%Q({4,<#4؞>RUU` Ɛ3G PZ/ t#Ƽmlo#BKE[0X feZ#P q8-luav툢*79;W ByЩLN,A1T/R21\g|\ }U P R31 {GQ)(C t`}Љ ˺{oZ#=\ ǹ~fl"(O }:-~$c<)2@] 0 HpA}PQp:  1Pd03`,ā P)q@Z~##P# gH2S NK .G'ʚs 3@ AiS'vny < '?qQBBig ܐsŕ!AAp?_''++EQGsqg1>P@] +$ Ps!"0D"`%ʹ۝<< ^.6AR)0 2exnn~~^_莊U䓃A1M 7s Y P T--P(N Τ7.@'q[k+Qr8! BOS^q PЩn_; k]n(oq0|fNfz q0K\kNY0AΟ&CP`X M6Z_s:vS` Ee jeq0D0h Y _A1uak^Y]A1bNf7_'Μ$yڹp!gPRO $P#O1@ b@"(_n`Ld0XAd:#(6JP Dp )Υebzeq)bz'@tBP)T" nPW` m>SjvPv6 u$@lD"ApB@1|_ )`FOBPR|KLEh$ȹ;# 4t5 3M 18z, B@4 553@Bo½d:fhSUPQ.pAPt% CPEQteGQv _`TXY(>cD-K@=,xOSB āhyD3y+3q';# E0 fZ)+j⺯+AӃp.@oӰ4 @,YZql#kzD+1=ܹ{gUW[/s<:; 0"ke${>m gBJ3 äjqUDŋ(?Wq-d".އ#d>aB)hD:nD)AmI) !D\,sUM @ġ2 Ցh<Rw$4II'%Ȃ^k{r Tك2f3x>ӋO(rP?(lj>N@ ` a:Dt($b-֢' d,?rZKQ٥raG@?04ѥÜ2G^퉲6f^;A8*N*a `tHBA$4'\읳!ε@>h(RzDHB῀ffsŊQfcx3 x$oL2TZrBӊQAjKȺZ=+KDF)U,Ժ u0nbS@ʂ08Rjjx hB] .>D+2JhmF:U=r4T~kV֛hbiryVeA+i֭۫ D]wu,׵>;SW齊WjgsjӏWrZ}=rJk~)#!Z80#< J*\3r.8^ vF_zۈ0uz^lb@a\?I"%8*lּWڬkm7Nv0y5ʺ;6zgb2W;|sJ(P3ƵҜh:# aXpt$}4椢R7PM .dSP#6:s̀'3S5dֺ\{bOr IT:!H-C;tۻ޻|,9[ѺGJl]vҘR YU1+h}g+tEC<|WDYC7@a@L~:c)Dp7ÍHOR5:Ȅ@B@789#T;Ð@q @=idƤVE< @Q~:#;#="K#F$jG$yGdu hL\nU#8V$X>I {dMp7ì@,̎.eQ$,?BvB1eUcZBb8ÀoHGzaAi7#r.@CBI)_$߶ Os < Aπ^\%@c0C80,lrJH X^u'X-bInwfI&jCjk\>z^vg xWO(l;xT gߎug{r'Aaǂ? ig6sEno@DY6Ű4]hh(ބF_6@Z"hvґh("1Ck|fEÒ(zab|檖'J(ދƎթvj:]-Ʃ}i'g2sDB†hn~ nRNdrEdJS?.iT/i^}dCxBxyUhʍF~)E*u*` fAf fejWԌ%;E44M CZ+8P ;.B5y\*DT+F@SU=Tc@>0Aa*m7PbfgFλ¼ڽs++9sxp2cN+Ƽ֮lBP\*hʌT ~;k,V+`5jC =ǚ- ъ DC5dV$|M@R!ŬbƬr|@ 30 "1(R+# )R5p(ƥ2\2z6 Ek @*[>(4*qP@D (&Bd=@XjTO`J6|~ "tjE*N`Fd0X^ْK@jq3 QnX3_B0j"-Y !7;ܮ9H` 7)Pt o(Jb\et]6Qπ/BL Kzh>?/??0?4 ߦ6 3!R@q`>c8+x DF 2< ̶ +P0K20.0<pY`#BLJO˂3$7Cp=ĖP8 d5ԃ_qh (&<>ec MMDt-26Y?E 0̘=Q 9MClO `4K6 `-pv?d1ަ 8V^Ԯj67>e0:CX1 sYy'E?xb0"AT 0N-+A q:@\B#Dluר?ɄpO*^M17(.zp&;  kŧX.ioѦP: $YHEܪ A!$PC(m!Ј*'E(\Mҡ[+"Br3 ̀/}>$ș)MQaT:X,anjʈ rdJXz!h]1zYjh4{xI僢 ӟΌZ$hS$ *UbO"gd<n2' ;TVs,$ޖ}1LEzw1"%HɅFj@D&TSrXC$F4lD6ۀpax]{/Je!H UI)HԲ)LOּX[m-/6k `6@«Gޛl} g/! نѹZ_ I(U^[K 1Gb`b!1ĉA>[(Ʈ:v"cĊ"@% /f/dpzzbj y<-*2I @ `H 5t @ (kk n @aiq*( 0*`(pRG"0 2`7 3&0@CPH "p1ZZ4:1$Q|]i"*{ (S 72 c&cuv&(V. NL-p 2\Hg$(j1q+p SLi(Pi (86;$h>ACe\&L AU*% [0$: d$%:c 2ApdgE"Dp %6 Lq& 2fEK`)*`43<s 0sA 6{"2v]*܏ƏkF<b 3X7_:ȡBBOQxIKMQBUr 0 3xX P +Q]=`=28r" P(p2 aH ZAo 9" $h) 3 Ph !#%p؂6E഍26`J"X '`/kd"%$PX0@  tAp &"45g{^*(R.?@816 2=S VLF)iF&UyFµx@X玘(Mȁз Yv0 (pU ;pښN PV%8b*JHƪ-C 2ixaHD1%qyJN>YE9H``cx50':0"` f$ WrX 0l s@aZ$1^K ykgoisYw$^ıs1x(HxF ZH': MK 10'01 RpH9S*hT`LV  2U,51* H-*Zό :zת8̢j( _!x!zzSTiY08hJWFF {!m9r* ؇CF[IjKP ŜyɋΝ ɝ0]3Ya%8H ("n)|J%ʶQ9ժ*ٌ ,d MT&g~ꬡ@/d.;ږ`%#q1zeZh h(+z=958, S*/ bVHYZΎ@ӎ|<\_A JĤTzɥ!{ʜeAAMp U!1 Ve&@:S:BHAZ$]g0Pmvp:"GP +\v0 P R?pB}! ЬSSpP]B2V YOO:` ځ.C&p 5) // @^wc = ]/ K0I>]@ @0[ҭ'50:0T!݁2,Q \`'`GBRyޭ:b9> p35pH͐﷠ZM X 2Ry,%8 pŐ`kv++^mq+[PgP(YHX'wELZ H\q9J\]yaY}'pZnBޝGS /U 3S~LxPeUq@Z#P  ]" `FMbsPe  C ݑݽ!9or$.tB瀦Q* 6(ڒ3s" z>&`@O y?ݎ  3 0`DFށN9{pE8w, 0E #` df*$Q^[^[qܞNa$u2A dt)W@/MϟPRTFp? 4S^f&ϐԌv[_a/er/x^|>ox?qn)K!#F&렳Rܝ˂YF%#^_7B.h jř @@vJ d+tU-0+ *P:! `tH)s(P _bu'5ޠ̼"DE`t`X-h(Y @(vZ5It4AG L֙Q I$2`0N?S@R3T I=5HpŖ͛14ltw}Nc 9gͥLTGdDZ@?S`W3*JXԽ1LN8"bD6 2O@ ]א{_Wm`Vbc( cX8 Pd7! r@PEH-MSMUվ3@#Ί äA$2`4@|4C@` bq#IKBԃUL1TMwe{7sE & \/c4H TD9jPb6IZš@9} *HSYFy q2 GA]1.C$ړcd@+A0TZ ڣ#91p]MIC<30`9L[ݙHYmt؍p!.p) C%)0]b(>+zЬ8D@<TbJ6:\f,X%:ioK "pSvLCn*džD?6C4Ƀ05@ M ~nꮰC.lH2lRn+>W TDACx9.&&~𭪔,f)Z.X7Ոaʆ1QdLy0>.QD.PnAT.=~\"(D]N8C-S.Ah2JJA>@:ނE[)AZ 5W˶ȓI E "@oUH $*N0+C p 6,O`0,C4eS^.?213339l}\ɥшJ WCDO_p I@08581: Z'+$?mAKx ШP4sK @ q=Y7, @Zt¬2ÕHλ9UDml&)&5NՀ1Q3G4|k槫IJn=0nZ =p]o;@oCnC@il6l71˩ I<-'x02,:o5~@,́P?:GO:W%B5C^SSCVI`?#d@ 9B)'Ip ^2nb23K*92DCHw=xqbpA63CL0 7izP@}LC(Xe}{ c#*B1q9q9m))OA)Ӿ$w.2RhhK ;Orϻ#28?vĹd.@f{LD[;k9g7<+^j%L:RvuCɚ<B+l=h/C Nbt<$OHB8OE\P#[ Kt)@k^A|@0Cс`]wk ɰ)D4X(11Jćٍ }^<{+;s#v@{{Dh4SFhȰ@UÌ7KPכq>J0Cњ7a\q=f~;"jNO8{,`}Dx 0CG+ tiAD9$}!i2(D\*U 4tRDB"l+v4A(mt [ P5 *V4j#H) w=b4:-IKC.Wb*Z{G%-%1K!X\6@v8Xm:㌎aX e)lp0`J73S!d_1Uf3p2lNB:~' HWk`s22hP/tt^/WG_?f#.ZqDOHPt9 4&mMs(` 5 g| Dא207\ FCpN + hp=t~dD~!ň>Imafo O`x⊢Ⱥ0(6/c< (K< B3NdK@{dʂ>@@x!jXϐ0L 0;TMb=M$i!+o  ZՊv-lCD$0En0NhMu?P, 7\m޸.+snA#>2ePUg=+O NN\ q`[-O )p.JZv)vj0%K3dN6 w8(wbh#5j;O +pI!̀.MܴUa ʻ.pbŰe0puVL 50% 򰨧|,!AДp/:Kt۠<&$"(&fmB/Ԧ'U}d5*- ؝zCb- nDOdxH SfJ@ I,(aHa;HR"y1VJad &^H0}-К2HIA*5 A(EBi. ?@#B#Yc>ʷ̈́.H .ɶc8!doRQƢ$?`gT`(ݸ*$M Z(rhJM#DIs.ܼ7L0& @ "tJb:ʥU^Jb΍0,T%`v 'Ŭ6! $:  ~.eC1`_'; P%B(e#[@ UyٌZZsy:'I(@u@#d>Rw-6$Bx)5(D`Dd^b^G8 bKh6hxPhFva 7k#@:Q f*Z0D{p& app0P;Pg#gEEd~UHB"LGCq G@mtF~` rr/w13zW6:/7,r0s'52tkxdKdOeSrCRje%m#>fg%Yp3`pfx a&0b\#Ԉ$ċ(MXQc'>*vghvl(4sa!*z|'ig1@FF +XԌ֌OHd=Pe莅хdm DQ4 s i fJX3b.@ VJ  q _6~q=_@J:!B[h WNH-Lh*l  S!.Vp'[\UJSEz_X  gB `+(+* zX4nlnx | &u[8! ?H1exk=L@)V f:WqLAw'Eœ uIT1* /+ þ(+1̃$24 B݀2 "3  :TH!AY>?)qYwE+;b<%ɠ&,B(>Xk Ʋ:ҵzpk(4IETYF*1nWGLD`&KKʇ QsVUb64GaɒZ ᰈ"P. = A&҇B0v" eYf\ g]M:N,iGG""j+RȄiW+HZ&!$Ԅϯp.rC1zr3By˓}f*(۷4|iqsDZ!Hs]Ԅݷ}柌|(@Tw!\Y|0 !C{Awp+ dضeݧmwH/{("Ur샟{o;pLZzP1gc%X6:#𴎗vA?q%ŀc̋ T˙6];'SRZYz-ݼ4 eZYd52UX'f`ղK%h؄0ݭsrQ9Cvґn%M*']_3o2B6(X" 3) XBa$,.Sx!7tKkOH9 ]tx/'RMy SAJ"/ժ <䜷Q:f2Q!LVG^Tܡ@`z/ewol<=#x~C2fb 8!B -2J3Oɠ#D  ,\*zZXJ_[ "U/!`A aD|>aR/G: "OR 6ÜG{iaAް{j?G.XA>% U4L 2"@] c&C*@I5ԓR۩ۖa}tBe`"Y2"yɉ,аoiOBQ T,UJ^Y!rV ĆЇ 6- :pRuJ ~Ѝ tA>@ Ml e(ÀZPc&-=M7dm"d\FIY/&dܝQ*dA2! "K.eܽ.v` !! #{+ e@>gW5?IVydG1 A@f"gɷL;}8 \UuE+:닸{ܿd׫ c?=.sDl.d hs ^<}݃}cV%@9V O{ͮtCgA˼م/"zc:DnbGO>"ooh/t/zn\"FO|" &Lm "N6 ͎,!0!Q0UpYʍ bmq0uplvzQoN l".h.U5 6}$i pHc 3 p 0 p AaH % 1q  Q Fjz) Iqgh0`&PUqY]a-,mqe1uqx:-!.X[&@2c@l`D3AYQ%'qq11qȽCpl`1qpq x & 1sQv#@zdcq%q)s #25#r9#?3?s??4ȓ"@4A4AtAA!B4%t TBt5Ct9C=CAD4EDsBBC4IEtYE]EaF4eFti%LP"SFyG}GH4HtHFG"#GtIIJ4JtJIIB!IKKL4LtLuKKBKMMN4NtNCMkMBMOOP5Pu G"!TPQQ!R5%RsP)Q"Qu)Su9S=SAT5E*U,"u0!5TuUUuYU]UaV1T%T?EUeW5uWuyW}W"h"5l!uqX5Y5YuYLU"ՏYZ[5[u?g+[\\]53f/]u^u^^)B2_6`6`v ` ~u4`a!b6%bv(6:<6b9c=cAd5bc3$Ee6UevYe]UH ?O B`Ϙd*DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVkU_cgZmVeo\nW;w^oW`pX<& a+{ =erY'ǩkd(ty>Wv{]wx|^?' u8?7~_ +DA~B- 5 Ð= E0XA"V$:'IƑmuǑ}%d%ɒl')ʒ+ ̕,˒/ 1̓,3 4͓l79Γ;\=ϓ? AД- T=EєmG%IҎJ5MӔ=O DMSMUUՕm]Wu-aY֕m[u]׋e^ aؖ-cdٖmgiڑ͗j͵mۖo ݯpܗ-sMuݎ[vyޗ{}ߗ X Nky؎%☮- !=EyMNU喆Ke普mAu}r 46nz|j߫h# ͱlյnݷۨց{o}phh%oY{v Ξ'р7OUՠ AT. CN)En9 GҘY-Ki5MS^xOEQU.MSNUUn]WeYUm[Y H=\p[{socO)eo9gIiU?O  `418LV-FcQv=HdR9$M'JeRd<S0 leӹ}?PhT:%GRiTe6OTjU:VWVkUv_XlV;%gZmVeoKpH-wL(`idN'X,f7drY'ry\g7tz]=]߰cxi.$iⷾ/g|~_?~_:Z휰( 8~fFxlX@'ǐpBC,pf`s` !Xt(Tf@9!rt1̓,3L5͓l7@(gX7hJ0gK SirbȚ %0)Y!yJɦ49YE(@f8@")ĩqp6T [M][W5k[i2 E$8^9@X4 Bt>!DX^1F~A!HY4Q)Jta1LYq9NAPZ5I4uPb-@1y1Y }6lI.lQ@EhX,@"h1f/B?*98r( E8n`_@H&#d2YH#9 < Ow7ozw|_~@0@4A)ʚ!ꂤ~&)`|+86*cV-3fD mfr  89d qԘ!+k F0hKѰx%3Cx?pGm5gh hJd\fLLW "9Ch2! 8P < q< H@v m!J~g st5q( 1\չı"Ϡ@ MHQ$=al:? \Ѷ$fv1~P3&,aI)   | )P_6 "` ct~ygmmvqn=ɹw3X6oY+W5e2-j"JE(D٦ڂ,`ZO@y#,N 8z,ZwGat+Yt 5 ?@B 8?oibB7ӭPE4HC0[:c -eA;7 ݳ1whFHCږTwp PGc DHaHA?1 }yٕ AD4Yo>"!{X]@K!,]@m 'z:GƅxnEKqѐо[ \@9C3|*J$!:Pn~~?7=C7Xw>F%׋QŸK H~AH?PY8#S71d t!@C P(r0hL@P![-8o6qejx+ڃ@(kRP,ATH$:هB8~3&(`n|(O40~?@1׆KۂHA (HaBo#< 躂2g ]($aaBz'BJE N(W`/SI0=3"3zAX 1 (> ,L( /*0CNՃLĐ3"80pcaكVhDƁF> @;K\wDŽxǔyǤzǴ{|}hxnmăPQ%y/2hxV H`DP| <X8e0j,`=A ;; DP5h);pPKr{BJ_'8z" A!|L ]t '<JI "0H-=I8'gəI и:(y`( &[zH L[ 35p4j@5JpSy0QskY~Te[\]^_NN.OeeZGmf>fvghijǶa#mk6Fpog&r6sFtVua )zjn|Fv{&66w!SV%ՌB]gfhV(mڎmࣨ=侮*^8in~i&1wԁXj1;H ;6eީ&< R S1~~붻ƼֽkEbn^Fh뎍l6v䎿lF{6FVfvmm&FVfp6ƽng?O BaPd6DbQ8V-FcQv=HdR9$M'JeRd]/LfS9m7NgS}?PhT:%GRiTe6OTjU:VWVjP8%j_XlV;%gZmVeo\nW;w^oWO+& bqXf7drY'ry\fstz]>Wv{]wr'z}^g|~_;~_ *d@Ll% ,A5 Ð= Eđ,LC1'ry\fstz]>Wv{]wr'z}^g|~_;~_ *d@Ll% ,A5 Ð= Eđ,LC1ELRY`gnu| &/8AKT]gqz !-8COZfr~ -;HUcq~ +:IXgw'7HYj{+=Oat 2FZn  % : O d y  ' = T j " 9 Q i  * C \ u & @ Z t .Id %A^z &Ca~1Om&Ed#Cc'Ij4Vx&IlAe@e Ek*Qw;c*R{Gp@j>i  A l !!H!u!!!"'"U"""# #8#f###$$M$|$$% %8%h%%%&'&W&&&''I'z''( (?(q(())8)k))**5*h**++6+i++,,9,n,,- -A-v--..L.../$/Z///050l0011J1112*2c223 3F3334+4e4455M555676r667$7`7788P8899B999:6:t::;-;k;;<' >`>>?!?a??@#@d@@A)AjAAB0BrBBC:C}CDDGDDEEUEEF"FgFFG5G{GHHKHHIIcIIJ7J}JK KSKKL*LrLMMJMMN%NnNOOIOOP'PqPQQPQQR1R|RSS_SSTBTTU(UuUVV\VVWDWWX/X}XYYiYZZVZZ[E[[\5\\]']x]^^l^__a_``W``aOaabIbbcCccd@dde=eef=ffg=ggh?hhiCiijHjjkOkklWlmm`mnnknooxop+ppq:qqrKrss]sttptu(uuv>vvwVwxxnxy*yyzFz{{c{|!||}A}~~b~#G k͂0WGrׇ;iΉ3dʋ0cʍ1fΏ6n֑?zM _ɖ4 uL$h՛BdҞ@iءG&vVǥ8nRĩ7u\ЭD-u`ֲK³8%yhYѹJº;.! zpg_XQKFAǿ=ȼ:ɹ8ʷ6˶5̵5͵6ζ7ϸ9к<Ѿ?DINU\dlvۀ܊ݖޢ)߯6DScs 2F[p(@Xr4Pm8Ww)Kmnltk-3.7/nltk/test/images/twitter_app4.png000066400000000000000000034512171420073152400206760ustar00rootroot00000000000000PNG  IHDR 4ڈVsRGB pHYs.#.#x?viTXtXML:com.adobe.xmp 5 2 1 2@IDATxi$Iv+!Z $DJ z/@A%@@"]\v;;G_uWqEzUVuuuuיU?2w󟻛[֚ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @$6IlD @ @ @ @ @ @ @99>Z?Z+? @ @ @ @ @ @ @/z @.L3 @ @ @ @x_E @xO bO{]AO~d @xО<;0o^ @ @ @ @S@`Զ. @H Jyf'_WBbY, @wV+xtcc0k) @ @ @ @^-n} @^$< Wpqk 2ޓ+* @x'`jcW!6wW:Lϛ @ @ @ @ )y%@ *@k+dCg"@ @ Tb{X!xHK  @ @ @ @ nq#@ *j!^A ja>3R @~X>Uj_׃~z? @ @ @ @xM_ @x4(I  @ @ @ @x_ @x0zB-z* N+_^q @~@/>ܧ@J6io|< *mumf @ @ @ @|?7  @ J+(a!jhبĂ-) @#0 @dpT3ՖihK  @ @ @ @ :e @ *hx.Wo @D`h^\Ӑ!@ @ @ @ @kX @P+1p^ @xC`q/(Kh @ @ @ @ǽn  @.p1@zZWyƗv7+8 @%x777b[Sd2iWv @ @ @ @S@`Զ. @/Ib`q+(acca`m1 @~@F߮lփ+_<2 @ @ @ @&ů j @LW0◩lY @^Lix ГmR @ @ @ @G^ڬ @ @o  @ @ @ @Eߢ( @GjC @U!?ߎ* @ @ @ @a/F @wN`@wnm @wP`h;6 @ @ @ @\@`[ @ @!xHߝ-% @ @ @ @\ŗa/)# @ @ @ @ @ @ @XN @ @ @ @ @ @ @2, {I  @ @ @ @ @ @ @u @ @ @ @ @ @ @A@`eKH @ @ @ @ @ @ @  @ @ @ @ @ @ @ /^RF @ @ @ @ @ @ @/( } @ @ @ @ @ @ @eX|2 @ @ @ @ @ @ @xA/ @ @ @ @ @ @ @.˰ @ @ @ @ @ @ @ ,~A@_'@ @ @ @ @ @ @ p_ @ @ @ @ @ @ @^P@` : @ @ @ @ @ @ @ 2%e$@ @ @ @ @ @ @ _  @ @ @ @ @ @ @\ŗa/)# @ @ @ @ @ @ @XN @ @ @ @ @ @ @2, {I  @ @ @ @ @ @ @u @ @ @ @ @ @ @A@`eKH @ @ @ @ @ @ @  @ @ @ @ @ @ @ /^RF @ @ @ @ @ @ @/( } @ @ @ @ @ @ @eX|2 @ @ @ @ @ @ @xA/ @ @ @ @ @ @ @.˰ @ @ @ @ @ @ @ ,~A@_'@ @ @ @ @ @ @ p_ @ @ @ @ @ @ @^P@` : @ @ @ @ @ @ @ 2%e$@ @ @ @ @ @ @ _  @ @ @ @ @ @ @\ŗa/)# @ @ @ @ @ @ @X:K+pvڢyJMO|o^ZڿwqG_X<~L=Zxg,cUUp󃳒ǎUQ?%>{Ls$@CO G:EENG܅}ѫkkWز prrsBV???C/|fõj_;VVR m:/|%p^>u5OΧW}wHkzzXl/'=8Jth͝ *XZ̶YFfI LPQdmt>W7\yW!PlM'6O>A (Oa}ߛ{rϷփ#Ӧiڗx]O=i\K; @ @ @nů[ޠ@uA[vC;˨6?yFRѬz%}j˥-+_#TڶzY4,MS`Wj5a,q :7jtQEiޜ'xd##4 r6|?^@?1*0F9I[=a|TWi]]F6GyXBQݍl-]SvY(^jV<8Ef~^=S\M1ZFF)c=$h3urK-pEls$}XcO/py_S~h3im$NYj rCa6ioo͝ i/$ltg&o@S_I~WU;V9+cS~Դެ3~}?T=/s?]`Cu73ayg[eoGSx?m~ʽV @ @ @ @X,{.v'Z9I'דt'8o'lZ.~Y {8bquZ%xՓieςꭍmTˮlo;.KUtĝ}v ܶ\k#tZ m@rѩE^EO|,fT6JΌlJԨ)o{sB@\_MArG+?ɰT{Ǧ``פpLPO=ky]NHmVqu;wz5]j[*kկk)PJ|}ɩ~޷xjD ,zF5`'./~}vSS_{Bo3)/,I @ @ @x?t0OtYFipҎl4NGJ/vyV\Z6GN+!Fe:v{Zu.PKy;hsl3ME?NmU$|u/~% o3TF fp܎7FZ"/g4:$!3'001ˑarLO$IFCzk4#q/ D P+*x`th90X}Y8W6Aΰz;]?9Qsn9O Whϕk <a7ұ8i퍶U>2M2oFWAUųps4#9V#GuX} >͈ҹy;|k㽵oZxAZ!pyrȃ-\۷oGW68'[/ejookվJX'LΙݽsݺ}wηRԙi(%{G20QןEGų?6Mbu6b?i8%(~+꩏:>whgw; E";[@XeWSoXo'\]8Ho&譊 GQ-3Mۃ[3޴Mȿx0m~0 2x9&nɏǭG&kΤ52ZfG9+^[fQ[1ώME{q vO>jWy%uy=܎]Vh77M N%$lTIisFe}fYvW;./HU눠%wTwNA*~_?_{R?l'&*wn|شiwA~{p >%h2yHOڕ?k+y@I!j~N5-Gߵ2~~jO׷WGz":XOUO5^7N~G.>S~uk~us*srNzv%KZOqMuZs?.@]7gߦO59i¸]i_~n촕/(?f_k]kmu:o!iy@s4v%@ 16.} r{8J?WARzae3d*h8{5z7r_}~/| ^e CLG;8߻+OWs+unfAڠW(>q)6?z/}ywM#@ @ @ @/Q@`KĴX$hfΧt>?<2i/ngmr?mZIDq4JǦkYصi͒N4趟%_SmjY tw33yFe^룞2o&vQٮ&;k=g}q*x7Wfm3/fa*iACNQ'vF]ۍYjNVk׼YKX个G'mQWs}`~}onl\\#TYuܭN\vT1IcU/rPNP8,'W3rjԅ?_H}MO)OW3'ͽqs]צԣފޮm}FmOP}mdݤkm"ZAۋR_.XEX|^ LHJJ32jq&^\>vHq9b@xml㓌zFIP=eR#=皇nɃЀ;ogkm;/2mlIF,Us;u[#jǷniFҾ;jw'9W;(Ƴk;_zH;<$hhnڭom1kӫx\Y.4iZ-QΎ%0bbr6C9n߹n&zI[Yַ9`u~qk3Ԉŷzas}zZ "_ ո H;;&~; o<j-]HֳޤU;}}q}?`p'oW]3 ˒Y $W+u{~]4Sx}wov+^ڟb|W#n @ @ @ @x%9%6ٖtOYfiD3yKWe:t.˟=i:nfԍ !Zk?MK/})a zSL\6+ޱ:r6gP] >IُWk7{)p+m󝼘gY"4' &:'هGn x;/ԖBW9NkF=|ɩ_]`+˷.2rI:f=:MMuec XI68b 6]Y['=3Og;,egd Y{wdyOMJt)<^nx< ,NR=|`@m׉_,YtҦYi hakYZֿ=PQϹKW?$A?$NXɨ ͵4yqⵝ3uWp4e;C[4u;yXh90AƩGfi? *>GM _2c\&kW=% <'UW:_Nji?f8K=U%7j.)EZMUovH!崻s}V`qݫ=2ڠWJ^oR 7paYG1"0쑴/YvV*4Nݷ}~O> @ @ @ @Ճt.y? '0e%:e=Y^gӇ_:4LluhOʯ%l3Akq#Z]]mGv3#_eݬ`'m&`%>_u ~|uT群)d.*.x_$GvFFF3˝vusgdѝ`׈ۙ+ iE"/6 \+_Ao|[K/@* 9TS^AwSMӜ\kkmt֏ѭORoZsg7r/[Ȼ􈝤}ҽuNXыCxYopJAgYe!4ےIc}r/TA8<72O2_}iSDkvzdm]^oleԵ>blSu!P'ӣr$Gsi?e8H}Jl?hGG4I=x!#eՕͶݶƕe>_J=S{Jyiʅ9)u~ u>u^R/,'iTPi.Ӵ]TJ5S||&jes{K"Pd޵W.@h6j;/^r^X9[y09@ʟ [jJuMT#fwΨӪ2L.缨zeHkʘxu>tἬ:fx'mܫMã={$bzF ~H'OM_yHj?r?$yb^f:=߻~F~]ZoWs\堨{eO}%~}{O kM @ @ @ @( m+rik7N&h4#v~u? 2]Q״H[Ӗ=+dNOұ=q"QoSӌzyt,?}W2U&:ל-nVg;om%0)6)L:7uV};yן{'t>>H>|t;W=iHXcvO?Ns|3y?8_}w֘#-b1wNNڽo=h;Z;Q̕avR/U[WyMZ̯og9ΏTwJmeZ@|y1XMU+{N Jzfp_0?^R)HH<(f r+H2 } ?K*:/i=eQaPL\Ml&yY#> &oVϟ.76qk{mk{}_|vH{u}ru?t&U`wU`ym>9V+0s:>{Q>u,*=lݝ;,|:oǹ?H*͢唺`e}oAWv'=|*񘹞I뵉vmm^k?j_֨Ip񨞷E~u,j<\xܾwgyGmqΕͶ;@IDAT{W`?ͽJu5u/9cr^Yzgw~GەT`_: J{O5-쪣YE9`_^MG4Ð%hkom'_+k}ꏕ?+(ꁜ?L2W`^G&?[ 4}+o?VPNϵv=:N}yZcW|V˹Yσ3"߽,33h9g5ϒ_mQV9og9e2OOuƦ>^|g޹ o|\ۚ=)3|Póo%+yFmW vV(;"Oӱ J|3Hx /.PRO9+6=ۯsOfѢ/Z:0Xd}>q\gsg.MPqs}=n'͏ lo  @ @ @ @ ,~aB 4C?*y~xZktnov6gFĨEzVZSrMGݚ$'md-#և LMm:N2TY:.7NbҎqf:A{f &fz f-fdPF =/s+{Ta. ~`3תYY"oϫ4VQU:t>/%=-Ki8Y$f:2pT\AţGՑZF9$wd~u_V Nm;NYzQe}  :N3iq9]A5Uyvƛ܌C랪Jr*c'^v4Q +aNe=?Ը}ےSeDT$۹F,GLrx&P'rvn?saTq3OrgmL;M f#=xes??neŽQlQSMųλ9!Xm>\^W^]\󫟪Su0 UNƋv9rs}J]ʖQZ%2N 4ȢI9ps"Vpqmޖ^ޭ+i79;[i7s}5ʃ۵nlvt9=*Yr{I-\auV|C&ᜬe~.Y˽̣?W3z񧻛)*ޯhzfyu|l#xi+o';̣RlnF6Z'yY'0 ÁLs~oo"XIvmؒ_{/\H @ @ @\ŗeO)H_ K {yWSIg\WEc=C[n6zK+b4˨A97~t7ɓtfb[u|#꤃Tu;_MS[)We 2w n^N7kk\+m+媴^WS04MiO3|6k7Ot|^ר'vpҩoo$.i QOtOBru?v3NQ۴,h hCPyvUDu/糔wqy5e䑳0-E/4}sږI5kU1#7ziveWQh}O/BrLRb=e&;:o1@:7:sj8&?.Vo$MGQLqΣF?)$RU =I?pv YrT۩#)\u.&vg0d;9|j|^Yt#>YfsUUaYI{PuVv֓u.VMnf“T`s0ˌ}k{u^<5*,YI5* *gҚZ1*]Oƶ~8I^g Yz>G 6Xogk?82!IZWuT]uSXΧO캌b]<_C^hU֦wҗ_++aUyiJՑqt7ԑAy:nyvxzɒc+lr䴝>hǧ(Y:jNҎ|եk/kpR/벫#h0nundo[>[(seQ7Kr9=aY,_8MϗX۞8TtN[׽*NMioOxyiO'}iʞXo.&+u捺`ivmdzSXL)>M@sΏi|;?%/|.υ9RC*J] xq)ΫOǵ_qJka[>@z^XFaCyӋu`?SM[BՁUTtPQ5տ߯ϖulmɲJ{.OT]ZB;E*]f|VPZg}43>v:\wy:^o=z_@͵=}o/˿\VgǦ:);dQB9{e&x<߯yPX?GZ[':,aZo`>,c}Iu,]qz!k=u|RQX#a-TOu.91ǪuMtZ˥CMu..&iMsΜֲRC9RN??6]/5m]^)%:?%Cs_~u{Mry9U?/bݯa4{ʹ1sϑSP#${a,ϯz,Q~H.rNQTysu{"9[zV?)YM5=w| Ugle=:>S)_S_C#n[M}_u^][me$soR:_Z!j$2\?_}4{M)Xj;OMMR`r$U~5/,K/rf.JTZϳYKvOYg}Ϲ^F'rH|jX<,nڑ=G^Ye<7uRD~Zr3ZH:}S|zV{|x9#wZsӵDZr;OV{p击=zڨ֧t*Z=c57wYOuI"0-;~\:g=ʟ31j!i}w|-4.O+$xIOzdUYWT 1Z{:Ώ}s'J72(#R#TcCu鼖mm/>m3O}t'w+Ǯ%=VSedHl]/6~:M]rrL|7wWgyʐ7;i1x򋞯:ˏs~-G,+Gk q;I\:)W2^[9vvvڧ_|oEr<8Lw]OnZJǟ|G׷q\NIp_mOm촓+ӳqT镚io&}&?y|F~\?zڃ(핤$۷h{D~z}-w>MR"o;N2|WI}ǚ+_SYٓlu^O5?|+J:5NڭJ;~ǦMT]TuPU]T׊ZŅ޽XCrjfufOκ>Ly:5ĩ ۇ-M6ZS8{4]82:g=`|ܭFo nI+}=뽚 uutO|ǧ9ܽM}||{uG?a=w׋:>~>^s@Tz7SgZU/Tyt\Uyrxܾ{Zlg}}G?ZfVH˺hԶ甌AʖJPχ9QFUGuggNVC VUo?~SS{#uTVWtowPi/r:bi5QOv ~OS7~ym78;Ƚ|6ro%iݮ>LI!zҪ70J;x~u(d^Q~6ZuJ_ϫ[oIZm}+U}š:8|tO;j9.6rֺk9ӺU;.$m0Xޗ5[,*yy?UlWvzZ`ݏlwyy>eSo?ɽ+c㤖mvY6?~/|kYPڞ$ȃG=ܷ^P9.Dkj'}Jj;ǻ4~l~qI7q1ItWVڷ)r{j9x_ 7n_uxa 뭴~bUk;?/8[:W-!@ @ @ @o@6 ꫴNj08Q3J+)g.\Vӵ浕I-N2p:oA'l^ū%'\VS3'GGR6F?߿꾙gӇ%IQ_dt[L_G:fX|t~-?7O3Zh:gw4h ]_g~|6- ̎cCǣaze:IqFIOZW O}w}O"[#Pj/`YA(gIһr8nn]Mpe+r95Ly+GZy^"Ok&8v S! |_ JVkyUk-R 9H'{^lPR' ^iLuQ.'9?Ig_M<`%uSo?ԋ :ީzNO %,0NqyN֟s(iF9uo,xWZ8GDΉ9nf9~ꕤU_MlJ:^MO8A8h8L8f[tΫY~AoHഏ4Igл؜| + s} ]l9bʏRk /zZu`uL0su̬`MԽg5cqpmI0uv\͕_NuLԲjTv#o?i/z.cg >FNȻc>G˹PGsI̿aU3OUGVQZi:e~N•ZUL@WE)VP_vckWAwە/2_KzV>LcuTs-Ӄo' QqoU?J2&_AS \Q:_]ə*+K(n6pΐ{Ηpl Y@dZͨLd% vnGĽ.zk~^ kX׀Tk"^tÇA8"qjChÅųW}V` {8<ס{tbMxؒYwBy9΋OÓ#lgt/zhr+[{1/9Ο&Eq`*{Mq|o.ǻ1ӣ})s@>;:e1xLU^ٚ`Uޜż8۵Q^kM{!7xNv ;X^:~|\X; zvAYYͳsoWca[ `<:G>}x|jnbbk]\=xp^,kx^iRt/%Ļa9: eFEWz$Q,h7¤ 8|~c&F;@~%Ǚ  |O>!>Cޟ <>e^l+)ʀތy& ȁ<2p3k],tBI|v `ns9n6W~or\]u f,7yy6":&}Q2pC(1 T) tƷ[+-ʂpRkE"ÏBF7lj?uډθG"eee7 'vm&eUyT=3t޴I^ol˾N4w\Ε˦ET1EB6fϬ 3ϛHa$,FNu=ִ~نk޹Rq1/> ?|H~@qVql9Þ\{ m=l[aTx@U9qVLߔbJ&,/tuw:'g;]❣Zۍc JV^Xn'";G|?l톟ٯ:әc|AbDN $C׾L / h~L?m0(/F0%]аB ) _|$;ɄM#S~jpw3Yix1.N9: h${G^C>]|eC- Ϊ,9[+3N_( @==suFv1?rڶ𓻻1&J2̉:6w^h6Fm1/KW |_Ա VRsAڼqYx˲gE^l Dz8Vww֡M^gE_LR ዤs1O:&ٮmW'5ZmRU|YquLJ+t8V9g]}.^xĜas{6b**>{٦=X';QO_l8 | 38#wŹf̝ghwqۇ|Lb>behW]ys\r O)% $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ |')iIkQ@ZEG4:٘\6=M\c&7ض18@K":آb1N!PUƥ:1JP Fwй'XwE`(F fbqS)iBj'U [x1a;NiŅs?.C1@S`quh77Qv^` QzH|楚dиcDhP3B6Yԣ휼;&b:y5'=xpnt@x){a9hTV~'i"kƤG+óדl}残uE\Wc8>Lؿ:N9l)ߤyd{F=?ܢ yV>0C6m~ѹnq9o@"(g"8_ zD ܖˀ}CB?rTZ;/A9Į? D1*@EN;Q|_^~#GTviӞל}R挦nf!J0,AlQ;ے@Hg ,FWPӀ|:_̉n"g3t.]K<<Ń*Gh=snm۞+_21+w('_&̳y< z0\ʞ;y&ٴetlV |ξ1΂VktHv.ug&egXϿY+ 67t16,||ÿpNQ2vs5* ݆O8}TL0ȗ%}(((((((((((((((((((((((((((((4R@|hb$Ybɢ, 4fb,Pؒh@a31?ל[0p \%<}nn%Ԭ|S.o"K#Px,ٻlu,>F(ł-0>k< I/'\<&tNIfd;OZ=s(h`9MJa&֊?8ܔc8)8rmA9{gF4GxU1m )`]YP|Y/ňFGv m31F!EpIC{Іrxw af`f3JD9ky6ȴ"䥉^A@GRn>Y=eAp?O8kfwYkH/||D9"[ ej{" 6SɌ9وʒZ#F4?!xf3nYQo@^-&-zZV'eeu L,qJ P{!skn0""g{ddu&_H$hmeE [c1f'UV_LY󪌤 )h Txt1V}&'#DvCQF2exJOYJ54, o|;0CT'9V$6R!]D7q:a/ 0/;Wg"o/#%6odxҚ=IAl1 rY]uύ E. E@9B(d,Q BsZ} Q 򥬑+\9@`=^FȓC7' +t%RLgALlr>֚:;d\_(E~7KFX͉'/XA[>,\uC,ٛΥ\6s(Os j8N(' O9B^>gac HBF51_lk\#܇AιlNA(ܵ{Unt1Sfr8/Yc@}KN}h`#XW珑lE%dY/sB0t5g?kҌsf/^YTq1Ҍu8tUv F/&ZhU=Ⱥ)gGZےMGp -&4ki tG̈<řo;Qi/!.Vx^ވHgvm*GDED] v]'a>1cH۬( G f.Q;:F|fk;(p]%ȕ -Om8o_ݐu o"Fr^1v^n"!Fp0EBGR[ptR2wA| 0>#Oy$hGy#w #DŽl-БPKtG0 6iàz:*zQDM#us>lh#~.k:w!,Oֻ뇶w: mΜ{Əjv}Τ{k mƜr.97W1,|y#<\Fs!Mϟ.ڻ:_HY}`Ig;Q-2SJHHHHHHHHHHHHHHHHHHHHHHHHHHHHS@(c2ò}ۤUZ *bP,bɿ4);-w=29"`< Za + f9,qN_F@#zY{ʳ^8a  :27"e3dE صuۉ/kأ!VN@`E/ǁ1F7"bk:Qv~CosY;3 S#-hj4jL'4AsGc |2XOqc0= Z7:ٚ$ȃpՊ94X'^~nE RS (0y-1t+ah,ӫQku;g*mbq-jUY D04|bKF1+eFmfyBM0z a"U;jW`@kd #Sx瓘OaHOm"{ȵR('(A>-GN j1i#&m[`*'pWCYI3跤%xQ#0nX(.xu->#)_ċ| 5+ۀ5D%uR fÁƭHUF#e*15ut[9~6Ge< ϐA 3= :i/'·NzydޑIFJ~ҰHg]扮b{$CSr"bInȵīow F%/K7vG1l~[Ȏ*|eY6=eY1Rc+TOїF0X(@kx,`_X9϶Q V$9cAդChk<0tFaxFet&-1/cw"{zB@IDAT>cF.K<Ƙ5bK\)2KT)3>@O~8ϑ:%pzq Tvb[~J>@xa>'?{sV#J+{4}ʂ4t=&O> Z^ã{pF>?Ξ1.@8]ѠgY0d'gUb+ U66FD SkF_"O_՚ (}z2gWg9 Ayn wt>ϫG= erΞ@O9/NCn,2ƀBˮo5w<]AM>-@gat}YNGe-tAĴ^oz0nV`Q:OQT0k3? IEg ]0eG2GOȹw')Cߝv#=s"bFΧ~}[k3w 7wcip9 ?(|ɣn8Bgy5"*4N1~LC;<{F&Kʣfo|F*8Ky2.ڎ:׍2R' nYTJꃻ(": O=M'DPQ :ے hZ[^#ʣZJ[]cZ}1GYFxY~M۳]Tz lئ:G:?r+֎y{9k Ln$y,Ѓg8({ 5sDg?/θldyX&kViߎk5>ut_a,w=* )y@>fr쉣&WFvywyCXuGέ(5*ndT8Gt\] C{@b[\Uos@!{ zK gr`8ߧsdɌE>jyd }lNX7ѥh{۠rݏzxsbE"L:ǡbN-yGfGk:K_y)Kr|*uk&эӹ|VS@ՏezGwkixl=)%tBsG#tRG(,zÏsc`.8SNA7P9s7)9^9đ-t:g'KNîsp{GR +GD^c^U4im9,/pA-PuN1^#7h@d9{'ϡ"#6|09!Fu3$ZY:2%̿;o/F1W}t6y2<.ڔ6;c\^mz^b;cZgo w% :ߏG1w{w*p3l}ʲ -wϼ>yщgNu: "_s$ם<r2lYԗ =|GdDŬ27jw7oa '-e ,ᑏ1|(rAqr aF:*0"" 1=>k%!eYcX%S@(`ȖeSS `,fVG~ed_mDnE#VbLkSM*&>!˓ r5N}FKALeWUz<t}\G2u 6Z9-XT,w|ְyxH>u1=3d-k![DhiF?~i?!SPd Aw[V14n̍tvFћƜ9X6 jetod6?9ѕrDJ-nBs"v*UABUwIJa _>|sɀѹ'z#q9%:gOI3OpT.ڍ";%@S8l9S3P#Og]ώ-ƞGg4yVb/OgL51c lTV(nd8lۜ#K>F'T~_aoU ouv4èy@F4wp'P ^?ܿKZ wNC \2>Qn~#*y `./#9zav=~S$tɃ0LAcx}?~ pP"/nen{(| ty8{h瀋z-0%D;ay=PdG7(*.?k]/$#z3zg qwo_ NuN<_Zݳw,:*5Pqah8!r0o[6$W@|Jތy~ )¨1 !8Q&xo?s`f·:3+~z9SAMdR&oޘƉ}=(w+򨊣wnJ['88B.=FmxM۳]:K# ce OLႋE~X`= 8!xb\lN,qΏ*8}4rq!{Wk%< bӵWr$#{lv 5KF)v_~p4P:aP ~}Nh{6z4[% P1OnЍ\";5g11O ÔHGq4?wЁsap -K>ʃ D.Ng98Ss({Nn!XpnU >?k:N O.F%n+lβ7?WYK ŚI يv(WnKoJ>438}|cMn:\rȷqvH>?a=6{-o#-F脰x<0gS*cd<'#|LE]T(7"pTCy<3c\pPgGYl)Gg:R@gpEΌI B6`T81`Xюc@c{`݃Jxs KA|™!;$/{7Oe j|^ ч}/qxHSHJK9ܠcpeĝtw.Bϗ?9|q9C9kV>Ӏ YwTCl's7I{h6sSg6kWPwClI,_}.dhg~9FгQbxs:cE&Dk4(@s=eԟgyw-"p2]syeq׍}ѩA=GCznegh>"%>e}.o8f]Yݝpգ _8_|wKEHF+~{#ѕr8q E:LTyh>'(qJc߯&j#b~1LÎ`jy^u>0Č&%#9AȻz9`ay;?/ED{IT :d: -w^IG1&H>do4"W>Ϭn25iةQ$xЦ7%1m9:Fk$%-:pX2ZΈu#G keI/?66Qgo3+3[y9tx ptC=T9xrnŀϚ5 d u,t(&YtVrY^[!,FV:f80T&X{0gP&)XstMY9 ߘ&ÅTZb,?OLiE{`>\yB ѪFF* DT,!y)#o|= @WF'~l@$aFh!D?^)v5+~<͞\bDw^c,`mQ5V22DQ6"mLQN*蹓1Kt"]03`\#F`i:{ 5Eb x4!`n_bd A;be}l+oHIׂ*ac # ,0K#f0s>^Z:k.Je//M:=8wثsR?b,"W"@/`90^p:I13*oC*DS qsu3tW1/<. "MKK&5<[!ϊ Kdg^#N$2ڙaX _"UQZyASʷɠ]o@8h t#tn@ЛIY'*ݼ2p#y{&r!m.1m.$onS醃u20k3mLJ㺻'LXBQ :=Ve,X_jbLY!"fIc@fQ[5@KٚPޘߗ&7wrG,_: 3"##lY9*iˆU@Bx*O<T<fO /PȦ䛨a>`=7slH} /\~3ψ9 7^"Q&pm7SSxty Ќ!Ca]l˙A&]lh"'z|TDQC.[)>?cStc_[zSEDzxϸ+_=Q؜ضg,e_?W l\$s~#T9s2! >&YG3 D**2>!;Ύ^|&X{xdܨmF+ADRI3@qy'*"Ţeǁ.R>S>gntϤjVs?{Rj38wNؗ&كHL-~C"DiOgkM ct1`ݜ_MmVhoӦBEhzm"]Dm.EH :|ݳ \>҈sS﹗J3|5--tM:3nׄ. eh<Ge#_3ܳܳ3n j[X|»9Ftq ͵fOB4E>kf3_y6Ƽ27Ηkҥݘߨ\t!DDDDDDDDDDDDDDDDDDDDDDDDDDDDD ;3#?H `̒RedWq1}@A4w BVh&K-..hEOaVpxB#mAS< E^<u1`50Ykn4.x'eF8}Ay8FN&28Qk<*A0*;ӠhM`FG堉n滿fnbXƲbEDŽA ZFpÚ1iMW0D`$@Z0l֊;H*lR5.1Uy P6D?)8"),rDpD%^NB&4 47H# _uoY%F5 0r.e Yʣع Rത W, 90ˌym'+y$(ZjPu3?ZxeL񝲥DZ\%~+sI+ctvj/'3fȶ_&m79q\ʽM9ތ> tZPF~0>z-@峴5:k6x7O>U0/F=!ue.&@l9}ΣBfiYrxAdQFw5d$K2Mnd#Fk" [ $"@͔hsT eWj+W4ZM>:e (x>]-^>_ p\U0/% o5@TF1i_8:fk(=5<'-!`{A'݀"7G$A{< Z ^:2r_(u6@hu-&|%1waWF$2mxiu'+Y$O)xpq_c|WQ+26-5xM)l ֿ% sFwodModsBY~(#oJ\:UT .B0?Dff V: N`^\z. PN&_"רk}@>&Wp=[m ]uvF<{7~vL s1Rׅ9)u]!?9x~K2AоY `lZg{WڴMצڌl+>M]6·L^-,A[' b!+r\W5>W-<˪ؤy?l1ٚp̠{xb奞l?#2k46<25 ֠:xև?б\wy3FEDc@AK/ wZv/1] phuFa`iwC<﷚TbQ (!3Js$oH=~FgБ0Jz'aQ;tE+)QCWi0~l#6j¡v|eMV8ynNf@0iשM}F1!J2mAKvd5PXia"}!V6-d=" p yQ6Ey1`L}r. `h-g!vS/ r3V-05,1o?3.51 uޘXGJZzt1ƝH@^t +)5p* nXzbgdLhbK(2g|}w\u[fRҺ|'Uu;" Wۻn,k@[Y MR9@#gN6-yf<ͅ'{ yU 0!j'<˄%r"F'ca:`HDTfLx^E@hDQK5NctڀG[:X :9QHׅ:їRMEB%! W ]zk1X_ QF{E]>-鰗HzEuq csJ=;sA4YdV),Z>U s":N`"mF(\l>GvIۍzk+!2+:1k1cf!tskλY<;01Ph(/]h>/0k"_Y3"Fh f%`> +쩭 XLWU6J:C"⍉;G#RXz}oQ B b2 X|[-o#y6 PyE4hDšO: mDG_/JCxӨPrwAIa؇-w!ЇrXcslas~ ZZ!y(ɘoհ,f`!lQY'I8dAM!73db$zɅDN_#s՛p 1| Dg3d<2u eUG2 t9ηGk@i(Qo=%Z128+U"v=FaW!ʣ2ċ:-yu?Qy _mwѦQ^'6h @Yy3Dcxdo tC·q]X6 f+_jIJaȄМ#L# П#8{Pm$./xG9dFn<Q\8>`QBddskj|Fg%/mv8op3Xѧ1CIY\::6y= '< ]Xt`dyt$mW{.U%r^8ߎ2mms2۽0۩ oyᑱΫ̄"IdOyA[Wf!}GP/CN3]U|{A)7:/IYnA[۩IܢD"Emh$K^E]L@M#:Nx}gOֈG6# 9a~G)S}ise9Z8l G]mls+@ŁŭEԐ 'mWT9AD|\ΩnqY5V^;gܢέwun3 :E7~ #Bz7Ao~sϥqb]F=-kubh5 k˹h0g N-#mc并/isJK;gH׶Iλߘ~sE new欣˘:su_9"v_PmΎt؀>|eytxq`}]t"&yΫ,/n2gs!}a]A[ 2;Œc}P4tЏs䥳+sa1,k͸uuu W좖X' &(̕sR@@@@@@@@@@@@@@@@@@@@@@@@@@@@wbbRJHsP U|a1s  рEzH0#p~#c0_6Ƭt6<aYH*3^P@}S+I~4#Z~bgn<İ|n%NhR{oo1 ,OsH" - wvv c{ k0yyACS>@ѩѠN3%+v~|g+ooe l4F2e:qǧYx8yh v$I# \?1{{whͨ?ޯ\g#j>Hexshk>vv'(Xqdmk?ȲQ 5?<NƱS@2i)r\4=j>z\!?m4bYѻM F,kL8ø8@*)w3bmۀRP6hgрM(yxO|׿~R.'J6dKae||܇#7 v+Ӄ]>Я70=Kӊ{_zv:PCSƶ./o4r etH'qT|&`HLAjJ!/v<,-Lc6mC*5FEc^Ť x sf#<|4 c%7ƺU N bF`p$|!3 @p&0p'4& pcah ?NkMOo7u2>} <29K2~}y^ &'ðL-]s\Siܫyg=28hF tfVN۰nLQBaىr6 ipyqRp1mqӷ?F7TW3 ;¤4'8xv2ǿ`>91A@_gN#|+{p6oH-zlQ=fgay`1$>tYd_}f` xr6ȵò ?QxY:ٰ9hq ᗟc>CmBW;eQZ͓񅙹:Q- ܼ@_¢q??Fx7?ͰM#ՠeA ,؊g_.9@(tqnNyEVo=Ճ'17VQavwAC=}E/d: ]xy<[>=)mΏ{0sh %Tw;Ws#q^n% [w> Tۇ'\)x5{|xN3').}g5R`-R{82Ĉ=9-Q_meMQ~~tJgT{]@0`sddtݮwc#6 'o? Sd7 ó g9WUd`1|uKg,CIY~NTUÁ y{{Q h Psd k' }FʟH f @JMB ZgPs:em hDF~ ڬF? ':lcxSd{[ w4)s{+eٽ3/B9ŏY/Ә/>zz.J&6 㷠b$ y? J9&%+D273S`g'cygwWomޭ38Wao~ŻFix6z6#i9t hI 56pYc wxCt̿p&ѳ{˾ב0^T@s>v~9k\Ǐf9lݫ;[='#|F!u_8Yp2h>4>ofݘXʼn Ml{#服Y~!-ǽ{1-űZt:QT:F ùfFojl6}e»Y̡dHF%zqyCa8Γމ0`] ˼./Q6uwa2GI`وjFY'/8ï<#>?~1 s衛28F(H i} YxuBZ_߼\Wv:|OKiGs⟳55ԩ lΗl6S" p&qsпobnzuOGaܔkOrzA8_ n AJQ hWR@Z5i%* hFY4 {1lVPhiuR v5T 3kKh =Ð6`NcFu9b>hyLcz #O8d;ѲIvÑ-.x/YvYc5"w|1cK /EcX|Z!#1< n\5mf-ωS.~>26Ϻ_RF UD^ =#^ȓ Dc*; 'Z,ȻIƒXZ*Ke%8:T#6ÛGZMACnee /%*Y-c6ڊqp x- X$Rs34F\Dy>cy@T 3Ybu6{ :b[sYzܺ5*{mFb/СʾgϩCHkYp}1Z$)UR\; ۵gtAp1 aFRSEҠN=|neMb^D hDfp*|&rcb%DZ"b1e;t1b1ihRGִMa{l<)-O4& X\%b+i!=,b1c lZD,n6wtQz}JW :hL:'+uiو 9 m-3#D[1u %Dm2?{FZf ^ YeY+җU@أ0GU' l}vvs}yXlbhD,3@IDATwW!͌T' :N38&"@Xmo7E,&lx^>P`9봄~];u He>LSꄭBhyPt'E,ob0eؠ h4hՈiFW#X\ buMQ>w߉fxW+3;Cr"KhG#J3jfLjލI;{+RTlX\E)¼09֟ #ܟ8x|SٸEuuBXJŻD,}6s E'Ȉ<m 5ۑYV2d ;g8\Ỳ_e| ,[{{qm.+2ḳ ׇᓧ+:,XT4b5+8( Y/A+G__,fXeUyD8aTXoYbry\)dk4de fk{?1bk( m`w.(e`-GZ:BYk✉&8+v,RGqwuĺ-ӗ!b5tU'2~zukV1:qhȉRUfݯuM/aپA/YŝP¢C`WEK΅0GWf }Re>##ݱ91!mxQD$~ȸS:/yhqX5PQ@.r6N(>sq45:zSҤkVGVgO"5uy89D#y:`M%dn!;p C+<76g^|Wƽ<(7l]L'18xA7Q9enϘԽ[;[Q~k4 *yASX\ʁb)/w[؛U=-k3z%b mn^vmո^q>V7GwmBSUw|Gl \'͑)",1J}mY{ooZg`lcYgh j! {+=s꣍=31ݧ򜡑W.QL&ĈŴ?/^7F,#y EGƜ/s]/{|.p踣 [ƹr ]J) DTRg~PZ2 ͷbQ[fE@6lXIjhՍ8 \T2C[H܆qX4?3qur0,z%FR"ވ~.6ꕗfcEm^`0ǐ؈ͶAQC=c5kF6 Lˆ&X$H~f\=WsIOMƈLs(doth4`7bt5-@HieU yJ?( \N(k%my0c5. c _aanZa6M,# 6a5=~_ʊ̶(G"~Gm1˛uf}/Ś`2 S#5fP&MQh 9iBqO56{)yyʻ2{Y^cΫוbqSCUY|󂌶65AiD PΥjη`<^Fqp^$mW m4r"d;K`sߚ^p Ol>דeGWT4pI%/|3fѸnL5dW ~{2of̪4DF#Bx{_%UtP9*;2RYD%+YVIOL}vZ~9``pIϵ8G#}=O\ x' F#ĝU&˷Cl"f$K!rc|&6"`g=_4V" yw, k;ӛF?77s0Y*>,iFd*`@`s eoŕF'y:A<ԡFE(e"pzSⒼ? $g;hHS@kM/nd~Yȉ([1eMc*`8Wu\~K :#"}W׃pm5šA_?.B! sp,eH`il}`@gq(r̙ɹ8w  A-d+ʰIuhU520cPG:IY˥y+!lS1v1U~"hS~+e;66rljER2&H

    BT8ifmXzL68PJʤZPv4'{qIO\ g{hӃy͏t(\#o-)t~NtT;Cs@RGp#?HlW-f=нJslpˊC"UYBވjr?U|ꨡ|xERC㝲HmqI=F=l>k}&Q[vEt辭(;QM $t2.p}>G}KgM]_W.cPkzwk3nza9rsBUs7 rr|uSQ7<2?{:Վ6S/: yahG_ lO#Bܣs>~b߾J[Pʥ͉8,U52YǸa\۩6"٘e*!~Ufd+S+NT-mX^Q 6fܦrA3ÜC#[z$A|3%u!B7!S S S S S S S S S S S S S S S S S S S S S S S S S S S S SC@@XhU'zsZa&shp )ώa/,uzk,z. C@MR5l[x l"ȩ*CY۩8I}(^4t\]2мt=z^Ap .d&_TP[ 88jK|,'ճA]}1UUgL;Z,[d䋖7 @EbA|x^WuFr.~1X0><ƾ??VŽ:cMni*ϗ˚kRQ}AwC9=4/ !`{h~ |61JbG>wӆ!4sjrF}Nio/RuxwjxoVuٻx/sx/K-1(^V2_Q8GD?AW}7?>4 0J"%aƒFH;Kط`30YPm?¤Κ.."W[x!{C%͂0 <4z^XCy yOL'Co$_ ڮ^~ h :0_c(17k}Ac}߂hA3 >ޅk52ғw\:!P,6; z6/EC<Ιw"K19]cf!O0'gSbҟ3]|-q !K8~"u1`(I`5S Eu``k,] C023qyE9r֜!*iC2W`?ݔ=s-yQ/$^ eQfԮC4R+ĨA;ML1c\*1֘tB e`\ڃ>KCrYa3U2{ϘS>l `dƶ/A7 7AY@2Rӥ /^i[SF"G;voe䶱i\i;gş!fP?ESzNE`o?xk!h.#M P)N7P}rOa/|UI.uH{ rLT+b^IG3or0 V龖 K_>LG ɢ'i~ tᡱilC/Ҭ {~g'Ȭ?Xӷ>:#c/ R@z/xQ@sAy7dxDv~?2,t(}hنVz+ $E:$ӳwݛLHgZ_Z[OwIm!zV:x0Hwp[{=A=~Zy=D^7pWՂ;g{ s\4\ӳ伌jxRO)|=@eғ~x(f/º=)׏yrM'ٹk:A{a}!Gi+t^gf}~\ w NZ-~8vp] _-Rԉ ;<8e3 ?VUcY󵷤JO^^潇bd?DKM#:I%s,緩< 1S=O:V6a7!?z#|)^?"'2C35D|N-2o_/9L>6tX<4l)@`6G27KIP>egr@VyM ) gx?; Ų }ulۮxĶ7;5O =y,b/ϞSsG:B3??%G`΃=t`2y~8|TV{ :ݔ}:u+Qu 7H=:sq!\sM_=k_qϽl"V|ՙs,ԃxuNbpY_ݫGV{h{5y9DoQ5e' ?w+\7lKE[$ƆEqy GH(3Y Y8 ߫-4?9<}ry{g>ߣϛ!S S S S S S S S S S S S S S S S S S S S S S S S S S S S S}S` 2)F=a|E Fo+,e.<sO`#^*4)o~" A=[*^Ŏc+i4RTO=]|ΰx_b`3#je3UleDGj|ɂX5B'x]qߌ6q?F$PytߜcK( 3s-y5T/<-BO8{De ph07իAQ;hܯrdg`Svl+#xF`nܑ'?qĻ'Cɦ~Xe-H-W"OGe!m"DC- ]n2 p]>PD1xԻVL"A]5J@v&ޫ%T }z,bkƭ-dJon5Y7Vb €S!mwF~FAwHeFo}kpEF*:?Lbcf<ro2-5Fj0`[5~;v)l.)ø ?5tpq'#tgyxRx4_#zyeY>h' 9^mMawpa dE="X̡,E#p6 Pj(', HX+Ux, .Y*kP]1 [$ް|!4p_" Vkqwi^=*]Nԥ6mjlqcu#Ww|4 50FCwFw_;.*K1 9c0-6}c1e]2h !<뱸 [Qנ$.V\ӃLt:~HFEr-2pI7Hi D)S䉇+kc#8<{D<(E]@“jn,?tMq!f␁&o lS{)<Aݣ_.H'(x.Y'VxNkQd:EOcx6OBp=ޟp;x,Pt[7ͱaҐ u^{偂u.ʴxQŸ /-5ec{0Ys=fmoG:K4Nf%SPYpH  xjAzoX@|EѝuFχsk _5aJ7TC1&-;1?,;CS'7ʛe(ډ^&l1^uS}PLh274#mdKyCv3;)i}%ן)1+5Z%- w0k W4~3׸,jƻlaqwFAy\0Gx8HKJؠa_o{jug6.R 43jZ:z)Sތ'AS`I+u͊*4=!jp5pg{:4E8 O*t ޗ<8ZGyþ}ėT@yǼS6`N!1,,ٌ̥y/40Xa>] Oyo5ZhNsuEy%hR^aF9\_VgrI`pp:ea?sFZ"b1u3Ǯ\|>r1%֐"n4fs1:[!#ÆAcFs+lqdo256oZ2/0k)C5ZS sF; mu)]sDF~_pڟombO|+LejZ:zd]TٛH']ཻ!y k2 h8xpd ]' `$$yA>T QU*+=X=pG*@zQY@Z9!#`*Ȟ$jC gE_uԤVS6Fjn (PYEYàO6 *3]xB w*gZ9@@ 9}K zџrlt; 6[@/'4``>\k1ZG+\.o)޷f>HUǺ6v:-cк5vRwyC1 s|Fa{(2r[K;~GO%yF)gSSDXAc`PhBaY-(k5c_z)佂ЖwYWW$1e|Ҡ\#cwWV"^e`LTEAg3o,egl)V 9x T863:%*R`W(?2aud~J$<[BOFRS5W+H`dݔGLɸ >ǖ 4"um2z:0Pgͱd:˹e`P=)/Rw;«&vϭ @3&A2_SDYGqmom cM?]%*=Hu: iLƾD{:sƒ0ݏ+z[YArxP̵eos!5Y )ß \lE7!3;ZDkJ}7V_b>MXWS+ -WJ\?V -/#ט'ƫMgmdٶzג念D:Sq_h2"LLLLLLLLLLLLLLLLLLLLLLLLLLLLL  Jkr;2~LhDB#g I&#Lg6MZ̗a3 4@*hP 4d ʹϮ}.u0^`IJ8zkAuxZ}ۘ0Rd*kKWȜ)9z74A>(Oqm)blqhĸxn_mȍ[~d*# ev¼# x4MdMr㡀bǬxN5 m;O\ec۟)1rd/Ҹ1' E Nxw d[{jB؅.Iy {Kpu6}R@#vs ayMy 3pYCHWVzt$:FxOP ފd\8뺬,:{ MlLN32222222222222222222222222222' d`D\폟 MF4:b<Kz=D C}ÕZa1Х6o{J˲4)CH1GM11-j,ʣ0t1i8 >$ac`eF! ~ O ?:'wu̷1yԮGvxècX^#b@tx 6h=/~Ld>$h^!H_N"{gx:1o_ #ACP:;oO; oAXvl#O>_U"1ۿyK^2QŐ|]w*gB``CM? O ׎F􏟏?~1$^Z Nh1ui^[ၞVJIG]gC#y\xG"%pڏ]aTָk[? Y6tPr(+W> VyMzvh"繦ܱ=w0D^aHe}ߣD 3]@ϑ!#s߱.NXTf|4 "o9S9z)'|vbY}1ݽӏ]dT}5I @OӧxP<,:8 T=eߥ wGX>׈x炊2f\^AO3z{-^N#E?M\U_-DYg4 {x`q?Z]ӓV-2?,Z <pyC & _ kOvSbT J,o^#_!3BZ՗*\LݏT9PMm5ĵ'(u*Zy4=} tNѽ~IW'׃t {Nl;5_#atcԮV_BpX1o^6ef|Jir1aDQYnc،(C^3e a FϽ8FMڌq^ސujt^3e4n@dAg'}1]=x']y˹qဌE Λh;kYSWt֝y=FC_m6<>\oI)A#_|/+.{.|J~(n] 0^Np8L(w q^_t B5\_+ϲk/DZ`58H{m1uZ5֔{d5%#"k-|Y*f/ްLdGRqa{Acj|{<@U@rutY@IDATQ? e6u|Sߛ0LNx90Sz|[L=s2~h 1xiKFj`j_'tj܍~ox*edۛÏg9GC c&fix yg:;sF#3d= )+/B7̯P $ .chC W>#j,H: <( nfىP?9xx]L/js>&W$j \-*Pv X Wt4cq}1峲Px, ov5<ocx Rգ^tB@(^ep9 Z/oE*lg/> =ppНT܃+޻.NJVW<<*}M;'//f*-x`#/pa;;Nxs /t[.\/luʻe7g]j[ GN:Gзe Z4rɤ@_Bob> Zz,ֻۜov2oqٲ鿨DG>@!0Pz,b żC]ԖTD[qИ9-o&3+[+~enXhosS]7 zA  h  ̯u_@i\O9w@ "BrE6R.ș:k 0eIWzGxn'?᠝s9t|2Vr*}<2m{Bx,n#w:ٺNבd:[@p Ts-~9bNWޤbu8>W=;G\ʹdꓛߗK+"t7kNtiQ6oy'.p`]xSѿum,Rl2t(/^ŬACϻ/.כ YwÛ]|"26xЕ?Mؤ[s 8X);rA+ o6XEǁ7ovybs9#{#jz({KXRsHD^2}h;i۴ƳT(2W;*ci!K7Vu= 1q|ȵBﳯ˷OAΈ+Q}j֖z:S)uCF4R'|&ڜho /,ѵjج_}_:U~bZ:^r?s]y4BOlroDrQIBs`L;;,X}?ߐy\z)e]KL JqfKaFxo<+#h[I6.u!# 8]j$(k/06=p`osx}lgτ ~Fkx-/zUZ[/IZ/QC &I:1zB7w02`qU? ( .eT x>is{777lW44׃^8`J0?R4`7ZQ6/L;j8k4/0 S0TwV Hx܅=o=+:ԫ)wH~c^5UFR&𓡀‚I+Nzj@ Yu@V Sg[1E5VaޘSo>R0/0HJ[wey0W).ijz٪ҧ7廇6*u(Cz/5x}Rx_7,eԨeHܠcBC[{mw vo*N"J`*m'5hv*^@/hooXPrh|2(Ո9XF'uAZ le z&u޶ui37Ֆز2lkiً2ݼw3YZ/+t7+fovZ3&9|v2|]19ӂZ^u.je$k 򁼠 q3T()S,877wV]kB PΥ Tr-eo@Z/ h>,ZFYcPʣW*^W)O= 1Z#UR6n7t(}괗wuΘΗ$WZˤwCw,K]K%F~]el܌b|uS݁9i95֯',f=^(fYPȚDJCB:VNؙ[B^x㯽lM7j^Z:O3ԇuvv&]x(QϢ..EYmqነ7,Q@d]rTl2}x(ܯFG5s~wQeمͳFng|7z(e(J(Zcֈ \oz2>&(tp8<*SPz{|жB_le[\L`PCsfфdxoUƷY R36 cኪ)9VoѦm22222222222222222222222222222' SL J7nV=kA#1a3Ŗ[U Z.42c0*bPadW'rw5}`yZѐɄq FOaRswLuhlYGa;(gR|Ux Oz,~m=<׈N@q9x䳍}lm!XtLFB: atbƫ\-@.Q:m #+/djY77P:O`Xo r`}>ְQdч.Ob1R#dH 5z"dƀu}d[PL\Kg/ud.+' ..s ]2RF#@CVI,z Ek%5>^.wiVeu6C!>/3PCBZd3z KŔܽOFeM:RON HP.r0d&'PG i OR I ]K5|xˢlbxiEv _1{ C@Bƫ:6?I,ُh2ƚ7ʜs=[u퉶`zC^ۑ'(z7=t˷|j=A96Sx+[t6|'ę'oknJ(O-IG Agy"uB螅=jZ7SԤf񱯻NQv):Ө\e"U׾N;oԹkͺ^ZQC/M^u&w8ǑS&_Q;y5sku3tu=^xsocGjD7NCn.lAr#38=8 9,z? o9d d @lZqF\X Ib451b)`X42!I >>r1\yJA*6fjĐj<E0dg &T= mY~ΠGl Ȯk4H'tr*c^a}IaQn+n;M}h|kaZG:KC.jj>EclP':Xj0NzѼ^.7 g} Z4y8>j0dDj#n:c,Bh38EO7s4 ʚ1^Wp·) Wy힞#?8`c7-2L7 򄖌Z#A70>oSz+`F  WW0d^G >"R `ity>C&<_K1}(0X1=Y(d7 ҳРWQOtcai'Mvzī/>:c34W&U_E:Asȁ`p^+{^ h8]TMjFD'?'U 0EM%c^)˘YOtfil7V(u؝=j&|WO^W;L\Mu&OcR׵v/B:~ \F oGgxB*HW0=e0l^M5uRY 8WegʢU(׃<̏?#B>g"vӽ} 4q2 DK ƿ{H",WL9)bOw"m-O|vxvf^#mRcquB|0TTSDrX;/~sȕЉEyȒ[q 䑀3c~U?Hf>qvh2d~tidVx3yN#_7Uۿ fk1JX3 ~@O?J/NqHL&nLHeQz>[K%5oyLzPC6ܻ>|>`_GR8Jk +|Oi᠓W?JBX?[>k%3n M~L:6~e,]xA~^d:BVr6s)8>YJ/+l2 3-@ .T@֪Yro*~#썸GʋKY `GU[0dOIU?oy{e{ug?Йx)5=Ōg"lYurq7.ߴ˵V># zOO1&o_q> Lak44 " 07< aC3=aaPB6u "_8eL'gs"g# *yH~4}9d~`܆x`W>ox FGuTPC6GCSXFqƁk| A/ gLZ]` :Nқc\ Ɨ8)u0Plc ߧlp0g8hJ!wKT[rw/Ko]ȝ5|_7ը7*{*ؗc 3Հ}t?qC@VAi52Pp2MA3~w2Z=Wlesld]ʭ)Xn@k \- ;`r_%T8+dWE6pɽA:iK(=@{JtC=@KzO>N'i0GO7\<$@>چã/06>4}|5c"^A|t5pCX{m ~v:=}`-1xn.IzP-?iȿgǟ{o#C>k ye뜞Ms4E:җ/ׇC}e'ů0=`]>⁌Ags%0Ɠyo7M~joAyUBTP`:ݬ84kzΕJqH--z. Pb^@+zYxx<|\<޷b(#s "S~:Rgg1|Ӏ`=#ų401Qu~Isx>wۑ-gpV\F㳢/sx'8O#Q4}t@ptX<#h+عwtx v Dǵ95W_C/+kooo?-'p9Hb 5x~s>g%J)pp8h`O(l(kd` ~I/R{ձP*& ^䛘~<5 2lo]Ɲۿ'XPQv!g 0Gٴd<'&My'C't:qX4 Sd:zC?p Ӯ|N+M/LB<8]c }VЉ߻DpƁ-<?TmZ00(a8=1Z6I2)*3//?M{ӹBN8Do^x cf>J|؋rZ虎ӇM1Gi5c)T Mt:(UɕC@@@@@@@@@@@@@@@@@@@@@@@@@@@@Z))41jk,}Q@%0*>@.L0t9h^ ۧUzBS ͠Hr e{ǵ?sp̠'<'cDoLԀavCS\ZM<^ΒFHa)4,4I xѠVh30-z2ШO؏2Q{cDdM#Gn0ҲY5lJ\q[qFJC c# G%'yi^S;,<&60Oɣ7i Ź7:615F1^9%|pFLUD_xzȟ@VSt2ߦ*W<&:Fxb0uo6ɭ"O2u ($tPd2c' [x΃"f:{H9d$;usg5eT͓Hn cz+F2~)es[\A'X̷ ݃ʮ: xU'?HZ|8 Q\{6HCD8tmOc8`>(? 3/$kIb_D}(ɀ@pq<-'q@ BE3ޜB5]@Ǔ:s>c0Ⱚ{7;,ƾ6] }poz*~#rh>6|ξ+;!DNg:7.ڻGkgN;wߵ,k|>@;f3(h(],O|JAoڞ.}b2*,:D}b3ǫ*MNBh\[l:^s):8vL8f1c s}7/!S S S S S S S S S S S S S S S S S S S S S S S S S S S S S}S]O.?S0f~2^0CY_*g{^L00@4<k`6ɵPcm@+R,4&F'55:ƨ6#f,@}kg!m:jјk29mXwE(ݥ4^= oFa \blU52 v'X&Ri/dK<)`  p4ӀK#'X@Ǔa]'Tz4c[h ãCh{08tBi0=T2  VCbHM`Xp6r`HUU :gqD4oԘLx8Up_6m#DMs5t(pV-ޗV)%eM1ʜ޹ƀ[jȻ3a1kq9s]X ؇OKtš| G1'kZQ?a`,{H [^I}U,3Y=(+jL$F0@ Z^sN  tP)\:eN3cLLȦs+ֱs)-v#MJ2R6<DztqCٻd4oHγvԘe$Da0tN&vvzr-'Dd:goƭ1K2󴋀lR*;3 2wtChxBqL PsR!)-6P!L)4J@)0Kɞ_֙%[ 5;\~YikF+`_lזҖ 0Kʪn`q!"SyNw?>gn"娻]S2MRί&b(Cdq3Q(Ʈ,`|DDe@v]?4H(&&j.qssf\s:m=~ۨ{zϱ3_hQ`-*Ę.0k,S6]-Y?`OXF?#,kkw R0*…}+ }JW.B`b X븾 ҝaxzNn3F]d7?"/&/0Kg S3ޱZoCmeЫ" qB `B ݤ cE9(go k+ џ+N;4O&E{A5krG+(N¼?Ϧ @0+ThPoEdS_756$+j-Ÿm) a"pz6f[b5*XeI49%9lDV)g QI{p&aZ(%t,mGrC+:#"e+fEPH:.bԡ;Ao ̉h>9F?&Îe#hΝ=>R?{A',*ObЁ۴=@ow峔W'/Jux}(8Hoj"sDvg߬ YFŔ5Qcy g5f}Իk**V/{JD.u\=:ʴu}wAht^*okR'QK{ÓdZc񌷕 {JdCe\[6M J47E<O3:F8( q+3*Q>J*8/|ӛq1\u\\fߞ5Ο×vskϕq46X|%3)8.pYdγFʌSR@t^#pg|G*BTd6A:y~zr"'ۋ~̕sO—:ב=(KvkBA #!8 D`;Re&e5Fbq*sy`E7?-TZ"s r^5*zKt׸YGpFߧϡ˄ؠ^ہVk-uiΓM(^F6*gk}CyhOYsc'}({0$0i%jJCgɏX>G/?ytowp(`lxz{iãu ~Ы{f ^bPHA7 0Fg+c&pXO^){݆?>؁yܭ`,#̷'úY m=xk7!_&D? _Q]NF L{uC=Gy u[a^/Y!$ t=G 8HBX*%{ LN96;uJЀwBHng ?㊲Rf;@/7rBx  ޴Йר!Ct//I0MdRЇNBE9a&. ULg^W˯"Ower|R\=)96F;JMJr?T@'W=)`G7g^-V`6}43RvD6mvmSd!5c|{:^^Vbj7}01_:Cm a}$J7jy oCA&3j4n7;KU#zHo^zc{2h1_5o {ϡL%lꆫ@IDATrN)%#UxI8%Fd頌*rDjq*ǩw@MDZ%J(b H#M7J&t }xK{ iGeaI.@US!>559Fsa80s?̢>j-s~Ke2;[d$d̶‰O~`˂];8}<rnsw9¨|6Z{ݚ8ϳ:2ȝ!mEX⬾h87g?{tf=5&׉:!Oᯏ>g$PV Yh2;4'Y8 0$-LCm/%@1فtɼ!UƁ t9XDP܈敃^'|Ub,-"uWDQ"jBX˳έX%x12DtrFEzju=w#E VsvtG8v>gmprØ3iSS^XoZ_xy,r0`,v&{bT ''gδC=~,<+B}A7wN%J,W[D{g }t]!}0ʉ˲QO2~y,qw1_lܭ#BKzN5ʍ=֓ sM/ xP^xg/TvkWԉJS AuFmX#Kf3=:FTB 1eR5;'(:7C_]pQT`(4|A9`N: r 'LO_h# {Tщ~PЧ^ŕ2b9"b&VAJSK+_'^ٟJgDn^؅/s+X; |ulؼ3o/B\LG= #R8AB=pwG`1< xcց &@.Dna(4bCHuP%aWP$!< 9NT.-簒l |N$`@Dc>~v1:Ƭ0g+`oZV/ amWanOGr|XϪa}@?pN r P"y?K̵/'3ʨ vg:mVW=תݠ P yɍ:WL^%v\ˌl`$1j%U`q8<>YhjGsA03[1^~蟜>mYY=F+.asfc+&xKF޾U3i5wߞf5 +q`b.ZuUhSqۭP2akڅ1$笿g?>> Jxy58PcvAKϔhX/4'QZ] OŽz~>Q1,.rx~R><@6kܬ أsض C{e92DŽlRރP,&qux^~_T 7QWj_O+Xw;C7Glw&2A!|ob$[GZ, d@7폸7#}ꧬKlkbv9r.ۮf:UOb>y gM&)W 9"[iQP#Ճfq,}'䏨#G"2Pex6kZ:g|OγV6c^w>3H>z]+T4X [eM.XX~_s810s}ȁaFZb}@>ʾnrH)xOSG̱wxAm˻EAdD)y,|Ua3a ޫƼ:=_1{O 1xjDDDDDDDDDDDDDDDDDDDDDDDDDDDD_3I)QO8tCYH:m>#ski'1 gשb8}s4x g#^HıwF[s8R)J2>ܥZMIɲnʂ w39z0-o[~ RW !(Th{||ѭY*zxХ\{8f'u,y_v9+riXV)6zbDPzFs)/FGsifNw*(7]o(D^0")hmL`Zx݂7Q8#/mP M0!} gΛ4ijQ^,P3:lHdwsխ]ߍp$kQ#WA("m{uūآTLrBC"S.) ^7 #éx>;6(˙!k@VgjNDR^Fz>6@4_two/mưbAoScK+=v7H?ӣ0HgEjwC>c3?v"N7Q$@^=tĺVąWՓ|^D6a52 `:8HODՅW`dyc g4{:ǀ.6꧇x(Hf~o6 ? [Pl">@9C#d9)y:o65 )"#t5Y=!rN5g00C_,u0>_Tzf.EdwA(U!K5B=D:VMؔBXxdXYpXˈxx/V'J*:P[=U:p x»KuAv/t!ʾ_ܥ/D:WcY½9 4G0^sEp;2h>Gs)8oO-&F^ցۜmi\t@ wˋd#8 }\d'j咟*Q0mQPX]3X<@zG!]FR0&kr͋*ίJ$2ϖ]Y2l破"F4Ry坞_"]bHqd9u5Jk,6uyXǮ3aȅE<(?t5hHmZctMm+ӎ[۴qkKcR{I}b=\j}ȿg- ݃'>yèK0ŋf'7ڋy,d+ӂgK8YÐWbg]_wpk#Ԫd5PaEu`!^MjneW 8Dq|}AEDWVqQOK:ӆ.Wgx8mE_(fs@B-%p 2B t(A\ F vNZ5nsU:N)}-VmGTdJXOBY^gHΜ>Zu8 <7:y/?ߖ| -!NPD`xF8 qH*q^As_Uv.q ' (tHY ;+}K~>|02F<"F?fn|PB_ fGa<^鱈hڮu."@JoN5P{D'>yL:X}83~7iL, ~fpT=o)pDp`.w1e k-h+2Aߤk>}q<ȁЃ,KRqpG)@ F.݈$wʽ.q2*Aԋt]5j:עCϙC)id2̀QMq]s #tЙ2dž("q󅎌K\BG9qFkoӑº tHI!3@G3;@oJi[֎ՔFYG $5\j&"u8 ODHDT 0}op>F7( \?SKD% `a"16BA4`T1@ lZ:7\±EacjkDެ"Wѿꨫ19<~[x"tO1bA@>EQ6CJ:IDZ=9wڽDĪlY &._V%倨1K}N邈Ŗэ}sno[%֌2zHyAsj)pX|/qgi%eW^|%[wvڡZbA6%\ZOࡸQ, t$]R^l :^`3G#(̛v"s622xKxU1^}!e_ Ye(hkD)T"G1湇5|p&L=>.|ʚ1s91e}[5ҵ|U-U ^,}D{tהͳi/8l`ͳ^r0&`fgDu@|ٜ1rYZ&BwM+jtHwZ7%rc; OQoԁVG'+ԴWYLV 16&L:owF)DX@lR-ghJҋ\`mB+hэm;mv0tMm6m{ޔϦةQكй/9 O{bQ C%t\BRc?ÃB[y mlNYT{{<ġRZv-p8,ȸרuP`ߵ\L68 X!ԧ7F85Rc<\OCJAwN8sajo agp،y}9:H%8TV g˙ NU/FDh+=j7SF i76p'LAP}*4}u1^[W`Tsf8A[:/trF4#@8ݾOu.pB`|;6sדֱ&30YF~o9@G{_`Wǁ+"Y!? fbn1NGO1}8mGva:}сSYRcu=9@8ƒiq $Ƿ"S \.8ʾ!Qg)sY/r=Vn5u*0'#dL}dP /N"秅\.:*l> ._붨{+.7^ XpGp Pyx -zFl4B >9^ }vr}-x9NM?;42ٯ[C? qZF\d+zje4kR1#G* O>2gCx鯧d`z#;Y$״|ZD/\G`/=18ӑ8Φ!QYOGH0ZOX|^]HcRRYTEdC[wZS/>'V:܏rqwOCo`$9emѷ[[8Ӆ)l)Q Q{+tpA;vx>{&rGEpb+@2v9k؋'P{*x9Uw(T{K͈Ŗ];gΉn؁8[3"Ip%Ǐ #,F84gE3>y8K^ b\"Q7#,Nw (;yȸyu]y^5^ggа sޗ ah1 %J?gC\\LL*횡q{W-4 XoXͻ!.hjtBuՉS`j]QcMP<,ߏ"0pa>8Ngϴc9 q ψ0<$j5>a=lûS zݻKX?~A(q s'arzZ |)odbq,Fsɒ8ͻ{iw{߉ewM\ dZ{'~н{^с`tV%썧_cܦO0d`lO9bD`^|B ǃ @>wW3}(smIQr9ھ)|)MP2&~^>Too1GG߄o<-‹5"#YD&C~ġHԷ>w?8u׬n8<#nF$frC{zԹ01 E=||NY?g+lϣM0vJx? cֱœv7Fc&EH;[AìM<̔2yp|uv5 :@ ր҂ 膭 a+9I%^%6BW:;GR/}( e[qZB $ۓ%ex;ٻK4I1;]x>s]qHGv {{>>/Ǟ7^ =f~wyZ`IuGw,{ۻ>`안Ba&KC^ge^cCǃ;,H(_jtN(9fw籗"ESUm~kAQ{vC6:O%;{V;쫌'k0OpV5{A9Cv%=Pn}Z-"/c7k=\0vo.x)=4$ W[A-ȪšYW<F|> Md2=_Ο9-xoazBr>yȿy-Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q QRSO)QWBK2l ZP]F'gVNah(NWV.\Ùߍ^- I7bb~K tǠ3m0L0_/1aNs Y>SsATAG`MDMca6D]Ggc\>չ$}T 3U0-8GUxrOQ}6-KE{ k5.ufl:ޡc=!"ւnt K?ےmTceH8/ԍ20^1r+178D)f8#qt@Tl/i @qG8 ݽ.Q1GrjSgVL;`Z8~tU #>Yp<?GJ@Uwh#? Ũrs<?7 ۭ#a ޞ[ v߾׋+1Z=u ,/:1¢DcAWYxbKZp@(]\^r`_q~pb-ܘ

    xH d?hʂC2񔜢o/#?p wg<2F6iԑYJ9IG~ctϽN{OJ?a (pJWFr]՜:sfJ0%,qq,2{akrI蟟A8PdVG@TMti=D @!)<?X|Lk .NKAEy;D@>0?>8+)`slDq%ŚPZ%:!Xܦ?1jq씵RgY `}Hǀsڐ2(gF[Ip63(#Y&PaM{?| }OĂu| hx+LO3l# G_? C9h@&kfeJ&]:%{?1[òLE.̡W:p ~Kr*_ҁ?Uz{(w^R 5 z(|]B UG M.1JiS *`@ohP@ۻ_yf\]с_v!%BM^ߋ.? X 8Hb'9SbK_Q8T:cLc]7 2:p"Bz}|=<`B?^;&IJhLγ|X̽ƄQX\XA8tLJpP 6TS8> gL'SqhCmwd|=f@bk6\fO_jmVc[MGI۴'" G`.k/lt/;c6e-v%yF+c2ei܂VX-a+.sS&{)'9k,=4{hSCX~boH_/"\n W?s *0[ˏCVxX5>d߱Xґ,Z)% $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ T(࿧(렀#ˣo̽(HvK$,!Ӫ\FH p.w!2~~)j<9x0A\0 fW"Uq!B^eA]{}q *q8&` ǃgѝ8cjc^: ( 5NqH'xn#f44b* C8VWn,,hP:"mgutdgz+Shp0KYp*:nOKeЙ(\mBM$_QndQHIR[:"g<=VP.#=Y %ḱ3Tr+bb *vT hq +؃Fbh+"؊%ᵩ>^bTvkZP񺊽S'jkae/^[R:sAHk@bLN|\?3Z5ŚL|Uež$P]l+Fy~Ab%YHvDlkAo/֩Μ^)()R/L>r:8/+ @D߄'kd |=+ xmiZ/!t4T9%W"|Yf|{dCږb#(wkllm"@ Ļn." ԰ᬭ[f %ebvǍAD^.p$lrha,5ƨyt=Z`wD_xJmE_Hm=Xٶ9mX.To15q:OmʡԈ&C~n52ZҜTGtM * YT=h yKe3HXo7pz#}Gս d}H5mX%xI~6691Μsl/s/?oOokVXqZء]S4TY~9.E@k?pP7YLmSq\oBٝwJC h߃E2zJet9>t9hƲXjMq&<ԑU-\LNpz~{)dcrDvCpS֥/[>*YSe/S  )y?;fccBQ1A6GJ[¬ͷtp/t1Sm ki\8h,u.ڽf6v k,<;5m2l5XS?ױ7y(sh@nPyab{9Ф}2bsmhU"nS.v=6h!}\no"e(TVq3 s%o,ُcyE/M:ՑܯڅGi@J? DWɟJgR?-.It_H`\dK 0CP-7#`os^ً7O8[$cF՚uӶO8˴3'\6}?~]:iFTNPן`ґt.>rt9S$gzi/'8`[ jEё 1οeb`nH\U'dFe͝rTNE]/u:3##mCn[e3uIԇ,sB]FسsθI %@dED#zhK};дЙ௹'͗g4qTJWS->R>15So.uu0quoґVpGz_@lH)_ O 1PkZF7KOεQKqy{F6 J7 bk:+^7٢Lu,t\T=8@p\Lm|6}CsLsWyTj: }S[_ٹz.a B"-i#N^=FZuWiښ%Q[]RZ!#Ll1R9r_fM WKa%v5)oXՃLs;쫰z-m *s> 9"il䜔} 6qM/Fܦlm}/rT6x^y]*)o;{}Й9WsXAv0G5H^jO+u mm"XP*W?IdUG-VLm1m::țen|2G}V]y Ȏڤ,Rruulmjmq#:*SsQǡmz7Qױ.VckXsSDy}uc&x+ M(4]K;&} ; hKlm_~a';ON=M"wNu. T68w_~27(-x{co{ _gUrCD_G6Sg/X[I:,zn_ʮ6~nAM~G:ʕd{Ppս?}w ]Xv4]$q_+5h`[Jnr&.iʋqmYF x&vyݵ% Cڼv/ڸ?ݎtϦ'ٴ93OV>ΛʮqC|̾5BK=4{X6_|yU0Ty]Oܳ_{CHU~OHHHHHHHHHHHHHHHHHHHHHHHHHHHH)ZJ l6t\™>76@IDATHӬ{{Ъ#i_^I̙#Q@u,ҁN8ܮCQ|M-eVkD1E8nqmsh~2:l@FUij\g;D <%Xg+݄tL|NVV=ÙA)n4{ J8d ɫa/~7V)p-@G_O'5Css=}\hA͗ e&|S9~MDe.iz;~N=LaD+8tRa: d3u]^ d4])"Hg4- eĹ~5m!JN5N~L$IJy0:nWbT*R@VGHe2}YIdcƻ3}a.j!We?|+0UBv ƈ^g#I=z]{Dy]!J :#B踼V@*"h5p":Oa.e;p5?FTnS 8U賩s|>u7t$`0AjalAvn 2~MyDYglFT5yϺp.zW@ /TF= F/w1v#$!zߐK=kGHre-%Ѹk.i;@:J9;ca5Fd5Q@ɍc?V;Ѕ;A9<|X Eh`hȨkzX"hh)uCd=hdzeGazUh[rqI19nŲk˲fh&b~?-qE|J%;Vg7F]‚6Mar?M6> <<wCm8g73*<"RC?ɲzp!2;9ShoȚfXTוиsi ;mʢ*#M6?o CP#7,7%hFնmg[,!սmὪueX_dvl:!kDj7&a<;}IZ;aw{UrȚ̖xOrbޝ ;Nz>mڼPb_8L͘ܨyY5tmTJ؃GD@*2G^Hn[Xk笵+kZȿ֣2rҁ? (گClR/mYCxC^.nDH׊oqeFлfSz1@=sf`E5ygD(ݰka!n@0D<d;Ń )݃ yhA(evGvh=F߻>kFnHٟA}|͜> d0VMSx+DWw8lkYg$r4W"ˢ@n{gF䰗{n%wF>u}ђ]6tXsrEx;#7nG҅]|Wh7WX@GeB0J=4ED ;U^!~qWh֠=4a @س.kƔ㾩˖E]q@{3ҧ"߈bwabo=<躅EKpO=~o.qq7#~4=4Y6Ͻ- c;52\-ǽ"˂n}ø.xu~$>=kd6ͰiGb[ډo&$ўڢ LN¯1E47dvd6\C?q;~2!(^5ٓ|⻷WַK<$C3r|v?%d[Iz#š&|E~D.ؑ8~r|Q M/`qȾ]v|e7}0DDDDDDDDDDDDDDDDDDDDDDDDDDDDD~>'Bԍ_ ptu":eeYM;wꄪ76R, >'PnH"ϯK:_q{ 08Z)UdL:ڶQzu{9cұG疋CdH+wTЊ6<_i ,&t9SA.~3-:Q]4|ogGGz6\']y@):;8Lg:#ڴA8TplK:u2 h;Lc_a0ܑxx:Yv~ssEGΩ |~M$s'ٲy&s--%@Fk:|LHZ^Āw ,=Ώ~9Ag|+)Fi[A:gTmD0G#*D|7}sez1ں"p2Ɉ9r<\4F jn$X@l6F"b:r(dsy5eԑ\;9,xfnj˲mHzOp14涱Pj~.ڇΡ?.<5WM\ķ :2Ӈzy*qݦ#<#Y:R_M`8q ׶`o$OPȮjXVwo~~tVy ~G1βX#wbm  ?(TVx>l8˶fܖB1J:`FHGdlJȹ9tuxz o, YXn?_\g;9_;kO^KynFQ,{T03r6KZv\ٌUQ؟ڗp[NtskI?<4JNQ͘\SkQz,zV,6, ^mwcִs]>_M (z5qq?,5)=x|ߎeVm:D8WOO17=)cR$2E5蔹A%䤸2s]F[ ?;|pH+h|2Gp -@t$#!a6֜kd|萇 F\-A^0N'2ݤ]a wQZyE|Z8.o(9a);.V!+]&r\qĨ2zײܬw ^^&ګ#り-˳u:cҦаN4c˯q< hl˃L(Wp=㸡wuA;CQ8<1wlxKgvIJe2oTIQR``0wDzzfO~+ deiQgv^6=-ȇJC]i|h9|~MUVKpYͼM[)[6cӔӆw#mՂ_ 6=.mmGiEebveiX-s@[vyNVt}*2Wq'N9{̕sfY YwKu9__۽ /Q;.,zŃQw5H)Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q_O.<u,ugNKK:,Xʿ5Q+惽#Z:јt"/'Se&swaDǫ:Fj:!Uo{U|זu/K7>`?i֩CUl{;s٢A:$d8m&m܂~_Gd#bY.q,*Iv?gtŦ3Ҕ9KY͜g/LTb]rǫM)Q`CK> CL@eolcB 8Cygsr^RNnM)/öf<H7T9˙n)A#o\]Ϊٌ837͇<׎I'͓΀(G;o! y5Lq8̮ tS۬2xm% ktIy -f6eγ ;@^k@E~d/5:=#  p9ޓMKz9glr2cǙ/׿cN1F~{?ދel< ?I#63 (:Ҧr̯!IWAL`3(wl&~ԑ_qZtܩf=W hYteS] "@%P6)+ڝ,GO x7@:͡Fq9d>+.˃R;iX6Gw?xalߴpp>R/_e(:*`q'Q 4ϓ2b7|C񛟽_P>dk5[oɳEh򳺩8]-{**["-]MqniwkB|~ :R:.5q6ǰS22o͟i/ikvׁ{3?L&+/}zkmX- ^z`#aM '!*b06~ʄe]H{J >9(94"8؋8mIe4G9LEޤprr0|*:vl{3~m Q/)i%~<6u?m~T\RIu}ׂA6?{P&Y_ڈ\сbFm^K_6/ɜG=`'Zު46g[6)dѪH+h&um/qkbέVR@@@@@@@@@@@@@@@@@@@@@@@@@@@@4~maٻ﶑3O%~vwN2ٳ93gvnng&$=loQ!bJL$ "@X]?gXuȥ!jn4t :Σ`);d_|9eg4F}Y9V݅roѣf+wt@Ak=sO. wVzO# _n#!DUGЪÞ>4PMl2 cU :O;> @WϭdK,;唝?gw!2~܏(s-wNk~n?(Xv <キxvUoVέ1ȓu4 6<ͳ'Ȓ$XOjȫ}pA| Em*>,.9+a" }gYB~d<^mϑrѥ<>={߃WgF˻CXI)9S;䨃O8^f1zj&zYeiNy1Q9u8sOtg`&;haˋr 9?\n'ۏQ1l|@,mXxh} fEo"2Ϙujי}᦮XGc~gO1|U|~J?_ `L^Qu2qq&n1p%B4pHXdy0W'.Xg}py+Cn~o%|G_zuSKV?50yyחλb/3#^%EyMwsyUMV{_󧮁3׾ޗu_|+<އ_1`q6L^}Ǐ,H1{r=:͛O}rC.~'X;\XsE㍼r8~:?G>~I%&Xs2_6hގy6ȈݲcS>u4ey\TYe/Xƶʲ=bn>n]xvko/vrgXN>G#@<߻gB yE9ޝ|;Ύaor)Cœeȷw]\nU,:f@1vQf6Ëᅬ FZw ]7^i>Ǔ]'@>rf|g8FB:yȨn-8oYꮲCyO_rW0=Sg|뙻0R)[ 㪍xـG0^v_6Sqcn㶼V. ^];|.?w^ۄ ׁ s`񳗼;|rqIKSپlܜua'k`gq_j mOVF/ckGlY> >όώxo4nra|>z~rtn;Xqc - [m>RlbC绿gQ-aie_%f8^|u^3^~1,W_ΏOtrzq~^]<+z{uKWuA @ @ @\@xH'3]n4mE r>^enW;'>xC}8s=G_wGjj|+4yMooq^vs-hZ`t?go>[dNw#yV%\o'_׏owL}YO>{~^wmw|;Ǘ^"f壘oc{=C @ @ @ /UkNɨd++o6/TuJitN:mik'_זּ'K;z* xE@?/ʟ.",zAW1WW1Zmף$FMo˿ĈY&e]8B;WCųIx܏Xݪryk2873T|sc|yv^o9勫|#it7/_NOㄝX2 ZV ?|Pj~E 㭷i8MͲԶݿyTkލ2G?Kqɵ"o ݗ81_iF@%yc{-^ΏOk[)ߵ[,e~xSU3?wzZ]Qx @ @ @ @`) X@ ]{v NQ>z5(o#;~fyQJAm|;"2|(8"o%22G@A By0=弡̸]kX|.;+nqYno\_*`e|!@e1/w8+1J~9>f]0[ՈK @n?'@ @ @ @cⱆe @ d O9XqLw17Wu o{p]9ދMD8G,.E\1| t^Nctト Ő1g ,4L]?GeqyrsyQ./]y{ݴyŗ:9d?89Oc^J;$@ @ @ @ @ @,6RB @}xzp.Y\ݔyh\\ߔ߾*\qޔY,u/@"tZob~0c_7|?G1Zߟ7X*)ױ|Ѕ3T|?5 @ @ @ @ @ @* X6j @ #/4q|]vF\&1iF|+g1"\r R<]7('1RO1~P&>@hqNNM_E9ysP~/Q~sRĨqF8`oO @ @ @ @ @/ Xm @2T*<3HˆŽ^@ i,.R NK%XP`݈֝y0 @ @ @ @ @ @A@x0'@4$7Y 3|8Æ*8ex480'@ @ @ @ @ @!J|6!@ @ @ @ @ @ @ @`7wݔ @ @ @ @ @ @ @Fq٘ @ @ @ @ @ @ @n f)5 @ @ @ @ @ @ @71 @ @ @ @ @ @ @,vSj @ @ @ @ @ @ @ oec @ @ @ @ @ @ @) X @ @ @ @ @ @ @6,ވ @ @ @ @ @ @ @vS@x7M  @ @ @ @ @ @ @l$ X  @ @ @ @ @ @ @즀`nR @ @ @ @ @ @ @H@x#. @ @ @ @ @ @ @Ml7&@ @ @ @ @ @ @ `F\6&@ @ @ @ @ @ @ ŻnJM @ @ @ @ @ @ @`#⍸lL @ @ @ @ @ @ @`7wݔ @ @ @ @ @ @ @Fq٘ @ @ @ @ @ @ @n f)5 @ @ @ @ @ @ @71 @ @ @ @ @ @ @,vSj @ @ @ @ @ @ @ oec @ @ @ @ @ @ @) X @ @ @ @ @ @ @6,ވ @ @ @ @ @ @ @vS@x7M  @ @ @ @ @ @ @l$ X  @ @ @ @ @ @ @즀`nR @ @ @ @ @ @ @H@x#. @ @ @ @ @ @ @Ml7&@ @ @ @ @ @ @ `F\6&@ @ @ @ @ @ @ ŻnJM @ @ @ @ @ @ @`#⍸lL @ @ @ @ @ @ @`7wݔ @ @ @ @ @ @ @Fq٘ @ @ @ @ @ @ @n f)5 @ @^L&\7 @ @ @ @ @%_R۱ @ @ @ @ @ @ @|&g:n]ɇ @`>mXllo_i @A`yZ<v @ @ @ @#}m,޶NlC37 @~B цD ? @"٬ r___wݢb*  @ @ @ @$pxxL{z?e{ٙlMJ @- da>ߏr6|Q aN @`;=㏎J޻g\\\tQJ @ @ @ @ @9ݳKYIvfsf"@. #Ln,v @@޻|C8M @ @ @ @x Pv  @$$֟  @(03P~aW @ @ @ @xP@A/ّ-G̘ݨ_ @[`" k>@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ $PazU%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P}(((((((((((((((((((((((((((((IӹJJJJJJJJJJJJJJJJJJJJJJJJJJJJ Pa8P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P@ŧs_U @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ <@xC lN3){v*Ӿn??]ٟٔե??Y_Tw,?s7y*.oʉKi%bS|g^Ue$Ŕ8եф9 ,r&yKi%.6y6^x`H,((((((((((((((((((((((G@ŏ^؉/\{SyDt̙)GLL0zܹI4Jt ݻS)nݻ7'R~_]?ZGb)uqG+&[bO.n޼s')Ţ< Y[Nٳ Y?ɽcU\|z̝ ،P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%PHcZ&\!8>[۷oO"&jtҔ#p׾6 >bMq7 a:ԯŇ~8t<&F.\TX,ALI/|ͩɄJB'xbOW'8O|Sll6+% Έ}?ߟ+;}7WXwo.du^xa2U=xw8t~O֛((((((((((((((((((((((ؔ@śj= jXWI vy%n"$ jz4B9"9q pGX<.N3l8/ $>ņ'Wq4k}cK<)o*엲bѽ4~.$^knw!5;xz@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ :w{&@DBDSġq,@蝞:'$'$bIx}UK}?EG ꗐ++ԯez\<{fo%{?l)ٴǑS#1IT! nI{ݻ;w)q2^AJsܛeU ekC@b)_/KEyuѺAל_.ٶ@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ 'Pa#0)Ht֭'|'n=cSu .,\# m D`%~6DubI)+b,} %cn߾xw|$l'4x$tWo:^"pbIu9Kui>QG\<#.>)̹[#~+_DK$7nLYݽ^ѫN7SEWwO3?-/8I)WDClCX.Gio:",CDV~NI E5fNX޵k&Ộҟ8G<|W^]|;ߙ'Lt;X~ʄFٳ웽%*q^G)#-~cGǜ.|[}${!Y.ՕIġLLjyH8"{st>SS~Sw߻wo |crũts4GY{֜_nSW%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P@ŧk?ݛX"22%$Sƴt*Q1XWe*%5nYҮ+LU?BYM\#s&mi8C@ĢϧYc~QxY!i,F['0HoX֭~Ny'?b,֛(dx%r?v/)w6@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ <*,~8ZMn޼}$!Ӷ80p[7ă<0j؉"@ E'"("+} llWPu2~駋w}wk1oܸ1euH@g#/^[[WZ꾃bػeI .LĻ5۩.0I6[ә28NzSW~-ܹG=^3gLJ4hI|ͷ~Pt';d#T%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P  ? w e"`R׶i"r"!!l"/{(6e~H,JړOܩC*}<^w$! ?d2:O>+^ QC{RĚGѰ//ލ%կNĹVw5uZI;0.+9GaF'@ds={yb} /GW^yeq[ sڵk?|i_qs-~L>p?]zu*W-uJX|i=uׯ/.y6bc7=|@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ 6T) c!S~]pP*¦t$&TFwJR, sNXA |e-;<)ě$S)\S $Č? 6ĈzQce }3?1+]~M/d"wxG7\:k}gST]揈O?<ܦ>?|r;]-.((((((((((((((((((((((mx4ѹddQa ђ~=1b9Fc3~2qxq[v(i}>cC?{|DzJ}yy};>l2hzl]|J>cS|Wc Nvqyx/3b[9~1cM&d''1wO>ԕ؜J1;kޣ]2f9ν̕8:|2Qq1mDW-PDtB9ɪ?Y 8r}c}Zig8PʱS9OS˱-{yc7Ǟئ\wk?ϱ=Y]1=:8}uegSelK=}\#~5(8\YƏQXl=ew|OxX}49Ī7Ϝ93~NV~.=3Ӱ}K}<SmX64Gb:8>fqMm&k2ͩ{69307}lPJo646fXcc>J[s;cޙz$纵JJJJJJJJJJJJJJJJJJJJJJddCkg}6_~{$&2 !˗//oLh̵k&Q\`L|ɴ_X4իW=yGdsNX?  ᳬ2u~6 FƹvbBbw{/ߟ|į\v$ }JcI~3.&c5xVz{#6ʮS~4{flEbo~>g7q_xaEY;BH9-e.>JH?Ro `w[msR?Gba[{8,Ϊ3s_mr|*ن#J.v6>ճFX'C]J,nL?|7/+qQL/N-rg!,׿>Œb?órѰ왒MReīcBkyxԪi⇾ܸ1R>57#dԽwa,a=ol9?G>{W?sۥ=];ev8S9lr?2>ɾ\̉2I}|N&>ΟAJs[ݾ>%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%PGC|W! ! Q|D1=zIB A@8EyK`|a/Fʬ{3hwݼ4X(o3_U {Co˺Vsg!{yޖ(47qqbXǞ`VBbyNS %ە9K(Q.7<Ⱦ[4Ca&%}yL40OPOҸFV1Y3}׭q)5<36돱H6;]cW]|{(͸5}`;kJrqM<5ͥV25AzY?۲@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ S 2#PM)T-E$#za3dS8ت0&>=D,ïͯ(C/J#Lc6;,V͛㉉y.1~IKl^)wQLEN<&ǘ7Ri!{.{m[RncZcۘmOZc}ke'{5aLE]JGa~|͏7=>Cǔ@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ BGeObB_\ gLD~u#E"b~ ƙ'*z~5벙x"%;ʲR~,~SVgsġ5=II&?S齌r?:{d1aoMv9 Ӳ{u/!vyWeoќ2Ѧw/\պA8j_|Tz/r3C>yGMl]lROouĴOL#O;c_&]}ؒ2dFݛJj-b2YO?Wp;J~~PO29GYӺ/U6`YrF's~ķMKv#[^Glƾā9gM9{m?㗱uܮzfo\;{Mߌ˝;;Iuv9+(Τ}>*Įbw]y_zw=y21ohu;ѓO>s/gL*fyb_F!14Ktnr˹wNs6v:f~6J,gq؍]~'ܹuOguq!͑XV<&s̙3S9*eoȽ5w|cq C]_SWg9ml\e{KJJJJJJJJJJJJJJJJJJJJJO3}f$2 BO[s /^(gDH`"4Zc}C֍¼6%ƹswҳ|? C!|9,_~LOO$ "ۋJc"+Wҳdă0&"c#aG)S}_ʈ]"> ѮgLj0 ?%y睩?/#3^qWB2{d'3<#.nK8ٸyt>ءL (ɶd_ȩmҥKg}vaV?OĪWK6&]91a[o؆clt?#vΓƻ]7kIJs'r7v.eW|d.>[qCwu9Svlw^vmoL5Φ>b}21vƍ;V[Ğ5Os={vOejna?'6Y {0w@n;N>?=ϜQ{&f?bמn6ײ=wzqRbCYzw'Wa_>Xr'@ի;Ldek{^{1";S/,>_9XK~Ͻ3 nܩcܬ;|sW`[>;+߀fĮildnm;8Ɵ1W_}u9O-~ZL:ą2:0f|>٤K/M%Ӧ=gΦ((((((((((((((((((((((!Pap?5+W~knY"~!!V!b" Bϙ`A[DYf^E ~_N~ % /0~y7\?/eI`(2LdQ.>$5vDC8y ' k<)(0Ć@mEy\ &&j?)1%>Ai*JOLS)K#7u\qWϞ~{ߛ%nDmYgRV}$Ĺ9#l5r;> 0я~4 ;Y=vWIqv>0DOJ{Nkss8c;0ywb@gF]>H_bկ~ ]eg[8걉bQw~a}=oxoo'a=J~L%bNz振t<sT:9d##-О'J{l{Ϧ㹷=Xw'|?ߛm{}Đ2m?̾Qo/r M1˿οsow;m7ƨ;`Rr>3i]3u6W)pY6wJJJJJJJJJJJJJJJJJJJJJJGnW9E""Qd.KN@ADD9#n?)60sJ]b\0S}YC_x:Dlnmc;_ۘ6)"c%vJ10",V_6auωtt~m7e!kbEL)ٸ[I|#s#)mSc.6Qnv}α3\Qݜ8^Ϙ5oYUc\/wRnjshy(3AȚlV(R[ew|~n96-f8"؜'O#~/>j){=p y\F$goŁ̇1ؓR1vbWiٯ:? 8Q[$Yǽ#&'cņXU-e,K9Ss+o{Mvoq%:Fv룾>zם{<)#2v>I9#nⷱXy˳uo볃:{4~_hĀ"H91!B:>XcVN߇{ڬgsIΑ}b(qm;c~ek}f̚G#2G9(FR'"K/ޏxIXld{ͼ=5| !:S>Jlv y6μy̷1w==99/1\ʤQdaO9ڝK,#9k}c·Z9Ε5[/ܳLNȾkD<z͕תW<-ys{<6us_ɜ%gD;|{F wkxؙs9,SY]ݒظ;0չsWjY%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%?[G!1_;#" XaOĥ>~X/]xqz& LC`NBBDlʜY0yO%"~y wcoDb)~{k_駟(Wl"ړK:[^ah~mD%#1mR狽7O'?ͫxצ}2|-/9XD.n?)~N c&"XO[ w}wW0yҥիWwΗ5d={,qxڵ)&C~P]^FQ|䜚G_e|8v{goel5QK8~G7nLԝh73w=J.X3N܌a/."T%w3g 6WbOL+,iĝ6>c=L1bغpʕ+;w4~;mck4 |c㔾ڜ}ebs]߂yC;%F6rO|ať_͝믿>=KW<9ƨm㯥ɳR κ١s`7o7ַYsk V,s>s;޴/~9KӦ]mis՝Kxi]s7Y?;:JJJJJJJJJJJJJJJJJJJJJN5-NsDD  Zh@P"b" Yb/DD` ,Pl8v(__&q;Ե]TJi,##,kDa<$ }&7oAbǔ5CؚGO|(/'O /Ld̗燽3b"\ DOBQq+VR_a=1?ő?qc\b6bWT3 6ĬR߼kj!%cr?F;LT$O~2t'KxdOc3&ƉCco¬>˗/Ow^N<Yl1/ʼn{)ǵْwSW.",3GcLh6gV}Yy뭷}#"5=&9GnVt?߉Q$o٭dn_ g<;C/q/J!{۹gt2u~<+f-gw־s/A61fSz)%_ܻ=[y۫\κ6 (AV|beZvLٿs&1W99|_|s'9o~sw%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%PGCJţ`JD ڒ+  eu >|B7yuQfNb ~QDHD%G~Is{ޙ8DRIĎ4:<[ECVOԭmh>~8ޝL\Ha*%yʑz2/ԭuy'.#vĞ6./~2kZWb;#"Z('ɾ}'un3,+͙v|gCnO":˘}fsw>(G6|CmcqQ|p}Yb>(};;'\6ac؍c)>I3 kIJ\!2!xO?gν}OL$[@}ԕ?蹏N9̳&Y6''Wys7>+wN2|nϝ|kyrҦ^k7JJJJJJJJJJJJJJJJJJJJJJ`= %"xh eDI $υ -QDybnn 6%a]M|a:nʈK=M{#ZVsM_9cCR"ca!.!&ֵo;cNC1'LqR2rN[l}ddl>YzW ۓ9j?v*GǏăx}DߣJE9C.}dGXfSc<;wL,:}н#q*ve/sw… ܃/=F ?:gs|ϼs^Μ93Śx2vwwLaO8{PO~֌k/kk=w߬|9]vÐI(^iyQJy/ޘ$[c~x~ݲzH?s&džmYͻؐXUsm1n>˿{du}+JJJJJJJJJJJJJJJJJJJJJJJ@ǿ(\*3LFT@B7.ΝID8<=k_"KD'1>eG"BAPOOYa_lk׮MggNnܸ1q% z?󗎱q v.^8խk"4d?yɏv:_uN+11M| ۷ot駟N1 '0xwXNHdN̥t&.]4 ģ^>;Ł>|1,3M''VlcKb .sDTQkn'zkM obJu;cP7~TWh;/5Y=g͛Scfl?ի;q4e >8%n#vΤz0[k;]r[&kW|/{ş_|q*gq> ׯ_W=qw1v5'ʣJ=WR8?G̵]=I>h1`̣JX@P]uY] uߏ?ZW_}u*^8b  uknQ {Ǽ])v^UW^9,ş0/{ゥ)A\|yKY+wx&;UkZϺ֟[׷du>Q>vaf9ڳoJϻ{>%Ӝ7wxkĺ1c(7L8dbm%'5Ew 79;x'?Ď>̑{V["gyf*™JJJJJJJJJJJJJJJJJJJJJJ r}[ @@aqA1-1 ~2QQa HD=(-y% 2"u|-o}[E@B$B"cF(H@~XC/DT#$"F:D4Bo~szX0I]bۉGҞlgQdp;+:I7ߜO^y i$ٷġFLE|.%6SF{ɱY~>\əM|!a?)aQ ɜ%I\~Eq蜥-svƚ:9w3x`6 _ax5̫o}_rFrNogX۟oHe+c=hA|Ϻ|b/2Ś?J.RWڼK %.kǏ̓GUS,Jl?ٟqo%f|/ms!>p̙|ϸ1~N G6_267G;6v')cu fҦ.u{sΞqbT[cScDz5ͳ5iearcpc_νs>=9XqZu})kS9cΕM:5&0A{b\m6̑u4q/v+{%*v.sO׌ǿI#4T%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%p2TX|2ᡲ 0!*!D21rGuY72AK6G!"y"+#I1RD['=0>؛u3/0`?u)qo9>0˞OjI;{ O7<R}/q"R%'nN櫓4.sx_ge>RԽ$25m2׶U8KlJa`ؚ:Ŏ}U67}汚xfOlҶ.볍]̾ek/}y4yqz?.OgPN,ƾM+4a!p~&$HfsdNĝƬϺ54W\0tm};۔AX3-Wy)㶵+'vCg>ޯK&2r]9yY?A,ɿS^%'vKҦGcҹJvE:#m>_g2~%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%Pavy u.JDo㯖RHʈ+FaB ?B;x@`a7̱QGc_#/gP0#:Xs~ h|O_|?~i'*ˋ̅^SUJ]_9֭_\7nܘ>Ŏgs!3a"F_lN^;.K$&M-Cq5+W&;i~%V=k*W%%cfˏ+F1  %~y?/9HҞ9ƶ;0&,nx>6݄hEId*qf0VbYވkwOo'!Y箊_UO/c#nT}җ_H ubc3}ҥi܏8P.WO~I15fbX[7g=)79352vQܘ_+1+Գ^잗ec_pd[gI4x콵$1?;evM)ٟ/[3sNfɻ?k*RϦ;Guչwĉ̗_dK84OߜyeμҼ'y/ıQ:{O 4&~e1,&}ŇǣMYs<Ο1r- {Y}{6J۾%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%PGc{*g& ""%o޼9>sȀ@B" 8"H B$rԭu/ml&=Jp_Eg HDdn6co|c_$m(޽;eux#}6Oƿc6|[ZS={6@AR'*? o,>ioC駟NݚML<)" BF~yO:_=_}Չt6Oe+# ˗/OxOrGE̖qV=+X߻woSEi޻ Dνur_K/4<<쳋oSi]~!)1ŋW^r8c3}po= ?7{m΀xx7vo}˷pn&˙⟻÷zkkO_y啅(gOҘݒ<]k挛wo{ʬcbf0XΝ;Ӽxݾ}{O{ӿW&rVpVbzꉽu]bG>'ǧ53.nkzڎ;hq~v܋7g֭[=s;A7(T%qgC|365w$?ěyW:ײIc}\y՘Ӿl1k3^gW((((((((((((((((((((((#Paѱ>+Z>B!SD72\4c,Sgmɘq:s'$ĜǮu\J{e]sl[v%kY?nok|я5uɘWfKr]5GƎkj[溹0̙Ϲg˲=㝙]XV.;iW{PF 2?zl/ ıUڇ1e >;>̥_S @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ \Uu2l'hcU"!$ BFX9E]"(R4ιʶ[r]bC|B87)DВib3f%6xacem>Ofۚ'}|KS![ۈq&͉Į|>Z34`D|+<.Ǐyuϱsͽwa{՝$QJ\_qc,e|&F&ʦԵf>+MRdnRnζ@Xp;%Ԗ=KeޙO]&s[ۘؗ&']s8~$f>ֈl CvYO&s463g˱߼>;՚s96^Ͻ=ϙbbŽ{.u1ÏĶXTgW3se][Άs?/al?ڡ-~=1ɜYrS @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ A`wG<"D~ͯԪ IId@,A(A8AH!HM׮O50X/I5?DWl梇Ki~9sWڕ& uB }7id)c^0_6"[A\?C//[x|R_6A,))yl[;Ē=WL*!xE{Ċ2B+.ڥ6MXgOΘ1w"6}9Ƙ܏=9`.=w;Z21_7Ծ7oNsS1OO{O\̳.v; PV4f9y>&VūA^n܋y1Oňs[̈c{qyss1.ҍ7箏앝?yZ@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ > Z8>:G""!8g8*i?s$Ϛ~U1!%`}''#u"t06ZdsM.]8'sz,_DY7˲;Dd/a,l!%cňJ(?̉(/űԾ,+W/m,smT٫[nMwJ*#Z9oq!>Uλ{k`{޽ŧ~:E~)XΞ=xg'?^ΰ9z /0&n߾=?D@Ԫ.arȝV33`Y]ۦ'1Iztz7<ݩ8Kq߿K{|0X޷zk*}]͓ SvO>M\εrٵNwQpL"@IDAT+s/^8l˞)Vf+nK pa6νsoNELe~=yYB0g7_|'7mr)'?~megNzν3<}w'[ԾEpGdwu9,(((((((((((((((((((((8 S ( L !LH8"$B8ﵭ B ۬9 -"1v7hg9ٻjػ8eKUsKFk[7~rd=οi=v؃d̑1/r[/)aNGbT/% v"BVvЈ0*1HG)#%aR|ǒwbAx&ViS~(߳v 0v:gOC&խy>Ж8O8O&syp:+㕹=maL%>I#zDb%{r9Wޫ߿:AL){wT/²9cۺص'Yq6ic{ǜG%6 <Ū&Wm{%kyIȞ%%y?m ؾ_l4o ,v!ؤ4Sw:(u>] @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ TX}zFCB0"e ׉#\Hiߚڔ "&"v^~h?2U)Xu>c%e?1A_{uPjaDbsi?,͋δ_1TW/;m~v+ŲWcFbw-c[J9;ub/c%'NM)=MҸع#&cgwU=}Oz9_"an$Epey?{46ueS伛Cy3{jYvέ5 w^ox^g]t{=㌙2W̌qXֶ,Y7g$wͻMSMnϼ[)|ǿ6UVw"V76'uIW>%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P!Pav8Y"( "_>B`"(CJFD  DH0GG$r`{"&Z(.ki_eKDTe{=%e )Y1 pʹ_8}sn".? &;Ν;O?t%[no{e3?Ϋl/~CC2>e3~{)idϫ{b{eywP[V׹9g?s7RS:SP]Jn/ݛb8I鯯޽;nï1{kIV7??G6)ش! M19H%k?Iɽ\pav8yt6r;?|}}`+Gԝck˺ʓ$Om)((((((((((((((((((((("wEqYu@C '!QFD C~Mŋ@;ANd@@ptٳgnd2Y;G}A4pQ?UaOB1_s˪Gn/[b(&݄ Pbuk}e⶜ekY%FVm9ӿ/LEVg|qA<ųx=ܹsS\ąo>=LCα3-Ys)S~zҶ{y:mso8þq>g}6M9JgGO_j:OήvWX7ߘ%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%Pavy"2H9 /.$"DCK xB?&55OִVJk=E:A41ѧU#IKeGJ{}JH I39K2АpP[)q\}usKi]sKl.k_=>LXB@LnS&ƮbJ>\|F=ĽOε]Ύ44N);'p}[%M(((((((((((((((((((((8*,>r" D(!: P <#o2G"@4U*W9_s1y^OGnЈ8_y7I,R2/js cݦr>cgUMc>{+yUoN'{L,h_<;~71nνsx^_G)8l'@be,w9Lo1Gޙw+qBh6whSƺ~߻~Qh*(((((((((((((((((((((K⓻7'2 H)cg} j_6˥/ ~)͜˒"2V"/-ն h0΋s9JΌw4L#EhqD޻$ggrv8Νs{Idϟ?#'s~|Om{M32nWvu%V'q`zI\tΝŗ_~#@AC`]g?듨`7 G.߽{w:ݛC O>gm~^)7Y?9>'|Q"&$^}_Q0 $,~7&A3z֭{y/҂0X^J(IwY|'S]s<7@Mb氘5:ò=!s>y~wVT:lfL= sJ+qN(@GaG 1.ec9Q8KD[p7Gra없?&]>Y|ۍ̿.^8tEAi`S%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%pl*,>6„(GaA³1-bHûyxso7ysP/Ӷ8(Q8~5cBWKMsf켦\T% RWC"d">r.Y'&9s92|Si͝gK2fjG;2^Zuqs%16{9ogΧs3#}aA,wL?%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%K@ HHMh !B(JXg̵֑nM?׭sb'ƜUD2 V=v>d-2a ?"j42I,Ӧ FbZ>tVlʞB# )+ńŲ59=s̴mڈpJ+J^d?9_qr>v7&S}"8YvNg{.`)3uOwElp/~Y6;F:g6uw9ʙY7椼9ࣳNg~!ng }euIg1_~ǝ;gzJJJJJJJJJJJJJJJJJJJJJJ@Ǿ'€_׋;wLRD@Xa/!M(Pp~y9BbkV훮?_c[lBU7,9W9ygN,)o,v ْ ;oA/|ve[ @bZlʾe~QYur2醻:7[@fw똑 虵{k>B~{GI$o& ֟o~s=֢q>C7k\vS6MzWM^Ny Z6άvciߜq5|I=:kQi۽;Xb Xgg>/9nmLp_U6Ǻumzݾ?8nN0~fVJ`k:5lu2kݷ;5|0߭I H H H H H H H H H H 83όjT0rv]wǧ=A (8NGCqv 0c a,;&}6&m˶;f4kkG]{.W雲~L`A`+K\_UuS^Z6 IJV<7ͺnn:է K@4嵟]Xg}8{zu7YFu;l:?Ğ#׮]k_˗IZu^{}tk.ұu\L_`,Pve9s\y޷H H H H H H H H H H H (/E˂X,yP ?[ng I}'M[%ئ !>ȩO6[ic>$6a6'Ms:&FyG q2'1YQe}͇cg1ٗKel5ݺkg;ɵu]}.֍595x'nR$)c=n`:~7M~1;݉+#cI]'H# _W6.: }7kcg<֖y6?km7>G^w5f̍3Gk :n'5nI_s'I}gq-oZs?fcov}%sKIm}E闺޵l^gK͹'k#<׷vzͳzk{_|{amٔ9hZwcQ{+w559se_NSZvy^e~nZ߬96V7خLwoz8njr_V==>+ˍ{r}Mg+3um}@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pv?I0H`_@$ 2qڵ-V SQ`  _< ^0;lAPE`i$K 0Vt`I/A#HXLnx ,]i,39︹qp2><_nSkbC]^=irqrBYY~%LlLƾ9雹fֶ̋XY'wamVr_9_Nlu$@&K2oйڙbǹz 2]>̣>Z`ou0g/gOuP9q/ww[Wgb֚}K?Y^`zͺμ';Ϙ<[NS`]ֹ3on[֮s[ֿܱY׻ϳgBV w8ξhk<ߏzyp{KqFOr^~~vk[smy $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $ X| B`! v`p h@0 L6!'cA=q49s:]O1lʎVSN0f,Ơq4c59qfvkS͵?co:WzigGg͚R']{3;鵧1Rx4Wf9f^q<5y>gerY},P+NYk-VҦ罵n}f'/kڵڝu69v˚$_g9|mKu[˞ֱV9_OֿIYg"\x~YGUl3ɜsWH H H H H H H H H H H {MΣ>5{``$lslA :x @!矴} `Cm}ulfv6\/C?=ACOپkU[?Oc{\ݷeCY M$㚦 Ns sf-oڸiYem|c)>ՊmMfSV_)nl>g8G{ιUZP׭ȭ;u[9-;w5ؽn훯?XZ8}szsͬ;k]7e֝z9v>yٔw<Sv͡p+>j~gmٴ~ӎ|Tٳڜ6kg͕uJ𾯝gc֘g]ZoMYkxǺARڞrtf}[;.ײVvי}뼇˭m+rL{r>CQCO|W׌e3O~{=H H H H H H H H H H H ,>;KY P@'M n&ۉ==-Paν\~^f W~고,!?I q<}n җ7 })FmS~w~_6%C](8mk'ǜ;]}Tw9x뭷~>纱0,|`kb)>18OΝ:g:m]=*kAEOG]{׌f3I2C[㓌Eߍs֛ܳyv~8LZs`Svgk3ʳ_8vbQσٟ5_r{0ڰ7ubnio}25\um]Oc+9卑nx|k{_2ZYs?}ZtOg9k˼Mٱ953ʒ:ױMMismZk[6M{fn's=D澙7x_qܳzri$}ӽ~?sF>Ò.u^;펇sO H H H H H H H H H H 8{R 0B&$9w\"(@ 6r wC/~%Awc| 0aӷI&e?:w7onIhC~ǔU؏~-^;& ]m.(C̝^~'\5es&0̟ cAqb%:smd#<;3gּ{SO=15֫@^xa;q6>聹W^uc]Cemڜ+g[>MښQ;渾m+l}/{c:N:^}uZ'Թ{裺;.?P"X;T-{iI{ csyn^[w֌$uzVnm{-/ͽgy}u2}3uJy3mUmkkڙӦsX`枡w#s潄#7x(Wݤ6I}=ҷCܸg%}_.rzjșkc_K H H H H H H H H H H 8=OJ4A)= P&M RT!R`'|r pޝ$4$A$i ٭_p&Gk&@g}3N{Ҿ`^T1;>pt>G`6RWcLpԝahG{ axPY);јz]7ibmp3aλlɼg'fg򳞍kupq۬k~O/ڙkgm9.9o<}V{r/uιW׹G>vI׌8:O>\ݽ!g1͓ u}s<ɭAϡ8w應E@ʎӿ{=ųqܼOy.8DZ bγgڳ9lx+kG)Wcڴc[߃3IZFԩ>s}'8g:/+Ʃ-׹^7>x؍y-?,Zd<ΎH H H H H H H H H H H GӯZ=G L` @`Qt(,~6?0A7[=O P6ĝhIY1+]MSCuiﰱ}is62AƩlSvs<l9g94cRv4GӜ*x֫p Yur15ɵ?gksj]Z[ˍgLֱg3t?;͋8.M~5gM̺:g~fXg,u\Kew:/C>ۼW);=cìs5d^̟9;_>;_6kc_2޽;1tֆ2kܦm.ڕ{MΦM>66ekYW׌U=ڟ}wga拷ܦ_ڦk%Qmsfms6̽lm[39g͚y?_ֹzYF9y+^ku.ۼǫus=Km-O H H H H H H H H H H } ݢ@H Ln'p`/_(O0  L;L !AY`ní2ײl3|jNZ&ϷEڔǷ)T$3OZ: [ i%L:Qpke_/5uL26_6c͸R2^{7Wk/GF3,%Rb2|bܓMʳv&5su~x赹-}w\{ 8Df8438a_i9[agzm{f~x5_o>{}v7IY__~aSg9Q7ڔZVsvO\oS=8ua={Bߴl{}{zƧ\(@r ?u5usP+%@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ @~a[ (BЎ@e v\ׯo#'8hx A rLT1i|k_ێ{vU?яڲ)3@e=snd! c= x'nuС~!Xm['Dp68I&-(z#<駟>P9UUJk#7|쯎ܬ7׍kWu=k5kKw5/y[>ɝc>7\Z֦6>Әӿ|9=kB]/+O~Zu\߬IK֨|9zY$'g3< 46\l 1T`<uք>̲vm}CO>y}ڬ'N&`^ܗn~Ysin+y} /lv=s=m_mlڜqnզXzg}mn -=ccTb+}?鶶gkqlsQ!7@I~lʒ؟[ح[]66~ύt?5!k~Y=n[798;?H H H H H H H H H H HVTߝYtG G9~Xg6Vt>A EPc.hr 5%$ug`gObL>ɍEprXvI{4#b'@F?8۶cckLӆYrin۹" cm{cdȼ Xď;Sv\ؘ ܙ\>G[8fjC!γ~ 9_] |qnGScѾY_Ơm}qV ȝTg'yn?uy͹g/sۂٸ|Kk{mY)Ks<{y,ͯ|W#}[Y\{[y=jwkA^l^׆}k턏Y׾@bm>\<+\gH%kW׺/ڗ;=ʘcۺ~i79w؞ek?]ӯ9G}4*g`ru>MO\IjS]K;n[esK1orukΝ\ds'&7$ ooekZw_*ksԯc2Ybl^)H H H H H H H H H H H| ,> ۪A 'h`_4#6@00 Avh_m'ad\;&B#nӇ-;.N.æ~yTN[ Xslb_٦>uYlN8Z1sʫqttlھ|η굶աm]-ʒ"iz|q4\Y>8]}s:_ZsX9ns OLwa9f,9ilsk/9nWmkOxVIK`fr\y{j֦9Z7̚;V;'i|߾9y/ekk]ꚵkvԺvk19fν\uqu^_[C'u0~͗̇؜cNG@']o,snwisg{wrׯm'5j5f X so91dֶs묙)o{}ުܹuiR`F>I%}۟gΝ`ee}u:֖=zc'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ Ly]E3h@  V0A Z#a9&0b;N8f 6ǏOۂ%uD? .ޱ6&8^7|#oUNˀ?O>6 z׶cm:ڔj[^͕:Ү|9f~;5^ͤMэ7hO?1o#v5PI=mz7V]I/'y*e83/ڌXm^alh};kHY溩d'91zj{6ZYu|O?ͱ}s$~3ֈ׎Gyckszgb_3c2vY?Ε;OOI~1ek8x8u\޷ɽ\>i}3f3cXϟm:+{6ա?,tMi{߽`]Ys֔e. ƾxS\n:O窺+ֽݣο:{gC1} WfӦuufӖ93.ew݇H H H H H H H H H H H (B@@ W_}୷ ;/M0A .08u  Hyڵ-AP ;6Øyp }c4!Bn`}`۔5 M>оMӷw Hwl<ٸK/m8L\{̿77[n޴3SUלqXgͽ| lg ꕻfaekp [Ϸ|ݏ֗Okn5o%Xzۦ,\{TaO?~ 3fn[۬m<(g=6e#6eǽ{T6w>?=yߝzu>PuW܋s۾MRfqټ_ՖgH H H H H H H H H H H >9P=$ @'x`1`4eSvl tؽ }va\#(yOM (8X_y啭,0C@6y1!PEF rqO_IhW{$ǧ .I?'n@IDATls}sic#xD9.Ϋ &`maoqw,Z:yMSu$h:24<57k0yr9!EVҶN}Цҳ3pܣʳgs/kzVxuݓ:oIjg,k\k lʮ7'9$T9᜹fnK܋*PcU*s*Ծ䜙CIҜo>=/׶S97ܾ֗4qԆ{PMsp֞{s֮''wNwuάM57&}$p1}/3.94f 4)aiMuaY^K}?yfOW#Sm<7ispplֹsmUSvYڙW_:qu.]v^Zrg:o7_F[.3.9H H H H H H H H H H H (`S&導N@ULpa F& @ cm^[?&fڇ}P㻯uN:k~A_ wzT{UZXy}:𚱎݌;6'3d*Uk?uu7~2&`M]]3v1~LgAc6my~yJ|=$y5W^=c[ NҾg-1W%\`uT6|%{I\b7rPV12ss}| _c=~M%cMs&Dmif,r"ցk|ˣ:g,γ7VMھ~uM5Ͼ\ҬtL;1}s~Y3ڷ/匭VvMf͘7Z:INuHDɷ2JM_Ʀ]Yn_}ց>) 2{l;gtuw[?f˩gڕ3kWYˮю\Xʧ)ud߸}o:.:RL:n-Y38kh֎\ҟY3VIafEwܓ%kߺp}׭ǜgAingv\|v;}}Xغ3sk^W+-naL[]s8M uiK%Xkumsoru\uԧ8qhS]l$=FYn5s߹vܾ֚돺fӦy~:8M}SmswMO}\y6qibgM|3֝giSΛ{Vv?Ϻ̯l;,:я4볤n|іx}i,٢>gq:הH H H H H H H H H H H .@g..DO<3[_ * 1A4 +_ʖ &!Jp|_@ 6SGA7Q8 ׿QLށm1^6Q'Kkw>ng_?!M /'5^g zѷ \||I\r>:<郀'.OlC_Ɗ7m>aԧ4c,3cWQX8hOw w}f53w\[}/f 5jaI=۾9=[omeO?ڛ5]uz~OokIk͹sqk_W[},֭\E.9&xM2zVv\:Ҧu(k֊guon|?s>scw>ϭ 9+}8IƠMMM}uئ߻לվf'XCkWy֮g}s+yk敳5̔]kSǝ]]%{c8$}yh6YOc$c]dl֝m|kGg|uحo}[7}juA~<.ښ&8omu6cwqۭNW+2:υ:Z4c3>3ʳιf.dls&PqrXvbߺ]Y2eI_sy,^<<9Fk|xNێjS=L<晶ۦiYr{1V//7v'?Nr`f]9Fؚa{Xy9Bج zg"{vM}nMˌش/9{<_I? Og;٢OqZO's,ͱwl~)H H H H H H H H H H H ,>? ٲ SQ$XO\nD# OY@]̷M/u>O'=c1c*w9N Q>V~Opi{zk$A=kcACZ6g;NF6Ig=LБ~MQ|5}:l rY$kqm;8:g1֮1IŸ9S<3ڔ%^g.*1Iܘ::?κ2Ù_r12Yrʼ(ߍ}57mΚiv]S8vוg$s S<3bwSn><7oզkyMe Y%clgcw~um_ky]9IM4y֧g֛㇥}j}Njw}LF٢޵~m+ H H H H H H H H H H H HK=H !A!e XvK?014W:c=)O`,OڑhtݺwmI'p벌gs7㴦96fs1coeW"C3dqgqզIt֟d֥c3IH/8]o^yթ{c߹'9٬_u}}} ϖtn $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p/ X|/vc=S|{ ) X@_d,k.tJ|3NMpzM _]7r[Ҭ_k{ր~w \u0.ˬY\e[c=gRscnQm/_ y}9fnl=s>m'_i֭~>V׳Z{#>s۵G9.iki^Ɯ6v&gy}<{;x׶oT|ƍY_֒6}۰gs"ݤ C[rlYѴG%@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ =u-]qA,&ph8~ђ>$L$hq6/?G9^}wկk ׎sI>i;| t첌cc/ 6[9kuk]/9^yV:z_ΚY91 Μd<1cX~>c/e^xn'?Nݝikb",V`w|<X,X[ou3yҖZwދ[.䷛\v-v%@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ @^WT@UL* Y|ζӾ~?b]ǡ|S:5sWU]˗q w3Gws|-mͽsLn.yq[o*<ߐ`ۼfduMr@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $X|'z]@ $@ $@WJ@PoqK/mA]رzk~ _8xgӟ Ÿ|S8,6'/vLx]އ~x (y䑃~7OWjL $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $ X|U@ $@ $@I7?O|@`,+l`|p{M.&X>&=x׷M׿/m|+ O}S[@:۔/ϷoO $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@'(`@ $@ $@WWaeX,8,Xбc_I0'>)&bɾ7+KP]MRg>}sۦۅ@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $  ,>V&@ $@ $ <'׾`ε'?9;þYxqxVaeOkkT/|a .TO|r?ͮ\ۊ)7_~fqX~ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $I ,>X_AfO~r9^/u勞gơkb9On\alX*'@ $@g!3J{}K@bAmlZc|ޒLe|N+szO H H H H H H H H H H HW0~ o{/~q ql< ,5-MŎ8Aۤ<+'0x}S \~Gܷ?c۷lWPF? $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ P`) V!PGr ?Ã{o @'|r pEJgDyg@c-H H^X?=K>#|>|\@o~_}cO]]mG^s}Cgׯ_^Ɯ@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p߹a5\Bjm696gl+`CY `{=}ɘC1ٟ`c=r $@ $p oł-w6ē+O`|c.T`o(dW=REyƛ@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ \L6 !hCo~f_r$C0o/?oMYOJūTH W|&& 5AYK/xpϔgMe\oSֆ}nK $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ @ŧ%Y=J@Z"7ȯ_~Ok=voi޺(eM⛙'Pl3 $@ $.3P>7JZ~n^,xg.'g ,vM]>{fS?6lvƟ@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ g3jM` ~w`!&?ph:|-sE;Sפ߂Y|;.1TN H S>'~[}r;&x>WYg,4,6b|&nH H H H H H H H H H H @X|"07t+ 쫏T]v3ch92H H^ix'xAŇK?? $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ ǣ*ϠL i7x}..%@ $@ W~K $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ \vGU^H H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,ZhH H H H H H H H H H H H H H +P`^&@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p>uhH ?zā@ D`}u뱞-R@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@]>?G8`?ZO~n x, m̳erϕg˺wy $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p ,k5@ $p' & OZs'o8n6/3{z>ɏ_kgWȥyǭMӇȓve}zzuW+>^ g:6eϫx뭷_^_Z[~/|_~]~l6[Y|~Q}_خ<7 KsۺQYC)[C=Jg0 ;G_n}$lzn:pGk?rN)gv׿r5{G^gYV 4{챛Hi_|ܲv?}W%@9{};^y~~^wnFs~VV[UO;?{x㍏z>;WC ^`oy;1\`_۔_meTع_x>>{[r?S͑47ŬaOIyno}S?eXQ+Oj xFxfx{/|QV6^h}'Vu[֝4AN$ =>{ɧgn=n~ /,(,E9l^xf΢\{ f.߫|e[ u9q? ~֘a<<e? 6{e_[>6aY:8 kgmܬYgѧX<7/ ws 畟^yg~&)HߞG=W}>ukL3}FGV{VRM H H Hw͕نf_4 %ye }*7 9$IvBFr}BթCvo6|?,[תW/_֋p$/C|bKZ1}+F_z.v>ہ@}җ,yv^G}p8Mv e2ם/[d:: G뾳s}`]ZYyA&8Vl)#3s9b}gqg^}< )Z?ܫ4{gs~Eۻ:쳄F9֜ }^/. S=#^Oæχ>gP̖s%5T/{VR31{{W8gO6 OqEX6{>o_i3M<ݞl6翳=nX!?ųL&O3z<{6ϫ}^>pt^Џ>Kl/Y~`^{~ܙ=/3Ngw &Zar +sݟ"6l6F`#l6F`#x?,~+u^jBծg3_z19ޗEп/Z="Оhg^ֽvgmڱK{qgkk 䑉5s[7Lw&R[1ǧ~ l7 gN05}:7ds˸9dozyfog/>f|F`#|ڎQvqA[blgSW3Fvvu~FFsxBja)>ï\BrS}^xoajvL󖤻xֵ,;=B/kn-PkbhlxU8C`wgܛq=3^F`#*A{|kry뜕S}V9QmHHw;Kڮ6g@ΤVIøH{TN?FS`DGF>_khF`#l6F`#l6◰\CD"?(ȽE^j-r}ӿ™~o|"͟7_=Ѿߜ=_]+fcO]{=d/UV䞮h~_}lq9K> ְ=0&gElƿg/ ;'F`#lp;W'%=#g>ųX}݃7QY=5.@IDAT*xl|vq;k_ׇܹelno`>#=͝yR{l#l6F`#l6F%!%#Ⅺ%~9/)K:zv_?0@}%tcbxAN_r:['\sǹҘ3:,#sF s_g6l'?59C|Gf_|fu/)96<^hܢb{jާ^A,YĵGj<'7Gw44Mg&\N~+6W\y9h}V׳pS~6SaYsk[SuW{2ռe"`8]5knx{a^H yN|,?׹g3~F9#[=g3ES?=\5F`#SS3sE#rv!=>?9Q=% _{iqa5KH.)U@OCb5"}Os?+=p%#zr_H6ol0i#xtM\Ka22?gJ.Y,g ήlzyX)ܜUU+Lot滶}N՜{HJzNqfugqxe{f:8_n%/.,o#cƟ+k5jUX?W^Jr׭}?R\Gdg<5ݙܹ>h_-侗l~ g#sysͳlDLvMY=ޕ.F`#l6F`#l6//e%^tO>br:^zYKrt rN//yYPCXs[O_Oy̱)垎Ċk s͌~y?ykg?sbAggot5l'yktZt9ӟ6+3G~1}y7w>~3O97Cn}yE񳘫}xǧ.ũƦmro{M M;|Ūz5:_}ЙXЇǮc-g&r[ ;zvܨz{ͯ4e{f=k^nwsO9OgLUg${_bK>'/~Z?t-T_ӇZY;s]7>sHoZUG]:<1K>ui68q1zαȷ%Uo{T>u\z㬩a}_zڿKwx}cJP#9LN~[cў$G墿ֵb|aZ7p~]xLbҝ1{ƊYckFaZrkg5>8 Cp89q|r,W?r-s=y%|>s9қ1oV|Usm1֜'k吝X,k\_cQ䓞Z&9w&W{sg\|(rNx# Wjӟs6y 8c3n95n|?~ Is cSc.1É|u`i?_ 񉍺璹WӍO?+6l=fbL*9~1k^s^5?a<(3}NϺ;XY z_s0~<ΜV}Wɣ5(^~3w6W Kŝ~V,9?$ڟqLob[ϵosTK-?c'gӓOH/_x~_n<߸7ɡ6ǒ_alF`#l6F`#l6C`a[ȋ^{b/Ԝl~pHS~IGƽ,_*N/qķV.ɚb趸|ʣ9dTxuN%q~kQ> nMhƔ=iO#:Ekx^8?t(7/ybFr?u/U5(o1G^5T+} +zDk zՒM?صZ=9,g+Vq:{p .6Cr0oߧ?Jǵ|OsŹ}UlG.u%<۽@ɨ$l׊#_}m$Wk8pFjAWq(~|ʵa.}a\;Kת_U\Q {3<ז\l%k5f{3F3jo1ɵzhuk7c%{b5Y/ͼ9Jkrs:ͫ盙7ⓟ֘Dgu O?_| 5w-ˏ&g3y7˧Kğulo۞(Cp# J<ͦr[p>ë5,tϭMΊKuLk$d >"vy>dy8gnk}9U{=qÜ15f]kh̓}IO.ݓ_jny;gS kM3%["kC#KJ9:!pYϟ+8?vX[9=$ɧ5_s+ɯ1zkr^jЦ^qU>ɷcvwɷZg }{c~OpL޺o1wZ j\}s5.Dr1#_.b {s\]#.^Br(F91ȈڊMy뱩}MݭTVVm٪-G}>J?Vlol}ŧS~_U$7XPe^%[Ƌb~ĆZYu=TJ'ӝ-HyӗC\A񵮇z=4 {2v&LZ|k#G Gk_i=+/;؋r1)F0YW3}5Ap1szQ?ċløX$U\sSݟ[?>b^1b>fiF`#l6F`#l6gH^&zٗ۸^zW/5}1b/ y˓?d/*{r/'x9)?p~:QM|/]-jo{Vt3*.|^0?dqzLFb:?ãV jKQ9fN(<2CVք{t^\o/z>+Dk{ ·}!/n4g-7~9y7~a/` zb|'71 ɧu'&Nt析r}>t#>lnaj~ׇnϰ Z#}ks\AFP?Kfi<_1:;qvb_F5X/Ggj^Å9ͽ\O_)"q|7w_97+F,.k\OqWZY+ן^O_}p6ͯ$GTuM G:?\.g1Clvd{>:==ʼu{ur?B5+okyCnY?>ӛ}ҾijI\3 yhogԚsK9uΊkΧՎ׵Ku~s麖=Olcay]^kUp|wny] G ,Fqqpv;]x3L%++Fsܜ־T0 -쎁@]|fݞfelɵkJ׿8'+Dr3^?CY816|.ŖF7]QNkDkzqok-zQ0EGa/%ۈkkOyA_IrvO+G#؇# >#6d'^1]FVGGxu\ֹkκהY1&|lݟG5+s{c-̇eq6x +k'ֳ}KȷQ^ߚYXMֳASg1٤O)a'u%ۇl6#:}:%cu 󳞁G=Wy߳5KO }XLrdq;6gg>ʵCV|fP!\OI>p1WZ)Ϋ靑<>w>Z|&]sY9?|}r<3$[q \.j\ =TthOOd18'+YmpAFh?}dvl%uF5:G| ;cg+?<gKx{Ha(Ǯ9y]?YUwAfͯYL:u{ܭ{v+mO;+]8_Eaӵܞz= uo-^1뾪پlwm\<~/b3:g}wڝyXk(og}Ap ?>wZgz{qkn-Q;;I~Yjaq䠑07Rݿ÷g)v0Z4бY?j7ߺ&0WxNНXy`yGLOq=23]õ3PŽk')CL1.֗hs'Or)KqøZ`#S!BF9ȗq9^>k[o:Ϭ1sdULu>tMЁFf[\7|'ʾ*j5/<۟ ![1)? |e\"s)?]sxxϴ!7%ˇy<yXd^|8YXtÏ,.Ġ+u9[b8]}r,:&8jcWQuj%.9sky]U8b_+pv}_-[YK_.Wy~#>tC|þںٗ[\N3FnS|m/Zw}._]/&7Q>C631Scu=Ģ?}U?p={L~=Ck=fܚVǮ ޒ#5V[sg.5/wo^ܰE?.1MK5/޼V~gTۥ뚽z cdg3}\˛캦.PXjڸ=V{}{F̩mtԤEk/^KƬ1<\>m q&L,sp-~9 _˖@>!`']=!g{,6Imkh,>jĎve#:r%V]Űk}q͇+/o:a_SQzF+C֜tX5g=[k9$A3g$^[8Q5pu_Yi#EpCۈ]Q955E`7'Χ`QCۜذ g5^S{Z˹A=9 g/ovճb8 ;xLW,~5]bN}/ky[ dOzs:'ħZQϨʧ:S N󸱮m,1j7Z'#{}`~{~#Z|_ʧ=g~9#Y 18| /hpl5ruN\X?F.]9壼@nFn87u÷gSrNX վl})/zDyٳ'b/FF_>psƐXtPdɇ 1q/Ą CC-$r9%+:'F]Ǟ3Z]UZ {>/6C>S$8"BB5hp{5u=!w%q+b!|`:đ8tq9bgŔ#Wc^rs6c\厫ZÂ]uo~hQdC;h鳵ͼ|$rT#u3A{HjV_G\j%<`yƆqp~1е|F9b\ B;'0@6 ЅFF|\XWOXnjGfkAy[p?|ס0rgaD~5k4o$s+.Gq^}%vu/:Ȱpv;RlԂã3Rrҕlt%2:jrٗdh5W,y=<ř)`ffF~76kn2[5!B݈n`H7")\q+_F|!<#YNyo ߴl6F`#l6F`#xJxyT%ҧy89?yHx]#PKb\\sZyM} 5W?(gr~"6aAfr^rS\X2ӗ>ۋr+ YՂIv䕌Ub/jo~>sў`Cg?E`?!9crYs=&we7V}ꀇ&Ə{\s,4Og˽k)e} K)'=;o-]\6(mSr~jHg-V‹isM.|(O_o S|7NkZݚqZ/>5gVn"zs[E* Ol5!/rwƞgwNwveg<)쌉?ye]}cnM>g|G9o}6Gb Ca1/nYȃܧ-.R>9g&bkHs1p{6|_Ob GOxg“\̜Z^t]CՆ brS?_bw_5q3kK6W,w>IN.\ґ"%=6h08&.Zl?,W[1Og6+\8Unr5SΔVʏ%gkܽ7bptW_2^aO!7?mk.3/r?3IWΡڬ~b^>3Czmȵjcj5S80+1an#Qͥ>_|uƝyљMd(վZaܘƶyشFt'}Y˷D-1:K%GqQ>>b8mP䡩/3^Ny7~g8~gŏue#">5ν s9+\gwM;Si>9b/>͗_q#_|d9⦇[ԯo/\ח(YGrK1ڐ!7<=_{ֿx|#~u׌χ>@j|S+ͩF_eϯ>4>/ds|Tk5Sj|Q9KhPf<5mM֒\tg.A)V[3̙?1 gg5INˡHV֌_;?a,6cg.~y˱Zwq5ixӗXlgo>qs)^ؑ#JGn~_syWørЇIMߜl򫯽_~/nck-0}m欏ZbvzHWM_K3A5_~]Ob<_Xm䗔X#}<|ˉMZ@|#l6F`#l6F#p7mB^"jv/q/I!N߿xc__[%֋-|@jyۺ>7ȵ&3o!Cg;m<rgDaMJ_ DߘymƧa/]‚Ou/8lPd/OqQ1͵oFf=#[SPp,o51~FIs}iC]k.fxg32,`{X/iߞx[+y7&m^Yt53k:ֈ.\}qkX p>C]'n`c+}MŬyTWŵ1D}p>j;  fY}+` 5kyX0y͗\ 6G593r7o̯Z$TxuMNFS_ߵa-#5&n\mz{oX!qT&pn^7]Yk\9W1;Oىe?OjCC=1k⊧Q#n.ar.,_+ܻOR6|O.Ocj@jgK~vO^9ӯ/ {Țb2x6|׮уȰdypr#vTLyU?xNx[; [|/Ǘ֚pwxbYwy#yhsتsG57Zp\7ecL_u @{Y~bݕ_lhb #2^;>]ķ'Oq՝Hro ≠پX-s:u}&<l;eǗ}Ok r\W\6/5mVKO:{GaĮz֫g짞s%⛍F泽zʕf&QJ-ٜ,|IB7r}uu^[Xa4{S .&z\8WqobWf7\|N"kdp~cH];|⒫ p^o\Ԁko;ᙞ:g^(g|>7{qob˙]8Doޜ<٪m>_MbUVsؐSgy1p˟5Z\σjFa= xSi n?/֯3OL}ke]n9ZckNѭjd+.NlmɃ?Ӓ18tzF1zf9g1;Cn(N۬ >nC9_:k?`֟}<~5m#ll6F`#l6F`#xy>/oM"/ (^*^z)e烳rt|c{w4r/p$F ѿ/7_$3>#r5NTMr%772]_ne G\j!ы`{/k8 سKk?0S-͋ast^ Bܑy2{yP/%믿>rKy[@=lKnkxzE coVug'\_?Á9bo}ï낝=ehcoHY >3;9}'A<58N@~Uu>Cװu ^p))D9ɧ5Þ|Xp>򀕳8Y,w5tёCgqyU_|qq8)ѱƝU 4FNi;>n2~YZA\>{yß _9Z f=bX}w7q٨?\VWcTwrR{ڼǬEyZG\/y.Ww.{u<Ë^9o#>ųF`=yY#ntxv=/_pBWOta+7_v3}>[:ħ~z?# >Χz|\2\#IGCց:ʁb6l|D 9/Xd$wL=6XgA##o߬>̇s^|:G֨%/z&WGS?7G`ع {{[#åZj_-?裛؈x}Wע8Qy] 麖|3ץ> NZ^oY\.r_ ~HLjfao-ᤎo >.xkIqukC>\cl=4&7jM{+;6'<=pk‹y!qnd:$/z?}uvvչyY8:pY_\L/]V0[#5V'u{kN{JOGޮqC@~7vS>BbZ/+1'|y\3quZs֡vy֔v&v lػ^e#=3hddyت[L=YnnFl+^7 ][_|o~ppC.r;8kkP>гM6h5N? /{dC`u<<ھër ؃}vcΰ a^6]z&;}\z/G_kȽCl>a+5R߬mFh87R ^˘Ž5_sa׮9~71;K>b[֝kH֪Ed~Я5ǻ9ڋ#uО]_!u:>|ٿw2wLGHn~  ǿx2Z/Xߕت?^o5jE4ns˓9}j5SI|"\_m6ւ{"ͱAՈ]S5|5}6du+.={͵|Xr`gwjXr5ۯrFNjn>35!y[3vt]Ƭy&כOun\$3ag{M_~=e|y>7,_tY#i{{F{ug~~TqL ~]kȿ6z1~>U >w+jC ⅽk9ٯ7'gk 8TaIf3-'D?tM&dD_,Xb_ :cpfvm>ʁ\N;ԚZϘu9;z~K-[~Ő30Y~|tM'>tfNXtmjAnؑPb_;~ŭ՟+j4$iLܘo ~._2_=cwu+>|ϯį禍F`#l6F`#l6?ex* /4tԼPD^Zƽtŭ^nzth K22ݕ2^{) _~ߋ^vÆ9yF:a"V٫aoxŇ9-7|zCt'88݉#’ roONrO\:٩ibK|_>ΖjqPGc8]ص'ةQc/gaxAoOַ$oE\H-6:omL}/Nk\SK|#k|kiҥ|;[ _kjáf:g#,yj/׌r9OES7[n?Vxd.u^>~Q~꫱,Wvdž=e+qͺ25JgfptĔQyɰ,b?Ԛ=0OxQ'bq(~to1ɉ_fN 'xb\3֯lEL޸> ֜C|Cŵ'}bjCibX[$Z'>s ooZc3vhb>} ;ȨxLP?y?kr1}y1oN^ƧMqs^ڷCq_Z \Sں'p͘}Þ>;vKV_biw֣N<?Kc'fρAv]W>9|ofzZ9jGxVZhﺖ&~ w61ѥ=~KYl.^uOXdWl7^><RsXX0O=:-Lq~a$N K:l”-ڛt[j/^&';{/Q^=.O}|Wº8g [ ~]֨`8/-\5j{o_ة_U52&NV/]4xW8}w6'Y)⊧5sͳ+6t-J_x\o}sS-q`ʖw?ΏQX.GM Z[sO9hlG.|~ǚ#p@IDATkPC{91w-Ӛi\ىpxlOZK=?6.n8; W jה`ƃ_bõ>\9ٻb%^Θ91jjӓ%V<_|k5\.&n=`qYbwɹZEPN֣}rG oߔG T3[˗>p/O>Zjom~C731?Z_\<ܝr'/7ħ\+:]:!~Ru/bOjQ+Q95aLN|r@אXS>r6t/;b}ˊ7ƔG>{r=Y=r@6#CqGVOA^qGr&/ut+js6G1p>C>Sy]X|k/ox8&k.,֒g?H63Z3}d k@/C^Z{1aBlgZg]phG|\yU 3d>{#GNkg̵N65yfq:[>ۧ_Y(wECxkƞ9[lj˧xtAS.|S>ƻ>'g;%k?ߗg?rc??j`gܹVkmZg^1SLmb«xq$|ltx9oK<,nKxÖd~'r061KӇ.qZ#1&_bia9jWr!W<ŦGfb 󓌳Zjju;sMb5֥83-o6F`#l6F`# ^î ^zyXr E/756%ťqUHbEF4cxi2^:$tP]_goub8eub4fYs/`xr1_Ts:gn%y0l.8y61Cn}LH aS_=yv0X b>tm=zt4FumyʭӬ8?b^Hf_egbZ/ yˡq:+ߜF/]v+^æoc|揼R9m~Vk}>YLz,_qT+]~%+`6/<lu͵V.x5˩xri画|4k ybj&krJO>2{Tڸzt!Bf#{vC.Źm}_[b)=җ*#bɡBkq:M^90Mp󏙋|I>“]6rr_Ռ?^͑Ed>ڊYvqtjΗ#Z}GҌ_xȏ>_nx֧l--'3\5Ƀ}|csjd<}~i|Cuc׌7man?c5*&;cq|MtJ,2~7rqû߈;[a!hU|^q x3n!mn#rh˯>Lk}:Zc:Kۈg.bs֊Aa~Лucl|5НP|?d폚5eZfbaxxzF|ǧ1T=K/5>׋kw.t"yv͑#~ßb/2rzQP?aC'OfKG\ؤ92#gU}QL9e|ot]T='>Z }<'kI\-0xCl1'%r-O]>x>f拫7V fՁ#势rCZՃ7?m髽tqTMsp?O.qk9sWҚWo׹.=r8X~&gCX߅g>Ce\8_}1}߾>N  Y#t:7\ rg?ryOj?nr(1c1?l/nk2ce[n?I.˓Mm֬HO~5φ|bm|;y`3k16< _jy!:Hqk>N\x\ڳoS9٠\O{@<}yb%}7m6F`#l6F`#,^:_+н8|_E+͹>*/a/V"i/Rֿ:iE/锫:Wg ͋Kn/X{ ,^ʦbIf#cl4u4ebkQ##z]T.6qW6~mX* o5<˝aA=,6\阣^2>_ڋ 9jۋŮD˧YI|-*jM9ao.0R+{ rlQZ׀tY֓N5gÇu/jd;)#۷;)-.꓇3](v_>bV3\+F8čמ rWbk|7K8=y9dKbOldK]Y]}8:ʵbٸQ!~[|fm0T7nW}9p}/Q=(\v˭]mrOΞ_ʃs anۿ-x^Zvჳ>bɝI֥3Θy9gv<8~>jŮ(bI.qf+3b~*S֝Ok䇪^~y~??^A/X5P>Zku> _uчEt%7n-ь#&xkEWMrGp:WM|S cd֞Ϯ~^t +v|63ry'.8=pWCbǯ]ҺFեS(_5/Ծ36I_tkbuM;H.O:YךY-Sb|[IkYZ`c/͘9F6:Vd$-YswQ\Į>X5#y9V\Ήd#dqȭx[׮QM5{Eñx++ggy%|O^Qmg/a}噯5GvaFNM>zi&ӟ[yO.7c؞O6_}V.⋣]~ḻAV>lRNJa|X|?|r}v3}85r@[G"cN,T{\kŖlu7Ftz/GLK1|o} |),r666Vm/9zO.Oy.9J$.7I|1ϹO?L(p_ĖZfT\< Nr::֏?>o̫XjP‚)Sb.j-o\=K5Ǻ _<{=^5ŷ>]Cm bkҬWlcZe[<7OH}.'2{yj5^65 ^iRYȞ'xGa?7If\WP}8k8]2sR3ηk̿sب},6=X˽gTs\_qۋ5~įn/9BsB'~{fb&c-"7G~?r'|؆]y+baÚ\kdS|WW7 ʇRc=Ӽ6qinD.zsaTNKv[ߨx8;I3G5X:j///qg8]nLU y.MM?9YlGf 2F'r@dq"~ke_^N/6\֧y2^ >bs/}5wb~1!Ӈ),V\s1VdּC_~XaNMsNNj+#c#l6F`#l6FE!yQ%b/{ _/%KU/͛Pa;/ 5~K7|~eG}tKOP9hd]\^(㏇1v^=?tȮ^Pb-|ʃq$5X\VcFg*`KLg|pٸj&bOw;pf KӓƖo6yzsd0j/S>q>ҞWSYH}|˽Pxj[f Is^]0K|]8$1}?l-]\p}W9v+Bn0O_npru >fo'&"x|t^'w;cK}qpŕg>+t:jzէ68w|cs=W|&ևɟ.{|7x/j\j_kp Yy~MJEyi-2ǵV xϊp&aWO|88>7`Uy}ryȗEao][l ˗_~y쵳^IlP|ˏ~|t#c4cΖe7+ GTx?0%өЍhdDu֐Ut7zo\_X^מW~aRW8$~-o~gxk0M{sSjgs*K >d3Gy3|ooW_c߰g^c6W]<`+K7w"9yt]ZcjXk NOksr:*=35 su5;Ka».65QO[U޹l5{I`Ӛ9kMmw W}iz>B^}JOű^ȧF{~>LN9Q_\{/nu{/{ՁY<,z Y[ uvܛ9{%>Μpv#K;ZggoZo;΂=5T 5}laãWɷ{Ĭ?їKӟbwT;h< n}I??Q;C} }U=05^iM#?S[n8pAZw.ԦQ7yʑZxONM>ҽ=[SE{sZ[^p̰#]98j[,Zm8xUxb8?p>g k|ob34kn8SsWG[}s8i]ykڷ~~'LT$=ոO~v=ȡ^Bwu6w9ٯY~6 .PL{8S0֜DvY,GT=.qOwj O1V]ltv13}紸lS6a/W;"=x҇a0j~pt!^|[ޫO؊?쟹c.8 z_Ľl,X-ЛG'>> ]mby|̩_l 8 i!v pƓ2IxL;W =4^ ڣu& 7{W~o_`[s3[8jÊ#<űWrD0tF*G5D {gܭ$_{(Gq=Grkp4im?ҷ0~~{禳O9%hkMtOͶA-pEgkκs^էt]wNwΜ>/^5f_<|5Mg%u]qXbϰOiM:1u׵g_'k,+|mrpV5$9Ŷ'ɡY;) w/fgTZp1Z6M>qxUd/Kf\x}Gb֌ $"\NrdK9g^?rz*V9gL|G\*ތ_/0G,{>ύ9: pJ؝CG;9ͳk ^uW <á~3^ڹ+Ĕ5}^ȫ/Yut66Eyg|akJO<bv}ٳp+y*>;g{|ɦril1foo}@})kx0qĝݛ⎯5mJqE> 86[3KZk+#{чUsb\fVn!+_s3kї׹Ѹs)~͞vNϾф=ή9k0g^|o'T[j>c#g18_-}x^?7N=3\{^}Yn;;W_!MtaiXb'r{s٣S ~1:V8lɵ!6kʩO[L@k[=&T6j}N?3-FN"Wǜ.Wukjqa6H#6paC\-&gU$ɬNx5_3~Zy暾OOsW[4N fk_MQkVv/~Z?R=1+Fs?m¿֏ HMq[<*^I`-m3qg>s~>š5_˽[L8BUa5W>̾5} zO~a '~Vph/-r[uŇW0%9wl[ÛWpO~{~][ XŔG3W[Ϝ2ck<$ ꪶirgm=ެg3׬a+1 LD\uVc3ŊUrOg=]a8_|:=Z]wX`y)>>b{xҋ?ӯ%x]n4>׊|Sy{.Mb-MC'3_Aᨎ1|Ѹ2+r۫4G8bx 3= kf7ƉמO;3jWt~ϜrsGidszƐo*ͱWw1LgpU:5[rt׳;吓*ى;?"r#< K[cgf<|9gM7OÐ~ka<V! kS“ٟ>>_ 7/!ţ='io2=[s/!圹fܽZIv|îk_M,qpm:?|/֧.9&w}>Yq֚{Oì.:C^~g}KÎĩ~dTG_Sԡꈃ= (ŤkGq'I~SCzgks=a'^XMb1ȬA?N~u9?0Wnk$'֝XWb/<٭ZS+o~gdN,#Ĭ_l`Gqܲs=.u7)VqW ?}1EO! Z_'ȎOUqfp1MOLOꈯG {0Vt|ŧrs}9]>d]MAZ95Ɨ~<]7k b=E Z*0/keZıgۜ\l41&xr8#b_ajM[vO_kqfMva)Fu-Vqh5{k _|d,rbΆU^WoKsg嬰f^yGxʺn6qeƳoq2YSɟ8܉UNDpg>ęz^YZן Ʈg㙣oMsDIY}"Vgz[|2cl٦噟Ĝg9Sؙs&?c=Z8-LgM޺Vf 劇tiW3xtfk?e-VM1a4;}+~rMI~S. 4qyTǞw3c_c>bdƌbT_큵rgKzE٪9ݚ0rL?"abڬ5]؄+>$[|5Mm4q?/I1aol]>f#^yk.v'FV[\孚k9sM3r#i qqKu;۴՗[\+M+Gښ㞜|%Kz;gILvڌ߾ޓY8a G{ݙ)Ƒ4?gX{{0"W޹Z:x8Q|ӯ~k}0p>ZzƤz_nڃDkfߏ1O>-fiDbCۃSIC[= 5~ >,cӃO[=8ۜŔ#lǗ1oֵPGZm|B喫?xz)/ rkڔ qf_L  0њS"/ Ξ֞o_O~m.#~a/Gg^l5 jhavmpx=9G;KgΑ:9;~bo^jw}[1`ً#ݏirKt=[5/Z~궗bT[?xғ7{ľkL>aW]ĥ\(>w6Rs5{q}D[ٱOyYa|ު]pS7lª9/iߘ_{j_VlZ7os/YpvCޮ/a^fG`5Xg}yG1{b?~&z6X^~Mhv\[n5t@NuqgcLp߹i=7T)n{`g>-o> =}m}﫯bfF·Ok\~s;D_߫c;] #^v]A[׵3W;Iޓ0yko:󺆗m7; n$ GQZ}pSkՄ'<^*8sӝM9{7猆~1l;糮nqŘoxWGgZ >ˑ/N|9F\9߼F_}֟,;9`­S;5>p[vgAXbsCy.yFˏ35ϕ:`CZVq/;Xt<|_ٹfueāu-D,}~g1b> =37ξ-'w/ŀ1<4 Nfއ[ўt~9uP;b}Օ?.\|b8}.gpꂫ065ā?đϏ G%xʙԸOy'|3/>o/avX5+|n [C{g2Ug޿Ԁ/G15q4\'6dž-N'>rFwOb8 3.%\]4}ft6tkDx0iꪕֈPJt9ʇL}vqʩ?9 ݗ`}L wW5{)uy ~nM>?#j7mnօ?X䷏bY|hH`y}7 q0Þ|kc֌ jx轂=r- ?} [8˿%^^YKZˡYZ#'L&-ԡf˜Լ!Mİ5U0W0u+jsի~ƻ3ҙi>ػZLwA|!irh=^sxq׎{kNxK|6YuFw+q5ܬ{D\%1|^gپ/6!=|߃iSCg6O뚳^_ۜ^1q/0ŋq1fuYkXb!aϚ||}\o\۾˥{ȼCVs75v}1vR&Z>oyqĖo|VbT'?}}XNT}ws$ol]mrdLK4^jˋxWM%5={Cgx*\&_|{Նc xʼAS9` 8pG: ŞaŽu_n=,Xap.?x`'?~ {Em]k{=9i`/i'~IIF{ }&֟r1wnso|Ǚڏ~t%lM>]bgֳ 58bh?`"lu/'|yjǽy6b∧Oj/f\A;[ ؗKM.晘u%XŅx6k/:Opq/̯/?&Lq\8yU?ʣK>[W~ g=p\{sG5>Q{C15-u4_guhէv5b|- qiɥ9k6,]`1:/؝-b|0v_=~YTәPi_=n;q9~kSb/s\p7zklFX&sq|ϑ9P/:kr؇kퟱdx]|MŸ=wu w‡N;Cn}O`*&xl!w͉%g5a$>Y#C{{:rkS>SN:tmلj?Zc}#Q9HS柪g.s ?hg 0]G>eԠOqWbޜďp͆+|6~7JgbT>}s|XӇtNS1c/q{}sv=0G.|)0+n;O?8=h{g20gk[~8#^3o'Naip܊߸;O-_Kp5Ƿ MR<#n"בt"DLXߺ `3U|= '!龠/=ު8b%V=rà&ڽsOsM|2j_Z{מ/|=k#tp?mǵ19Wqgo:#=?>.cIY^rh0mN_+gtpaN 0|ǧrۏzrN>nuڽGs2q5>|ɇ_ÅP󐲇`$3O5cI{ n9Ė^5x̦j\zz LW}'.r@X]{[{\lՠO 4Yql/3F< C5q.֬aL! erڞ6V`7^>?ΞYSι8)7D||7k۵=6Ř+n7üSϖoz׊Ut8m/{v>nñ˯py?ҷFE<H~թVb|lxũ?aUR\}Ƹ=-^1[a׺@aw˾30ܕh~<[ꏯ㠺ᖈ1pw\ s񌝍9~8թ8>c>x4'wR v_j>LK܈V;{Kg^\o^|_{·zoŮĽ89S>8a5 _1c0'nV06.v'{yHuшyX1̹XwkJZZq:w up`3ܧo=gߌ;uoUO 㙷_cۙygtu; ~c7q_r1p1p1p1p1p1p1p1p1p1p1p1p1p1p1p1cſ=}vE=0CCăzyXco~,{衬Gv=sX;+|[Ŕ{Oc3Y ʛYsr2믟ߴ?!z@ܞ~ 9'5yjfָ+N:>Z_cUtm_8gYo问A)Ά)U;b}Gi].CN%Wa?Lܮ?s?FK&NqZGڬg۫<}g?{kq[?\Sljhz~tu'ǽ}yc5Z2k-Gh8<+)=iڞs#E_o뤘i{b'6anW޽YGjy^zZ^~Ҍ HK6'N-_~=3)aX^zʴCkMZ_8I1UWvaN[8OCu:,-FGcY窡WhjZfjW 6|i => ħΨunQ1°5cH7=S}Y= {exO,;|$űN>&:spw{$vaN.W;YX֦]qlh.Uk;cI8Nι˚Q54Ugcf}0nމNHa.G]4{v69ڽX}Uws6ރ|93W{GdSlf.6q8q0,n{3k}\0\1?"_cofʾ٦WLp϶oi\>ך>uKZϬA:6 ҿ'xKks9V댥6p~]՜n>J[/'mg忼ki?y;=')b sm' O"ҽDp0Vm_:縚naM5jن-\+i]9Uˌww̱){~0?SkZx #ˣ<ϝo;œk LqQϽ{ ^ݧ+ij^nA>|~UG.?u'_v 5~a_b#|p üV}ԯ^>uyKyqgx;STϼFގS~|:O:v駟k}gOGXHmgU-}vy}lZ`=ߡ#C9qYoʺOpescĶ_d{ sg-zjYZNY'XvVyIjpLNWƝXꬦ>>0ؿo<99-%o:;r:rEO^S#bE\F.'۸u߾d&z>>c]>lͺ8:+i߻i"xasnm\ߓSV-բàfukֹ̿Ko.>,&9UlܘMCgsgɄ)h9j8!>l/j>ΒfRy}W f;? Z1wló'|:jm_K\pkU.uyS~|is81?kM:g^;M [/.|W7 )='.hsxO),&rj \OC_A s\N&(U5&~A q4"l2̵dWN竝>t ; ^ju[//7&q pErCES]v=[klNE`4_M̰5x-ۗQ g+Wukֽ7<å=W:qS Y|p/ V{(9jm\8#|xgD\:qL?0я-Ѝb='4 |íe/u/N]eO;m}ȥ9QkQvZhq)~tŭpmo9rOs93b߾G'bt΀8N]{8]ob€v%^................~ wQUK1!`H<0pCUs֌yH{{(?a~D`7b=oy9WWɷk߸{IΊ`ޚk>yAD ״3qEjy>õ>")뺼rfdќ'`]\rv&pZVMH~c`4h5&M5NQ pMMj_jpF㳵;u{o:n֞W뢳i͹[^2oC\cg\>g=R}jA\5HV_pAV[us/lSco,_9ŀ.ܣG^ZqE=:h߰?ʪYs폯 "޵5g~7eqg¦v~xNl!Fg r[S:[\ 7ɫY6v|:/a/ý{޹W|}Ԧo`}В[<~OyIb&Lw-0t;k :wa24\~4&9?')/ y ~/G#M99fGrNLkrbǝ5¶s(&l0s3ŐIns1_^l HժGʿ]Ěֺij^۳xWԇ]pֻGj+mN'Iq: 4~1Ӭ$a#kDb'Ku19Qm.1˫/F16.zypf'nىm.{mss^"f;s5um>}w]\ _\x=:CG܈ڋ>7ah76pk{6ym_{ًۣAj3|s­s&'uTw٧~յGyP?-\@x/s*r8,QfQ|W\5ںfӌ;r󧞰t>ߓbE⨽S_u%F[{=GsoVj10+Xkաp['aa|CM=$6|E3?pr7.6 t+D>^ `8i$V-{e|3ni"l'C69g>.QP&qGD3f K_[3,%}gsa,#y_g 8N4>GpM¢_;#tljͣ]]qW~?W3c|R≿~ g~idԟWb0w$3y+:`WC#gu}|M0d.m^&}9t;lgW{11 ׆@n{ckX۹5@Yr|{#y Nptߊ+F䡺;/#<<1mߞ&Nܲ?BGH!sj6uM8;>֌m0/4ư~uZWPy ɷaֺ&#d[]Gox#̳/vKO5}+Nk2yO;chmOY`~W.{9b>g~ylxמzf5dľ#^5k bE/׸aIٟŐ]8LgG'{=5bg|ā5Ztg)Z[^?V=qr͹רC qY.㲹jK6.W^坹XMt»thR^}1p5X ka_mo~Z'q)w\מGbMZ~Lb g?F xogLZi|/)㉣XGgGmt&ֽͳ[) @6~̋uxSi1=^1䧛HָZdOqMuσs0UZKx:D-޼w>i&`^>Xx4b3g!7;~𾯏pA|=]kZŠQ?k/3\+?]g/f:[=՛wI?xGzYG̜nkn9wqڜZc8ğϔɯ>Q c:3^\WCxض_{{[k꟩#(w[҃|-y1cb>OzH3=qTKqi1G_r1p1p1p1p1p1p1p1p1p1p1p1p1p1p1p1cſ=}vE!=L%f ߃L>~\6S[[ 5sgdbo,f<&’nnq3u!>9\=ᘵ x [rPq_#]y#t#ֽxl 9>ȩ} r;p_vსK?,ў*o+7K [EM|$+Ѹ\әoėZ.r$k |_M{YM'8?Quw}~z/.8:hÚVQ-ae=L-ÿy|,?-8Ұɩ>7wglf\s8xOM|S]瞄Ⅵ>=%uhHO:mrrhqiԾ`<׌93.ɼ@'e~#ɗbYr=۽ȘoXkt& ~6=9a[#ŋW1IXҰVǪ˃}=,à)Kj~Ňb:7oU*C7YK17’kg&7Xǵ[Vn>vQ[8 cWV{cνzb%$xl\G38"3(&a#׵Oٷk̺HW#-Nا>',s|&Sm>f{4vf\}u/y{9H>+&š+ڲZykrq6gZ{M~`9٭1(wXcVc'q aߜȷ[|oU?vT?|+?w'O?=[0O.{jqr9>MU\b]3_pV ggǓ{h-|W:Lle >8<}v㢾l!g"nh?{ڻ3ƹz-o=%6wFh=~o e7;s_|L |r"3/8 뻦h#a=|'08'poͺo v?qs_}^z CԣfZ׿ԝLkNN2o.Qcrt&&c%Fi7׼u\ًoNv_u"w:lڜGbg]%Y0>-_;]c<[_5{cGjlHj,/b}v%58W A13!˒Fҏ !C`ֲϷ=`Rl?ZCcŬMy0_.1?9b>iӏf9ż?(ꫯU}yIĤFԉ01F;+_/?Ns6=;uȫy&/^ܾ9(ov&9Mjd{uuZi98sg'q$k9z6g}h.gfcy?;|`].qj0hRwؽwưEO87윍EΫch='uk'}"V\;8ه~7쩘KgXigӸ7jn񋭎/O?t}ij}~볩z3__{T`;'Ňg mS 187Å##?רۗ|}\U:nܷŃN}q c913sě=(}0.KxwU9蚫޳5#s:]eŴzjnϵ0øwyV?>OP#`ۙØX ٰ!F5fx-1 eryծ'z;qv7xg\SM?auY>1Kf}=X}54W|'ę?ubzW[8Ů^9/_ 7??iƓkaoV ]pjJ&Ogw ߽F}-`9|g2rʅJz?Q}C |{'Nq'&Ͷ{F>ʹ%{s~F8g٦q-N>a'~r<>7B͉O ZsY>}6–O%[ ka1&/lZo|ODoԧBjc/oIٽXGj IudmzӟS{8WkL?~a"YakoV?<7<&l}~G{]_LۗႿ>w'=abl[~861[W[QOx>juVѼa-x_!aT:OrvmW+1 ϔbss~/,8lqyvG<Ûܤ3<|p%N,l_MU{r Z''^;yε3!~"k/\1_z]N}]/|`/]gSmuvl`g@믂p>{>0øHY }Nd?;8S}gך:HgGL8w=¥:Qͮqya=:%-lb[kȥkWbUN.uƥgE  SnluW7q޽6e+{>1A0,ljNňC;[85 Ab $&]΋D3h"_3NvWWKf=̝(5 !״# K<:pa;ka56˅o߽#k" m,\`0^:sZOVعqkLhQl3ΖGۙ(/,;>DO{k0+㌸臧or |Mż%2k!,嘱`׹i/|Z^kawyM}_ƹb[޷~8#'W~%=҈k |q;X|;+bz^G@1f-!lG}aϒjwǞsaY|-|优u- ܓYGצZ::.8㵮Z9:pc\<#.؏Lŵ? ̯Yl~`͞&m&¤oP̜;1/SDnxq+gZ0?u|įn*qOr>X!`\nG@݄g➇:sq{P۬˹>chnC9{p͎Zop%㾎jZpFa Oag1S$|苁jx 4 lIW֪׮|ian?`u¯`;ی5k*\#{?$~=Ps'~_\r }-?צm;. ~>WͮڭǟƏۏyO<'9kv{n V=rN:wu/fgYbӚ9t,F|k-n=N[ukcFJ=>bAߙMDL\tŗm;u??ګbx`GUG#揤qz4 {q`Kb8ƚ\~T|ꫭ_ |Ou{|mrsrW޽5UʯxU?{dҟ9gg>l3ƥuYG>So?=< ꆭZN3{| b]{@poةųN|ړ ub{jџm[Ejn?|0~Qq/l/|u#%~jNvBwO~I#Nq~\qi-i>9)3r~LLs~1+a/9ms!?&)ս|k9ߜ?9;ĭGIy`jO'51:5 Vu,2S2[fLx3:\W'5Xيڬo:ƒ>čۇK΄>c!:Og|~3{\ZǏa{0bqt.g^}N=qG.bzڗic2qRȧb-=%^J\+N֝|{px+_5+:csId .jUOpM{s''''''''''''''''XqӋ\_JދK)FgL9 %ɴ?/J'+b?kvmGgl~[-W/3~51d3q'Ҥ_m7x/V>O ˽q1&kk-qy޳ZN8Џ'ծ^ crmضoӿy(k0c/]XYlKxa_'e/4-f~śyŽН˷$3w9vS`쳉lJ7ZslgK'swL-Wk>?묎6&q6[Em5Iqoj-٬㫑X4 Og,lӦF_c2c\ç:''''''''''''''''?,pܿ̽_ݤya?%8%5~E9C\οjLzɺ |X7W0.~]_% `٘.:{<׊wbX8N_91Z^̺M ɇk?3ΌQ}|m]4{}0ٱoMwi_majmsH>&xp6Z8Sjsq#~7Me?rٙ&+D'5"&\ƿ{>a%a0Kc^+F21VG oq鉁}90d}I./Ƨ?_N?_>`"՞5S^hc1ð/b(}/~+}'K=lԦu˕]>EΙɿ/9|bLQ["fLߵ">Zۗ L3)/  ĬjcWi |jU7}1ak]_w.N{սS&hp͑VCk|稾y-⦽Gq#=W\iG?t5o 8=ZAb>1cٳ{:O[dmGxrq;[j[듷4ۮOskF(>jac]`-aӺ\{߽[9죱S~ އ׹Qo\ ,O5JOz#(n1T.lsjK_c31<0ty7IW-Wc} aܹM.~̅.7U5sj8?Qc8/F1ᖳ}.q[ģ4Hq/GM}b8W>^l6Kg#u57}:l~qڌY!4|-?֭e7|I8&_#Lijs`Ǒ${c9W/>~?]|D1[o{ly'9>c pV4aO{__1x|Xnb`chʥ}53GrߪŐ_p+0~5Ϫ4'cszጱrgA냿|5ikc9TZžovJmؚ;;= S w6 G"{xqݪmb.vZ>_|xgZWCZ:\{Α71,5ɯ1{hq8*<&ΞL(y͘fwvm+98:0ϸ#]n5z>Q ZǧF{a{_f {1z&1w &}1oΘMZk/_9G^57>W,yZ֫I|'^yDq^\:=ɏMdr[{bm󕫹=ͯ}ŕv)5-[ա:I~YML֯.sf}J~%WcgNՠv}Dr2p2p2p2p2p2p2p2p2p2p2p2p2p2p2p2zo^/ 2˿I_L"4/OK?ۜ}wKFƽl[^/'r?!lE)uyqYo?Rpbvck>p~O~/{}o6 9|а_nɎjb;Gj6&ȕG0?myة/?l?JԦN䒧n660G/_#O~؞$<&3a O~Mz ζkS._m}8 K\w^g"^+ڛo?-l!-|~iFdG8[3{u:Gncy^$|`Q7vڷjč}䯹\#Bo6|g7߼Q^Tqn;k0aeŖ3. _h7&ݗi\zK#=Ջ5k0OɍOhp$=7iGnysΈg 1v|0QwK}2>-.sm}p&]po?w-Ěu:;O_WgGb{}]c_vT"vUCZ7Gػwhb֜ijsֽ8;=9l] 7kj4O`-O~Φ[@Gl u-g׵5׊L?fA׵kb׵XVG^ם6,D0t]W/~wu^y1p(kۚ}3歑;P ;gQ>d~}\]ŕ=KwEbNxe/3q9%L{ٹp`OM}W֞"⪭7\ m<jmאu9"Wuϥ^1MsOՁ_k,17g쬯ea~S|R xE HX5gVjK]y,-N"룘b8pϟi̅AZוy3>_BW%|hq\&a}'? akƅXp[9dXsH`wb9]ưNU_߼_ܵfLKw{|w#X3SpjV+w{:t um tgZqž}, =v^3>̫Øvuuyő龬#S׆}Y <Ć8rY+̳i_伄]GZ5pǿqg}VC;5ߺq ýHrpמȫɉsk8rn5}9w2Oíy=)lV8cGえ#+_`aM:+5?u@:/cG f}x{Z.q1qfUĐU>Ξxb5qy+[9sf.;~xڿ_[lⱜ״cgwbc[xx:R}|ۏr6ip֞uVףG&jlX={Rng{l묹.8+Ny:t>|ùN<|GՋ̳vq{:gΤx>Z5pgK>_mDѹJ'ᦾy={g;~m{=|3X`Q{:Ga,FI4ɆNBq~σ ^5+?qpIY\fT-7kz6aXKav٬jjp^h{_p?*m/gxp :;lpBks绠ke :gWja.]>L ngg?٦=S_tpv}|hKa#n|&^wZ~[b6_ޏEp2GjsJߞ]Z<иOb7RgXE y:3$~m`ɵnEԊA,uՒpM״eL7g53Q> k5Kmq!0Y|{7YV1SbZ,'6}'~̉![11sƀZ):C"'k^;Uo̺2{4\Zyԣ_3qR|7GѵMmM_ڟ6?r)ֹlZC^<8'l{촄z;+٥g+׊ܬ:{b:CW1_S4կV}g/9mAY>85kl^Y'`,q\˩=gg\iu3=m\Ӹdž?{]νVԆOt/f+bklmOV1u'ƴ CaƯ V|s$8X?=q6S]&ĭVZyL_횟x/ŖG~uǽqLJKq7G C<ʏf^Ct;g̽xh-)fNoמ[}a(Wgm~&H`CcWU_˧髝No]3.Nu1Kg{k@|̫M{EޤVZ?Z0G`/Ng?nN;bKraKW:OqmGvs4Y{Ƹ3Yg>?*.<x< p^lq.jb͟e.Ou>)EsMٖo_7r^5z^/t1븵͇uC-y9h[Á}'f<gfY>NkŒLf~9O >_uĩܝY~b={ZM8mGLvϠ_%9߰fOGITj>=}3S:j×ujWK?#Ɖ5ܫ&MV;X㔳uq/pgfo/<bo?5g]c?)~5Z'=b–|&X㌚>q&.yؒdg8f]eާگ55:;1Y目Mu&٘;w&i8=)fqPr+95筳g ֪O j磏7>q⁽~ YG[XW}t\ѝΘlo>՚K-fa(}ϧ7ޫ#dM3n~:jCkӮc}=3}m>aT<ͽco=5bݼg –Cu.劃org]8Xk|vş5}:rv:_~oج׵99k:<֏ϸ>]?U.;Gb0>}ׇ8Zj/]Mb϶q1O[%?ٗS߾X^b^3Sϼ=U&OMאyQ|8榍Z55!{uϧkT̮O5?Ǜ6-n8ZGmZ'L8Wֳ}Ze s6q]̸g<)0Ѷ&LfkOkz5}obПqC^2Vc[c8K3Fզ`v8}Nxc<7/pWY|Ȍ'fZ2o..۾P\9K͹9-p%|w緄N7OZq(kZkm>05~abN}kjϧkh/e[#FqY޹.x͆]Xۗ2_ϕ8Kʕ^ĭ^mJq%bNO['gtd֮ Dm°֙^ ?WeSa矘#qM>a,6\uoK֪1զ1;C&֋o޽aO{1o^ sț}轼j-4ٚ_Z.Xڻog6sʣ'xC)(Gb љC|^τ\Α} plCp Ͽl_2n g_^i^Ý8loى.X䁇-|ӗ}k=P5 /ɷfKiH"F76W_Dl`%֌vEm֋K6lr_'|x欵N[+?}ypl.|kq>%/ga/np>c-_[:g||i]kqg[\ۣ͇jaÌ;GV<΂ Nu/w18's{.x4G0$5toQǥrƯXE^v/'M N/s9i_Ƅ_Z·uxMl8ᠽQ'QM^V_~lطγls\{V1XH闛gG_,b]NgHp'q'Hџ<dwbvXZaVy5gu9:i8Zy{v]MA=|y]5غؚ<84MϚVmĥqߋ)V>k`݇pɋ5bv.cON|ۃbibjgCbyo|Ȗn[\q4vNM[;?:|K{(v7'>5j^baO5}|w/:|$pGƏ&9˵/uw_6Su7Ys,f_jq\ž/XM8\cg 6Q˜G 7%3uϺq'lxr%' qY D<~x ;Rڶúj0[W/%ŗg4r̅3gܚ|ӷTnj}e9mqV&9$}]hrw>C,}`{Tu8$Xkی|5<_njqO:#O~ۧ4Nn1=a5vVG}c9x3F-MW= g×h)f{^/=/gҽT9h:>{14R?-,a1Of֛ ~yեgěv0L-3Nѧ58W0#wv5}@nviőO| 5^ɷ'n|ɦ})5b6xmbTR9kݗ]C8!4;B;[㙏{y]9wuT.B p>~݂QLg@<5ojt⿳A!8qK㞈= .vz76 cg_l?~lhMM֝ }Nq\+#.pF":{^s8ߥέ3|o6o/}\߽ݛ|90nPdssa'~q ڀ\ׇYC4xu G~v=雓oO 6} ?|9֜;?+'gl?# _ ě9B9V-f棺;C\x{1;]&GW}C\Nq.qhRj5b3çt`CLJڍize}xSc60w|Tθ=C tmG*?#gQ>&Fז4vFnoy9snpڴvK{=w>tO;bNh='* C琍{"5f<[}4wml=F9.pK-V_zcV\b:+kj{=u-^ϯ~b0|g"<wV99A&?>ၭ3=G̱xx3/0 nOk \{"ϼv 0Q̣wb:'b;7x2gos \5Ν:X|O|=z>8 59>888888888888888x= |F9$"P#^y!YO I/"/"7 0-/Q/ua%-\'NZ/ bɾ^{ w/iWNXS/m 94ph$G\l6WŰOT1ߗыd3ynzZU_>_WxlNrg<:_3gl̘an V͟t5揇lfk{2qozPήyٙĔ3 .>j.vvOCb5'Nm/^Ls[8gw~vG־y^d7+g~9WGgD8 }7iNqX xq$Պ'Wq8Sp|7G_ȷ52uZ1zKO.gN=Y>c>٫3}u]K/FyiH=E |g{M};pᄐrkUZ7# lZB395-գvg9N^a5<ۼ_sZwﺶOj't<׵z,cOfwbĽ9ԡ'}~:vq;ac&կk`CI8 7x4wjf5Ob6+Nmb>{ l+7b{ޥٙŰ潂kJ9v{q7LpW|խVXqoںv.'K߽xY{͕5QM\MqƋ|S\qg5{\ٿi^hrTW0ҷ7cO rlgTb? 732sWXo9]3Osis*?Ӛqu{l=;z-`Qr]4#W1 9+x&=B?Ɓ=QƯ֙6FfLcqo_Þwۜr1ڬ!Lpw1;3Q_|ܵO_LWx^j 欓oX15&8{ 3 >Qo>ɔs[kGqb_Q]5ތZ7wު'f{OS=u휚8⢳0|5j>,alm %Vk>й+{<9`+o^/)]ņļoo j ~<-{q3G?SuK k5}Qv~lg<s_l̮xp̿&F}vz6:9)q}t` ָ3?tjfuax[I9'l֖OںAq^vka-}׷16g.K3XcGN,mƿX.xX.ߧٔݭ"~4.krz]{*pM3תL-?mG=8'Z֝ `5軮k]4Mu&aM]=7qƕz?1זy.0~}|2qQםi>V1}u3ߵqT:?qY-9w/5檓m9'10v.:D>xq`Zk7BOC q}MNLL~ԪYf |< |v{"}zy%e͋K[^^czX|eֽH’xZ/)ً! 09̧[Xcvyjxjp7^O=1ԇ/hs8xeu/ޏ`b3d]뗋xcW u.X3F56'61Ku Oj:j'&|mߋ'eY/N~S&vg@xK7"W ]Soڂ0Ĝ7lXi1go\5WtkNN\W659ӗִUr"Ē3z) #_M~$Gs4{MM?.lj S|ǜ|[Zʹb}6SWgZj{Ɲ}~l%{>A/d^{h{W_Ck\uyU&>p;5Z 5']k󻧦#j>t{V8qV sW_{ꜘu&9їy?#YZlklּŋOح8M]uӸ3_<>4_nOW\a3cxf+vMb#0ËC¢6{F`.N9p5oaֽRlo~BN~lK5G^qgK1nY+kkTk닝qF-aw$f'gDs^qH1ۃiO?sj<ēu]]}i)|.^Χ<՜V3v{uƦZMWSy55(|ӧ4D_0[!56Ю8W#)bpQ#:35_5Uӧ|G\Iu!̻ H[+hH9hVs^L=1?cbc_a56oOs#t. w؏rWKOMi,Ƨ٭d# sO/{T_^ἔ}qjRÌpQrE3Ú6S`= wkIIsnWo󱎻yײM7O^I5'?s6x@IDATP[4_6k/)ah?oŰՒKuYmb+rpɚnD_͘%rNf~jىob8=,0=X'fu2k/lgq>c.s|>q>3-bu갇{r#w{pVZqD)w/zqfSlΉ56LŦXMΪ>,s=C[|# CY8ޓKM" 5Z޺}֋V)׾ɿhs~ 8-~b?ncXZ~[зqţkNffO5p#̰C-־U?/$?l`hO >qˎ~Cr;ukk#YKƟKRsdddddddddddddddd2or?,3+` 򦗯|T6>pd%R3(_/!F|:^hd/*)n}_kN EK}_^~/텼՜9vD k|a*V>lߗ3puHf~5"WL"[Z<0§3!.M:tb8[khDMZG4n:8_&b\w_% m?ػg\ b8d'50t1fMbU r5sp)OOv0hpt_0ro]>bN|lϸ~dc]]{P5-d^{bMf4;9'jV\ΰyװ+1YPX,_|ӯ9ݟ' 8Xֺ)g|;%ձ9}_[*a/]/6:flu |řX/>֝{9_q:ՆO3WsO_ 3Gq#N˯/w!3?ׇOM:;fu}h>8~_ +`W>ᬟkkMfm#>?'(>[{vdɟy9W $;/lE_ŜX>le;_Z,<˞tԪ'k-^)f8Y\}ֵ#N1y!Wbs|4͟p=bGgι哧:5_og>[:ܚ>'Fӏg홽? vGgk^Mꐗ .J?' ŭ&Z+/m.[C_pt^xi{6 T_‹Iz` ?ϲbHwn8S@p7|)OXn~fE5{6j;ܻN8i9nاΕ>M]^LO<\?>m/e̙ L̯o3ΐ3̾g/{ms%~:[g{$ng/L;oΖT0ȣ =ɾ>.gs֧ļuLq+5T}O?[=?9;)56rdz__SJ }.'͗MK|zF}xŞie?:#bÀW1k-O~bZ3]-xW+x4$^17aWkS9 DmDN)ԞmW>n Gcs־T;픓|{E}CyE業|ދA/zԗ^tzQ Iyz٨/W/>gŬa“yM&ygGb3g|_?5}_|'_~廗E]knZ\l6glr{5ȿr[laku/}>aiOȇٜsjk,љXd֍.qu5Xsm~qbYKݚ>[{>}uE ~&fǵ?s3}|)sWs:\hݧ;/{~M6s@#Lp9HuMNng;as}bNc1;ҹ6qtY}2xjjk^ٞRkNWD]jIǣN-x+DuG^&]S3?`}/}/ꇫsiƾ7xr;[6DynܑANO[gw*{h92r-]7Vs}Ιe\ڳ>NZgkWnOӆ?3ֵ_e=v[39X(?hrg|y6ݹ׭SVO^Ӧ)c?wgyǣzj\stx̓ʝ|l~]:22sg;3Eie,?׃C ;u{nO;gۡ}3G:6seQ]P[gY-YgkSuT>ޙc>UN;ִks`-[Ẉ^`sM\y0Cug}_swMA|;yo{t۟a/ⶮvg^jU?c˼OQ9Ss|2MەjSmPyvZyYY瞜9о|T嗶oԷzw[kCkʘr&_=L)tmgN"}]iO['oWk;_gx}R-SY=J g;oWni:k;}ʝ>]?ڞ1Lk,a_R-mxo_}[ӆݜZNC @ prgvM}Mx&5t,?oʝJ+oj{]L_;zԱ/?VmW:'w2l[3~̾5ٷY~|W~u.SZo9ԗiMk-?76mO̅s0}ntPZg-N{V9we|J[רC+c_fCueQ}yN;{m䫳oǾmykݏ?~s"_?unouo񬼵c97Vko5mLSƜvׯesޚ\m_ ׏iSc6*]˜6usr-?޾mq߭=Chڶ]SӎҞwcA;g꨼?{uZ\oEv].huoǎ+6UݗW,&_xUu{ߦ}iL_sU1N{XǬ/zkZ[Yj:fv97eQ=ScM;6momm[T~suYiچE+o_F2.r{:ޥ}MovsYiׯskgk.%ؽ [v)t>_0gJ?5] 0ϸρgO۾w;Ͽ:i?š6MSڎqk߳Eϗs6siT^OͳsJ9~cϰ'vOKks{ן'Q-ֹsv6ʨY{kXoڱ5G+2ςʪMsaS]rm:*:\Yj|1iU}ʘck|]v9k*oʜ۞zgl{4ڏl9tw_k=k+c}~?9N @ p;wIZA7Zor īYm\,?o\7071wz6O3oN7;tNiGov Y9w"Oݽ鼎s5󧜫׵XT{9~ΉTρ9c:sxϱS]h)쫾?0>Jy(e}/?V-k2n<[*Ce{[뜵ey-2:g;:}W:5w/z]ض<_:];;>ϵS1}:j{W9ou̵s<+5ut9fʹJZks멯ҹW[s]7L?n^VC}Z;okz_}Zcg*j=ܺʲ:7uν:KNώ:?ꙺoc\6}9o9dUcUޣm}8u 3\8oɔ=g9jywEu91cQݍS3ֵs-2esu)sz}n9k_ھemex=}W^3GC׭cMOQoڞϫyvEv1c-cmCc==uuM?zeuL}W:3om_,rqڶf<Ι}ckOkдaWϚo2?NJ['4zvΟkOM~_e:r]ۼ[tmٌu_o~y-i*}Sތz͚u7mxeo-:CmGZ]s*vul.sڇοh|.Ss @ @^Mh ojd=jE+le7;w򥳶h.]lO+o]I}nkOm|]9NjӚ6>xO:\8kkqE}^W{]_[o{v}{Ҏ{͟rOmcX:˶wΟe9ySoQ2I+,om:M6mR*V5{=<5Ȣm}n]7Ԧ]go(>]ΜV5_p*Ikך?vD;uD]^16pW~?J}Em3&=Ty9tsLz?VSNuvݚl9s\nS;eO:csޜS:9IRεcz~ws?*ߡorֲ&\|=PK:>6fֶm^{Ꞵy˴s&_:o>X]/㵶iʼLYwyδ}}] @ @v߮nP`m0XwC 6aCZڦhQ~G7[~ǿo_emϫwtڹ?r>u<ʯOѶ7珁4ivͱY}k~eosݱ}koߥϩ׽Zo)e>;ri{wǛcה]Yom1ܗQ}vlHiڞmlj3}>(c cuV?֥~˯ex聯 ϺǦ~Nо[ykr]uz޵quYkc~5ZUgܗ{j{~Zhz_ŧ2_rK|eWncoy[ oضKRgeR{s}s}O>ח,߾nuNkǮs-W?̾So,˰jyhޜZ.Ӟ's~~>?,q ͻ1˾ IOZ&V+y?Y]w,.u9qݟպ_ӡg:o+oݾkkTZNOcܗsQ?|}=Em}wρ竻4>73/o~󃷿}sMozӍϭmY}}]c}ٲy,>\tLVs?\ڏ5_[whܿ}yﯟzfo*]ݴqmǚ6( @ @e~I^<D7k7lo8e揁G0fKH}òW_Y6:SΫէuvϫuj!ʼCuqׯ}/n1e_&y~p.lxojuǻǾGm_W?׾^f^甿y!GuǦ67v].]cowv; ϻz6G~?{>kj }W^H4=_{_Hі~yֽվI뼚qZ{}=TZ}7gVҗngn{q_k]]֟mL~^!߿>jy^?_4s۾5?S_T{'1^)v];[fe:r9uJ;v)W~_.3/XꫯnC{s!O??-w蛞{͉}7=O8σv]W3T)|׍5RM:˛~9onءmڰѡu߆οv) @ @!ŇT{,MvuNjƚ{3|?T}]x<)g'y<}6~CiOxےay2pm:9J;*~\ν{cp ycP| NVbGSIas*}J;~_sscOC,N lNqu{Z?F(c?7Q븍6|,nd}vY Ϲ}*в|n7=7]ݍkddn3,cCZGv\?u2nK?NS; @ @nV@`*Ml@5 ]@ @e-3} AOPq-H-7[u @ @N}x|/׿ߝ~ߝ|A'N{-|km)m.gXjSs֙^  @ @ poX"aC]̛3!?ǐ 46C @8qQfK_=x練>]:V};0Eַ>|oĻy @ @Z]PK/0ߙ'tN+o*駟>_׿~wX|N3,HY>cӜ\|rI @ @81'6 o#SO='WL  @ @&0wG_<я~?|;?^iܷ^ @x~W_}V>`\e%xgߛ}@o< 4ZO3,f7, @ @ pm!pQe@S~ >"ұ @ @M`ſ/GG}LZ?{"޷k/ @@?\sAW+Oھ~قj;h{8O3,iK|Ѳ @ @ @lg8B?U#H @8Q{oo}0F>=Cw @<)~?ߗ 4,V??)3w*ai@Wnyv @ @X|nM`<_Khy߸}ⷽmC!@ @#yzA>@tAHCK+B @w~-g>/έ}kg ,ALW`pIZ}xv @ @}}Yz7h}skPq/2:R @xݹ kߝi?? fγS Km(a_:k߬  @ @ p|c? ?սg!@ @@G @ @ @ @ @ @ @  @ @ @ @ @ @ @'. H @ @ @ @ @ @ @C@`1A @ @ @ @ @ @ @i @ @ @ @ @ @ @c,>2 @ @ @ @ @ @ @ #@ @ @ @ @ @ @ p PT @ @ @ @ @ @ @X|y @ @ @ @ @ @ @!  @ @ @ @ @ @ @ pO|4 @ @ @ @ @ @ @1CQ @ @ @ @ @ @ @N\@` @ @ @ @ @ @ @8c(* @ @ @ @ @ @ @ ,>< @ @ @ @ @ @ @X| Ee @ @ @ @ @ @ @8q'>@G @ @ @ @ @ @ @  @ @ @ @ @ @ @'. H @ @ @ @ @ @ @C@`1A @ @ @ @ @ @ @i @ @ @ @ @ @ @c,>2 @ @ @ @ @ @ @ #@ @ @ @ @ @ @ p PT @ @ @ @ @ @ @X|y @ @ @ @ @ @ @!  @ @ @ @ @ @ @ pO|4 @ @ @ @ @ @ @1CQ @ @ @ @ @ @ @N\@` @ @ @ @ @ @ @8c(* @ @ @ @ @ @ @ ,>< @ @ @ @ @ @ @X| Ee @ @ @ @ @ @ @8q'>@G @ @ @ @ @ @ @  @ @ @ @ @ @ @'. H @ @ @ @ @ @ @C@`1A @ @ @ @ @ @ @i @ @ @ @ @ @ @c,>2 @ @ @ @ @ @ @ #@ @ @ @ @ @ @ p PT @ @ @ @ @ @ @X|y @ @ @ @ @ @ @!  @ @ @ @ @ @ @ pO|4 @ @ @ @ @ @ @1CQ @ @ @ @ @ @ @N\@` @ @ @ @ @ @ @8c(* @ @ @ @ @ @ @ ,>< @ @ @ @ @ @ @X| Ee @ @ @ @ @ @ @8q'>@G @ @ @ @ @ @ @  @ @ @ @ @ @ @'. H @ @ @ @ @ @ @C@`1A @ @ @ @ @ @ @i @ @ @ @ @ @ @c,>2 @ @ @ @ @ @ @ #@ @ @ @ @ @ @ p PT @ @ @ @ @ @ @X|y @ @ @ @ @ @ @!  @ @ @ @ @ @ @ pO|4 @ @ @ @ @ @ @1CQ @ @ @ @ @ @ @N\@` @ @ @ @ @ @ @8c(* @ @ @ @ @ @ @ ,>< @ @ @ @ @ @ @X| Ee @ @?@IDAT @ @ @ @ @8q'>@G @ @ @ @ @ @ @  @ @ @ @ @ @ @'. H @ @ @ @ @ @ @C@`1A @ @ @ @ @ @ @i @ @ @ @ @ @ @c,>2 @ @ @ @ @ @ @ #@ @ @ @ @ @ @ p PT @ @ @ @ @ @ @X|y @ @ @ @ @ @ @!  @ @ @ @ @ @ @ pO|4 @ @ @ @ @ @ @1CQ @ @ @ @ @ @ @N\@` @ @ @ @ @ @ @8c(* @ @ @ @ @ @ @ ,>< @ @ @ @ @ @ @X| Ee @ @ @ @ @ @ @8q'>@G @ @ @ @ @ @ @  @ @ @ @ @ @ @'. H @ @ @ @ @ @ @C@`1A @ @ @ @ @ @ @i @ @ @ @ @ @ @c,>2 @ @ @ @ @ @ @ #@ @ @ @ @ @ @ p PT @ @ @ @ @ @ @X|y @ @ @ @ @ @ @!  @ @ @ @ @ @ @ pO|4 @ @ @ @ @ @ @1CQ @ @ @ @ @ @ @N\@` @ @ @ @ @ @ @8c(* @ @ @ @ @ @ @ ,>< @ @ @ @ @ @ @X| Ee @ @ @ @ @ @ @8q'>@G @ @ @ @ @ @ @  @ @ @ @ @ @ @'. H @ @ @ @ @ @ @COQ2\UGGZ- d @l8GW8D @ 4غJ{>) @ @ @<ѿžGcGc\<^$C@ oׇ:_-};)TzW5s9 @'$\w׈I xz}۶=??{KY/ @ @ @ pŗQrεbm׿} oxöNpɟwéw =~_n}Hv~*R:;h*  @!~l Z{w;uzĤ<.ܛF&}mL|5xq A @ @ @X|%\Kk>R[X>ч9oz  p/&W~S?3V3?W҇c>(|v @2iG;o~󛇿;:{a '@9Az&η% -`:D @ @ @zxos4\>ѱJ9[۝_e>6->[Z{υy&ϯ+o>l6eZEgytO'V5t~,{M}3~33f|纫]SͼJYssݮiwg-=Ӟu?>oiCc;@׵tVٞ:20VvzҦS9w9PZTڭ @b?Oovzuk˯?Uvu oxu׾1c,Tu,;?k^ty3s]G)5Oz4Gg-}Էu7vh_oy8]y|ˤ5Y:(ޥ7(0bfߤ3/f}XMi[e?]3{}W]tkU| @ @ @NWjԏUTj)@i/򖶯?D<1|\ 7i[wq<?\r'.D?3|wh2ꃶ=~<9󖷼e[e28l?h -?я>o_ T2\zkRsqט^_Q?,XG?і֧I/!|6͜O=z75ӟ>|=Ο)o}kNW^M=l{}^~Qn|g7c}3Ωt/>^94s9ַu[[ @ @ @ @ @ @'S@T9Gu>7>p} _؂:VpM>s`gyU~ӟ~@XpPǏ{JA'?ٞ1n iK{T77z\3Po}[^}-(/}K[ d~߳iOʵr ڸ6_׶g}v[7_*`כ_|?h}nK{)SԖTmG ?ym;O6֠k>-xAy{/~__dz|}2cav?}_mi}~(9S?ook˿Ͱ:ީo_㲝ϲ[e?.NA @ @ @ @ @tO f  +oZ 2* Z +ZRCVu}Zx|z8سgB߾Zaϗ5I~Cse ='.Z:w-=z\\j~z?^|Vwi/Z5qgӇks=& Ş5K[:st ܺJa /m=q?;;>!/y{-nk?Ksg7vwMke6GgUv,kiks鞩-^ձ_sm3`ۭ-2r^T}<h>ɪTj53R @ @ @ @ @ @Xx{WNM/8 7BNӤ5@ p)_9L`;|o#[&K I  @`'гg|[jϜ_ X|߾= Tbyc\]` 7Ơo9kE{5xzR8]yWh9Mm1FkԼNկ^ l̫^{|jK/6۷vן޵z~JMZQ;w,w-ϹZ/ +߂ ) _T --w][ZJOWM  T:0RL04 9P+c'`ӯ|+~mv~K_ڂ[ ,϶gTϛ:=:f@Ak g??"Qe5_פv^ד^j__li=6Os'=sF?|{ *__ەRS+[:A]y[?8 гӞ_PLPitZ]Ϛ 8O˷t%v5}F1GʮVF7-][S:S:ұc,S_gI^Z'?ܾlحӟpmX}8ñʨ߽Q[y|k-yKΣc˩=ͅw,,nތIƝx @ @ @ @ @SnW(UJk*-mS`KAaT76SJ'_@W j)ڵ@O Si'$гu[운gR{:ϱҎ['ug{(~U{ 1Zyk\jsm:|군+\?ؘvaV̦}طg_g~̟֘fͻ;՜jvkˤNu @ @ @ @ @ @w+ JnEkR<5m}󖷼e h)-x ɂ&إ zuNKU]7@d[j[A17휲۞#y۵ys6vv5MmW\Ov-mڱ1u]io-?L~?ӧUGXzhkή[묬”Y1 /niuSoi{]xtsgu6eTά5uv:ڞś*+-?KRG̪tt]936۽-mُ϶wLWcYmTг&wc (nOvVֱȦg.Θy3+o|My2m>\f vM[yMu6&vNsYUVkKR>ɦ1 =s_}vY-SLS_ƠמVǤ]?>O_J+.S|im|{tBL=ܾYY9R~}Vs7c:cq37+sgkk /vTF7S:Kyy|:  @ @ @ @ @ d,~2{Y\S0~_iy>w>SPԘ50T_}ނ&zo?  )|uK<S͏/ r+ 6TZ >29廮*_m-5я~tKkWG][Kk{ׯQjciǴTw-W^yes;K4^Ny{Ζ52O~]_~mie}[4ie4>:_홴eQZ{:[gjWk:'9qi^vNˤsmmw]|c5XTg麂Ko}냏|#[ڜ2K-j͵?郗^ziwMug?w?[ݵenedV?oicW\̘o 9?ơ{~~(퓟΂ҙ+Ƨ&>4yY΋)3~z[7~=?[Wfѭ͇Xnt}(%iYc,*oҳ=om_,(_{66ϪuA3OE<5_ͳgUSuY=}\}=[ϫP{7@VU;pmc9uoyX9ԷiY/Lk}׬1)woZϺƸ9Գ|s7^Sy5IjN~kw~1?#TF׿kTfmnLʤT}ΎWO>SS_ r]rhZpVg홺'}9,U?^1m<ϼj.wNcWZjoib5[W__W|:hNYuGm@gpiÂ#|lOӝ[3nܝL]m_W߿??C5'˗mv;e^u>_7_Z3i>4FU>WZ_OtKskLKKKc5}۪'/i-0t(ӱ5w4_׶[&p澝;c^y}s3rjzfݗwoc1K{y3l~ӡ:<}>s_v~]Y~mE}vVYwkiR_Z[Jf^\!i:V3qGmvƾ{sq7=[ZQ`^ Y'_{ϠR}gOώ587OWkU;ޮkms|-Z3̖ևvޮ)Wsz+<{>Ol57/j @ @ @ @ @ @X|[;>3AXVPJX\ZSA2\f)PgUo/-[Uo˴}5hM X3z[a+|.z&g+?/=Y@Q`XYSjg'qJ~.߾|}Q:mXvM҂ϮTv7>k۟SNu鴳ve:3ќ+k:zjGkGƴkCL -mO_Z[uf}w*|m3WKWJg|<Ι~Q &,>׿|wnʨ}+~UGljK%ҹ;2jOaSkmm[zL[tqƵ p.U׌quϊꚟWGu6?˗֖[r-9ۼlL螭?-6.OmmkOm{kּˬ9=7Ƴ}Qys%4 @ @ @ @ @ @ \.wPӯ/PpL,kZRMA7kML`fDT]UysMm] ,,f =>MQun7][@NuMP`jӆ)ĉzjK8A?/k&*vر T&~W^mn^yS^ݵ ^aQcS\s^ڹky[Yo_?u;Ҭj{i+p}N+gٖo.VΔٱṢU~3A]ܮ_-*11QڱfUVs7seLrm3w b^-پm^M*m-v2l_[Ϣo,*K5fot2{lJƩ1/_[[*k\t}|O]gYu?kscmwu-.Ybϱֹ:xݵ_}mW].%6k3/sM;v\i,ә᷾s&˞33]͛ oOleOퟥ|sՖ-]7ƥy2wk3NS_i4+,J[K+㏺4F3vƢzfjzΠ1l+޷fXu6vݿ#5wUc\~\ͽqF_YO[~w׎7ݣy{iϿScZ/;gsͿ~˾U<6G>CU@sɬvZ;Z.N9_OھSfokugP`qm5y-0~t| pk /];| ׎y4WPW-yVg4 }l ,.R[Z;_fX;ۙn6Qݹu]޾|+VOǫ9ٻX˲'A=OsݞDQ$%0WL$8" ArPQB "(o,'vcܳݓ!gW}OsU֭,iݵk95%vZd }} ҟ ꃕu>6mq;,w9-?JJJJJJJJJJJJJJJJJJJJJJJ .D)D+&DCk|E)D3:&pFV p2'PgF&\>⦟u6~@D?~DA#GL~ϭ)P]DvY>XML~m2g֐uyF^Tn>R̓v>gl8IZ aa8} g`]my)W] 9̯1%%ڳ+ign-sᘹܓR֦ݦĆavփbVٵ,\+7vgY sI3gMƈ? s:k֜JӖDRnbG;'u/=ulތg|i?`LJy90+'g]z3s8%ƖL֖34%~Sb饴 sclm=ćZO[AS9nƈύ)qsb$D֜ѽUk^UܷS<6ΞQ\goA=ח1n־>gKJJJJJJJJJJJJJJJJJJJJJJ.?_~>=$ 4"D$b Q:L~vBM"&cߤ4_I(؄m$hbG@J}Y[繹9y`O"ZeGUbxZ;cY7g|v1 #u>W\x"0 $>1?#g.Gkgy_\#'|PffnO9|DȆk =Kƌ(漌_0E)p%O ⧝$Vcm6q06.>k"~ڋ^7q+MK<Ɩ+zmY5FWuJpvLɾďzwqr=qg=eqmߪ'΍q|s}쑮6^X}`rroId5V7Gl9m|Q'OJ|>CĞ]ZY[c׽Ĥ1q5Lg?F{141s(]'u3c%sY{M%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P' /OڪO"j"d! ," Q. l"*(_^%Ko^r-S覛nZp 06Ki"Ik'":#?? z06A=qꪫSʈxGtElF"~! JRc+9 'sZz#b"b1/:Zo}d뮻n*n8%<^S,ԭ:q5>~õ17RDiIS_W&Q~~d<[qqs1:~9{]gy洈W\N>ظk,['Js3z I]l1_x"Ŏl\na\;җ4bA+3}gZ5Z2>63ϳoc7x;<}]s5'`W~Om{7͓/uY[s+q9כ>9ә#".e\ء4=Uߑci+((((((((((((((((((((((؍1!@|Bt!! !DLFb؏X_D;FH#h'bb4Ÿ~C=4po|a|b z9!1ᛱ \+n)ň(ʼ6uG|e.v g7'.2u[{2\axI#'kJZ;[IY(]c%>mxg uXFXnyő`ʗjm,_b%nMT|mMu>#*7|Nv\O>LeJ3dꞳ%NX*]CLwvAi}ت[??nų55. ~Ϙf8a1=ΎdsWk}ǦNH7eSb!"cuMX?BH"Iu<#4AYsx˙/qiĥg|HI}m{p|qQOأ?bïٯ_1u}Kk3rӞ0'_8R` kaxfW~os=e֞R7Q{9A?_/Yl3>ۍ33/nckos%ne|M曏ƒ1bbR$# .Lů݈v߸8o>67_ğAzf>}1_Z}\RhO@k\'֌?Fe;vOĂ=9_aM̕,٦M*;8J1b|2֓^]_bY]Ŕ#^_¸XW+r=kg.̼4zi<\Ľ_>d⏭J%ﹸkbi =K;퍭q\O2|< IvO޴v}JJJJJJJJJJJJJJJJJJJJJJJ6G{=8eDI022aL2װ:f8yṋ-eOR\kuӆ`)"%<""x"|"`cwLImWFn\k?zbc8|!\i>'MDk|>*MݳIKizگ|ƞħyv/f5%& ]Og09+=} [b+_Iz f?}.}=óB8U/VYk֔X;)7f\,ci_/cK={M|J}$"]>ψ`-3J=JVwM;~74߷ٯr6&kLǒ6eeYJv_:@IDATS3^֐+wެq^7gsޮ(((((((((((((((((((((((ˇ@ŗ//JQTd{D'0DL捐F=B g?;=_~) ) sDTrl6b+"c66{é7/@˛0vgyfPysbl-NV.ό&6ڌ+Wc,%m2yC8sX' ݼ-ۋ%יS"۶]K5}>ݔ,qꉁM}.=^6^TDzj 7t7ݲ;l/~H{ύ9N?b'Գ_ad~3Dl9_vi6Y21 N=1u:ays>|4_Zk K3 [m9 I8f+r%L]f7xگY1-nւҚٝ vr8c뮻nEf[/K5qʈմy'gO=udkMGX)naĆuUWM1N8[5Ž|dӽ;e~G]=#SmZ5zuۺ0aB\cg\s=713Y7όrsg5yDJka#֏ձ1؈Һo5\3ʼn8L*[Vgrnu8Y/`ܬOlOΗħ{p09q_|X+㥄loc߱E\O]'OJ0YQMlo}{Og&=U|LcMqw1?a~zWǙi)Dce~}~>vߜ~ч:v4| 36>~au'e`-O]\)&:9h~S @ &໤ϿM/'YuXϳ3ɟ1Ѧ2jMekG0t6=e|QRϣzVFeL`E㨌 ,h22Z&ܢqTF[4._FV6!fOkN]ȥqJJJJJJ.7 m]ρ DEQMxQW;e~y:6.'Ki.ic>@[m?rx-.Sol%} #\dguDM1L2~vO /W~kLb1x{>Fy|fbͫGD2k55lciKs@Z7!_צǏ&qE [Œ|lZJkOfcGֆ%s kc3)qo3f֡.ɿL>&Y6_btӼ֡oϔڒu08ͭ>?u3L֬}* 9{.@d5\\;[{br9~Sx6ciϯ>׏o`]}R?g%m}?ks[>Icuh㾳,xrޒKa=u((#Igg7==\n~8k._?ţc4=eet4oGo=#VFe|/;~tN=+2Z&ܢqTF[4hrQ-Xn8*e-G(4{hFh7F85@ @ @ @ @ @ q"J;N4֍OP#FIP\\a.;;2!QLb{ʈi\ci7?IZ/eٱ>ͱ3"T'Wa<%{ἴ8\g-vjG/12RWF쥜l߉ 1@z>tk/q~g_<' "LIlG' 8&+gαMF/m#kOFaO6s˵g){xs6˹dܜNɞ~sO_3VǷʜe)@pj46_۲y&~ډ{~3QIle3ٷYO2{VZ~-u1x!%F9m3~>Ⱦ/(;y.3`>l}Lݽ)m]4H\̽yKJX}_g>z駟^}?۟vu=?= ͦ).ɨ~;~2*#۶lȜ_FgXl62gje̙etŶZm#s~aVFȜ_FgXl62gj2?fO-H((((((8N*,>N>FaTn0XD>_}+_Y=ӛW_JJ3X.Mrl> ̅9Gt-K96FEvWLopW-ܲkGy׿~AawDt~_]Kgyfʩפֿ|7MN`=fgic-w bCDhm)2l-]wo3ۋ)|}ΐ^xay+j|i da=x.>+}ٻlg3qf㻧]|VX&BG#H/fkGi.sﰓy vo7%\{KY;a4F1p_sz'{ 7ܰs/_y#c_HmOsPwY>0\z泗_~yH.ߖo=g7|n*HMm4g]=+X_WNL$.qtpH}%P%PGϝ>?7-#ٮ|O5ϛ/p${WFg\ge[gV+38Gm<*3;~o.2ڻ\d7Gg"goqǦ2De2cUm^验6Q{tUFwFz]{4{h"e#JJJJJJJ8خ$6}ЄN;EfHNhSOMg7Mdg":_D6=ԧ>zWO<4? qQEl `;X@ɸl_җ&1k<aJ.@=ؔ#z;1 0}e=sC&cAH%ɓSV7MPIDJDv]w7OXpK]興/O ձ'<6./a&?M qed kOpe+kby3Fk[tMu7OY]+S__aCdSROz?SN%yU1$H_tMő?5㯉 VwOb@6a]gq=_o{V[՝w9FkڰzQ|g\v}\n7;ęekq0h?xG6Ή'O[__dwkq%hʬ!q-&8ǬC,(+as=ƶ̙+ =t.e=.;^mڱM?Wk|xZO9{x3Ϻ;wMg>m6C]7يu\ߙ-罬X#?ud.{}z"8?WĠ5;oOclr֣[Y[b/~|n^|m-Oڹ6n3~̚5\ʞ&kξe0pGydgqQ2JJg?gO__gkþl͞2Bמш6o~+2:E`~cUm^验6Q{tUFWF{yl*MT+<6]&*{^hh/MWed~d4{To5eSJJJJJJ*kD*&ɮ)B^PͽJ2~o,&1=DH/̔cWHiyeMK%BB-!ֈ" C,C+̼8o.vH80|5^Nm8ʽS ^m+ UdqNnKUssowZ AsA엸sb߽uO,,|vv.ee]Wk'e>MDr9 K}.c lKSYuGpcy~pؒM0.I~3'.o-`T 86?;>Ş`c o/uA,<%x>lIݯLd7kr֘G{[\sJמu={ViZxVmc+ۿƜDŽ!A} @ _KM_f=4~j?-gύah6fOVKq]FGQv43roeTFη~g emg'e2&O($eMQHl/h;<)^v6yRF!,l\?ڟGnvcSS @ @ @ @ @ @ /#2f.Q +bBB5#!!*"™ ԥL>W2^QO+*#ԊIIaSIDQrDJs|;A6wsNf2}2_ ,L̑ul) y;M[4cE쯔ϰk$LTZ7ٳr$Ż3']g<'uo+٥Vp&,z kNsc.ke2sϟEDY?5kls-ϛ]>36ķslȵO+/0zг<ߏ|;${7=Wٯ)ׅo/;cܷG6۔k6WeDN6øI=|[v-KJ.O??q>/?G[nNW&~l͞2Z2:귣鷞~+2rIv~?h?:Q ϐhY;=(fh2*#1 wGSFGsoGo~T%P%P%P%P%P%Pǁj@k\$@.FIB͏y#˺gqi\8QM]'l"W?}(~cJT=q*o&E!U1'Np3|1 b숓ԥ9?$7>G-f|"~]'gBGoRoϼ=yK/%?X9{%|n}{3"ona7 Ll;4-?0}~G]s-狹Ms$@?N{v~LP:٪A.d ĵ39^W]&wpN̺[7{K.Y{}s˘rέ<zeM-O]:'1^b^#]l{,=x؛|^s?9;S=7J>sf-ʬ-@;{Wo󽚺5<ї۔م}t-{>9rmoW%P%0>=?GFC/>Ol|aCy9ah6fO}ۆH>h~͞hDVFed 2ڟeTFSDFFi4{hZ8"Qt954{ʨg8o(((((( #@CBtBBEJ|C>QSO=zW& ι$AB" s"Yd<}}^}&IEL4+ϧ6|g'8P EH/wqi!>c/^z饩N?T㦍" VRI(9>e}8`Wqaf}E> NDlK_p|駧ҵ1cK~i3^;={=s>]9[x9rk1W>ڸOݗg{ォ[ne/H)ُ=h?:O{1˜JvƺG;c8W~wvb8Q$wxxx{ws;q/´Ouku.Epd xW_곟qvޙMl[Vo:~'duǡvϡG}t#LeکO<I/9;ikYĦbS矟Jk`8dC,y"śX7Y}{>1'vs4gc,:g3((KWԧ_뻾ku 7?:'XYl1=ͦ)-v{4fψq4MpYF*5 .h[edemvրl, Pn eڭ2Z᲌6@YUFk@6\(kh Ȇ2eí8fd4=eacV @ @ @ @ @ @ C{_w t :!!4!K&h׆0h-"W(eb9/$ǶbC`R? yIxhngI u.X'-o{J#'馛&aUId$lͽ)2MZ漳G:{L3?%8͙Knٷ+JE&Le<ϟ]:{"S'a/xD,l/aq7t%{3~UzfGy)o~ߚ'K֒5OsCq_;[K٪>J?a {V>t83ynMl'B1|ozc1.2zv مbzǝA{PiŻ6{5Y'QY2|_e6G>b<_|Yy2WI@}8;E)̦ؒƽ((ˋ6.w~oh~ߟ#~h~k>= ͦ)Ű1F~[oeTF|rF{hgh-4=etLݯ6Fq_y6Fqt&Vi=g1F~;h3GdtƺJJJJJJJ$PC] ɦaND_)#X9CW?$g^b[| M%\2s>3""7وYUDPJz}y.J8ibn9yl| ?*d=mc|wZtS}؝BԭK53zM9ͭPn}c^zlO9]zyE[D/%r)ۥ̕3n^.7]Xb?9Y>J=^8ٚri|lc7ܞ[l~)c1[g}iX$=.cgMmzJJOヘ ?_r˩9x;\fGi4{hn(Y~w609=etX9FqAi5B|6Bqr?='OGhG#TeGm/h;<QlkY%P%P%P%P%P%P# /G^5 {y%!7y :"mbgMo!׿zW127+x"ș"rPG?ɸg\NwSb'{3oCTN?M=ɚ-`֟J"J+Û݉#%地9-ü|O3q\_^؝Bխ_6{cf;{rD Cd{9;.${&s>Olzk/g5Ǵ0P^+'/K3l~.yt/+٤491K]l _dωĺxw_?ė[Ė1?>Ξ5>Ϝb}{ONi xżbXkx杯ڍ5M1]ꓽd>b^nJJ6}CgU9?}|Yw1=ͦ)ݢ{4fψq4M~+2BGAbﵧ8vFfJ4=etj/-o8ZSGh4GQrvcrmmU%P%P%P%P%P%PGף fL*l=E&-_q楗^DI--$f^"SO0_җVO>[}k_"zް馛Vx;V_*t9;'6DIƜԳN_>zGx'O\}ݫkvZ~mݏ=ؔ neAcz SYqfn~?zWkԭPz͝ވ!O=駟|u\s8CO<40+Y]h4'_~IV|ilЎ/[-2˘Yc]a^V_JÞ|3SZoyW&Ǵ.@ )ّRoMbH'1_+k~&.7܇bS[{Rc?dyfqOX6{Ň39|'J>ƞa>㞽"eߪC[%gx+QvxnIJzbXR(;s۾6$x뮻;7ok\kf5F9w{[U @ @ @ @ @ @ =}c%KIHHC\G4FB/~q>88NRFG?3$$.5& !"(#qH3"!В]%N$bt##<: =g"UR%`#aKXJmYCRP,`K %<&3;.nd'{%6=[O9G-Q%[?ۉ#˞2Uq&A_8ˏ>$\Xq{:/]XB٣xYgϙC$cb|%ۇ'~JX2}0S~s7FΪM;?GDq21b-,hn0;$C" p 8YCq~8Ɣu~}grs]n_Y+O8qExY'7UސsO\ڏ|Psuog9}\b$uIYuY[g=+_vX@c ۿ>sana}W)ۛJ϶>~KJ&?;h}Ozϳok?c?6}Zah6fOm4{Fmߖ[98Z&XFeԽv*磽{Fqt4~;~_ۈn((((((8z*,>z>GGxE#3D+j2Lk#|$t m"}iw\DBC#D~1n!=) tvf c{dsXl^)ֶ?"p<;qMg9Rm^OSWI˭L\N4-|ĢMNv?]ߙR5ϑ_wz]'&c̅x?3p}WJI{*)o-MΠfck}G̙~as!'|O*Ż=({r9yb*{6@\89&i9޹l~cW|x_?!c-1dminoXhkTnKJ.o G?:}Yc=W_[?κ.7FFi4{hHo3bhSʨ ,h22Z&ܢqTF[4hrQ-Xn8*e-GˌhQh7mU%P%P%P%P%P%PǓ{>}%"@qaDHz{  w r/oSz3CxCu,ܼ12`5?ڛvEٮM$ cm[12b*׷rt8R"ŀR2g>3m٥[ sU%;bRo뮻KGP07&Y?b&Ǘb1?؍mo*] 1guy3 l .v[/1Gͷd,~=e-gdv"ql=běJS?~[Jc_G|8gU֒yu\mb[&ic oc Ϛ~o]H|0o}z/1zx?lrV|sbG E(Qn:+1@:]VϾ#ჟo砟vmĬ9:x[.o>{6˸MҞ298;ΉM>'h#o'~r(7%ke/2[3|l5;K6{%P%Pog/~g-.w?WNߛ>'ϧ>=2MSFEh~͞hDVFeL`E㨌 ,h22Z&ܢqTF[4hrQ-Xn8ZfhFGh((((((8~N)?ߺs$@C#DBsĉ:b1"B"9kH`'"K=1v$ TDCT6m@ejigG8M7J=lPCD5LׄvYNXaޯʯò[zPS=BK%!R?bFOzkcCp֖+>/ЋSf/`nlJ|pǵD(KĘ7pT^ϭ8Sckw!Sxc3/煌u})簳ZEduf.ϓ{òl& f/d/_اJK=<{J>ugGƜAr68 ģeygd<ٵ!SgJ{댻߾5dU{[9^%P%px߿'?9}^Ї>4}~+1KʇDjbW̚Nյab%>oqg֍k9S1#[Wż=x$HYA%7Ig ?=S}O f_1^2f1{øƹ){mlz9{k5;s~{_"0}'SLŇb3No|>K,ۿ_>ĉW~Ʋosڷj[g}uN,_;S82͜bcNcF~~wOs_R>ŭ13}2&֦((C;//hw|Y?ohXh6fOm4{Fmߖ[22Z&ܢqTF[4hrQ-Xn8*e-GeL`Ehq͞2jFJJJJJJJx_pXt; !!!x!!"H!Q[huoFIo?e&ӗmcBhƖy~8˼֙1A8JPXIOBkO *?ܘ8:֎5_ z鹹_'$8 scb,||>VJf8nۼX|b~5*$J;ߵk\ųNl8sfGb<1t'W7칹|!~gb`G֢l 뾜}b!1ߘkS%^Ύdb|JDX?K2q|JLؕ8s%il~پgsP,u?\~k9K%{׹*xbҞ~s&fl1(7{K/k$7re)(|w~껿wZ_F?;=Fc4=e[1F~[oeTF[4hrQ-Xn8*e-GeL`E㨌 ,h2eFZi4{ʨq*((((((C R"YЛcovIoUvy;Ͻ9P ~d""oAsQ~8 3ol3צV)/Yk9>O7Lc]gY?KxsaW :"[~<_g^6b>sk=і~nAdž\_eJc[\~f_`͛3ld5!q lj6lڅ'%61ev7O|q+f؍x/=9e'|˯d?cg~MYe(8G3ⴾ7~ǿ9x>uE𪔌6'υAu𕹭/6>isx;~JlsGl:e^c9s$Zsc4"eiwX ;g~cspmŵb%{*¦5v+l_=8o̵mbK X_B,M4mLcYVnKJ._υٱ|ffhnQ?Fg8ѦmyQ-Xn8*e-GeL`E㨌 ,h22Z&ܢqTF[4i1)nڪJJJJJJ_v{m.T"#V!:qo-ܲ;V'O\yӽnmMozꮻw߽7\DH$EDC 4?vK}dczŶyf;ثE7¸D?lDGs=7ZaSDMKB!k&%z 7=on#vBƛoyZzϘ`񋒯"9SWZatM"&>~#x6b/%{B+"*e[M^JƲ~d?;7 s?zqe1Ku]?L+ijx-qv֒aĭl>\۹K<94?_G S|~5-͛v,[Su〹782%cW9xlO6v?ybػi]bV"v|C icFf<{#"1-Oo92{Vڲg kou^>6gSdX#% |i\oZ&.fV|oL1jok1((O +u1_ a|0=6MSF6=#ш6ohrQ-Xn8*e-GeL`E㨌 ,h22Z&ܢqH8fO5v#V%P%P%P%P%P%P%p<c]"@CCB #{$ap[TML(_"-7F$ɵD\im!`&%a [r>g }Yr|D;5moDHX9\\Sl`;6[S75vK>l!.Rz./3b+k2)L{DNRN~ƚ#):y= _.k:aie(@IDATɼ9O1$H҆-?gYDQu*1zaWtϺ,3#dNunٚ}bWpg2扏8uر+oŎϔb_)3۵1̛v[~Jfko>kp269C2o|\OVwﰒSyܹ&ۉ ܧaٺ8b}f;ثvYKg-oyTϜ4ydS2|+γqi_e Y\x|P[f7b&sqoW_}u3Ϭ^~)n#؉sm*(|z衇6.n}lcC9=7MSFm6=#ш6ohrQ-Xn8*e-GeL`E㨌 ,h22Z&ܢqH8fO5v#V%P%P%P%P%P%P%p8<%Ϫ+Bn_d"sю:a !.>8D2sFvŘp?i.ʽ Y27Q$,ۓ%r/YwSto[~sm\9+u6r?Os3_o+upeNiWÑR&Dْ{Q|}y/ʞcϙ3.7>#~e6k$n]+s+xsB𩌟=gIڟYq;b{<z7vc\DIfna썔pa(DTlM[}k6s/}}JJ'_/V߹x[ƒ챤l͞2-GhG#T-2*e-GeL`E㨌 ,h22Z&ܢqTF[4hr2#-F4=e8ڍ@[@ @ @ @ @ @ OF/v!z&I2SgN%P L&ƕ#W'N^vm}~FFi4{h]0Fg8ѦmyQ-Xn8*e-GeL`E㨌 ,h22Z&ܢqTF[4i1)nڪJJJJJJ._>+&Yȼ0o$,&/|gS{.) 4z>0Xί/cߤTocm O/O$R̼yc^.Q6 TosOx֒@ޘ9kA8F}~}饗kGgwyc9 9%l~ߴѺo,>۶J.O'W˿__X=ηhXh6fOm4{Fmߖ[22Z&ܢqTF[4hrQ-Xn8*e-GeL`Ehq͞2jFJJJJJJJxxۿ'  A UW]WW^y劸EB[/& Î mcGXD:MxOxͼ5s>6|2ᇧ%y\bߒ⯈ _x3<g?>zG'qq믿~u7id~8k߾W'O>՗C=W͕6"v, Xۧگ>O{^;ݙ)'΄{n*syctM[ouu7Ogi+(|%??>яNbl8=ͦ)݂|4fψq4M~+2Z&ܢqTF[4hrQ-Xn8*e-GeL`E㨌 ,h-3b4NSFU @ @ @ @ @ @  g]E @2BDLG 4/ buX'G|m&5prFk M`?[ʃOX,ox[~ 8o,6| (u/f=g7yiՋik:?\g<!{vb"c"`e)!K#2`n{6_S @ $??Ws;;NןzտaWFFi4{h]0Fg8ѦmyQ-Xn8*e-GeL`E㨌 ,h22Z&ܢqTF[4i1)nڪJJJJJJ /J#™D:7]<s?l_[k7gnV?gss4{e4Fv6=#ш6ohrQ-Xn8*e-GeL`E㨌 ,h22Z&ܢqH8fO5v#V%P%P%P%P%P%P%pTX|Ֆ@ @ qzé7SހJu]V^{fDDD+W]uՔ_Op8(RYQؚ?,/۷_s5Ӟ=ޮ^!*W_}$K]JJ7= =Y}~+XzhG;hXh6fO!Fh X^QhrQ-Xn8*e-GeL`E㨌 ,h22Z&ܢqH8fO5v#V%P%P%P%P%P%P%p }玮JJ'@8H/}W+2N]wݵ:y;Xz뭓гRXMĔD^}mEliްz-Lo4&xW %8}W bBE_Vw:u2{V]GL}{uoxtm_%P%Lcgҏﹷ_quĉM//| m9=lͦ)M|6=#ш6og;eN2:2Z'rudN9f~։}]Fg3YSFDξ.)u"g_L:l&wh8fj4=eyon @ @ @ @ @ @ 7U7w%P%P%P%P%P%P%Pǂg?zՍ7޸~{_Z}h?l|>9=lͦ).ɨ~;~2*~gsoO^MQHl/h;<)^v6yRF!,lBb{YFIɓbd?=ew#e#JJJJJJJr'G\oDjS @ @ @ @ @ @ @ N[''WwY_V/>׿=gemi4{9MSFq]FGQv43roeTFη~gCaTFhN8|X]uUSoVO?{ah6fOVKq]FGQv43roeTF7NqgGԳ2*#gȽq0gh5MSFʨĀt>EmfOͽVMu/mDF[S @ @ @ @ @ @ /w%P%P%P%P%P%P%Pdž>?'Nu{V76G>F4=et*2:귣鷞~+2:E~g\/\whNcs6s-92e~46h32\/\whNcs6s=WFSF"}S#g @ @ @ @ @ @ {e{ͮF6@ @ @ @ @ @ @ ɟ\}c[Շ>s~[qV>ꩧʭ?Vwu{ðl͞2:SeS\dTM\[^mh/MWe{eǦ2De2cUm^验6Q{tUFWF{yl:(#cg)ֿ>\^*((((((˟@X|+,((((((8&~~nOY}۷-*Π)}c[+fFi4{tuFsrour3NWVFќ)v:$Vh+42ڊ2:bk9NZ)hN?((Vh+Y뷏% aS߸JrdA ̞ 7`Θ   7 j*uU*}uYۜG>moӫo~ߠܺݲk @-˞? @x g7ی/ @ @< ŏBMg ;1}?c^rc2ob^9-'r'wn;IS3_[^wu}kY9l1lwzsc#}S?SwgEߣޘr\][Ƕwk5% @ @ @ @ @/ŏ|єpN+9ώdKl/FXbTe xEK޹vM>Vnek{{1w^mśu95&ٽZ6+::K^o횇mu @ @ @ @ @ py/oI Kbx,fr}% A $U~<ޱ)gJԫ޶wMkJӻ|G;|ɣ=?;Qɺ%.~g~ثns_ G\{W<\|׻u,|z~owK&,qwӟQg+o];AW5~/(7?obޖՒ;z/}(ꩧ~^?KЭZmK9w'ߥڗl߱*{ezs1?i;?wk53سg5ߩm-[kgy([~g箭Z> G @ @ @ @ @ @G& ?/)D%2G?O?]UU(Ą%)%ü};%=9*,DZ2 /'a O >U{ K:*+HL,QD5{vW^@ϒ;J+9s[FdND~60lMo?LL?j룟[%՗z/ٶg${.Pn?s5% 72{'{qJ,n9o֝$~ __XJl;1ǭo}^2y,ͷ1T\߇J2'>q\S=eIk{|Z=kszM[y t_{Kw{2y돏 @ @ @ @ @ @) 9騗PKK%THt(aIFK)9%˔ذ%AtJlYMx=PUkz'wI2wGJ"m9?~[3[WZr~4Tȶ@IDAT7Jv?o^MiZ/hg*_km{j%nJb(yzI=Z8N^\V)z훂dܯd\G>r~oj>Ohʶae[>cZMѰ?.Fuޚh쵆*^n toD밿u{c3p1%@ @ @ @ @ @c( 1sH%$ʒJ|,Ė~h]_bLe%-sN,.aI k~J鞒&)m7 p{+{?~雉{ٟ= k۷U_.T]{_侵|<Z9֜.Vku}[ʭg~/bWa~vM?{{&d]{:Z۷y>Ƹ1WXLO%wMo}4۲eGK\gXq= *Fe[w^YAIw׵;Z~V,wrZj-vz:mw~v" @ @ @ @ @ @$_(䝾p =}~gKz}wH\(AD%3T_2elS%EW?'r k@IP=K"з,Qj__>K~*9oO_6K"_z;sy{wKXm@k乯Wd1YUٶW%޽^[qߞ۷ 6{uco_x[}ɷ۷v$jɭ瞱wW'rv<ֹQlU;%6[ma+]}Kr57Ջ'?~b/sch,Ytsҗtg wnf,={kX`Q?|➍m׽[:Z>zz @ @ @ @ @ @U H,~U|oK)^8yJ)sP 1%@}<ʒcJj,JZk_ڑK|'$mW̒o|3c/%%54߻;ΑX\ٱ?GbJTĩ/%N?<JʒJ=U}=^9>JJmK\jɘgLqyak_>s?Z_Q%)T3J(,zcgjg9x-|3oɿ8ζ=sm!zɹ{t]n5c׹GG9\I3=<}wXeshVQs_=sGwj~#ѳ[e/o ^5ڪY_~>;jspJ @ @ @ @ @\˞7{}b/%М6ukdTǮ{ﵶI?ZO۶k @ @ @ @ @ p/{ސKXYrNI K)i$ JH(DʾD%/ai :^L7n+%joqżJhu۽F_cq.ѩ՗tnʵ]Y2ƽƶ8em,Wf^}׬~x񏍩qCey׶~^;Y[K\ͭݷksqύ{ۺ|}pXwm}>Q;kk}Vvyy{qs󮾭XnϬn}_{[3W+ۮک\7sՋ~{yn;_?SwKkk][ջ~sWc۵űq7K\絿~vڬ箩_{>J$[ڵ_ɟŶm1V_^V|_,)XZ[r~l\WeXmžg_ҹֽ۱W[}kpVmY_[c+[ w}߆8[e^kYVsX{JLg]\{k;޾mIsV}[ojmsx幭kVv^nkoV_Tָω߻gˍcy>^?sm,W(z %瞗Η6gr~i,9rT @ @ @ @ @.O@bq %,d^x_򑌵䍾wUe%tN2YrCJ躧zH)"oe $DtMIujk]ӷ^׷++Kj\Vu2{H2\Jc[BT|@Qݸګ}QnUo[1~[:CK[29n~iJVZm{ӜUg%T.>Sq>OGcoe}4YY|9TGPSrMsI6is׮Ϻ51ؚ//}~?ݛk}V=m?яIFtXgGtxWo̧~k8*S+|;dz^6j~闎>ZW=Yw*3j~Z73G ?[=smٵ&ƳYSum3%T|xk{U^>]y}k^bڭ-}Q=\nV6o*s>_q.[R\^-blG8;^][̋~{G?k^lm8JY\m\6ux6׾}{w1/>y47՛HV_}gY}8w~ǘz?Xim|V^o;Wm{[ngz{ToVN5_׏qVo<w(/bl.ػx{1];%ƛUmTϪa[7rW<>n=lMGӚӞmٳyK}ngxک,VoиoE7iݴWbڻ;:߳++wٻf{ѼWoϱ羲K?*kkw{3{~]vj|ɣ̦뚳~6vGS|}:ל^ @ @ @ @ @n7'.%J,($0J,.D>e%%.{/}K6?:}?jXK.5PSK 0!G 5%sǪoN[W*xK:scnW/9,ۯ|+G~r͡yS,_ߚ$ƕKT_KTYìhnE[׺ؼ7/_۽뷲q4ƶʴ}멶#ڮB\ :־sݷ|y>*7BsUgsQsz+j3Z %fv]k߮W~{+9YLw\}6O mks,޻zJ,zk6Wolnٳ{n_ܞo[P۶D[;{7gXqq4OuOZKͨv{ō}*)zcn^={7,z&[gO ;m*3߭@~y˴o߻}n}gkzs{8سj.gDq͸z&9eq浭tFZ+WY{mןh 3u5z13zO|G~\ZK1֎=P_U{&e\{69{{JmWmoe}\6]k5zvkraG6.c1|/XGȲ3krm1~[c͞~oy駏6{w~jվ ω @ @ @ @ @ H,~/P"B&%aPBBI%,92KRxm'][.v8귭}n_焙[%[{H%NMrշu}[qS?/⩝u~UDGJڨ⬞c%Wח$[BJ㨯%4%ꍳ{jzeXq7?wMNʮϺrq6֮{7l>6Zsk68_՗S 3m-oeWkl_K˶%m~^e5W/ƜG>ڭ^Zk|mn6{kqGl}5J>ۋm͌{6ڷ6w^etm[QTW6mgŹg9ˡze[}ume_ݷ5s sXk.K},/svvkZk:յ=_3wݮob2g?6b[k39y-~&qϯvk6^ {8gc>ƻwbDz۸*s{kA}>n^}kavP[GKCU޲/̣~rkV%f}wrʞ =3ǺWϣ>wϩn-3㕽ߪT_[ӹv<:Vs{sƖ z&o1eQY^:i/=yO~=o^5{xYkY;ɞ5;WYcoejF @ @ @ @ @'p7b$P@|J*(1%yj?S罾5<%ZtJ8'EllW_kk%kh]ӽrn/cN1N*䎎uL;vj|-]Skw7cώ/؎oؽ}]_ܽz6߹l:ͧm6ko9YfbE1lk5[cS|7bƱgkbX纾z~vn뻶vMqVW__#]ڻc[Ų*g󌋫~,-}zX6:W;]zmtv~4wnζ z9F1mYԱuc1mN7sL:6fb*|vgt_t~nw}9̳2}R\;^mm5d/][Y{h=wWu[W [k[Ջu{1,Flb\{q^5Wq̥2h۸um}Tvbb5^c]_Y_ec#@ @ @ @ @ @X|sD^@IK(١ĉ ;_RB JXXBM]* ou^Q%:}^߶7UQ/%JVlm}\Eҷo.QFǒLJ.XmmwX\-s9+z|{;_}i|6kWk61W.kv8mhb퍧Ջ6h{}7Yfmt1ku׾gou-~>l{voVu~XWDz+⮾0]6ʦ z67'94ꙮ|7GFm]+oWN[WekQŻqk\7?Qۻ$/\ek}i0=?ڜwY6Gm<>g狡y)>ƞito}Vwʮ+kr\̵ѳ;kxzc+^\T\״qtQlI5=wwc^liwm?{\17z׻=%z1n?O܉t@Ѹ(o ƾ5۱սu>wc)E4?%V%[|賱5o/}阳<^\[ջYh;=mgx{?_Q/nl~}Ge[f[ws^Rݗޚg?{zm$Nrh,[V9)/P[7zuOkkf<gaǻţk//emd}T/U{/v}NJzdW>Om1dWk_#>7jſy)X:ߜql.vroM4~Uf|qޘ g_[Ի~?Þbz[S@^۬g5PkEܽ ZO=Ա=՛bs|O캭F @ @ @ @ @ @@_ *J)dKfXEI K)Ѡ$Wu%bk_1,RNIZ}.YzI#%M,VѶWxbz駏vm'*c%딠QBF!%|TQ6K6YYq^+%.)ꫭd\WǴ oŲ1tw~wM{f]zǻaXWꋣ~Rͯٗę՜UoXk[y~:%t~7Teuk+[gkQ%*UoVI?%8uS/&u ڭ|JN̦ύi{aI`.!,[߹TεtKw}umӱb=o{kuQ %SMm/%fS󾭴zyXWvMNj]2[9?xuyu}&o]gc{OVoK3ˠ>jqtc(꧘zȠgD~]+pX=;}kszw*ظW̽뫘[Kmeȷ:(ȩugsSb~Xkl[WyWoŔosԜ_{vZs?ZQoZYlkα5Rϛߞb󬫷6ګbߎEo{t9Xu~tn{n/YvF @ @ @ @ @% JܱSA%'бr~ضPJ%&JXRH m*K(wmn5%[muڿ~fiI mW?K"xNJ1vcmi-JsxJ8Vݹ[tiޱn׷k}맲9m~(T} kjcKҩwnW\f^ֽS{Oc;׵o O߸[[۪oܮڨoųuc}.WG˹;8qeVVq/xtb]7wO}غ|m7_kv!ڭ3\uow9Vڸ>1rLj{;_ߕmwY_[2]_uSYlYifjb|=C;q|t~ch\v硹]ήݹkw,۫O[vgfʞ-榽vjzs] >Yۍb];]Sk~;;V۪ڎ-9WG}~G*s;*>޸g{K5ںb{BܵGعbw1 @ @ @ @ @\gj^D 抩ʶ%j,Q%s޷`RŶ%tMط\QM{,~\?K۷7U7a. o˸㛋b)ܱ8LƳ$Ko]_doY6q[1Ufks}[}V[cɲo,bl3,}eXK8^,]me^_\+뿭r슯uTQw2m&棾}eo~Ͻrho__^~y|]ogdP|5P8ZՋsٸ3\UC6}Cto^swm[\mU۝o[]]jF87bܫϙjK}՘ZƱ:{2;Qv޽bycU=<-ƭuh}Wonxqo5s{kQ;y{Ai\ {eVzhʭi{-\f}Kz}WJ=NmFc>W/-3_];j7Wdӱ>o.⯝>.η*EkgYC{.kGcZ o܊k^wI3j0/^Z}.|wXjcckk[t&l ϵ$@M=zx}wOmut<,/gepum{n[<>bĨ5r9F,+#Fx < у|u:z|mwH-F}̓|P; @ @M@b6xJP(١dJ(A7j[%kG/Q/!婧_#98Kl_xK>p\۸L#c Ei1VƟU Kt$-%Ѷ-|߿j/iNs[1u}.]bsN~KDX:?T^.6wOƭk/9eޜTvmv%uvw~wsgGN?Vi^N~h9zm꧹?'Z뭋ꍹ%۸j>:l,mZ8{mU6ѾuEZk7q?|Osg_2iz3fɈo|o^'<(8W5bҗt<Ǟxt8VFS l7kQg2y_߸Ϡq?s/ymZ uYXsջ>׷Z}<3G?yxߗ'?yݲO[~lf1=^y7z/xm޶xncL= ]]=#Fzz#F#k=XWXG,+#Fx < 5jd!߶xwZz<;A8O @77.q{'J(XBK<()^SL%PXvK(i/#J)f c%8q}/ce%aSL[I%o\gKȬ$-iJ+d,+;uSSisu_{$%4+1&^L]$tسo 4$ml\k+橹u}uձVsZ<_*͒9YXb^8[lƱ>뷵zuT\mk7b߳g+ݺyi6+d'x∡yw]c,I-J-A/&}g=%<%f{hAxu: ZSUZ廤A^Yk>j/s[]oysٹ7ʗ:~f_쭅ޛ{w2;1l,nqLmvG{cj3_Iź5:9_{:_l[cmtf<9[?GcϾ|<5\YӶmquX++ƻo,.Ag%\|I{~{flc,7W;G+9sﱽZ~']֚\17=g]_C}nϥ$m7gWW{ښinKfPgrDkcsݱGz=g{б,mk>r~4)(]{N{G4guOmvn?_s3XͬkX~ks=[:wϞSsgl^?{o{~Fm/؉Oqݶn[<]yy|1b~'s81Zܯ~2w3kq=j's81ZܯFnd#@ @o&o~LǺdJ9'iu'QEIK(w|IK(1dsc._H,q9b*Β_^54SRȒ|JYHȰ6x+;669I8r|6Xjc3uKԱй9i*c_絿9ܸ*k㼟o;r[}n3\5cWF槾J*IjKֺ>c{^V"j2KZkVV48ٻzOkǒ{m>28;zkq{%Yoks !YRܹ?Ycן=[9=ͭfx<έyuMǶVLz|qʗjckzk\g-<Ż1\LuOfmyϦemNkc>:݃F|.ֽ[Ys<9;Ofg˻k۳۶X=s}ٺ~/j{ޚˌS޹sg]?bĿvϿgk>w?+5^*/=^*/=^*/=^*/=vmk߶x=E=QN6 @ fe#^ oYXopJ X]%OM}s^I@%m kgsB ׿l-Ao+f1cرW/Xk67-_QvĹ}{]mWLŰo5}+ݧ>JR)o\X귽Xܷ7x駯>яtMQhk3KD꾮YrWDxgI;}c_cm޾%q|TF?W8[w}leqy*>o߸X[ߊX {Nſd{}WY{%5}dmy\5v*;=/}.m.99WzغmsqzY;{|+Eն}o5}av=Ջ):;|(΍}6OY,Z+gbo,ᵲn{}t޳͞-tϒʊ9i]1 [[M[tUcL\`AּK'Z'<"}xX|ͲF7ϳA&sXէ}꟏~S萳 /`eZ=KJggmO/z_+k.{Vlr99t'<뒗s(7% ȡCb CrĬ/Vwֶ߲0aK'N8y/=̧d;?Ž9M}rp!-l{ywbM>:@&.Z_>X@ Kv}J:gw:ݿ[/žΉw=7N^Fi4}S`o[\?FgDF#T^hceǦ2DeX验6Q;VF{yl*MT^hceǦ2DeX갌O_MԬY%P%P%P%P%P%P\fʳ% "*$S%"/A{%I2 yBBd D$ur$ ǴM%v{$A%(YS…qLdHn1OmcI1I'LSO=5%/iŷT$3}=O戅oqcZ6;e}bBk#a-q8iɟijZ2>s@IDAT#s$',IzbsӺOI{CXFkl$Ic3+띯 9 _'N&͸@G灾1I__6{$~%;S40C5L2TN>=mo&J!O'al}~%_\˦y?g>%1;z\C=4%Z^6B&fjga=uUs/vl~g)u]c> )ϳ m/ GGЧ7.W3-$S*_җs>?5Trg -=ŕv|ɑTLV|A- grK1Og=ȡVvM2aCd)`ٍ#[rg=n{~gZ{y玲e{%6OlecK _K?!oWOa-]ٟ}ܟs}7hPk4FӧV.ɨ~;~2*#[Jɝ2 mmg;e2&w($eMQHloh;)ޖv6s?=V]B]Ӄ'}9xٵI?ϒ~,ē>d '5`>d$fcKFt6gaJd{|ycI&NW^9y?z/^4kg7?B?7:$Jέg4}~ygs޹%9f/5=_9s>6gKW t';gZ1{FY1=*1@ cApM2pf9 B:{TG\6_c?lXv=ރ+u6zJpȏE|Oġ?˝C?c?6}' ~p>?>M).ɨ~;~2*#l8gDԽ2*9>kGtM2ZVFe$s.hk[Fdd@ @ @ @ @ @ @ŗ$T̓*@쑊HdP蛤mgI&KT"gD1r6oLIRt1} rTk- !x84OĢJb ìAFiJΙw{sU|ܤS]b;>k6I@tW#;kl[Km`N2k$֒*O}jJoF?>^0|vM$. a֋>|yo0~dҵ9J1g[A0S503`fls.t}S>b<Mm,"sX]ב%+y1*fUS+t;r,gaBŀs٥G ʜ},9Zsň؍leM<nKpԥ-eIveĞpW~|f.a6@u֎~K-8Zy{VOɍB}omzvN|>p1>TM)oFhG#T-2*#{GP8|-2Z'п"2gh4NSFʨ~i5G,qz)b(?9-%P%P%P%P%P%P%p؞1t[^v&Dy$=$I)9җ yOF7oV<ɱ1IxH"Dsó6=WG<ݒ\#HB7yꭤ(I)ɖ|ICT7MD\v_J۔ ?6v.#IrʞR%I$j]"{va}gQ8Ou VE>vIt2;p9GM܊]54L r#g/w)=gnzFqħᏩ->cX RsVq𛿙7lRγn5Vkz5׺h}c~7jWXwn.z9mX7 ܘg/dsH?v^H=V G>2}nO?ެWϭ;sh:Omm4}FuMWeޱ2cUmwtUF+<6]&*{h/MWeޱ2cUmwtUF+<6]&*F4>h}Ց(((((( \~)>6K^"aIb$HQ%Q\Hl|f[_WKz衇V=?n~uMII8[}<ck.Ud9r%H4z߼+W_껾V=}oquĉ5\3VaO>Ϯ~y*/~7~tQz&ruۙ~;]lbVkYKޕwUWu]7ʼnZ%/Yg[Ilw!p /yO?=~X_%Kpm^KDD ﮸)o;wܱ뮻Vz׻d< kI$$RfOY$a(~m9 ϒ牜9K YGv¯a ?O.O*stM'8qQڃt=؞ؼg|9ťDU{Þկ~u:Z?Ϯz}soęa}isrBn;isjJVRo-2}o]]}vₙ |#L_\ć߰d8w^ٖIK%{? S?Su??>}nǝЇCh:O-4a4ψq4N~+2y34kG;#GnM)3gA6>̽6>3rPo4O9so4ψq4Nۙ=WFȜ+((((((W'Ë* J$: x+($THhEؘ{GQ$H _}򓟜%JБ $dmz&AED>[֓^̓J5&Y#כy#)EB$';sunL,tǷA9IwIJ+hd$JD%>%я,km*lQMBL&!.IX,WR)tMٕbZI,[{Iw}(Q3%:/c߫y7%,d2:󑸰n~&hcbDyEVIAXTג%8yI,*qX%6.5Ȟҳ '{}`IbV̈_q-)rlO%c> X7*9IP5:ߥb{/*1vSu/s=w7MI=ο#؄m[}<'>Ts.g}!s90ۿۧbI,zX껒Z=U%{1YS>;h/%||Y!g,^&]?%ZƷ6K=yYaX<ڦdڌ]u^ 8wM=__tؿz';΅9ߥ?>tM)]"q 22*gmHj}Dhghѩb6>:u4Oqqi?;Fg8Q-;j{[FΈ[(((((( 4b>\K*$8%Q$H.$1' Or&Q)'w].#fM]D@$H(HrD;U b' k\%IRZMxFB򎿭6e?JOƖ$K]rXq?2a~"&%E$S,dDUW*FI":1[b`-&佴|#%kƾZB؛-mt-i#oĺsOVw^Bs)Ϻ~I|3}dOͫqsاWؑE~dh4K!e$nٛYϯ踋jNk[5Usؕj/8 w^ayU܋y\:|s[so[Z~ڜce7?~̚ 5vEi\ּ?~ =n}{s.ڇյK&}qO>~Aq`4}=NSFh~MhDVFe@~g;ŢkOqMFi4}^Zs4Oh)No8j!pG5ۍ5G>[uV @ @ @ @ @ @ ?M,>~>'BCAIF4nD$(I\I*yJi%pCJTI 9elJȮe~Ib-bj${@/I|YMB,U–7}0&c= %{xfi\[<[z311,b$ZīMH%wĴsFR<$56㙟Nk^X,t)y0>T+m]{bwbo3/Doגwf^Źg%BjG(AyŏZ>ԹƜ>sU$'l6y멷zVj)7OTs~?>"ZR]<{09z~gWixf?F^qLv{8dۥgisnY=<[<,֜o-%PǓ7F^zq&؏tFy#_+e>th0`4Fӧv 6>#ш:oʨG@aﵧXvFJ4>etj/-9FӧqAi5{qtQc}Gv*((((((G*j,@BF%xO?zꩧV/”кI7@T$H1ڒ^O+q#e1$FH$ XIX~IZwB5$d]w6nDOD7lՉ'V7x[nY\=Y}w}TR1|_*ZO\0JVOjK8Y՟ǹGJ: *%%٨W&n[p kv?~'/9̡}=gTܷo򗿼g?WgV_'d%J5'LyiqWzxi7;9n馉#j~' ]pT㱗1I931ys٧{=s=z'?)%oS{O3cMwq$ z6o?*6>q}ߓ{?>1>}e~ۅ5Œ>KWZFz&9~Io{Mp>&|N%)=9Z&Ÿ7%sa{GW<ٵ ę=.s~lϟ1Ksƴb|3w֙{uM|;s1k\s'Vr`B G/8_wM_V<[_l3x>ㇽMhnQ>Fg8QmyQ{幖22*S1п ;)h:OKKi-EЩm4}G3ehY(((((((GgTcI1$$QB[E$SHs$HzJnPݓB+tIDEJX, ۊ{I I|I~9G…ĜM8}yW/I_tM}=X|]wMl%L RIN^=C|>'NJO %P$/ GIZxy7ObI,wRkƚ6yG1.~;@3Ǟ%óM,fK_ocaXN<>ϙcj%XlbȈ e9n{/xostV%υ/җ4%/PCN uyw^m}zر'2b#u["qx%a,2Ĺ}EW$̩N|8}95~PbJb|-x}\dɎ~!{g> y59k1kI*<؞gn=d_{)Os _>}O JOs='67ٗQjm1WZטRr%c3#i>gS> :%c2gKbv>",AuyJ/Fl]3/*J 8G$Ǻğf9?[շ?ah:Om4}Fuߖ[22Z&_r9=Ggϻz Gb[wIڥg>\b7ՌM>ɕI4tJx\J]*|bnxIuv9觝kY'gk?蹮ks=ޜOu{汇+r3/|\/kħiP}ôاf,w&'^ #Gl@{깼u#> 9Oh?}CGkZ=yE,%޵qzwO>oLsE2Wivw,v)y|>`]yI}0bE&fKҽ39w0iǒ_%p>^x?t:7|:·?9GӇA4>e[؏1Fԩ~[oeTFg4hQ-X8*e3GeL`yF㨌 ,h2eFfi4}ʨq*((((((Kk.=kT A$ I)$:I3IJs_B6e=! '[4Ef,!A9 aMc L$׺7kx2$HJI:xt}m ΓL?vWEWV%׺g6_՟'[y|\cL)ş?bg-~x}յd-T5эX(EKWg{>b;6>C 9&7 gUPI]׉Rc}o33Yw*KR]ϳY}󭅝S8+agttCw/gvt\Jb9W<{Pk>*t9B. s!⍽3Z\;n.9:3 k)xuN5޲񋘶pqb/]+yfZ|=`-2XٓUg==L6+Yu8X5O?MJAױۺxҗk 86|ہ;*]u;9Cv;wM.;J~?OF( |\}SZK+2 ~>^{Fi4}ho3bSʨ ,h22Z&s/˓=tniu뭷.i]$|LZ$3+L`T}Aէ7_җV/_tmL[Hq{d5%3LU?v ؒ!'2OLkW]uT N__|o]ﭯ뮛ZqAflԏ1i*ND%ʯԺ[֩_8Z~la[nZ>N;J|[}qm_Rl+ ,~xOO÷2<'OA5sSzY+[(S٬`/1ػ/6JԜtN=cg׺o]{WwXAk[cza8l❟/Vbѓ'CϜ'g\:_Q#c5ɣK4Ǿ&Sݳ~ĉ)VXF_k֞[<[젏{a-cX|N_W&;O=1kG>ޟ&W_}X?#{ų65km:#1Gĕ??.bĹ|?t?BGȵ6=Ī _lك>T}r9x6zϟe7-]tUϥ՞W|O|iYqGOg\vǞ]Z[M|Ub"\}I`щޗ]v٤Ib*t0>/i-ƬcO=d}Es72yo]k9xf]k=;s ωM2ʞ&sn|mSK3BGg$ҁ޸d1񇅖_Փ'? s6MlѲ[:x6\1^Suo]%>!F|߿k!{d=k)8%>>O3Y3|~/>|$ףètM2-GhG#T-2*e3GeL`yF㨌 ,h22Z&e8ڍ@g@ @ @ @ @ @ Eܲ%.-VR%[HV#N+@RD 4$~_\=SrI/,$-l!y")dH)QuN $HF}%nĖ<$[Ų*{Z'.{^={֕G'`t4&QgHb\tpIz22wGId;.b[{Y-ޝ+qj9oXqƑ~u| _b9Ͼ=~> =b!W\19gI^bF,:_#Փ'?{W1Z*3к|dCYt-;scfğY[ysә<:|؁:fpa l3tߤ?]M_F 1$35<<?G/}EٰIi[%p|_{;){G>ah:Om4}Fuߖ[22Z&#tI9ّmrM_ ۞x "я~t?#?2Ҽs?>M)ݢ~4ψq4N~+2Z&^M|L>shGb9HӲ9kfG/;R2_3kUu;W={b)Av+Us=^n5ٶa]6U~߷v{\k 6: Q9WGy11s]>'+6gHZH,H:tQ{ʷs93d_~QBdӱ5z:ka.{7.y3{Q=eݎYBp03 ;:hNcogWΒզF-87>hȟSj9Gus4}5NSFEh~MhDVFeL`yF㨌 ,h22Z&Ifo2GlKֵb~_\7yZ'{QZCykUK''+:G/:Y ϩ:ܵsϸyH|YmyGQد*ɺ_|W2ə UČ *zs1W:5֌?u^:kW\GǯGΡMcӏR,鷜!8+0tc>s/NbXΎ^ Ydkd/oX [/$ş7{F_LL֤[SK{ĸȲǏ#ei[%yoT{8~GӇ4>eno3bSʨ ,h22Z&>R %*H @$Hx'D ʯʯL$URBRL!" ATHdw=%EHBdI*r˔Iv+$wHxV&FЅZZ]zԺ[IkɫUDws?$|31۱/<$vU>qT*y֚dmoZ%-Y듥otpWܓX\pkd" IĚG]}X`ȇ{oZ'NLgB(JNS<=F\'&EL<#;syt *{1y}:O<ȴcK"dzJtW_=ͥ 7p|pխ:9篸Њ| _غ_b/6IdV7tdSt+dy%=ΎnI(?SJQ+bu7hza\sؐO}ꪩ9}U)tr_#=1t+bLݏv(>|;&z91 ^㳜KtO+!7>O'3}c9#NJ}#5{}DɜSrS ᕯ?O^#zyV_~l␝AמOx|fwiMc L`C0<ۖ<˾dyuO3zV.vaGgSĂud>x, Ňς/\C볍p|!3|iMEIQ%p 8w}V?ǫmQFi4}h0Fg8QmyQ-X8*e3GeL`yF㨌 ,h22Z&$u]S;0=I"PU%)t %H< #IOpW>p"W_!aG2PXt]֧7_Zk=}(Z:WfbLt''%٘/1"$k]{NbXГX\gxGZzD/%m-~ flMNt $հLBYt8dUrxe}d5ߛʴqm?k"{ZtÑBnod3]ٟ3%\8y! Ls.],>&i_ayɞN~eoY'z]7{,9~pa{ٚuW'Gzkl_9NkbzG32#c.X9dba믿~:إsllo9x=Zk08MO'%})ٌ۵:B7&𖠈"vTMOk<3ޮf0g:xګb$lճe7A#8: _8ݜl,{|{gTcY%#gjbLE8zF9wg<161ך(Xsc'p'ΜGqKV͡UM%>uG3|<`K}ĉ"Z8? ZY5Yy6sxc *0JiJxŭ;SY/DM6hn;c4ψq4N~+2Z&Ξ37:z%`,<s r?]5vہdM~;9vE̱JXabiɉ\(OņIҋ\v6߅enKKy;Z౮gs6Y7{>ysؘͦg2?>3摟3͵qr2<ٕ2SkEب5I_v:Z,o{;1%YGFf2b2LnI.&)zu8=?OUvZl,9|۳:{ `c.^Uk֢GFߚK{ba3zq7zk#;[*^xaA'py[3Z]luU:xĉIG*S0g,+,$7㮽CϩvֲN>>t=HQkҁ骰Ml*>؏K?pr`d8b|6jmŒNΜk5??c ֵ=q"F‚ 썭l)i'L-udRblqm G8ґ-t'Ǐ.9a88>#lNtpmGQ|z*s6ۓbMmcd=+zz{4̾Ål75l>8s"gbHOܳp?B&O9iObkάS6/{O8'YbMt^Zz+ !zd觵=Xy&Vcxf]nJv^%֢uMW}~ܴ+S3t͹xXA{P7k!vGs=IGnSjw>g%s7t9Om;uKtX o3imz}m=vqМh:Og1Fԩ~;h3etŶ^m#sfΰ+mdΌze̙2:b[93^FgXl62g m2FxaWFh5NSF{TJJJJJJJb'w~ H. A 'N&Jj޴/їOď$HT@n$hH&sۊ{Bȴ UU֕ !\)Ъ֖%EdsӚXgė$秇 ߚHҊħwݓ֐"EN/Ge;[]dct #cڱ29@māa_mk (š1k] y̟3?U}zhcǺ\|s뭷N ;s=fI;Øb[nY%hd_dёZ/k+;cG=)vr.T#$UjO6ӗ5l(e9xN"ֲ,? )Bnt?zw 9wŊ5ϭCW>׏Ws^觤=(M!Fv)ɾlr~Hb܇ ;6Awm giMk[\ ώЃݒijؑSOM}ksz*H-v{ּ{⳰?O\{&;r0s}z/9}2QNE:q‡NI/qhEGd֝7U;c9KtP#zjtp YC|fdXݚу\7sXDl#; _R'OcF,U?kv[߹$c]9Ӭvgyfj}nEkalTf(OpMbMpgtO?<=vah:OFhG#T-2*e3GeL`yF㨌 ,h22Z&e8ڍ@g@ @ @ @ @ @ Cŗϛ$AH^|$ wha|X%+$C2CCMĮI E%ZH&%T<(ג(:e\'sTKΈ\ctѦ̟XZ2Iq?d1K?<;O\ Ȥgg5/Y[~K [P65k-53NrΫyt&n)~ƴc=vESq;lq"XŸvϚa&vϳs}*[oEGؤX#o~t`89zFyK8bȜ-sGяlgb`ꬠSbM&,c' \`ݬA|oG^Yj[O|F->]&vJngX9mYsd챭رNblO_d]c&{H]cVS!S˷sViUzy0r-ǃ:mK'?\/>E?Ob^տWY8qb9]cFi4}ho3bSʨ ,h22Z&Ƣ_|bAk=r16K؅sma#zs>g kG;祯zϒ>m6;|9i{s8̺AŚDg\s/69VbRgsn*voƣGl^y\lX7;%g}vzS$\g5\3X-鳉MJ#{'>s>L//oO}S{~ٙ0> M)i;,1Fg8Qmq wf7}VFeYӽCeTFX5M*2ڍht-POF[ehY(((((((E_N[ce$H^}z!JX_5. PR?/.UL8G$D^Gs"-/zϼh,ؘ|yz˰pxS?>ooȌxW_˧x[%PǓ{o{;qM7/=\Kӛw6ÆtM2-GhG#T-2*e3GeL`yF㨌 ,h22Z&e8ڍ@g@ @ @ @ @ @ G:;'oe%)xқe]v흒dw(Q$$)'orgp[TooPA/?ϓGnwj09nkIx͇I4;nvb 51&aQ}߸z[2Us'oHB;>g纯ϧɎ2_o^mo>'%!z_a71%I6V}o~sJjvhlQ~o:[򕯬y՗+Vwu~;R%pqW?}WW\qŞt?3?3}'t0cCtM2-GhG#T-2*#\Kh`Qjտ;h:OOFh43Fh4A :Q]\lg@ @ @ @ @ @ #7?;!CT *i:s/D5϶A0^9~Hyiw}%N ~I%]\3q6?ggL|XQYcn~Ƶǥdqc7;%KJgyfOLouOcIx8,qǷ|.?s>oTcd=m# Њ}k|p/NoHwvmSru?g$ӽJxf]i{Y}M}?p<3^X}|Ͼ{Gi4}ho3bSʨ}T ;ۙgXvFR4>etf?o8:(zo8:+Fh43FhG#TSze̙Ѯ((((((__\֚$ĊaaS"4&]"y~ΤjXBjy!+$U`Lo(N{1M{ }^gc3ΗZ2tW(3?G^zɞ*?gl~_zO}S+lhc4Fӧv6>#ш:o~9'hgh5NSFjh~MYo8Zףm4}Gf}t4ψq4N]Fh]^@ @ @ @ @ @ B'kG @ @ EA@>oP ZJJ`WY tg%Ij?ܻr|3YC?Cjx 7[Mhzoo3bSyGhNcs6sќ~m2-92e>ZFse|46h3hilf.2/\e4?An4Fӧ)((((((K*[}} jt(((`6SI͡o|WW\qo~u{@ɀy(X"Vߚ{o|c+L oxBo&˦DcI}k/ GGW=}ݫ3MozwwWvw߽mCtM2{GhG#TWhceǦ2DeX验6Q;VF{yl*MT^hceǦ2DeX验6Q;VF{yl*MTi4}M)#%P%P%P%P%P%P%p)h&ʥ\%P%P%0Iy37^y啫nmuww)7~7V_V9qo,ʍU&0O,vH*h,kY]{Sboo[=\7*W~z/cA??0V=Ц[FӇ4>e1 1Fԩ~۷ >$h}eɾ2ڇd@Co!7PF(}H >$h}eɾ2ڇd@Co!7PFlh6NSFVKJJJJJJ#K5JJJ`dC?G?GV~oo~sJ?ͨn%PHrDux#3FuxK|˷M*БUAsJzЇ>ׇ^;+>mlNO6N Fi4}Eh~MhDVFe#?w22*>kGGh2*N8j5nd~.2*8FL@ @ @ @ @ @ C[F4IQT%P%P%P瓀Dߝ%IԗmW\qo)(%#;3PA7ޘ4ꪫ$-KD{'`P>Zb>w>˫/}Kdo+CtM2Z2:鷞~+2r; 2:eTFs;}֎R4>e}H wQbvcbaR%P%P%P%P%P%PRrm,((86$]MoZ]~S7 ^oquĉ5\Ŀc*ZcHm~o|3nX|ͫniַu:[$@ ?3?zG>:8B>OGb3̇GӇr4>eZ-uOF[eQ9ߔ~g;? :Q!пC>kGtM2ZVFe$s.hk[Fdd@ @ @ @ @ @ @bRrm,((86$u;~[BbII,n߱qk-W׼5׿.leuWNgbKW]UΙگ|\]{w7W??hPr4FӧNAq]FǓQv<3roeTF46h3hilf.2/\e4_FGhNcs6sќ~m2=[Fdw)S}ùSg @ @ @ @ @ @ \eD3uN%P%P%P%p H84S՗$R%P!pHV1gK0D;$|Xo>sqׯ{O?G_|qGQnFӇb4>et:|Φ.ɨ~;~2*N];&*{h/MWeޱ2cUmwtUF+<6]&*{h/MWeޱ2ca1GӧFZ)gU @ @ @ @ @ @ \ǵJJJJJJJ!?gyf~oͿyIW_]MW_OMh=Y2:t80Nt8giwN9)S(tHlV4oi[;eetNmEsFFSF[ќQFQlV4oi[;ea?>et:NwHFs@ @ @ @ @ @ %Bo,D]3KJJJJJJ.n=''W=}{ߞsxk_oէ?}bॗ^Z{ォh:O-uOF[eQ9߶~gFxaWFȜ/3,h3etŶ^m#sfΰ+mdΌze̙2:b[oWFSFmvcSK @ @ @ @ @ @ \j >`^?8nթJJJJJJJJJJJJJJJJJJJJJJJJJJJJ#8*]KJJJJJJJJJJJJJJJJJJJJJJJJJJJJGŇ%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%p, 4XJ@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ 4p:JJJJJJJJJJJJJJJJJJJJJJJJJJJJ%&KU((((((((((((((((((((((((((((8&Wg@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ $c*]%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P#((((((((((((((((((((((((((((8X|,VKJJJJJJJJJJJJJJJJJJJJJJJJJJJJpX|8^]%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%Pǒ@۪t @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ǫKJJJJJJJJJJJJJJJJJJJJJJJJJJJJXhbt[.((((((((((((((((((((((((((((hbxuv @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ KM,>n%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%p8M,>.((((((((((((((((((((((((((((cImUJJJJJJJJJJJJJJJJJJJJJJJJJJJJGŇ%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%p, 4XJ@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ ޝmę,O8TR)׽~۵jux^pSr'8d˒5KHqT 4tDR)Y<( ( ( ,ޟk+ ( ( ( ( ( ( ( ( ( ( ( ( ( (0eYhP@P@P@P@P@P@P@P@P@P@P@P@P@P@'``\[P@P@P@P@P@P@P@P@P@P@P@P@P@P@0x,B+ ( ( ( ( ( ( ( ( ( ( ( ( ( (? ( ( ( ( ( ( ( ( ( ( ( ( ( ( clZP@P@P@P@P@P@P@P@P@P@P@P@P@P@ X?/V@P@P@P@P@P@P@P@P@P@P@P@P@P@P`, ,f ( ( ( ( ( ( ( ( ( ( ( ( ( ( Oy ( ( ( ( ( ( ( ( ( ( ( ( ( (c)``X6V@P@P@P@P@P@P@P@P@P@P@P@P@P@P`˵P@P@P@P@P@P@P@P@P@P@P@P@P@P@KDz, ( ( ( ( ( ( ( ( ( ( ( ( ( (0x^ ( ( ( ( ( ( ( ( ( ( ( ( ( (X XDbQP@P@P@P@P@P@P@P@P@P@PpT<55\|8R( ( ( xuub{ ( ( ( ( ( ( ( ( ( ( (0"011MNzk/P@P@P xrH'׸yXe9P@P@P@P@P@P@P@P@P@P@P@.pȑ ( ( (a00!(5=z"( ( ( ( ( ( ( ( ( ( (G6%Q@P@P@ ,w&&&KZ( ( ( ( ( ( ( ( ( ( (  *9P@P@8ywrr#AP@P@P@P@P@P@P@P@P@P@PQP@P@P@q0xZ9bA97+ ( ( ( ( ( ( ( ( ( ( ( ( (g ( ( ( ( ( ( ( ( ( ( ( ( ( ( JQP@P@P@P@P@P@P@P@P@P@P@P@P@P@ X|@@7W@P@P@P@P@P@P@P@P@P@P@P@P@P@P` ,V ( ( ( ( ( ( ( ( ( ( ( ( ( ( P ( ( ( ( ( ( ( ( ( ( ( ( ( ( ``8eT@P@P@P@P@P@P@P@P@P@P@P@P@P@PP@P@P@P@P@P@P@P@P@P@P@P@P@P@ǡ, ( ( ( ( ( ( ( ( ( ( ( ( ( (0n ( ( ( ( ( ( ( ( ( ( ( ( ( (8X<dP@P@P@P@P@P@P@P@P@P@P@P@P@P@8tsP@P@P@P@P@P@P@P@P@P@P@P@P@P@Aqh%˨ ( ( ( ( ( ( ( ( ( ( ( ( ( ( ,> + ( ( ( ( ( ( ( ( ( ( ( ( ( (0C+YFP@P@P@P@P@P@P@P@P@P@P@P@P@P@(``\P@P@P@P@P@P@P@P@P@P@P@P@P@P@q0xZ2* ( ( ( ( ( ( ( ( ( ( ( ( ( (p@ ( ( ( ( ( ( ( ( ( ( ( ( ( ( JQP@P@P@P@P@P@P@P@P@P@P@P@P@P@ X|@@7W@P@P@P@P@P@P@P@P@P@P@P@P@P@P` ,V ( ( ( ( ( ( ( ( ( ( ( ( ( ( P ( ( ( ( ( ( ( ( ( ( ( ( ( ( ``8eT@P@P@P@P@P@P@P@P@P@P@P@P@P@PP@P@P@P@P@P@P@P@P@P@P@P@P@P@ǡ, ( ( ( ( ( ( ( ( ( ( ( ( ( (0n ( ( ( ( ( ( ( ( ( ( ( ( ( (8X<dP@P@P@P@P@P@P@P@P@P@P@P@P@P@8tsP@P@P@P@P@P@P@P@P@P@P@P@P@P@Aqh%˨ ( ( ( ( ( ( ( ( ( ( ( ( ( ( ,> + ( ( ( ( ( ( ( ( ( ( ( ( ( (0C+YFP@P@P@P@P@P@P@P@P@P@P@P@P@P@(``\P@P@P@P@P@P@P@P@P@P@P@P@P@P@q0xZ2* ( ( ( ( ( ( ( ( ( ( ( ( ( (p@ ( ( ( ( ( ( ( ( ( ( ( ( ( ( JQP@P@P@P@P@P@P@P@P@P@P@P@P@P@ X|@@7W@P@P@P@P@P@P@P@P@P@P@P@P@P@P` ,V ( ( ( ( ( ( ( ( ( ( ( ( ( ( P ( ( ( ( ( ( ( ( ( ( ( ( ( ( ``8eT@P@P@P@P@P@P@P@P@P@P@P@P@P@PP@P@P@P@P@P@P@P@P@P@P@P@P@P@ǡ, ( ( ( ( ( ( ( ( ( ( ( ( ( (0n ( ( ( ( ( ( ( ( ( ( ( ( ( (8X<dP@P@P@P@P@P@P@P@P@P@P@P@P@P@8tsP@P@P@P@P@P@P@P@P@P@P@P@P@P@Aqh%˨ ( ( ( ( ( ( ( ( ( ( ( ( ( ( ,> + ( ( ( ( ( ( ( ( ( ( ( ( ( (0C+YFP@P@P@P@P@P@P@P@P@P@P@P@P@P@(0y\P@P?@@IDAT@P@P@P@xbtS#Gtmĉ{G~uNP@9qNwoM^P@P@P@P@P@8&4P@P@P@P@P@xP׻{)LNNvmJ1CM Q`77lGHP@~@0L-//s:^P<;;F( ( ( ( ( (??!Y) ( ( ( ( (2 TݻwF133MMM`a/sOy|yf6C|"` ( <- ,g0?9j~csGb}j7P@P@P@P@P@P X`MVP@P@P@P@P@⵵DWA*v x+dV ( xcc㜎y^{^MOP@P@P@P@P@x?}ӗ>EhX<䏆D<}brrR1O5Ox 0 jɅTwZǼGڙZƴ5AP@P@)R+)#5]5>qx 4O~ן<9Cijki+Ӥv<,f<޽{mJhj?#^S_Kǵ+\1^}ݻwc͏1W_mcq̙3#xQ:7^pa_ndO<_Urcx 拋g}֦];<Νk7UP@P@x^\|ޮY)kvU@7 s;??ߦ:'&ԩSmd<7~kS^ 8gggz{7/)>M[nu۔6_oS)Ϟ=ۦ޼0}/^F[۷۔wԝO>|ˢV/w(R NpٹT'wf2gt3}N>zM7)6Fot.6̏_4?HfwZl@ϵ)eOomSP@$P?ե_ݦ>65ٽwv{|6m/l7ToxlCs4>7n-ȸΛC- ( ( ( ( ( (@/`= sW\iAuUrܿ˿)Oƹ{<Ld8q℁O5Ly p67_~DM̔@zLI[6sN駟v/_n7Qp>72 `noK ( 7TOO횝k9~-繜.q1湙@VΏnH&|]ooړF}͛7 qMj7Sn'+4EK9]͍HHlho}P@P@༔'Fr߷rUvk)a -z Ĝs>yp}93\;ן|O lKl@L[)ڎkP;ڞ|qx7eL0Ư}*i/vs/˱cG}pls 8f9O ^JAW&(n2O\6:| ,S'7׺&'MmneNF\Γ *4Hn/')e,fLBOG-͝q (S pW'1s5ޣ98h/v\C6"y6A=uA9|sx(8( ( ( ( ( (O&c]"q5Bn0;ncdzX7W@P@P@x*8vz5|=QsT\Y ;VLk*ו6ێW:U@&k2eLصkL[ 0QW~|kUW`1a'ʲד ;:51pK01[NgZH &P;͌`~*eɤlh7Dϩ8H,W@@~)0sG# Ly19a>pymc$Kڠ9_|O ~{;$| ( ( ( ( ( ( ,~/bQ? Ô?G6`dn(cAxf GP@P`!kcǎ "@m@0\?~M&c F" ƌTVAƻrPqc3_O8юqYv|Զ:hQ(c}_SY5:`Cρ,>SuL) $k;x$ %T:W'(3mz4o=@`^ ֶb'x[[f{P<ܱ92) 2~)~) 74鸼t6 ˜q灜q/{9s搕( ( ( ( ( ( Xlxs sB96^>n&4&Fʚ>ǢO$Pmʹiu\iTP@P@)P:\yqp_?ŋ7|Iʕ+{ 2/\<|վ5 nq-kz7xX~F+N?wo>jߓ>} Wp1iH} ƍG}]t-'?lۘy &c$4_ ~)Asyp~?8;x0O*^Ӫ322|f~O%ݹw>/vᄒg7T{M( ;`<3yxE+W*Y>A1땅xT;NnlvǦ&wO8-j x/7O?"Ӌ_&mɞa~_uof;式;y9'$k>w:~$( ( ( ( ( (  r/Ɨ3ztek< 6}'q-w>O&@bJm-u7S2V7'Hw.e%x23mR&C(o)?@>p\Ώ02AP@P@P@P@P@Pp X|clKS 䏃s71)S?hvʯ{̏֝ }˴eɸװWlG~uXV3[/k,\oV5- O-}]Ԕjfi4}mJҷ+Kl7hvJ5z ( ( r x1S$X̵p] S,#5Nsϔau-kon36^uu'ˆ3n]OT]w{kaYHc'Ό{q|r-̗wӝ-a~*GQ5l?L^ǰMF0Lc44\iΓ>-c4U,a]mwz]e g|۳֫eU*'U2_mU4%aU---q{RFkOWei4YlCYkrWX<<*'Xe&>oZP1tN1n$*}yo#T+ilst+(-`u2ilfOem6㛗À-c#?> :'M1lRzzd'*=V&{쪮NϔPYwmeףSuN\0f)UoeAiJzA/pd#)4i1vb]`X,dy l3,i?/H{߳>Ȳz+C^g1߃$4ezʅM RZTԔrQF L5?XgF-ʓtWSYyah-=lH }z &հO7ڦYP۳;_=w7l~&_c.Oim/|hjޣu ܧMנ,ܯ`9OfA6* ( ( ( ( ( <`?rӧ;ws̳GptIܼ/ 33Ƀj|(Lnf_XrpnU7MGʗbĉm{7@ZS-*2fM:[7WT՗t+}i۷o) w<˰ǚ<ȗ<i+Oc$qyлʷS/2|j}hjS֣]HPٞ>A?~Q̗i4?nW_ަP&FIhW)O نrtU꫔:WIcoV+ǕP@P@7:\s9ik>5y?ܼ|{aa'sN{uN^>ip_Ly'Se|z:PFk, uPiRVꉣ,gYa4rka<=C]ott{^6ߓT!yr]5>VS!nyB]mǺ?0ZnUߟ0_-[0^b,ZW VO=ʆUe]O[P}wrڔ֩Se~\wu=<eed u/:mŧ5}$_Q6ځ\1}y}Pmq̙ڐ610ҟY>A0OX><.2ڌi>Ri_̙x>]Nյz7o˭S~dJz}۟?X3OUiǰ?֣l IuڞchLq}6/޾22 OOvfٔy Oe=(<-)?syc J< lz*.%n5]M0%X%|O;17զ-І<J gH-yƌZ`H'YחxtJ10ϓoaF^Swn[UbB<r~˓v!`wv;̐5i/innC,_O'K+wٗ[7F'='wWT♴홅mVKC}}ؠQlSOܡ-򣪙e O-6 &n=I`ˇzPyb^}e63wctj'ediLkx~G&!9ϓ3FXJ?^f<ť]$ǔgSe 33oftFdCƢy<4'e}*ԟT~7b8Q Be@9fevo9Ǝv5<Gŭ.:mhǵ}Ϸy﷿mwʕv7H|9ԩSYr|mdt &$k q^{6F6?tִN(7^t8s̓F:!]nc7vXp$aUwQ7?M_lMJ~Lȗzm7|̏G]d͏mkopR+OeqyRo[F'_W7Î;F_uW_5/ڋQ;ڍvy۔MywA'm_w۸FY[zjwW90OCOoU9F_Ц,)7?pb UzSWȼ ( (/r>̵Wޓԕp2p=c-皍s|F!y]yA.\h~_)i-5?zm@޻F8ߧ;LzU׽:k(ƺ5O12p͍믿ަ\Dmq_۴ P'cxb5 (.sܹ6 /Sg۟j ::}w^UڹI]cJ{w{kPޣoF[o.>k S*e5Lw #u]˵-ٳgNkJݷȷeg e~GCZ׮]kv #쯟~i+[y횞4~RjoBо}g_CGx]\S>Eۯ|쏌̳-WM;2}ܦs7o8|E$?~צ<۔ I>SwLR}W}bx^ ;ڂAʺh Ob[gu͏U$ok#lEp3 :lM =57st:LR̯$|O@JSYf۔@88˔`a Kݗ7c[e]'x3=:D_K=V=(dK|hi4JKkGr NT?&ap{ldڶcsj t“@eGK =}q6?^ ^Z{?}u#Tk mJOmSMy?~DiU:5H[{9g$߷Nε)e>,L31S,An&ˋmkr5帚~kc+x.&H2U $f CzXտсv_/ .nH{0Oc"?,| 6TdJgWڔo){}^͖Aӿw ܒ}?}?yA30řdaϋŕ)ϟ^}w;?,zp>/ņz@z;ŗ) 93eNAO}Bgkf~&ljkGI~|3E~ϐ7-O0cɏ|+Ͻlxcg3I/pP^>w?>p)?\|_m<:^>MTwrΔo92=鏋+p01DO3+ ' >wfN^~sgv1? Zy?RrӺ̳~ S&IY2?7RiZsC(7 2 t̓ixMLŀeMfp$]Mˍu6>ucՉ|YNyԷ?0A״=3e]n Ř0O>X30M꯼ʏΓ|H[NyRcͷS}ڴ!fqkJ_yڍ`[$ScU>7Ls8@b mn֣4{Un0}dYɴ1'T_|@)#SrSDY إ.V@P@P`,8e\kƺ\'rN^snε*gfkүzcۺ֨ߺF%M9Z>f5s QC];뚲t^4ɗkQ)Pʏ4:4:K^l\PO$_IYI} WpcvX0O*'ݭdH|p\,C{0ҭ6/[7|Ӽ`N^^Q&̰O}#FO>>i)VK^Li-I`LHiciUlK_%%M'~.d_` u>E0vԉɏa$~Gc՛jz>N)+u=h u2R穀DS ~p~{z#,%?>]&I#AB7uy>U r<!e#JЅ|V﷧fJ;gdʔta7ǝښ'Tp>>̱#f<{>N vI?F;-2e,_糀)iyF}(I :1u!Ϝ+9/~^̠r Jf{Lp#mFjk9nqٷyrt>cl>g }6RFX17grd]\Sy/ˏͤ?RՑ!n =|Rr M{}.m>v*Xy_YVrLˏPCE?hGdMOWukm[ pO2<ܗ9弮vns=Ώysi΁9'\xP@P@P@P@P@P@ȝ?dsCddc!c(ͻ?(Q?$-771}#vܨ k|j}̔`.@Z*v[bZe ^]Օ'{_7- /'ХLڬOؾ֯eUC0&ԟ(ϴImȋtx]y֍,<)Cqy1V<.O lv5mx);^ێjj/ X\`u?vD50_Ql~mj21Ty1oPZHF0_UWS2۱~CPԍl{ ( ( 6>iak,qm;lrr._#u9zm׳<aƺ|kޝGQf9R6gaxmr: z5 l_yT_e5QOmՓȻ^Q,K!U;V=*_zz1 ˆ y io ȟu)erT~9LX|ن4x]UQOcYeW_`yϺ:0 c}^W:Ų-~^Wɇed2W(wK_g]TVy3S򤌌(m >-8 5Y@XՂ(Ahn^佭6XNe%ЈVS.{-$N <hl{/m><wjB4Z)7yPDP 8#gqՔ;ZVC'GMk)X+Y{W#<oV1bN[12ϸ6 (u{-2^|V֌5]/Fҷy"1 lس>?@ꤛ=OGka.iM:-lSՆ|$5)'L)/coL ^Ok( DžxXe9e S˒E?zH۱0c՛v_}?_[ۖ'Y^?6q MӔ[Lc|vLQ2ΔvV8m5h{џhIy$9mnHcf^G1ېg(9:ǫKM9:wm* ( ( ( ( ( < k$V72f1nMnR?$7>OYn /Hcd=-HMr,çȗX z r2O,aL)-qYOa(J]ț?SOxPI)>lSOa91JHn޼nÁzssfv<^W]+ 'ҡ.G&QFˈr ڙti_ʗ-#C/LH0}<ɛ|kg͓V-2͍<˧%4`Ab@YN{SWGͪ+ g;֥-~vՆcL?Lj'b|Lɛ|oԃ)mNZLk$?g`2O3}).1ϔeUd%s‚k?Q@P@P%sr sO u?<皡\pQ#iqιyӓ22r9g,+\w>׸O6>n<ym0S&!eۻ3v3qi6Uȝ3'Rd-<'abxF zw%Oool}n_|)Ai#!l7yf LRߤWXGđfL}{umMVLt ylf+PX/bɿO?$0va<3{  :$ [:.8bڱ|ߟmؖ{4wPHrc9Odwrn;A }YT^ʼUa҂3Ǔ/wҮzm䩬,91uHRS< K).TӾS "͆C+Cڅǎ $`J ϲ9fd:>іG<ɛO/ٮ(y?/g}yZ9:,f{ ~E E?eYJxg`1MaG Z~>y$@7yr)bs˴>5irdJ&ti/WDyRm 3yo7&$pZfSٗʓϞshlO i~`qhSL[lmOv g/SӳXXCښs<{q:i*KM)cM9%mJ>g"1#]?9OȓSY;7:m/^|htveh+ڬ'yL~"[ɓDKd{<s>>:sMA~}U@P@P@P@P@P@x>?&WMx|MI 7V bϟoQr۝:uPF:_uK_#7ۿ[Inf|ϝ;FHvcaΞ= pM/_n7ןZ܌HԓyPRg:Ð0o#ӧOwئҨyO>iy&,LY?6e)E%̳~c8Êxb[tqIn ;Sn io.\_Hm;cNL'Mڈ:C֧g'exYxE5nne~cc9< VLj)n:Sڜ$-ޣ?48V0~N jz/6rvs+1qoō3q??ͻ9?2@98N=S]O|m}uc4~p^P@P@MR8f8;r.=vڀ\pv3};~_l_p-P!sMz]s~ϵ:#SMySf ?ip]<&ԙuӟZX^Qv=('ekm򗿴y=>*S.kG.]j˹:32,(/wZ25oѽ;m6ᚌRN{3ez/lSǝtހc̓5%``LiĢo~Bl5(#-gT({]Kʔs]]i1:Bz:0_Lɓ[g'})\[ӦYF.?bE?`[<g=-Mn]3OY6vԅ~G]y͔Hyrң,/^t&[߬2ԃXȀygjG,gha.1"ŴTHp.=عyf~q<Sv3}M Й}x?>H> ^0DR$( ]>i7"(8N ^6$g ʊ/w­{7}a..})ю}PFR~"mBu3 L;6?f?HwoXH9>S&Yc]}N^ys;džc fǻ9pLb]b?ڽy3gцT~/:Lp2K!,jf۔ L8G(]ɺ7yq6}>/RYCP:O~*}31y1 ꗢKy<я|^$ounP7cK=A׫@+ lDZ蓫ݿ|{_YhSڃ/L9Avy1?dDS0>+щϮ/y!mdz`=Ao9LYWhK+xyc(y!f1Fuaǜ\mvs_ڛ%}s-)g/XZ_chs>}c O8r3A\G8z"7+rwb16͎9IOC"7l/ڏgWlcgʓc/C1O=?6%w*|͵]rs̹ry)w`9_uP@P@P@P@P@P@xe|cc&P7q7 rC$:pȍb$kncdyhH:QC,YGZBn5Gʺir'7qY(GՁ ԧnc{`ݺAF)7MlOy͔ٖ<(Q^WxwUJ:5yXn2̌,gJ>ԋ" 5U=nܤε#\30 uƔ (d:k;^W^SFe*5'^O 1LBL&X=OQ۔$ )1OF=U-?,] jOMO]Ke|;:%8AO&ֶ<ىD<~bJp1] .-[HA)j qLMbwAr G(7o5ԉVڑAjK2n'xFȋ {lYbD_ߤz-ԟyJ@֗ }aS6TՔrW ݃| d-߳mpJ;ʴ;&%<`5~,8&T!ۗ)EhLMƓqo/2eSݹMgؿ.Nn=3fd$<[9#E mg=[L?sH|oϦ%3F^Oۆs<\Mr\l>d<. <}j?J٭C߯߂&̓ose{oOyJ i4Ǧ'F?/p]c$?<ǧG>/`a`17W>_mA[VP/}%S4O;2sX/˓'Ҟ})+iaT`1?j^^y_߳_'6S싕gA M"ӭ۔j鳗rO;aq,9^>g nW';,S6kem>}$O~糔s69מ]?N1 XʹK\ImD[1&C1A{aӑ rig~y6Sx9n%R[a[lZi i-uz<&,}cj}fzS2!_2W{tXo& Y7ˇmK9)#nr!k*'=I[[ |'}z$au~:ڌ>x ^OzX!pPiU;֔k`+5=ecMurs3#aQN1UvwP@P@P@8Os:si֮sr뚡Ra[a9gd=y{g:rT:f -i`=b2uȇkۺy}k*Ouk3 ԍzF\0Cڞ)٭UN)^V5)=)+vtʃt*?aʂAĭ@~l_eG}i[e`9d}oC: 5%ަ\%0rqaiM)#y32'e{ջzlScUl3jݚٛ6$Y Y7Qf-Ydk"E!-SYYYŢH"8ˋHU6rO8|pci(]|=:Fv6 eRz}3{bCR!!h>䞠*DOȌq>SGuB:ɚh:9q@BDZN-'\;uσ`7׊6'6m}tܜӤ92948PUQ4='q M#ͳm'lVS|#M? 8 Vh) N.!7n= 9ג/]H뾔\ͩ2+b)9t_Z~XZشX9iT1-DP9q[~G/i"E\<8OeۃyG ovxV<9^=}s_u|o&#AA:IpMGvW)T^0z<)C~9rsn,}Pfeq.X/ Dζ2Sg):^e^ѧS2{Tp~%1NRޑge a^ݜ便lx4`{Dmƕݰ?k4;#h,H.-ZSzEѶTEk+y 良/g[/~_y=`x4ichA[ˉL_0.ms}ӭfy[㪈"pwi ]yNY?KǦjyV~߽'ka qNb3#l:_W]` 36Yq.+v*{LycWsЯ|hazة.[q{ûs<~~C\G#^C%?4AG> /s@sͬ!9ǿt:@G#t:@G#t:@G#XXN|gs R_@'ͦlfCFކ65m^b6Iȗ6$/!6ĉhedm$S]^ΨN -umA]e9,:5&lU?'^FɃս`́E>[JWh%.9Ȋxë2ʎHs^ Sa .:8TO)~w ثx)ym1["LGpQ^m٘vp_Y=uyʷ3#1A=|[^R=:My|%dW?}/Ō#t:@G# `^kmkN:'̑yԵj̿͟2'xRS$33ė,t˼~VRF GSٱD xF#2I]'%5GcV< L0қ^+2JIܓ:){DÀڒ>p ʻ'5`[A?=#^Sc6a|m)ִ9Xh@Z)np(C{\Z6aesk+[W2|}]mT 9vq>ȱBá9:s\)rH)۱*W&kCĜO)ҽPymӜԪѺvk隝)'A6zh: 'M%q*2#'G;`ƹûs[WEeЛ }%NƂe`O?zc\1(I=}{lZJ.'\cNܚP Ya\ =ɖ]T?/@!L,]kJ]{AyM9Y̋}YXǚ| _˜Ħ䵡SOUJ7V5Dy}Q2Ͼ/JKם9:Օm '/.S4/N`hn3P sz%,5ѣt" c)E8s6*w|rxBQMUigs-Z#;oxϦ3~w޲ *U[*[2O_݋G|ط|g߽a`42>m{ 2Q+S4u}Kmͩ~5̑g<W12]c`ӱyʹz Yi:{oO~3m8%r|Y;[ |G#t:@G#t:@G#t:@w,ԾM{lr믿n6}GmS6?ڜ D6_&׮]khxlsMo/!_:D){ms wҥ-ʣ1x,j*v6S/FkHwyӟk| |JN*f_Keݣ#U>,={v'/<`- _g =oFN1f6wDW6ÃN}w :!NC.: $O~I=2Keѣ3 o=O Ih7@G#t:9_ז8K+KGӼݜڜ?lry<Ν;7nӺԜ^Ț޼[y4k)ʣ+U<~i[G&keȪ|hy.9kFsyo}"?@(C"#}΂zdԷ Z) tP}& v[G.v)>)xGak6t#\ro6;ll uN?DyЅg|9@/6 [";~{8^<>͡Sg6&|)YO?f</Up *vޭxaگ9,ӕN3g>|z^QYujN d݇~<ޏ귛[&W*Lc;?UZ,$urs{'>~P?YxڝjgLi}Y2\:عc]h)98~PxJ9!SQK9uSt}]e95|h+w&X\EW<(G'mVi9>[?HYy:e*هQOט_~{aןH[Aߩ?#; ГcnN≖6Ѻn~,]ɦV;}S6o2|\BHtV~z8š-IWΗ~~~`3Fs`yXl坂~Ɲoj\\=26s@Z?b=tǪ-ix+kyu88_69>VyE} rb1E |['֛ExulF-֜HUýy79M{ ^ƆaL]mVNxzdTϏ6{75w?i;xzkՏ(qU}{kWc]{t]a=08~8n/vxO'/KېKߕګڪytԻݓ#&jv^?m/m`^lgN9yb@G#t:@G#t:@G#t:Xj#r0_g#w}76qܳPj8(D81fmVGӆ?t=l>K_ }hsMJmX󥥍j/6M*z6TMlk_X˱چ8:-͢<87_bŠ>6 Ѧ>e` PNB6hN1O>mx&-xG|m$?z(+oӧBBt--+^FKT'a(&2<ɮ~xʇWYpe0x&'B`7lp??䵱v`{o[ ;iX8 UOQ^yys._m&h-?}?T@DW%ڰH]ko:ol6>tE@CG#t:@GMG\<0kYۅV47709|O& -5'u~ZT]4HQ?$o>O?m=b=aϡ 5kh1׷>k}OO-o!o CP=8=?g9T@/ΛS5jm5P`]?TsR8x?F'kKv@q tև4eB+Iji+2Ys5Do֍Lad'>:5'4؃g8ȓ9H~s쏌03-m7[bSQnXыͲ؆>b":-8GqY9v/F/s| Hy5Uos,<'\9Mw/./rc8s(qGsX(g"g@g(N½SeҺUP\NvU^xZm^oj#9(gDPr]Dqo]UN۩͹ rLkGyv9r>[ESh9Yqrz+pxsEzMOr/8+ NԩjGu.}Xqn?6I(9d]9gqIo^rfG2E ́-%K}ٺdtr?y9wm;q*V͇+J|op甘6nF矝tSyl㺖9Ā>/T̍!~9߫][:\sۅ]w|o0Vn9zrD-jsN ((9oɗL/o>fˣȥByrYlBIll9~Tr, ٢<43OD5&N}='4ڤns6C^zLs|dyCoޥS¼wŴn z?9Q3~X_Nvozt[EN!"4Qg wۭWDNv6?Pqb ?ū5ns\|)Տ2/^“ik|InNNn<$[xcՏzy^ڼ{n9Y1Mof'2 ڄh71-z6flp_m&JC~ :ڄ= H~~=zIS'r+Mݡ'f>w ʊx.9#t:@G##9y9yuZRc?#u|ʹ' ]g=!WP )Dך]J)w r$#L['r>[)V(Nॏ>QcAէg,U'-?f9WKXmixCu\Ypapʷ6_WKA9]q|(y u U kY/|Sy9U`Rq]z38`lVÄfڃ-͵#w|d] LP7,8d?A3QL`>]8]y*< ~sgnM΃0aKc{Il=RάNsE {mVNO PXИ$ ie Ǫ~zdڰ+0EX?#p_'oәMba,%͗ѽsC8pg]6ϥ"~ٔ,>/Q͗6ჾ6 hutMC#soQ\To|荿͎6 f!~69y؅>dwN .;agR1N=_K2x:F=圤""|l0Lk"s3r+S'm0cM76P΢I_Yt6FNghjY;A+I{MrI}فgR62]>ZȉH?M4c3CG#t:@GMG<>k#yѬy4sxu]9=KlB訛y8ʚ[~&u?3xʓ!I{KŴ~S;|.؋~6W<奢 n;mr|9pP`ߝp{RZU9\qolo=_<qٌҲoV?G ΐqZa U9("ZĆ^8s*nAS( @x_~N6]v#_s6[{6u"Vٱ9LrU~Ɍ#b} Xr?Hss${;[D90wf(-slSNoK B Μ8I]OAy78>X47.5KAY4RsHD׿eRfWo9ʼnr g#e$^*l9Ƴu JsʫkN7Tegvu)ڱVES^;5*\Im(>[7Mͮ9g\Z**t\nÅ5^0d'.߭~X߿ܜ#?mKqMkq% WtKXX<{!~bCىoY$y<9u'=#_e F"y} +<ڸјg67Fl,--!wfBY4Ҝy+?ju{cx/lu^CLVFlT>EwDu@?')ˆ$GmzPz?lO̜}f7{t:@G#t:@G#t:@G#xtW F0:@jSFP|!hcMܓ=_"*Ey<\c6z.]HmDc6T]_bfa#0]ϝXaS$G##64HȀS`E<;t__z48|򰵙'/`_|ŶLx,[;8xt6rj˷zok׮5<a!ʓL0 yϴG6y睹_l^p dF#6l^RMi0ҧy>ĬLmillcfg0gdT^~~T=elG&z;wn3xh+m=`ʾ.'zf箿8&^"L`M Զf0G7c^l/ 'u⩣#uõ;8pTulWiH=;Si6g9 IL #YNJ9k\{29Y;)D٫7EدWx/apNۉYȪzVnɈן4֜L\"=Gh)Jz9lɥp7&qpݝTgzytAŃҝl?<1Q?Pr]"Pn 1t =bs&[=-].qjpzhC`:w|ur0k ڃ#nsc81rtc.t;RW[plUsjaQ29rp #)p2'8p[Ns._j6 {~Mnk㥶-pD~okrǭآT_8&ФOTjӳuOTg9-2sN'gS[60iyess0;%wVsMv7UJ6}TP1 Wkn5SCn"&NÆ<=g_ m{EG#Ua^gK;fx5V\+~ ) 92z={ì2/{t:@G#t:@G#t:@G#xEpW+}3lnyFYyϲiLx eRRlZLj'U}h[_>ژflPm=!l||6Li"9m7 tXGp-taS%|"+<>R=-A٠ٵU۴Z/{!ӥKL(kC/zs .6~֦'\{y.ʇǘ'~'|V y85o>VbSݴMوg꧟/kmVlcN6!Kc3MUrb{52'O# tǻ@G#t:yy<'Pk*ssDmcseO8&n;i~/[SYcXØ\ޚSyX6<:[W>٬kC=}kАZ-kQ^:)מ}嗭 T c4r 3p2>ͺ741N\^h8Ъ?`$y8ЗH8 ~Kw \mܩ:-]c/}'}[!^>i˜Fl$r̋l,}^d5.6=N6:kom,O[[GxpQj#(:tLEq;Ə??faM=>Q? fO>xFR2'ˑt9Rq`i'MIJ䄾y4O y` McsէNs*TN[b6 5 T(ʩz˱Rg)08 RnnUxxr|q.VodlG*(%'9:mO_?dx~c9(v^N*GV8QM+Xď3'ުwT9iϮޙrK\q|lCʮ8qnSNR:o=G99='ٿ*L3Y^Xj@IDATrn8rTF]LPIgVAC뽒YG~9-$ڔ3>'el߅r&i,/JX 9?~`9.K =|lh?yPJkvGQ?6[}iJ9KzƢj5Ԙw0;s/MG.OW֬~/>:[rt]uvz_ 8ָ02VfK av%z`Z2VsZT'T[-z?|x~PR'/{{~p¸18r;t7g yX,?gɃ@i2NЮqИ=7~^e769x#˻hz/ϐ(Vl9Wd jٳ7j9.+ї#C3rΧ뱕۫@ cWNKeC{6ߏurڔ5Ol7*!ا9_ֽIFVIG#t:@G#t:@G#t:+B`[HG0_/|hsmK2ÔS' i&6_>rlze(22Klt-d<d]]FE"BK՟a\.BCP_$M{d-䳉7F7tMD>a6؄f6)JL~^tKxM>ס'>\tHqnSe|픎iÛ>LӦ cymBK zyBlI[t6, 'ʫ'z*oNТ8zNܻ;@G#t:3Y3edmnݴS|X޵y}֊R̥ke3\9|hH-r:tG@xMh=H&oyfIRse'C8ɕk-CetrEei@G`*骍܏I C8,Үr7$?/ζc#xe tmZ޲e"3(xFOmB5Gv㉵6նʤ]C+ybt.v+*W%keʓo+24^2pObEj繼vV\PN˺:y(:s 316C43N~DUIgeK1 yoٟ Ffy쥧Lel.?A8"6al8=2h]=Dgrr:Ppd9[=-9ɖ5NUhE'.r>xpN9.G8Ǚ٤rTiN9}>]7 Ґ>Tw>Ve^i0my^{KQ̇ ~ݓ7MݓrdrplŞ㽬l]_7;N=^qxpvSWDj) 7jL[۷66ݭN/`_pʡӨ.l:B6S|N<^k."0V~XkLJۏ4_<} Sgjn6>Eo: 8=^=W)> հڋ]؆:>ܫ:)N&raZYuG#t:@G#t:@G#t:6ݱxYm믷O;F@l SͱF݂ŃpG6 Df/ц4mƺO1 xo6+>h;RasGu]gϞmNچ.hx'6wj'On'%hCv!Sm^&~t9/^l)=0IN|f7&.ISNM~Ԇ1O2!XFtc'艆:R햼p{ﵶsx*D ThdžN>ʻFsmm2.\hȏ&z?rDo:b6C;hM4*ʓLa7zyG#t:@GuFuu;(S0ŭԑk>moNm~mwߵ5g*2OYH3/z[PNZ?J/8@zO?O-uMvk1zdm@()Ѷڛ >{ $ drj6T|듵yN_,H&W7َ36 6['_9\/gS뫓u\"h6v'['_ݨF+m2T{J9~Hq'lY[IqzFo^ncЁ#?̜P\fB;vt:@^3O܃bmc,9?C.4֫KR<01>տrxac[_?>plu}*B-I'טɹF;&&;g&,ƣ2mEi>ύQtm{żwkA6\dsw͵)T=FoQ9D0j8eˏRl@pN%qQ˷RPz^wZ{ӘA-hޛu_h.x^=t:@G#t:@G#t:@G#?y>9~o<`aTRƵuvϚ-<`<^uul?idM:^E]Bc/eO^{'dЏ#ixsǡ8)z-`\g}nWdl!F0"T|m>onUxiw}ln1%y~SgǷzS8ܴu-m7gK$ :;)p9lRM9u9y\@wƠasēOb=>]ؚ8'aWtVG 2]_sK=&kX|:1Bᓓmr-G[oa3ǮK+)+鉟ld9~]Nd^'_LzP**uAѱrHBs.V~9tpt*F+}uI2uOqY{IZ,Aa͡>U2Qiv3ñzr(v.gĝ޿eʲek_Yb3`UVV '#'NmN~@|us1'l: NDEzik~QyQM;Olĸ4E1?4"G cAxI^yrצKh Ǵm6KEYmlae6ASQy| cfS2x'J1 3q}H:.[>y=4fĮl^UVfi+4ɫDm% /7S.O$>Ho:$O@G#t:yyhe2wOB7[e~:ϮaRVH=B3vJ\L5FY:q~ MS.Yx;n ~w?|_&DqLqq?~2}VA9gR Yۥm`6C6<`޸#؎ b/u~l6L{gD]}--}4a&~ʻ~qoYl\ʿ.8ltG:'.A\'g/wChL_j<]Fb:GEm՟2b$:EёNnpz H!i=MS8E9ɑ㱩ܡ‰38'>*G*aiCNg+5D49&Y8~xroԉɜoc'#Fų90Soէd ;lb5`r¹_כUlH'}:\Ip]8O-s_q YǎoYD?[TKcو 5^9h,p,}k-w[Qs,\nL]Qy!CSwoxT[Py|^Frޝ O2Ynx&)OVg?#y B.O޽w 6٦ +m`N$iyCpur?zLʢ^h2tz@G#t:?*GKg^yTȼ=+Z7=[&dx>9=:":d<87!:uƱ>dLkI^,[}2[k[g`|N?gqZ.7Y>'=B+A܉FS{cx>ǩgӎ~I`/BҔ7nDױ-\_ҏ˄~r/|ʿhq|Qc}ȁ(gK!oLi4Z<\i:{]/&ܧF?ZXR>W4gj'LIʉDu hoiח"xuϹ{Tn?z!z}8ƌ:b݉N]6hS\^-:}J$6b\[fym?2P Z R߽<NfJ l=cOYQ@|(3ϟo'+:B~N*RG}tr1<ot\K% OuϜ9Ӟfy^reg)P )zor7b#w/NioO< S]*_Ia),a^iSh\jw.Yъ}%ﴭ{E|;-, x ֨RzJ)):x{N-8N-' [dc} ih)_62FtakRL0=ʫ!A^`WBZq\PxDl+/ٟ626op7| }D&#}v_Nɑkp\.6u(dCA˩"ggMVtq)·EA2(r3c)R8(dhNUڠPO,G!A=:8 _׿X/ܓke[SʕNwH+6[4ےX|P|Y-,>pP.m~?#ΙӸ.6f7e?:pPͧ ?HpzꙓջAjl0]Ey<;ujZa 443 杢a@G#t:@G#t:@G#t:ݽ^]W/U>_ .]{KmŠͶRv 6,hӮ<'7-ۤh"6MumRmuM/)w'zƩW[nM>ӖQ??ZKl%Sr??[}?nlCtGF_W>`φ_>}E: l/_R`1 dYw1I6t/^dwI64ۦ?6M=v'`;fnzyzIh?>&O&z}_ȓÆ{-7|oAtyFM}02m]E%+47#t:@G#@֨t>0776wvZšMYks|sksSZzͺSsų޶斷>FzGj͢u Y~>PO2G7ޞ϶{ʣ87:>gP^tY` wk\9}n_b_C6 g=u֧舮_6e֏ڊ_bÎgC^UwNe/= oil= gqyKVhk܇>#`Dn))  l&ٟ|om\:+ѡ1.tul{|^z籵Qrk'V\nSn>##w3;(G )'{5&(N”rUBp“,{f G l9.!ϼ"8#~_Μph/+W{^opk 8͹u'gK3889>n Ηe)a_>xQ@3Í6g9v9vG{{*{3|[iCy<&B;3ub'$2'3Mrk:Į>9Hg~ʣFll6;+=:@G#t:07g667w˼ºBq4\ܼZ>3敦sLg.;#FzmCtAdf#MYG}PFB](hNm-B~f ꋂ{ze3uA>{oyu=yQ^|8VU?'+E_?B=} D8xyn_[h:Qup}VY}G6(/rkT9HKsboҺN-/49|rd϶y҉ū5$9et4ȀǢV\yi5=޴az'uRc}99~9 DXhX|Nol'W}N|ʁ6lG{)\9Ӊ::٨uncmr˦rڽ[+kPB2 %V\9X2 l]j&?/`$W6 88.+rNXu3UڢS 7Q yUz ~^ش~ʖN3r$=]'C· TӴJ[:A@fnz+ƨSjU3\(iG}0ƽ[=}e9u^{PT.⩙AB~OcJxOcݔ_64{k߽K,Gc#'Ͼ{˜_]Hs!*mj'N,>\kʣ3^s6?9枙::@G#t:@G#t:@G#t:p_$?MgĚ_QN{Mnhs,y6ڐB66)czPnʇoiS]b6&u1 +*sʌyی)r5SK:8xrtVn\){;E'J3TVlw_OıxryOvċRu<8{)(taI z. ʋ M m$u?|km _^^;L^<ɤ="=P^x}SNY!i?dIH, C:3zt:@G# /7wg螹x^*lgΞ:(kuaz(#z.@>fcuh*Z@Q.ݔ 5=ǁ܂Ty:[wlv[siW>,{:[=˻7?KǸ-6E1.SN<]SG۩vtOt;vݴ"9bgщ>rh Z|&-+}½i?vџ*t .5v i2hh]ۃ7v EGכxh,SJXKһ~̢<̩qpP88]'竇tXx{SM U-4V=(v0Gs G!}_ڤu.?4n%{ãf9qh^ cu+9!v/Xrp{1U]7҉stڪ`2|vL9޳+4Y*p_Vq~M*ohק /4aMsޮ6m[f)rh\c>OiL?t/9nЬMw{N 7poi]&hq0O熁h܊7 z4{'*5O m~0Os [tLki5a\\hzk=^8vU We'Urzx{ ;}'PL~P;7~5[w h@auz5'%FH ")o:3>6Nh&?&E#:)'|ep xoRLu/Хt~"Зgt Wڌ M Ti>1)SnWd>=lK^C@34cRuߩbN1 :-譬TPָpʕVhJ~'YZ#t:@G#h7sg:xᙹ99u9o{DM իW[$kn2E>u#NzY#+S>mɣ^>-rxNok-c톮kʏ?gh(wRe]>ԅqR (%& c R:ZIɟ8cz?v>v c3]2L{H;F]ucO0_f  zd&b:"L]ʣfAW}@kp8Hzv?0ËNtϸ`.b;1rn>#nrCi;סUKoMPprb`sH-od3)pH:}}5>Wyɐ7ZwʙyQpc>X#Psn>:4ʕONG\٬cY?_J~[oNy_ܚܮqdEח;@}`TN}^)9q ũ[Gc)ZmrNm%ļJŷiv9M=9q{ S{]WTvn+qP|O][4arN<}lmUxvSqcBN3tT7[uZR ! :}n'MyAŇs&bCRTwHN/'D/(B26z9W։ݜo1t20٬:WM fl{ƃ[좞qt?S'KRw&0pU}az\iO_R׾ԴnR2G?~McFy#_N$#'>6R6ޡW6ߑ_< kvxM6վ~iˣo=r|嗭zL _>hitW/&=oQ]Qy&w⩬_xџ| 2~"fL;>nv5g?ؙ٢74Ʀyi_gu.^d;g~{vmiI $ARRj[Rc#G̚9`nlns>XqVjEQbCrϗI)YY}ʷ2`Ɣs6Džh?Oȼ ( ( ,y6smSg:ڀkO 4nza93On22ϵo~62OYxw[lzlp5ezu6a9ԁV!r?)˸%-Și ̳b[Ik_~u9,\kN0A('fbJY1‘0o$_u l7#%RO|igEڝ,:6u{=ЄkX'|Үgî;b& }̾Ǐq=Mf]~|\XuXߛ0g}^=5io ǙI_*}ާs;`ξNӏvmAޔ2w =e9@y}}~} q# o y6VZ~/TknՏk-篭tL Hݿwޮ`ߪ\:|hJӣx 5ڂ@j:yF7+?X?@V?Ybq]?X)ɗGj: O(#E?t4l8F5zS4?~uͷzvz]g[ٞ|^4퇭sWʁJUHc+'ig곡|dЯ@-'LOx| gn>W XJw7絼a|ѵC5m񴽘Rq^AGQ8āro/Vi0r&n˗)H +KrC(r3MzO>)i#1-7xi1PlaGv1' 'Mmp"XNH]YN' !'iS2ާ;pJ:+Lɋ2'Ϭ3qHݰ~- #cm8?v:)Y6fI3 C8i\ϾA}Id/.Cn߮X1EN_'o%Ǻ?PaYo8RsD$vFʜ[>/a:;ch'Mi|ׯ8:ױi{ڍ=QGלZkP̳>ׯ kҍa֗6dcLO!eޔ:3ϲ];=~zXa)/̧XR7Y:Xe'8]Gb1.iQycD A Ƀ6u"}H]*(AQ/?i}=)-u7*h'LY)e婢*P #8'T'J)20'r2dœd[dYmӇ zW?Q[?;I "O*&*}_Q3 KAמX\ O,O`XmE`&O>S|4Ʋx7 mͷuXa4tH[n=V,#b=6l$väp,[O ^:~njT;tO6O,&@ nB3mJD9xr2@rܫ֑ײ+%ԟXG3y<[0Ox@ 8EG]ԉ'ďX@zQ_=,TYA}pt_]뾨)oՎOdzl3Pݷ=vځ6DUӆxe`?sO1N!۪zE8W~mhM\Z=ǖo{kX<Ɂ<=%ꋵV1v/Mp0?zpslkLI?{þ>{S#gj#8αW=q^ԃ:}Hsi^ƽ \kQٮ89{qn>SndMf/7 7fIilpYn<5wn0fdGyxHaX1qG/;ɟtId ]F'-'s3CƔuxWf]!}1rFޣ\`HyH:z̓! #&2?̻e:iiPlC#ۥ,CRfIk0'i%52akF x020PΔyڊ2iaN}|HaX6M#1/j`}c)e$}cJ}:&O,n^gcavP@P@P@9:\q͔s,Ϻ\r͹|%XsNsy?(\p.``Fʗk1Ki;ښltIQϤۥO)en1I?bFڋ3,Z>3'a)Ӕ=u$oڀ1^3륬'}M?OQ1e+Đ黴-nWɓm~0[O발c#Aӏ؟i 4(Wڟ2.|>McUVZb9U*ׂ}?በg[ V"*f wf߬Im7o[b;[?*Xi$5_yd՗jaD<~՟$+;XsϷ !cfL~{?G$x86V%}jw XtZ!AeW_XPWVߠ/zW҂m*HOv+lU!ȌXݱ5.Vc}O=&}F-Cy*Z) -Ϛ7('g/I<Վbr{~R}tsm-nԃ"踵|+>Bj磹"#fȁPP@P@P@P@P@PŸ7xqEڄ2O>are/X<矷rc *i%(_| ,cdA[1'}vǚɓH{k\{I<%Sc1rQwm:i3R/lOmRO2:@H|Hw2  ( (3)2sk 9εyXss̵ےs yy͹Z;O%:sIz^ 㺀k 2q@Y)u# eG\W0ԇllO#iԁu?QNQ'_ݦSDe>7`\`gROan2̘ǒ}7mrҺ~z8]q̔P޸ӮYn)7F ֒kq]16I7wXf3ubduk<}m3Pfާd{XO{FҚ)u 8yM|G1f9}>Ŷ 12wn?t;5#妍i{)..>WiF,a_N<͒M|SBRe[nc}/?bJ>k+(qw귺8\^0Փ<u.V B>_HٵGx:wK՞|\~Cp1wj]M"!_z>E=^21%0GUuz_[ZhхrL+'~cϥmYV#4'kOy_>TRM덅'EyH:ӛ&ڍsxb}:_ݩf3hڢ~Z=W keHs3ϏW.{`@%wiv,6d87lgAş~ pݗ3yڨ Wtjy:5}R+vR{q9_9g|s|gP@P@P@P@P@P@_om |Q_~m')_q#͛7| +ĸ n:#zLMfhMɓy)s/~v-7[rg}F7M[en1|[YeYyoeɍndi˓<)y2R~,'?p+nāY'Cy>h妞_yͺկ϶)X/ʤM) ggO~f ̔6!-ʜ4ӟ61{62yLwʓ/egdEyRܦɓT_Dvy͍LJ"_A6M?NKyFnzݩ #iRV1P͔xxG,OC[ɛ~:?ԏԝ~Bԕi e!7},7nyn}w[gwx9} ( (, 䜙kyuB\r54˸^5}Q[ƹ{M`mKIsrF9y>[\_?o^z뇜sA9:CsIq}oo[imی_S ~|(71'y emk`s ÚR_~,1č0ϵ?m>uIy)VxPG e%e(K}:_Ek?;`aڍe /CӿOX'y(M;uv~Hw eٖiK GO{->'eu>ԍxcK-#-?>Q-' 3OYOHsEiډc(O!]JG27@1`YLji b#\?@On=h?8΃~꣍Mۿ-tg)Dtō6%@\^na exu[2RH)ۚ߯ $HJo$E c IZKK`\ pڇa{J?X]jA~yo.Ogotao$VVG؁TLۧ{_[^>V?Xfڋ)-'yߏڎz]wԏ3| ojx݃>}^qEY񁾵нu~,a~=ֺMhj}>jߚ=S>ߺ}tJ-|g`INK,֥{ 6`V`AJo늡o &8|*׫纟^YidM4ٮd8z]Ubw#?j)W JjwFqjUrq{SBM '>TDqpbfwxErg O'k~~JW_ٛX3-x~?qX t֕A}GNƓfgo}0 ulF@g//qŎ2?~EqRuI֜w ۋp^g~d:O[g[Jq, cx:n2Ł3lqu,瞜8[P@P@P@P@P@PAbx⏛o_r#7b >f3nKH&xYrq"%X؆/1sSpnN]I;yQN^32PFb<%M2,?Ҧ/ʳ%\^)O#]Fܨ2%6˴ u͍W myGK%y@adCFƔv-h1 KVC KSϴ󌔕 9ΤLiS^= n`Le'ԃyͶ9sInǼ ( ( \KPakrmM(Ҡ~h ̧kFɛr=k~nx=CZ;۱>SSu(C[lKȇ2/T b2( L7i#KxFg'Œ=PFM*+6 _Ol|I3-y2;|3L>LyM1Wh7>1>DOzwRK=#iwُ=I1߮ҏr|ߗX>Nlψǒȋ4󷿬Cb I(+mLSN 4P|u*'~uo&nS iǴ /y(}A=Y'.Wݟ|+Rz<]~yX\z5o*R 65< ӷl&:.vL kUV fJ_6my`W.,>D؇~?,M`b: \x\P_?nO,擕'ީdݵ6aY u ?_Zaݥr|# m().wI_hLXrMFԾp\M{R<86cK/qF_ܫcNy-&|X<+>^zf೷/zzr;ʃcpz^W,|>9Vg$ȗ*͏\=σo'Ӷk|;_)o הԾ~>Ts>8˹} 9 ;ܬP@P@P@P@P@P@/``oí@nM>n4˗|)xC nj TH~| HYrC.c}ާl)/7dXY/7mfdG^Y?:7-2u֧v| J=\u0?c`ÔrB&ad=4 vCڌ/miRɃe䙺,,#N>˔<'l3>O䙼x4I43ty&fyXNZC:cd}Gنm :LG%,'\ۑN[_<2Kby2{gxpj6<ݜaؖE{1>C_qfb6mϮ6E-w_mE;>i$xTs 9fk]pѾ=k,f`-}a?&n'۰_22?)O_U ۂ=Sm~rLܬX }z˙ <->%i(N>۫YP]՝|E=dNg{S 8/;kf9( ( ( ( ( ( S]n"b0pO)2&n{x>V@P@P 8/99zr]9Ϲ+rMz \4mzsM(3rB dsN{̓>ԇsr"O#)332<#,'/^3l5e"M(y/i:\1rb\0e$]2-B\2#-}C9 C'ڜr ϐ2ԅԅ:Hy\LLi3>sF֣z0JېeI9łYu)eN{Rg[O?!/;lO7g1aa +ԟD+)F IiPF۲o6#v67iHz/Θ1OLwz鿤I/ӦO=+W ۥ,Lӆ3}jXF&I}hwҠ,9(+32`Hl<{h:̯mHk~NYiYFӯY6q#*I<e^'YkہBOA\O#e2e[H ֫<DU4!vI<Tx8xTוzz>/VOXLHWGONkt(xrPkeoW9V+\.l*1%xVn[?<'L&oP UɋQ-Ѧ+_byAM)?O})نOm_-ڍ.ɧ(SvZN"=f?v>GP!Of'7NȇҾKCZ+'㖁H)Xy هSTNUa8)u:G Gckؿhz e $~ϟa}چ6mhkOͿ&dc sh}9֐m?zɟ'}>;#ϕ?B R}z;Ϣ>/8ܬmV=WjlەFQ]?ugf vT){U/1m>wkJӻ7-HoȪʗ)Oe#1?)ǶyPeӕ9^ǾQՓ N^`]cqaЎ|GxS,}9&Eo7ǮT9?.,Ltelߟ q{ܶ[0'2v>O۫f#lδ2dҫ6p~s9(燜) ( ( ( ( ( (a X|'4||~rvr#ZnMtQ]`f f-0< 1_Hr77QnBFss71{|i/..71e$ DgMvvԛ~gn{n lCeA8޼yMIFnZ_b+_ܴ6`d/v_VSO l);r'eϓ( l4i02Onܸզ3xnsڟ6Iwi?L3[HI'}5y`6\BMX@hs|m_.y e@8V17' {nƄgw_=;gݟgݛoO2b~=)]) ( (prϐk~9缗sjc_v Q8漟kΟms{ D_2z>je̵=ۏ_wf˹Τ~9O8"Zeo~(1nw\[' 㚐)}OK;M)i05+\q?21πQFͺ ^zf纖e;nL=iqmGZƌz[m]# M\Gxo7i( \џ?mKԱ!`;q hb}xbwRxSkW]˕j;u^.& }`QWf׺%H7WSahΜ$蕕 [hO*& {0T0O <.V0O./i .DFդ? wxقYG r .ΔӏI eJQW#r*8oyrqcصƭPgS+oޮ홒euUz,W<ś2~Jݵ Y띨'K61_E2Ͼq5imjC*L)ǯ+K}Z^x ̷~6G_7vLiOiw]wW?v REk_o־qzҎg>/c5>/( %6iO]yA'K\=&}^bkO^d~q~{R;1Oʙ`kob;w]hӭ`Un2 Pߣ K|QJɃGg{*A>͓ܶW o.)ͣ}bzXg[sp`g/!|12pP! &sY%M}Po]_T?\^9\}clګ}UW0GBZqBQ_egJ)Ar` xmcMٜs;8e s#ykʟXynܰ4ˇCcYϓzð#y0%2 1J(]4#ԝ)˘gzECפ<ɕG9gmNh;nJȀ]n6̧b7|:fEe!?M(^i??Iw;.}0פGaH1.uv,1ϱ~ I/-P@P@Isbn^w9\8/SLΑ|<0ry<㯇$zyF m>"1VH8z;&y& AL2Gd2r@y&jk932cI)Gm20{O3P~S/v+`r4^L5d~܆9=7~Li3zZ/c8vi3e)Cʔ1i0{6N9)WHSΤú3dWa[փ)#˙R~eRX~t܆iƓt礁4#~z䑁$ͼ7iJG&ٞ4iIV,PWOڤ6'MX aO:>ki&z'e/R/||ߴ)i{ݦgI )Ag,1Oj?2z{mm{tZPp|e݂XDܭ2%,ɔ2_uQu|tUۿԏ^P/&ݐxfl Ʉ<)u&hڔmH?V̓F||-j61@ypa:< b(ۤfY@)^3E3^e`ya݋ P5ئ_/lմ$Xy]oS^@nqמ[uG}>1ڣ, Bd68 $kT, %?y_F B/_KY0:<}y_4eJ?Ĝv9Ȁwwj;'KG\lSihy΂M!y193 f襧ZEET>To (jqgW?)fm9>.uڏ&-jPyͶ L(y }^T¤=i ww Ms *}^-Е[<9=gS ??8&@fok%Pp5/Ю[C嗠WaHI$V򤼤χy'mʃd mXUM}_YaIM|+ʉjKٞdVʝ'?[n~h0|f?Wspq2 ۋٯsAyӏU;7fQ6y|n3o~_y~`//<\G P@P@P@P@P@P@`S͗{xTCnv)n>_JfMԋs\`a.7lMt)GFaeʟ:n|2fCnn.:1_4%/맏1usx03ۓưmm6÷'pM37ϖ> r3|O7f0˶L[ym_vJ:x?z&m2Kx"ue2eH{1} Sh ( (S*u?JxX\ AvWǙRy}20:472 tvk5'R&gd>K)#Iǯ=iR.);9^zeO&$sx9lIO'4ŋmF?rLdMIz6̏GtX'Qh>2bEziLZ AX\}c?֧Gl>ȟrXXq򳜑`ƪBM2) n/mX[,޽񤙪ӟw;?u;f϶@Zlξ=άK> ՝}cEC[ֽXma<).s6޹*wJCGO>{^F7Ȱ߾I:3i 6bJhյuhq\Ϗ/׹ˠ[JqN9S8c5\r9v[ ; ( ( ( ( ( (p$Hb& _Rf3=!yQyȗ yè;iO3yi$PafQ@P@P@:&2t I7לo8)z3 v ?4 Sz^{+0^?uSn _3?2t PqvX7$#hL ?O# /+O?O 1/mOiS^bxPMT}FO >_- xݞ{O֙RvڈZ0Xݐm3eve9Q\{I7Au7e:R #Ckϓ}fMiB * ( (tp. 3s*17y_?xoc'3N}gnt PƔyxwڋ1L*"e`_giR߫SFʾɴ}})ˆp>(:%_zݥK[n@ORYx:\n7ۄl,x+_ LZfN?HI@zl`Z (~{u{}|yҷz|-sm߾`iPZ-}@IDAT}Tо\<{ytFյGo~ɧueʎL9-wSi}}f7uq#F ^K~L}>2hIyJxw/_ڭ-9" [ofr_~^vEb $/;w'1Wݵk׺Ź?WO ( ( ( ( ( (G#4h3E 7 oJca7̓7@#a%ۿ홧RzRJtO2Qipc0s^P@P@C`%)z13s/p1~}֝~{M3tL=פ1n?zRӶ,M[-t ЏƏ-Ӱ?OosN`1e[__oe[ ?Px͛v%|!'4N[=6T c/yo_ nD $f-{^j ,нS7*y\*xq8V > Q SMJ+G崗4[ 35O~TP).=WU.T ߜife'rJPE䳗 ΍y0~,o{&O> R;c9VyI{pG?|v\HL`~7Z:Xy6_) ( ( ( ( ( ǩoG*о|Qz2E>{yo^8 26>d>GI8~ ( ( (Q\5a\F ,_H_Tx A4N))~ pqq766:Ͷ']Qz (r`y*Y[7YO,^YomVqW6u IgZ ]PQ\ js^PȽ&}ԁē 1 eT.zQڸ5 ~sy>4OB˔IpvpB{Z1e6 ( ( ( ( ( (; X* ( ( ( ( (>,Iu<3% '#K7qN-x+xcӊo?oɋ sݣznyl=L=YW5uy5u_(\M\TX ڢI-`J֔'}U;R/?=-xyT@,q7??ӓXݬ ( ( ( ( ( (P@P@P@P@P@8'?VWW[@{w݃ 2Np v3((&ݽ &<˻kmd@c߹m,vW7/-vKJJ+TG@p& UBJM/S-Y*uW[m4_Oq 8w;_#>c (P!XzvO)yf .^XX._ܞd, ( ( ( ( ( (t X<]aiP@P@P@P@P@85wK.ghkܜ^`mP@P@P@P@P@P4 ihE렀 ( ( ( ( ( Tݻw[I)SXtt*qĮvn7n<=n?lŏT@*As-¹33=T% wGUV-h6!\ļ,;bzP p>JϷb0vP@P@P@P@P@P@.=, ( ( ( ( (VDK(,=DxOP]Ҝt[ $^ ] pu46ZĴ‰bO%liTL{pGQzdLNsh#-H{q[@mvXuoQwoQk/!zRqMə]ŧΦ'W tnjt&bF ,>mlP@P@P@P@P@Pt ݾNP@P@P@P@P@c2l`:^X p:婻+ T<뜭TPXegq{=:WO nO>"ybzJp3 NK=m (mybq">w~F' ( ( ( ( ( (t Xցsϵ2?-xʻ jNuZ9P@P@P@P@P@8)S@P@P@P@P@P@P@P@P@P@P@P@P@P@P`V ,Ֆ ( ( ( ( ( ( ( ( ( ( ( ( ( (3%``L5U@P@P@P@P@P@P@P@P@P@P@P@P@P@Ug孷 ( ( ( ( ( ( ( ( ( ( ( ( ( (L XΠ ( ( ( ( ( ( ( ( ( ( ;wctP@P@P@N'<8Ǐ$f ( ( ( ( ( ( ( ( ( ( L3gk^;( ( (,``4ΔیSR$ ( ( ( ( ( ( ( ( ( ( Y{XP@P@أ{s{Y=k* ( ( ( ( ( ( ( ( ( ( r5^;( ( (.``1ܹs\W@P@P@P@P@P@P@P@P@P@P@O{ .vP@P@P@N'\X|̍` ( ( ( ( ( ( ( ( ( ( (0>xjƂ) ( (c |^M\ ( ( ( ( ( ( ( ( ( ( ( ( ( 4'WP@P@P@P@P@P@P@P@P@P@P@P@P@P@.``BP@P@P@P@P@P@P@P@P@P@P@P@P@P@'``k2  ( ( ( ( ( ( ( ( ( ( ( ( ( ( ,޻[( ( ( ( ( ( ( ( ( ( ( ( ( ( (p ,>qMfP@P@P@P@P@P@P@P@P@P@P@P@P@P@ػ{7s P@P@P@P@P@P@P@P@P@P@P@P@P@P@N', ( ( ( ( ( ( ( ( ( ( ( ( ( ({0xfn ( ( ( ( ( ( ( ( ( ( ( ( ( (05V@P@P@P@P@P@P@P@P@P@P@P@P@P@P`-P@P@P@P@P@P@P@P@P@P@P@P@P@P@8q& ( ( ( ( ( ( ( ( ( ( ( ( ( ( ]⽛ ( ( ( ( ( ( ( ( ( ( ( ( ( ('NdXP@P@P@P@P@P@P@P@P@P@P@P@P@P@ Xw3P@P@P@P@P@P@P@P@P@P@P@P@P@P@P X|+ ( ( ( ( ( ( ( ( ( ( ( ( ( (wn ( ( ( ( ( ( ( ( ( ( ( ( ( ( 8O\Y`P@P@P@P@P@P@P@P@P@P@P@P@P@P@.``BP@P@P@P@P@P@P@P@P@P@P@P@P@P@'``k2  ( ( ( ( ( ( ( ( ( ( ( ( ( ( ,޻[( ( ( ( ( ( ( ( ( ( ( ( ( ( (p ,>qMfP@P@P@P@P@P@P@P@P@P@P@P@P@P@ػ{7s P@P@P@P@P@P@P@P@P@P@lɎ<5Y Ap&E͓##eE>#") dɒTZn4nɈB֘S e @ @ؽ2 &@ @ @ @ @ @ @  o  @ @ @ @ @ @ @Nc4 @ @ @ @ @ @ @,,޿' @ @ @ @ @ @ @; +` @ @ @ @ @ @ @xf @ @ @ @ @ @ @ p,,>vL  @ @ @ @ @ @ @_y @ @ @ @ @ @ @ؽ2 &@ @ @ @ @ @ @  o  @ @ @ @ @ @ @Nc4 @ @ @ @ @ @ @,,޿' @ @ @ @ @ @ @; +` @ @ @ @ @ @ @xf @ @ @ @ @ @ @ p,,>vL  @ @ @ @ @ @ @_y @ @ @ @ @ @ @ؽ2 &@ @ @ @ @ @ @  o  @ @ @ @ @ @ @Nc4 @ @ @ @ @ @ @,,޿' @ @ @ @ @ @ @; +` @ @ @ @ @ @ @xf @ @ @ @ @ @ @ p,,>vL  @ @ @ @ @ @ @_n]m__l$ @ @ @ @ @ @ @#`aqUgfQ,[PO}SBdv @ @ @ @ @ @ @,`a-*f- ~)/kA?ek  @ @ @ @ @ @ @&`aQ*h7Pbjq/[7g{rq׊wt @ @ @ @ @ @ p0 -Y4YD\ޢג".ηߢٯw< ;gq  @8@eL^)v16ckg>l!Tƴk}A(Ufm(/ծǸsy9ls6ԦIy+rigls2am=m @ @ @ @ @SގT -(߿Eu[HLضy7zk+w3ʙTY~MyBߞV3Or @ @ @ @ @h= hW+@-r~}Q>`'Y_rs7O?2gBE=xZTK/-e Y ad-zxS;UvsE-]/۝Z \c=Xw\-xnkfq)o|e̽> @ @ @ @ @h XX|4ˑmU~}էOoA}Is_ym'ضՂX'8~K\|K0 @ pBk׮]_ze~\ൻ~׏TצJӏo|W{[\i,m//,mN_/c3g,gA#igZ?ḩηzk[WrxG9nsۺE"@ @ @ @ @  xG{QmmӶl^\`j}`o'X`Ԃgگoks=qAڂ3Wn*#@ @,L-o"Վ[: Sk{hos)NkmUE?JWiM:پ`[[\˴zí1Z @ @ @ @ @)`a|/GUM_,j+-~m[P -p٦s<ؼE˷Syh0!iצk\V<{̺ @ @ @ @ @~7}A~og?[ T<X`?yǗʂ6 R* nZ:*ŋ3]@;Sl]W^yesڵ%йEO<IJ{z^ikc=WvPu|suP]' Emչj{s^9sfɫ?\,A/\__._r@n++w{"#[h"Yp[pf ͂'M`d&E_'NX3qϴ z.︯) K;|/`Ivt} SPfmz%|ؔWG}/9Sny/.`:3._ _L-_-ng_Uk}uk_)c|ni}m{dg{'-B}5غ1vcc^]?G/_owmOG)֚OW4?T[$˲w3(>S;St=L߫rgNE?Xnw󝥌?|uWOtQ Js.}X=WwmWVzfQsپ>Gcճ0e1{'O\V^mkS3?ќ̟%,oڼToylBqǥmb'Am^vԆ_vՎ*yiCj{7ќKO_{WqKמ=3@3[C @ @ @ @+ EK ,hɂ Za+-8sI6M;t_{3wJS$ֶJLuO\Z*8v0x]neվ:{G- f]@IDATw); {Sy]g=ӯ 8i+uoܻq|kwWq0vn_ͭ7:.ϥVhp_ίӕ3#y3ndSIUVȼg4seSvOʫ]mkȝW_/ʨʭVZ/'C @ @ @ @y D,Pr [w$YPSZdvʬ-v*0 -{ @1Yu_[XLAڽֵKzV^}M:_o嘞j֯/tW޹W_}u -vXz…e:)TV}/҆+8~v7xc`ʸrҗs}|K_ZL=Wwi,vt\/[SNm}cT}}|~SFזD @h|1ccpϟ_l:믿Zظ1_ϵfxFss {?^mGz_}zkW_|6O<.{f]]|y6sY=5R(ƞ @ @ @ @+FB@M;_E-<{Lܹ4'sǂq#2[|Z*t4O8|99 ;i't1 - ] k|Q;*m|TgyRt~R03ս0Gz租ݻMr @ @ @ @ `a  B|߹ laX}ճA뢧}`MeϭE{k`;72ik߶ʚ7Z[`iA@;`[M;^ikzնPymӞ\gUv;_kiv;*{̥(  @ܺgx1eK3lBi|\uSh|=X1wZg?˿sѻ1w뭻=%xͼJO< z>g f"~S~&_t8OKԦ/6uq]:_;8:WzwX$~+ @ @ @ @ @XX)'X΂;7 c'P;M''дjcL-D-r,u ʜ~r Z-Z}˨ Z唷>}z.7lS?ΓO>y}Q?x|XovvOT"NmMiSվ իWr駟^+Gyd'2{6wΞ=?ꭷZ{=Oz3r̙e;O_|9 @ @ @ @x ς'sK΢ H-XrT[?+9m+/Ht,;[@i|Z\Z iJSVd\PjYf˫ ]ٮwȶ6T{'Y;x≥-mۯm9g 2O֟.UYXs@܂k Z -︲f <^_ܼkK*穧Z;M{jS+ =|_\t{ my& @n]U0Tƶk5cƘo2kgK7.qc5կ~+y)/5o+>K7&o>`=~4>j77ek6ڟyL'i9/̇\Dm)wx*9;ꭾQYϹݵ?iqri˟,X"@ @ @ @ @{S^v}'0U9ivQQ&rO ,ȳ@R]x: S:Wg[\{.V:W϶u}K֮ ZMsm]V}Xm|MkW=m|l?DZ׮_R喺w6ן߭S9Ӿ/@}\-K @ p3qhǍ95fkBƈ3;%ζs7wT^|7>o~fYn>oU [BR)]6W\R7\E&3_ҹ\;9W~[?ZhO*wQg=s3PuYK'@ @ @ @ @ XX|#,(ł;1uv{Wf9g۩smt ؜֋X gf =~=m(Qsv>V9N˻oiOԖ\|M6~vJõ  @ @4l!,0n6c4޻3-wjs=g?._m>yw"؞?ʩMk{9gyy~u}[S^ǔݵ2ow|4~  @ @ @ @ @Zⵆ$ p,Pu!Ĝ6s\ nZ=;Y* -/|g_޵[Ph:W]Wf =Oi,﹞/x-گm]>V;_}!wY{5˅g3x[|G~>ՑlV=*7[{hT<+'@ @WyiNZƼX{6Yh|x-Jzyf@[3n븽Z{yc&ZP_oUj{]cM_:KUFg^-\vKW^_(~_yAnQ/_zzs~|uٝq @ @ @ @l XX- X,@ǂ) \F~ ] o- ,`Z>y衇`~=ز_'г/_^T`O?yʕkg/զk׮-[~ҾӧO/m*ٳK0ocWPn˙3gwqwKAJ\>ŸU_^%@6u>m￿/жזcӦ;<s-kWiu @ @ 4jXEַ6O<eLXq?E5l.aqﭷ-a*SҥK~>KE䍳[Thꫯ.&<_|룘wmoի茶+7\O}os1dQʪf"O;v|M-_o͵4V;j{yhy-las] @ @ @ @ @^,,ދ}|O&YlTYp"AU 6L4 欭14"KZWA{viٱ:[,[q=v 0aڗlg/gp4qT~6m[ݽWfY]'@ @ 4kk<5UuEz{{՝5^n8ţfGGY7лt=[9&]L4_P;^<3oS6حʩ@xsݏܻ{*>VJ={4tՆI @ @ @ @ @`U>Ăg6h`&۪gkYpe:7A'Tf级2:8LiV:=ߠu:;׶Sk;l4KmYzfk{4;l.yfڳ}c @˵܌;qw]ڿ@e~XsǽRWe̸wEη4~Kcس|K:F.{/)2F.}ڱ~Fu;FϹF @ @ @ @F⑐(0sW 6 k[LV@dVa R3 N]S_m=WmeʝM*)(@ݶR+wUW_b{ yv/An+M]VoV^e`ۗO<8ݻ:Aܻ}c @s?6닭m˗/_wcÜ's={55.γW8qbsԩ ]}Y-m3Ru|裏{'~f>a)f6V_oٵ9oʙ9;nJ{S @ @ @ @N o=PvZ jq 6-wk׮- w:^[?^/E9 lūg{W_]~eH"=- ~{{nϜ9Yϝ;s4]+\ls…e6>}z`^=*gU'@uǼM!낓-%|i/<}?~yO;7v׻/.MN8G @W`-\mlظSO--v}6/2lfx{[zo8~xsg7,nVRez4RT3/7|O]ﹶw  @ @ 4>l<\jEq[c,r=Coqqs +u-01󌩻3o?p~:ᴣ5sy(>6eu;kiյRs"9#g'_n @ @ @ @C!ZM YcM f}+P&Pr9&sa -@vx =\[>קhp嶵n1iَK=W}9t\Y1;혶|:uʠ~5-{fۯy'յ]_=gK @wS`Ƶu#RƎѭ>h[ٹ^3nL=f9u7^y-m~͸sYG>垙gY;y\fޙW\Xw/)c.c+}Ϝ/ߘ{us^_1{y= @ @ @ @ @  n(PcA)ڵkھTt\oX:( ˂ֆ\֦ p./v;.8[sᶞ lʫ;nv ]*xzu绿:rl~:_EbT3o]OlR{ @ @ 4k,9_$n\8w]6vo<ƃ})sK7hlԌ{8ĉ{i,/|aiW^]0Z]/\Ȕq72A]Q;g~bqVy-n-s哶W^m?m1eNyymOB  @ @ @ @ @fL>^0iAveiAGꫯne\%nk ~70BG}t={vig-W+lY0m:wE/Ҧ`ڮY-w, >s&г n|_`ŋ7Ν[MkڹvvH{/6fpҥͷ{J0/yʶ<{g^Oy7'@ @#|@c}k˘oO7l1dڸ1{KwnWק=?L6Mu޽Z'@ @ @ @ @XX'0rv\e[59f ,/9ۺ=mnvߩynΧ]J~}.uznݮ#Om4k{a|#8?u; @j/7'~iƂ3VZ™7;->~xx߬9ʛ*z7-hmAzakێJZ{}֞[o}ݮユUۧm @ @ @ @ @xχ˂"Kv\`jBvWp"N_kEX<5f+aLq}u_^Y-T-M/鶵.qOt/tVOV3W{nRzk'w:VOyG%vկvTzY|y9?e2|m)CN @Ocیzq_m rƇ6&fn6wUySO:TYͳTvm/u׮Կ;;W_z.]Z븾L0w6vyǕQk[m\Ͷӻs @6.lO/ caoʎ45j<9s W{f\Z^ }~w'ܜ>}z1k#wygxʕO~M _/'N,4gآ`9])w*U.A7Tj~vy_/cerKW9k5WX-7?ӧq׿~ӽQ;~7^;?g!ׇu( @ @ @ @8x_b[@cw6?Ƃ$\r=uu +Qhx>#K^d9[v>}z 6oY0:E:m1 bk[AZ/uT׿ ҥKo~"Qfw_ի8(^=ŋk"_ @ ƅϞ=,\m|7؜;wnCrlꫯ^_Hزldٸv7n-o?h}mg߶k{cEJS46W,`m^}[ .,s3=Ǚ3gNX;=6/吝Mmn6N;~_.iO=s4gyfɫg՗m׿}ͳw=&Շ̻TN>  @ @ @ @ @Sڜ,-Td(Nd_ z痯Yf_)/{ lk<+G .@o{ NsND\[n$Ykg:*6m螂T˫8m|Ḁ)v_ԮTqZX<~8U~͛o -/l;,|>S˼K @ po4k\;7>~זs[Yj)m+,c1{v?e5oam@ywZڻ ;4~]@9׎ f @ @Mf7561rR΃6>meyǷjOc8r;>o}ں\zhnmͼF/3gsmvگ{9Dsy]Y=k>տY4]ٵWVFjWZ~﫶ؼK}7i_''@ @ @ @ @WJ P˂# tlQi{Z d坛 L0pe՘"qtYFγ7˻ (0z6w׌':ʮߵʹQ[W;*ؾS LwkS继ھSVG ~vUFAG})кO_G]RZ , Ў t,s9+r bm -hTUܶ }g٦ꜭr*[Q* ߶ ,ȴzŋ\vm TDGjkMl=۹GUU]q۽w3UvOy.\XlwGO>Rg +sWNϵ_y @ܺ@cƜ36j,mX1ASe>㍿WW[/-uƽWwgmfۢk ݘ-Rm=6֭_]k\N?oO6.]zǮoR܎o֏möuv,6ڒMUd?~ݧc-_-gݜKet#~--s! x ;iOm6dYz ܭʘ`'x HޭڒA) N]*FM9;}N?;jӴ< p{y̻X;Wʣc>3iR}2ok,mFQ>y\IyijLggzG3W1x}=/1swogjTV{lN6L;OpCzW6B`vtM"@ @ @ @ @ b- X:ׁ;7s,ȳEųص{&pry)mR*6JZvVfo%eW:廟p];>}D^RYn{ @ @c\{TGg|yԩUsG56ǰuN#mw:[;sMws6w׹ꪜI=[}]VRg-c]mRmvtɓ.TmWN @ @ @ @!>) tx_& @ @,zBb׹vXyBUf fRהQ~RX[ smO⾂<_gZdA܎ @ @ @ @ @**tK9)9w>g~|xwe/~Kv  @ @ @ @ @ XX|=ע Ydhqw;\έֵ} @ pn Ugk;^bSN]_T{ĉͳ>)oRsgr\E_nAn^{o~MxekEϟߜ;wno!wϵ}w,_=Ǘ.]Zʝ/ @ @ @ @#N9[h:mRȷhV @ @ 4Wb˛{o9كo~--o+Վ?j ; bo|m3h?~-nphA;Srr_󟿾om 8&@ @ @ @ @x$wTB;ʭ2 @ @=-pҮrו?ͯeQ,FnAwm-:nAq_:nkaq[玣=9 @ @ @ @!  @ @wKŨWsϟ?,Xm!aԶ(ԩSK~vsĉC}:ٳgB wW^wdzx?|f?}ɥܩߗ @ @ @ @ @ Q'rv6m"@ @ p/ t[~_m1G}vjl[H[֢|X\c)l;Kβ߽-.nkl2i[Ps̒wD @ @ @ @|x[1 @ @P-뺓ZL/~qYd<;E0LJ]ߝ,obq-㏗}|}gcWϝ;,"n}y\=-Ҟzd_E @ @ @ @C񞴒 @ @mhAzQzap/w~lo=7[_)n[$ @ @ @ @ ;8G @ @ChapK-,y7.\_/_/}x/sۜ8qb[p<bE @ @ @ @K% @ @,*>y؜:ujsmo~oQ,"n<,XwM"@ @ @ @ @ XX @S@IDAT @8$Y\<ŵ?Cˢwmfpڟm?,'@ @ @ @ @ XX @ @8D S:E @ @ @ @n.7 @ @ @ @ @ @ @w ~ @ @ @ @ @ @ @{xHn!@ @ @ @ @ @ @ p,,>oP  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @qA'@ @ @ @ @ @ @   @ @ @ @ @ @ @] @ @ @ @ @ @ @,,[ @ @ @ @ @ @ @w ~ @ @ @ @ @ @ @{xHn!@ @ @ @ @ @ @ p,,>oP  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @qA'@ @ @ @ @ @ @   @ @ @ @ @ @ @] @ @ @ @ @ @ @,,[ @ @ @ @ @ @ @w ~ @ @ @ @ @ @ @{xHn!@ @ @ @ @ @ @ p,,>oP  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @qA'@ @ @ @ @ @ @   @ @ @ @ @ @ @] @ @ػ_KcɎBq6%%J?ȫ J-&EqHrN;nޙwHʼ3zN/<՜H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pZX|`O H H H H H H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pZX|`O H H H H H H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pZX|`O H H H H H H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pZX|`O H H H H H H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pZX|`O H H H H H H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pZX|`O H H H H H H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pZX|`O H H H H H H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $pZX|`O H H H H H H H H H H H H H H-,R$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $p~7R $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $-[@ $@ $@@ ;㵰.I H H H H H H H H H H m?iq}] $@ $@@ ;âۿ=~BH H H H H H H H H H Gy)H H  ~g1~_-'5@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@*Rl[uZ=u&H H ڕ@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@@X{8>]%@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@Y`Zm=H H GŏϽ~ޝ~u}%@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@kK $@ $@ $mhamK}4ma?MH H H H H H H H H H  k[SH H HB }|@w'H H H H H H H H H H H H H mm $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $psܬ+H H H H H H H H H H H H H H u-,u'@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $ps߽%]{;߹c+%@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ |0-,`\Bswݱ؂|#w>я%@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ gg*x7sG.YPǦ, C$@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@@ 7*_(xZ,ۿcS;\,kl $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $po]U <l@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $@ $ cSE~Xn+7s>>;n_8G?ǝ/Qkſ{w98c@Y ۞=Uϒ&Ͽ|NZ]];\_/>'@ $@ $@W lbn~oSs])I+_5?u m ̋v.n*ꕛ6}yqΎ+?(8/߱znl~.;gι:W^>]iW=?B:܆~}Vm׽Rd>{= ü}V>?gI;fq>?_;6տXh>qicmI]O~n9>_/ٵ $@ $@ $@ $@ $@ $@ $@ $@ $,c'W_}cQSvMʂ@V_ޱm>'0P^JP_=rϯ){?Oy'V=ʮկ~u|*)/ {-X<̻/:mvN $@ $@ <~>6e 9~;?ώ8ϻ3M[jN$7wh駟œ}뭷\׼k>{lM,i\78rs7?:rMO>1G\sE6%WycS6:u}oh~y /pw~?QWqeyO>/7=k=kû%m~>9}_{l?<{_k~Kk23>yoI綌ű)ۼW}@Pޡp9cʗI]3m(7Iwo],h|ܼĢ"݂EXZxhq͟m=yyGsRk>ᥗ^k?k٦[XEƻk3{g/-6v|3-Sm1}س^;q{t7͕~ؔ[Y.:n cv<'rs}y{a,ޢw\굨 (s=?{.T[X;Α|Ӵ3e}hKY~YT֦t2wyڒH H H H H H H H H YݑuFc( Wl HDs f<4 I?϶:&pP `)Jg׳,X TA{^yu ҃{=`hϿʂ.]nwԧy:Q̋ޯy}݃ٷ+ol7y4,,iO?3OY7&o <)g\͍{I?3q/:ogeՆM3ٳ6lWYh>}1f{K\<׹On39xO?60vu߯/K H H H H H H H H e?w@cq|>EY:C-)~[=rת3犏dOS0d{&6uQ1(s4KܿS3VkoS6n~w;G?OnΝƷg%u9ꛙ;_:.s}2kUk{=yGI;0g9/W/G%@ $@ $@ $@ $@ $@ $@ $@ $@ <&-,~Ln5L~1"F6~W|ZOu-48՘m{6Y/Ϣ:]CxyK/+Kv<;}vjCY<.?wxvQf^M16~>k~ҦwJ.ǿ;=Z^|whw $@ $@ $@ $@ $@ $@ $@ $@ $ I l+`S_r4կ?A(PP] ܔǦMڐ H|X;sG}ow _0{"nr 1ns) *}{`Q䗿#7vs)7w7|ؔwj> :-tY/yq,g?{u뜾65.r{O~qػ=|3`3v#gƨm?;ϑy^Gn19N6e H->)[кOPvS͜]>=Sc*g:{abss5…b~]ע` {O' G?]_nq:V\/|nJ7fe$ gc[\?M~m{wԡ޸zcQ90߷x8:@ $@ $@ $@ $@ $@ $@ $@ $@ <-,~T4.A{ 4)`Sࡅ7 ":Ql & h5&Qp@]#@],wbr%ϻZ ){}?@r~|uJ $@ $@ $(oo%,4B>4-sy9yNsMyy^m;wyu׻-DuMlq}7I5ӐsX]?,>a`[ kE2N3͵Ʀ{ϵ9Wv9eJ;Gyuӗ-o}}q:g CmcS|Kg`}նqj=`rqB::ۯ^Z_+˜{q޴R3vO47yΦr͞'{`} O{_]>hSRw>=(scSy0~۞%}Zsl]9Mp Fi[\0A 'oۺώK]UQPuF@IkG_s\h9qm_\*a,Y_֟9]rKצ1j絹vK}}Y6ֿ0s?Vsql[w$S6>\l\PT>pXY[Dm0{p96ϕns_g= Y0ԗH H Wm.%7<\<ÜÜ\\b99Gn~>盇ƜΜD3sFjmeunOŋkKێ9u،A.u6ٴp9omζk^]^uGBNz:I;|~T: POYߜ|eZW=߸}4nG|w1wLsqږ>=(9wGihK^rYu+sϵ/c{W9I{gα迶mKo;G(˵\=ge$>U9Q:8ygb|eRJ H H H H H H H H g?wc_;ew`=3G > |/8\@@Bi =sGv>$c]+d#`P?#wk~cSg?cS^:4j{[lrmjgZŝ6m^&u%mSH,~Gz>)u7< _zQ~yl^HԮ>k7'xv_kmx~ӟ]<~>UIqڔ?8-=ْ;.I|[,|:'sBv'H H +s s* :99yؓO>y̫*99mQr8(Kz?gԻIƷk{fg`sXσ1lLS;ٸ/C/6'{^8;G>e{^׆g?u3_=#Nhg(N^m*{Oh+q /eGgCٽocWs@υk=S+ t$XP/QĒϮY.REς_~#R~.׵%5 H4^_׏Lk_ڑkإh6e.!8ԇ}cwկ( dǮ1>x}a@L=vf^[r/|n9#8Lw=W7qzÂj`YA?J :lyVͽ}~z?}Fqߴa῅K $@ $@ $x /;mѝ999aNo.ibn;W%^>OVs7u;.i瘝^ͱ2ru7[t)7m7tUg o%m/UZ|y^X|n{zwA^܇c?>gΟwoHy0c>C~̶zճ{bcQxȸ]s>&@ $@ $@ $@ $@ $@ $@ $@ $@@ [ph'&Xs~&PD=`G]']#.Hѵ.QGq ,*)F~J[{|? HT 5jS F\ tq2J{ uL(}&r>c$x)XTP<>YX/]S0d7=Sơώ7n10'{x){~{v8W}u:93^Gy}P3/><3P}ι (~H9:;@ $@ $@ $@ $@ $@ $@ $@ $@ <-,~T8 *N`B{q]8-8&XP>jO}p>_9I]T;]wzؤA_A?m@`Kc:y'@޳zveW[׮`#+/:8wʒs/mr_y8WsXWhǘ񰱑f9G%@ $@ $l78<ʜUs8s<6W{@w_=֖zV_|Ƽ2lZ:6VvkgsyKϛ:ܸR[+/\\WgkG]c?{u=w'7Nukט|vҲ:]3]u_敫C9շ<1/zcL1&@ $@ $67?򩧞:Z'|?=c3hD9oY5OϜ|pםiY_WGMIrט;,^y7NOk|đۼxͽSO\[߽CY|&_[ '6X6~w7.6rxl?Ϗ]^>}w{gI{1yW1͝:lfu/~?u }X?ޱf\%~[hYseӯ?|~g[]]f,sTgH H H H H H H H H GUŏ}%P`Mxr |Xg'P6.hW^9LY.XbWA /W$We *Pp[ou\Ű/_{@DAE>^w__]kL ԾA댓$@/_<ʮIs_~?F7{q;ַ?X_ Lҗt~?8 T6{ؔ=O_S\駟>r356ΏS2vυgW˳.wgqi $@ $@ $̣,"53?W}_=>c_'XTimm>yyjIy1mse.F?⯬N}q̘mmlל_իs{IW=}O:w 콂><4I=?{Gq|yݏ_qZ*/lj7GxrA >Tł  |V@HK[tOxKA|[Ժ\:GƩ^,W@U \GEMq fTX}VT9 H L:O`kgꘁoc@W_R֞TkVꛏ\;%Sn*8Ե( 2m'URM= QpcV9qi $@ $@ $;n.&7R;miCG_,V^}Tvys9brveһI=8n,M]M[tqQ0צsl~I/OǺoXٴOž$zתe7=[\/lO\mm{eAgwcZ>:gK~__C{W], $@ $@ $@ $@ $@ $@ $@ $@ $ .1 [ <y>K g;h:~& T(D> EK_=εO:om 4۹Uc/Rk,T`Iu/pێ]qH{v.81l|w|>K @{CKq+;oVagA>m!7=Wr`H H H;3H u)klͣ7Syd|զW7MҼIYR}ZHe-,</3yڟ@ $@ $@ $@ $@ $@ $@ $@ $@ oӦs-T:֘>Bwc\Vzνܜ#Rludn"_,6&,(R]e6! [>S +Gp$gm94S^WZr,׆qe'|nJjs3[U7R $@ $@ $`pNr9s/xМ_fgf*W9OYsՍɜrqA]l[|l>c6O67G{ש~զM; V=o|yϟwo 3w& o}x_=uٸ2(T4ޟgw~`0N[{\Um{%}m7/P6yOٴ?s8R9u|V.(/v9^wrcўgCį?/W/O H H H H H H H H ezGZ@`@I''HR`M {`?ukO+6N O]/qA᳀B_%W`SO=u tg/R :lq>Qm;:r[G}~M|I 4cW^y돤OO}w^l1;'r,e_`\Ɗg[ou? qK/ٵE_{O?#7vr}/6vb6w }\k $@ $@ [.cVλ*gnif/ÿyyyif|Mndl#7.9MY]d玲$Iꫯ\78ʛ;o}sx ,-.^ܧg`cA Hկ?|3p~}2>fj(9;Α~hk]3ss9Mھqϛo^~;+?lZG1흆&&u?`m^π>$f~mvެq7V́|d/imѳ6{&mUZce:/w\cwH3:/⸿|vsKڳx[6Gm'wjr{H H H H H H H H H^^x3 M>6p\*mAAA]t(9 _}uz5kg75]+VyՂ}=(!@Sp <[(y;6_2NA~GqZe'}eqvx}]8՞GϜAfјH H tq~A:-0lO=+p]g{>My9y1J>oU7r\[`lѥoqbE\buZu<(NO>^A"׆±WZ]瘤lW:?ʬ`W|R{]y>[;y19G%W~ܳ-._usxח{ip~ejuW?:^*qw\=cs{7rӨøԧ:<%-R{o~Y_SJ H H H H H H H H w?O/O0 (R`S^*>H=6eMP`H O\UegK J4&A_5l"x*׿>&Qa h@sƸo,< )psudLؿ;帺^}Nխ-uP*YY@Pٴa}2>g9rc~駏g.>q)3c>g`=3UL H H  #$Ǧlf/#7w^:r<=0gƱؼO}1k.X}̫}6?ᑛ/G?va*3. o xaίߟgG:[so.S[|ȍwߪ }Q֯W^yA߽oR痾#߻Wb~}UwFl焧{g.ދܯ|nꐟˎkOߘrB??9ὂq }zZa皟?<՘-H4Ŭ6-טo9_|>*:ϻm9W{ė6%>ϡͫ-zuOԯ%fw#q/{|=ٱun|<,sD~[0P /p꽂%eT{~Is{.R]|ǹ-yn^e׺Ʀ]oU|坻|oMZ;7jAi-geuUe^+{>}@ $@ $@#Ekʷ<|Ϝm!E>o.061G5.c,ŐZ(i>槎:.[L)ޫם-흅ϻ/ggܫ:_{jg@ؽ#M6S{X[jS>cz>I[׮џ+|:} O>3b1l˯3<|)߯=?l_Iݗa<&H H H H H H H H x/^jGZ@ @mJ+^N;`Se-PC:םؘ!0}vL:׹Ƶ Fa 25{JꑌY򙕠y<y;*w 0<{칝ۑ@ $@ $@j>WY-U~qw k'8zߚ/9>0g +ƱyV1)fvmsG gmls\yimr9"Km>H{otns}q/W=}3OwUR0T;pmguydz6bu/EҦ^c>㜝|\盦]+Z_}P]o{~{}{<ψ_?׉Bg~-@ $@ $@ $@ $@ $@ $@ $@ $@ 6ZXԾm ,'(O@`m )`P`cy5׿$xP/yGvG9SYP>#կ~u_?Q/8 J)`;8~l|eA?Ym l<6c3}'y^;{ !ycaſh3;w*{λ~ꩧ%Oc.E_s2_Uπ9po]{g`F:a/ϒg=~sNr^x9prEl{e IZc׿y|8vS/^Iau3u<|{?Wԏv5wg޻2wfkE.9wӜw{s{y|_<ʾ7 ggF{n6ƣI H H H H H H H H  ,O`A6|+.pP #S@kKPBAZ tLŮ]:FPDcpqg T62? n~vBjS*swd<~C tn~ۮԻPͅv|93nپo tzN=8~?;ٓ>%@ $@ $m0065_;Ro9rsWsț{nƺ9[X@-f}gy9:&Lq-Uw]336Wsc-_V,ܼP[2fcwػ] ? hgc3>8'}bMN]_Խ11LyϳMy{\wz7ywB籮W=s\yڦi&1ۧ-Nr۞g}nr+mҖx7.I1z[oϡ(9wϿ/eRkc=vO $@ $@ $@ $@ $@ $@ $@ $@ $! ,P0% mAl>'hG݂m>Kc[0ЮAr EuO?6Ǥw26]Tys\k]~W{|ﶛ BuS=Lzm]nm#7GҧH H m OC'E, j^ag¯Z kkk;6|Xl{`̗XkmVrƌŔp|"fK;~|?oI2ce'T}]m|zo[{|w1%mUu/m\gDh<emgϨ_#"=߶SJ H H H H H H H H x@ mҞ@?x <)prArIc  R=Uy W`S6c  0<ݾAnsLչ_@Vݵ%_ڗ\NUWk73e?W?Ut6yc3Xy--Lm}|q.ϓ=sgw;ٮ'@ $@ $? c[g.m-_6T6Y_w|z֮k7sYY?k1yϨN7{)}O[)H H H H H H H H H-,~c}<Ât(T@DG?l[04(iG6~ӟ J"駟X( BtE_c!E$y ^\imWubۧgqO=ԝ'x|t}K2f}7b'|`sl>rKN6esڛԿukGSMٳ78=bCsas{?\waˍ{c_ٜuBRοX̹̫fw7yկ~I=u7s>_w^z=3ǸW|vV0~{݅w<6ewK.]k+ӞW {/cuy@?h.Ymlg[3:6nՓ_>WgIF^=sum~?$w<{d{U4g @WyE__*n\|7h.b~lך{9jyccN2:y][ljmڞy3P{{Vg:Λv,wEwSx:53ߞA{VYsW]\z1{e{޷,v;޵v_tiob켋X?*[{bvE\{/h1 @ @ @ @W`[+'P^|%&%ÖX2`I%Mr&KB,Eĝw ۦ/w^;*Y$>d)qmg"׿K;S?6|&@ @jc[sh3?i.>9-Gf[ϵgny{s6wF6s6W[#Lr ۛo}+[ 麿r'7[mv>yc۪-:y|ڜq}m~}}=gđ[}9tw}+ég}5\bQ};qƼGϡ1-wػvoms]uu^gvsmSǴ۵u=ݽ N Yb81캟ݽ=3q:-~ @ @ @ @\ /H~K(!<{~%?KaI}~ǖxT` ۱d:yɂ%|w^2a{yIj <*&z}Hj, SWPow|6k.|ɒY2d XZ]-IVOw}Yh}7\tǗr}/-t}7c碽ƹ2Wm={g*ٷ8X`;|K%OvDԣ>zM2x߭C @Hw>k_{n\l7fo.|m>ZͣwT}8 >5yxv{k.zEu]bhao^;{]E<>X]J15l^Y?gOg 9 C'Ͼޮf%s;˻wg{\S->Q;Y<1n_?[.r޻zGUv5m_?joZ=?=} TOb빙2zvʴ:5cSʿ3\uZԏ}-=0?cP|]^U=~6.ʯ3 @ @ @ @Wa7qlW"_K?OYKb,Q$Ւ-K\$pv,mK",1$^{r޵+ɴ2UBd~饍1'AvU%AXlɖW'xb94Nx̠K[,^ku0[Oi+IkSK%θV{l z!rI%MfӘv}wQ}ϥgi9̱DݞzWzz{F @h\̿gѢ!K4hwyMϗ/?3滽Ch_}mad߾k;m}3ˣŬ?_ʷP2b- m]+L}}~g|t[fNW,7]IkCN?k+S?;30nv༱__+޻9 I[{ݛ]syV=꼲- oQvcezye<ױ,~>\aި'Ʀzϫoq7z6UF-:~gk>u,̥95=>=?WǴcR;d.뼟7ӽO=Բvm"یX=6 @ @ @ @\u p/ K+I'a'$'KʗxY;%;w}{N%$6v_NߦήwgY0uonX_-wdђ<Ͼ1:o Ωw|:qkocm$3[p?+o8>枥B @ p%f.Esk}^O󵀰Em'+SϝV,͝o/<:3w9>ta޼ypo;c1Ugƥѵ:qt8TGL_]=F Lo4_sq'<[[m,*w9E=kuEs8.R{ @ @ @ @ p,,zT] y% 7K$,ٰI,ƮOgǒ&;M__LX{}D{]-i^?w}b8$?v{n/[\im\:oF c>ߙ TcږO;[Vy=mtFgƭ]5g%@ @|4kw~3ߛ9J9?_}sڿsg37m[ǣ|n.Efww Z[l{kku14l^}ݙws]|2ysƙVӿm6Mu;_?Scmv^m*3 \s>,Ύ]kmcq͘LXLZ;bf?eVw<}lwSϼ:VGmSb\-+ @ @ @ @.ŗzx߹J , c y#<$X$NR;sg}Xْ:o@'Ycݤm;`r7']w1{I%_oqtOꞭoٵUO_mLb똺N8~M^Ƹkc[_K*޷zkyA;v_%tTGɤ:K3t+o?0l91XQRm=kc}enn @\-f`>7wk^O>7ӳ~{Uf.٘:VOךVmN ~us0v3P.g ;?|řA˱*ӻ\+=w^ܸtw^}kci{g\챭{c})b.ڨ_Q ǎwN.bpu d7V衇t_m z{:~Kϕ+)|{;/lٳ8XsX.}hlWsXc-NjsC @ @ @ @ XX\X"fx%.ɉ-.o$}ocᗸW=%v>TW]$)K,1?rx$%^/!p~'[Yd %(XJ,ᱤI>,J̜bX"kuL\W$naaٍ_}x1n<[]s;owߔSOe>B6Y{. =%|WZ|jǮ/ks<=g/򼕰HޝOݗQW @́[b[ׂ79{ז9H<"Mvos^Y("g{=?^۱g?,ڮӱ8:L}̓2}b(@IDATp.wm̂ٞVbjkq՗Y[ Wdz.:oywV,g]s⨾~ԛ{>qb/s=< ʭ?~zس__9/.v_z~z_b9 @ @ @ @g]U $b(V_ %AX_ 'KB.̒AK XbuLc]+:lKl-,k%VOuhdĒ;Y"c}ڮEyɔS%nvEQuٖuc_裏Y=k:2[ =']83Qbx39YjZTuxMx]뾼+{s$@ @j 4'\y{͋˸ne.ܤåSQoE9{9ȵwe ){ژk2Chl7_lk]geqgߪSuV_SfjL 2}>gE1_WuJmwglVSGu,Scko9u=;>1OKu}ѺG @ @ @ @&`aeo?8YSWr|.atKl+qo-;ws>t}sbgnsi:tmSo %V4qWϓ|X}ճ٫znc^+ٱk1˵⫾buy[s9+ߵӉeW?K~G$VǵŧO?cIw8WWm)/46ŗyQ*_?'\M YbkInoe&Yk93~Ӈm'x,X=%N}kؘ5xt>X֟sgy曟9im\KByu塾L @k WK5/n֜W{enyLu5/]ne+UwEu=ChoNtѭ s5C;μwi3S+{3<:ƫ>Wy̿_iwG14oɋ~ͳYs$>cgzmN{}7Is_'QvlxC @hΜ]sW'<6eպ6Λg6'j9sSܹ9iFY,ism>ܰv^Yv^]o[];zƦ})٦ڜycY+Wfr|j2=ilXt[mz~^{;خN;56wGٮvO̸>2@s< QȊ{8Lw~86[}Sy'yqw޶Ѣ*[M;c; @ @ @ @WUtwP2 4]l%M`Obn~";/pNa~s^%鯶jcI6js妯ʭNd1vvXyuM}W<ٱzrX'N guۊo.&>;o z|9=yt6Ϋ'1vs1vo]m?2umڬqn @Wsvl8sUJ|e˜ʼnu5si"̶S;"NySu1خ tm8}kwb6;q?w>qֱ:_DZVΧs̡UJ @ @ @ @ @2XX|FP,,.>yI Kw۸lJ1kngʈ @.@r6mA_>0h,,|`ym5V{̶:zVsoaf!ke9ϸwZǶ}}vroaqig| j @ @ @ @ @#psۉI+LubaKNlu"%ycO2/eu @]F+{wy߭] ͯkw.eks}^/>};Qo3EǶyv;_<븺n#@ @ @ @ @}XXqu%8[ wW;IOW1m [Lh] @[]F{scѼ)Eŵ=3cfZ{{.{[=cqsya7;'@ @ @ @ @oW߫=vv։uDIFsɇv8\ @ @녕}k;c>qΝU3Pcχ  @ @ @ @ @,,9/wIC GT  @ pN=Ʃq>`X> ;y}w @ @ @ @ ܵ 5 @ @ @ @ @ @ @ p @ @ @ @ @ @ @,,>* @ @ @ @ @ @ @ŧ>B#@ @ @ @ @ @ @ p  @ @ @ @ @ @ @.`a񩏐 @ @ @ @ @ @ @A#  @ @ @ @ @ @ @ XX|#$> @ @ @ @ @ @ @G @ @ @ @ @ @ @ p @ @ @ @ @ @ @,,>* @ @ @ @ @ @ @ŧ>B#@ @ @ @ @ @ @ p  @ @ @ @ @ @ @.`a񩏐 @ @ @ @ @ @ @A#  @ @ @ @ @ @ @ XX|#$> @ @ @ @ @ @ @G @ @ @ @ @ @ @ p @ @ @ @ @ @ @,,>* @ @ @ @ @ @ @ŧ>B#@ @ @ @ @ @ @ p  @ @ @ @ @ @ @.`a񩏐 @ @ @ @ @ @ @A#  @ @ @ @ @ @ @ XX|#$> @ @ @ @ @ @ @G @ @ @ @ @ @ @ p @ @ @ @ @ @ @,,>* @ @ @ @ @ @ @ŧ>B#@ @ @ @ @ @ @ p  @ @ @ @ @ @ @.`a񩏐 @ @ @ @ @ @ @A#  @ @ @ @ @ @ @ XX|#$> @ @ @ @ @ @ @G @ @ @ @ @ @ @ p @ @ @ @ @ @ @,,>* @ @ @ @ @ @ @ŧ>B#@ @ @ @ @ @ @ p  @ @ @ @ @ @ @.`a񩏐 @ @ @ @ @ @ @A#  @ @ @ @ @ @ @ XX|#$> @ @ @ @ @ @ @G @ @ @ @ @ @ @ p @ @ @ @ @ @ @,,>* @ @ @ @ @ @ @ŧ>B#@ @ @ @ @ @ @ p  @ @ @ @ @ @ @.`a񩏐 @ @ @ @ @ @ @A#  @ @ @ @ @ @ @ XX|#$> @ @ @ @ @ @ @G @ @ @ @ @ @ @ p @ @ @ @ @ @ @9B׿m @ @ @ @ @ wo~+ @ŧ8*'S }Wи @ @ @ @ @hQ{fqi( @-*na @ @ @ @ @w}=H?` @0{9с9ųߤ @ @ @ @ @-p @8m O{|xtMp瞳 @ @ @ @ @ @Z\l#@ @,,([\7hA< @ @ @ @ @ @ @`_Pq @ @ @ @ @ @ @{x$F @ @ @ @ @ @ @,,8 @ @ @ @ @ @ @=XXQ# @ @ @ @ @ @ @oT @ @ @ @ @ @ @,,( @ @ @ @ @ @ @F 7*N @ @ @ @ @ @ @`aH @ @ @ @ @ @ @`'@ @ @ @ @ @ @  0Jb$@ @ @ @ @ @ @ Q⍀ @ @ @ @ @ @ @؃{%1 @ @ @ @ @ @ @(`aF@  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @lx# @ @ @ @ @ @ @ `aFI @ @ @ @ @ @ @6 XXPq @ @ @ @ @ @ @{x$F @ @ @ @ @ @ @,,8 @ @ @ @ @ @ @=XXQ# @ @ @ @ @ @ @oT @ @ @ @ @ @ @,,( @ @ @ @ @ @ @F 7*N @ @ @ @ @ @ @`aH @ @ @ @ @ @ @`'@ @ @ @ @ @ @  0Jb$@ @ @ @ @ @ @ Q⍀ @ @ @ @ @ @ @؃{%1 @ @ @ @ @ @ @(`aF@  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @lx# @ @ @ @ @ @ @ `aFI @ @ @ @ @ @ @6 XXPq @ @ @ @ @ @ @{x$F @ @ @ @ @ @ @,,8 @ @ @ @ @ @ @=XXQ# @ @ @ @ @ @ @oT @ @ @ @ @ @ @,,( @ @ @ @ @ @ @F 7*N @ @ @ @ @ @ @`aH @ @ @ @ @ @ @`'@ @ @ @ @ @ @  0Jb$@ @ @ @ @ @ @ Q⍀ @ @ @ @ @ @ @؃{%1 @ @ @ @ @ @ @(`aF@  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @lx# @ @ @ @ @ @ @ `aFI @ @ @ @ @ @ @6 XXPq @ @ @ @ @ @ @{x$F @ @ @ @ @ @ @,,8 @ @ @ @ @ @ @=XXQ# @ @ @ @ @ @ @oT @ @ @ @ @ @ @,,( @ @ @ @ @ @ @F 7*N @ @ @ @ @ @ @`aH @ @ @ @ @ @ @`'@ @ @ @ @ @ @  0Jb$@ @ @ @ @ @ @ Q⍀ @ @ @ @ @ @ @؃{%1 @ @ @ @ @ @ @(`aF@  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @lx# @ @ @ @ @ @ @ `aFI @ @ @ @ @ @ @6 XXPq @ @ @ @ @ @ @{x$F @ @ @ @ @ @ @,,8 @ @ @ @ @ @ @=XXQ# @ @ @ @ @ @ @oT @ @ @ @ @ @ @,,( @ @ @ @ @ @ @F 7*N @ @ @ @ @ @ @`aH @ @ @ @ @ @ @`'@ @ @ @ @ @ @  0Jb$@ @ @ @ @ @ @ Q⍀ @ @ @ @ @ @ @؃{%1 @ @ @ @ @ @ @(`aF@  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @lx# @ @ @ @ @ @ @ `aFI @ @ @ @ @ @ @6 XXPq @ @ @ @ @ @ @{x$F @ @ @ @ @ @ @,,8 @ @ @ @ @ @ @=XXQ# @ @ @ @ @ @ @oT @ @ @ @ @ @ @,,( @ @ @ @ @ @ @F 7*N @ @ @ @ @ @ @`aH @ @ @ @ @ @ @`'@ @ @ @ @ @ @  0Jb$@ @ @ @ @ @ @ Q⍀ @ @ @ @ @ @ @؃{%1 @ @ @ @ @ @ @(`aF@  @ @ @ @ @ @ @A=  @ @ @ @ @ @ @lx# @ @ @ @ @ @ @ `aFI @ @ @ @ @ @ @6 XXPq @ @ @ @ @ @ @{x$F @ @ @ @ @ @ @,,8 @ @ @ @ @ @ @=XXQ# @ @ @ @ @ @ @oT @ @ @ @ @ @ @,,( @ @ @ @ @ @ @F 7*N @ @ @ @ @ @ @`aH @ @ @ @ @ @ @`'@ @ @ @ @ @ @  0Jb$@ @ @ @ @ @ @ Q⍀ @ @ @ @ @ @ @؃{%1 @ @ @ @ @ @ @(`aF@  @ @ @ @ @6@IDAT @ @A=  @ @ @ @ @ @ @lx# @ @ @ @ @ @ @ `aFI @ @ @ @ @ @ @6 XXPq @ @ @ @ @ @ @{x$F @ @ @ @ @ @ @,,8ٳ$YM%Vk1b s\@؀ek_6X;3stM?QY[֛OfU"N @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @$oaH @ @ @ @ @ @ @@ @ @ @ @ @ @ @Bb<~/! @ @ @ @ @)p]w  @ @x$md%kX^'_?Q @ @ @ @ @G.Lޮ~wy/G @$b9J*^'l @ @ @ @ @8uR% @\P?tA8 @ @ @ @ @ @ @ؒ/oi(u?Bu[ @ @ @ @ @r/Wo9  @$ŗ{'5Wb4B @ @ @ @ @Z`>+ix=ܳnB @G+ h['qE+ @ @ @ @ @w5}/'_?A @G$5sDA  @ @ @ @ @ @ @[+ zj @ @ @ @ @ @ @Q {Q J>_|q=[- @ @ @ @ @8u_7{ @G%G5!@ @ @ @ @ @ @ p9/U @ @ @ @ @ @ @J@bQ ` @ @ @ @ @ @ @\ӬV._\º~}  @ @ @ @ @ZUZ @ plmD v @ @ @ @ @Z}#8 @Vw @ @ @ @ @ @ @ܡЁ- @ @ @ @ @ @ @k{F/n]S @ @ @ @ @ϵ@ @V$oencGnE @ @ @ @ @כ> @8)O:K @ @ @ @ @ @ @D$6 @ @ @ @ @ @ @i H,>[ @ @ @ @ @ @ @X| @ @ @ @ @ @ @% [o  @ @ @ @ @ @ @NT@bn @ @ @ @ @ @ @o%@ @ @ @ @ @ @8Q':M @ @ @ @ @ @ pZOk @ @ @ @ @ @ @D$6 @ @ @ @ @ @ @i H,>[ @ @ @ @ @ @ @X| @ @ @ @ @ @ @% [o  @ @ @ @ @ @ @NT@bn @ @ @ @ @ @ @o%@ @ @ @ @ @ @8Q':M @ @ @ @ @ @ pZOk @ @ @ @ @ @ @D$6 @ @ @ @ @ @ @i H,>[ @ @ @ @ @ @ @X| @ @ @ @ @ @ @% [o  @ @ @ @ @ @ @NT@bn @ @ @ @ @ @ @o%@ @ @ @ @ @ @8Q':M @ @ @ @ @ @ pZOk @ @ @ @ @ @ @D$6 @ @ @ @ @ @ @i H,>[ @ @ @ @ @ @ @X| @ @ @ @ @ @ @% [o  @ @ @ @ @ @ @NT@bn @ @ @ @ @ @ @o%@ @ @ @ @ @ @8Q':M @ @ @ @ @ @ pZOk @ @ @ @ @ @ @D$6 @ @ @ @ @ @ @i H,>[ @ @ @ @ @ @ @X| @ @ @ @ @ @ @% [o  @ @ @ @ @ @ @NT@bn @ @ @ @ @ @ @o%@ @ @ @ @ @ @8Q':M @ @ @ @ @ @ pZOk @ @ @ @ @ @ @D$6 @ @ @ @ @ @ @i H,>[ @ @ @ @ @ @ @De>ݽ޻;S/X:V9s旿yԜhns=ܨWo~3g^}XʙwSo #:sp=ofTZ\D`{]՜kV5{}__y.ǩ\3S8u;}gKv}i47{&@ @ @ @ @ @ FA P_bIկ~{LWݵkv'%xIkw$xTWr]sJlO?kmR;~_ߕ\%yKzv?R6۴}7I)7$.k|՛'y}Jz_[o-eso7{疵zsw!kn%XV~''K}~7%m  @ @ @ @ @ت+~#'nHļ/ %uKZeUB3<$UT 6e[f6 0 %;۽<*|^G=3Jcs>ڽ K׾%{ZBXjYK.(={cX\B`͡W^yeꫯ.zꋣ͙v{;t&^[{_zR{:#/r^O*}eKMX/z=?oWK&X @ @ @ @ @ pwlDFPS_Q%}cY&AjJ&-iX;hkY}-/R s%}vzK_lK]ۼk.X\9%6_'ojӾqmZzWL2q\t|)|ͥ^bqse$w!h3f^N?py_5zGf$ڷFÎY~[`oջfmL}Z7U]?y]g!@ @ @ @ @ @[X; S$ƔPSS R%ߕ0< g5u%ڔH5%F_гNs̯z?ʩiDI%\SRqWAgi V55w͹}~[ھ޼X*%߫#u @ @ @ @ @lC@b6I.Mחf?%/ƖU,,%P;Kb߿+}ͯD\< 5yz6m9i׹W#P /cz7֩TKrq/9M梲;K4vTwEܾݵc?γcw  @ @ @ @ @ p H,SGV@`,٩DzkIn$'I$HU/)^X\rOs=KBqntYwg +~g^!([т%S{/{ߐ$p ޚvk67ݱ)I$f\ۿ-~- @ @ @ @ @ =38HXN|eI/v$C5Xu Uy]3TZ܌@sDٗg>=C|+Kb^t}3pΝ'мi>4gz._#ϛwӝ7.Gͯu~7ߜ׾կ~uw~_T?a]>zɬI~s~KI~G @ @ @ @ @ @- H,艝9J~*O>G-jqI3}E[ħ?p,twyg߿l@UBEJ.'\^/TU:~ڵ%yX"⋈הT|xgM<̮zf?%zo|dz\+'X~ deחooJxꩧcPG{]D#$ϻf- @ @ @ @ @ @ H,ޘJv*Jvzv%LZrM3%Tr0W͝ԯY:T9rlǷa⸬I`l;Yn1X֗Y~:shשOyY3gMe2<}l/[9(!ɟ>h|y}]k~e^km˱~gz݃>#K;u{.|ԯ~zyN}nyu~3Wo1/\XƕkcӍb9S_fY۷csݭ,oĹ_f5^t^kk\_,.~Gʠ7^XF׿^wNdZc},v3ӑj@IDAT׶EX۩T6]֯PLkk?2 lN$Vo30]3m7ǖ9l3wg]Y{ab37zm{ܧrbzhA5=&G 溹T~mPzm2o&='u?;^+[[ʹ;ʉs]v'39m]n6ʹ];:=枕sϮ;~NcCsMglUY9͏ZjI.hY[/;nwWed;>sy\nԏ;}UڝsRݯ*neKj{{?cб9>}7[v8NY;+ʳe\kh{ƣ{L|Svc:ڟ/9N"N%-{K=J>h{b>1W%b$I%UfeXog}f\k%$}O_\Btk3ջkWR'azkmUNGkhw2׏ƧyiN׮]}޸Gfg,W_ }s|˵gŖmkۍz o8ĩW6'9U7qN~r96#3Oje^ϛڜMVݰZ{fgZg;5NGF#瞍A--}Ƴs7s|4@ܹ@3Gu΃>x}.ê̓ށ8qGܭ޸79;9Cs*%3Zgg[C=mviἯƶߙ/]yWgp}ﷹ.D]X7W^ӯg6ݯXZۮճi.Uoߜ?Űqm~a^{}g @ @ @ @ @W) *ݛ% L"$RRo{嗗I(#<$<KrdI%Y,%`< /,I%jdk-IW=3Y&IoooCV[O{=i}%t*iJrW^ye)~xK9cT7w6ͩI̯zsxkZ}Itʹg%o8[W]dHcZO~,֞,*$Hӳ[Ϳ??\yQ%?27뤭';{g:ca-cɚwWk;-2_ϛ2c{zO= V?8{caIz??['z{rnwuo3淴g~u~;~ҽƯ8|qxogk~{*[3~'߯_y9M @ @ @ @ @ pB= F*J(Yd;&䢒J(!zIR%KDkWBE(QrGv #%Pto$s\KH)I\)d)'Ki)$]拝8-=jrtkΫT~n6eWN exJRl| j˿~KF*1dyw$ovơ,]URݧskګZw[O%Uw]^ߍCh_IӾ_|5w{WMݭx&{wo^ 5OzgvNz׌gR_9W;oj^?]y^_e}ƽ3͠+yoֿƲr~;sm.\yw7_z~ne-]wɗ;}YZZ췬Yh]Re1Lx|kοeέ̲1.YT}h<>c6mdwTexvn]kknvc-7%N͓޿os2ڜ=wẸ̣Yk̹y3_|^٘v2y{{bq7g&~[  @ @ @ @ @ pzܟ% PGk]Y$uLfeQbG畼2ImQ%t$etnʖftMkk%L2Ku+Qg2#ퟄIY]}bϠvZJjiYtWfQ^ښ{~Ovv굙wOcR{=;x:xv|}tN9Vٱsjc=grgcyS[]k>ԯڬ^{rƼ&Xβ9~^KZKV]{xOSvMܕz}mZ繯lkTپ 1{4F-ڶ}9u^E]x4΍wz>X;-y8tz;^lݧ62%9Kbԟu:yk˰tݼGk}ӹ̃W\7۵2mw_ϊuEscƣ&w;\}U;ܘ9c]Yy}eQ8bWkk,hLm̹ˆ? @ @ @ @ @ p`@&I *Ď#Jh)`V^ۓ1$I:l)q~HҽYbIvT*`(b{3}-!Mk*'I5:[[%G%:]_4lߴKOs~W}Akq6NYJk1^Ƨk:vKmևRPFZy6/^86ٴ8u?î|}ѶD[=jX=I9ׄu3Y ϵ1^ @ @ @ @ @ @2$_ \@ %kKC ,%1 %~\vm~wI*\&ad,iI(QeJTI(isД13%jtI)d5^y%{TJ,gMINRKI!R9ז(y-dK=??ђ8u3/ѥ>xԯ/$u-^{6Flڭ~cQ /7/K)Xf,mͥ]rT}1b%7_&k7}*W6Zw^1N,Wtm˕Ou{;0o}?zN_;p%֬{K+ʶXemcb虫///{{R%i{?ܧgj_ov~T|{v,햞[__aI_Ǽ~^~6Ջ%WVeƥG7v:K/Vcis @ @ @ @ @o%90%lQEKh)D@&DJ*b+9b֒ZjdʖE&{ִptYks֎MKmwNj>q{⯟%LPmAw?]W^SO=$oιy9Mӧ,[J*׾JB޾XU[Me˴SZ_*'>K@}Ǘ+OK÷lf-gPY\~YH[OcXgY`@k-=Kjd]BPՋsmrmm__ە]۾UcVn^Y%w q>917'l;&ƉZ;j+̧ڱqj{vW9}^?ʎ5ҾbhӯyqMmNk99l(V\oQ{ceb.IXկݖΩ9޵%NBfF-3~L*e,c|x߼[:^Gݳ9J @ @ @ @ @NŷS۽\@ %CLH@Ju?Du$t}[_ܣ}{[ο=)Ć$Ri#|3wcߡ/'>vPIĎ.ϧw.uOW7#Ur_̚Pe;]o[ĎV,wKߵ1Y1'zgIb0M\c#!M/kxmϼg_ލKŘ̗~xX4\SWf{>?ssՃwlFd o=oLuJEϵq)Yy~~Y杌g'_b;K+Ʈ+{ث [1qք]8%w=i92;-O~j, _KI|e\5®Iqg/ɽX=O53ZNƌ63!1h<C99M<1g\y 4NMs5y;Zىm:w췜8ʝ0NN%q^|>s̱-61e&:Ħ5Mֲ6b?~ưe&|s8kZ῭ߓS%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P'@)JM>D@1)kͣh͋!̋\{Ax>om-;u?MF'UčI/se̛_~RY5~X㨯/gc7メv8v}vY-SZi#;c'ą=3VbLp;:5ϳ.c+/2>1ϳ>X}kzql+8}u}M 47Cd}9{Z5b]wz7{<%n}/[?k{51^.(\G/{MSs'hxn[Xl.qo<$6ҥKS_ߞ+a~~֪9AƳeuqލno0LV6SaO((((((((((((((((((((( wfJ,ٳg{O}͛7{$j+(-.wޝ _~9‡Ǐ=x`bQ((nO W~Z%KZ~[WN{aӽ._/("S_DEP_}մbH>өP\~}?T7A/,[Nh唵tmo_k6B{|{崯gڵkB!$%W^_}7Sg.B/[y-eŖ9U~'ٶ>N|d30y"3@ k.}z #yh۹ac=g\rHZsʟ~cL19`ek]o[7;f# ߹sg?wHXy\ԗKl]+Zg략^߉Lkb`ֽE\;O=|^@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ II9JNQCKZ?aDBXDBD@\JTidOAPAAH1!9% t}X"&a1s$1Nl;IB46A1GDT}U6aK֌k|S)y+4.q%(,&p1Fö9 U11]حD*<#S"%Q U* ˁ|40>qQFH`|''R淼ۿ۩;u {W3^9'M{nnL^2ݟŀu/~\[l<>O&l}fMٵu&00NXI$mc$%shyo!{cǮ)o>_bs2|?INo.' ْGCڸJO th-{oЬQb,/!WzZ8s6%|oz8K ٓze&sy<θ1ݺ3PoY[8+CY쯰Ͼ~G㾛+ϮYBf}k e{g5!*߬6{ =xGgs_'ٳb,ULiAgyzi|pkp~W1==Ln @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @;CYC%p@Wg&yDBAEDi3sxh%b*c#0'E!ei#b&B E>emy/m%9ਊ؈9#8oӟZ,0Dl!!aǵ"&ġObyx>æO9S/B$~b7K|Jr/{-"jZl眼#nvoٷyN ɑ;EaMnx'T~s( ܓSgm6y5'fG]熿rit/{E,c\6VaOMY|3;~״L\c[x}JJJJJJJJJJJJJJJJJJJJJJWJ`"8T‡=!#Bui\)l껗.qE#vh]oL9<(:!#a 1JE-l[A,WVa6|ZZc.cN1(Rǘ5Gޓ!9syfʶOk\9bf٣@Ď91ZqTK]ܻ7[_Vl+xϚX5]7x?{:[W:k21ڃ|#09{TS;[1qZ?/Gg㱷X~oN|^@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ q8f b B 4W75@DW\E,<xxh^<3?1ӧOR~]ʕ+fn(׉ 㷿$ Lq3?=\>P|/^b$HPG~Θ*ؼz//MK؉ڵk{7nܘ/%:RͫW"G?}<ĔߜX٫׸uksǏ'\=1Jlg~@~o]Ȿrm56g8s9S,LngM9 \[l{ɯMci=KN?u2_)SG=>V9%rS+DBLn|Η-ƻY{W61Rl+6ܽ{w:Zs}g[n@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ I$(w8!EDD%GB%n (#!l! 8xNtAD=sy96 U1ʾq a1:"r1V(Ӎ?!qÕXo# bg۷o)NZ#s➹W+""si"#EHO#.pR]*+GNussuxr{ 5珽6z!uh俾5 ķn/(((((((((((((((((((((&Paq8A  D SIl*HE(aB/7z|b"L!t81U~YZG NƵ-6/ {5X=Lz]TsX'E&㙵G+sg+V~G=՚f'vO>IUUpgW?B(mRn&?Q);}~aT_r]k1Iax`={R4g*A]Du'0˾)v8\51=yJ|pmNb̚;9(sV|5^^⺴6\c}k|s(g ?8S.65=Ͻ9٣#&vBP,\k$a4lxr&9f y_^X~ymaW87wș迱mc39?zP ƥa'5{.6]UFŋeG,}56V(((((((((((((((((((((8)S'D$QDJ>q1Ǧ¾ڔnD8 [Ԍ٦e#{% 1}T`K,lY q %%qPE5Z1}">)>׽'%u=,ư }X'F|W۳jMk>1g+kwYbίK,Q/Wr3o{?(u7bцؘ>{kO}\G{9ra6kn|*;{lƞv{vV 0'o# 7{3﹵jb75VƜ9Xٗ*q?{l'.m:I cWXL:9|T׭# {m曭]n\^uyvq`s?M ߔ%g`9뭘bUWx厾sZ'G~]/䝜H*Y 1_ĕ1"nɫ K[gqm&~ mJ6U=I_2޻3~-(((((((((((((((((((((&PaI|%pE={6 {2DC B0A~D'K# H)#0ֵDZ|!%q{D#6׉A!<1Ja Qb&!ZilK ቸ._Lc>wU&aq4rEiNd=wǮ;:EW"7pguXb&w2"#8}6}N^XܓZzo}}/ęr3ת3pULʵpV\QVC7{:[oU9{OmsA˳螿ќ{uUIy̑%]%_~M1=ˎ\5q~kOY} 3aIol껠5"͙N{)cږ@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ IX8i:_ KPL1J@@AЄ0M 挘TŘQxͼf#bqd1<-ӦD蒸ŕf6sF5*lI?8{Y?fmƳ+5+c[,rWU߽z۔1۲DUE쯏uNď:Fə ft=Qc0s3.18V_MZn{f#b~JdYq\/iaGUڲ;kOxqH.8@Ju·XMk5]zg͏s͚fA*|IcC|Xbcu÷7g\ ,}C8a[Y166w/FckwR%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%Pg_qgɫR%p([K5 "u/b*mn)l'E3>#ٶ`C;mf8_`'g㽥s%61=&ۘ_bǞF_ |TEUrO|־"gۛZ=}s6oQ͜w,0r:.V̟_cG#iI}e$ʇÈDk.fMQ鈴DTKyMyv> lG޹ˈ?Ks1Ηƹ75m'~b`UaE)ǵvfcg?Gj#2<\mKJJJJJJJJJJJJJJJJJJJJJ TX|+KFADDeRNJqқ7o&[~;D_lՋ/~}!#{o o1'/~I\Xb [H9q 0q&v#~%Wg5͸򛯩ǒ5IwJIkϞf/edgڇyAH sgͮ*J<ٕ/z[;#nUM<]XU|--FG=a8y@ȯ)޶؟/_ڛ|0OZcscsӺ<ֺ?y0r9L?ktОyg_fXKkj.go$6s_ja֞Rāok{i/{fjws{-%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%pVTX|VV~! FF=zh￟$hꫯDϋ@IDATDsիW FIPO/Nbmkm*@%lq|"&Yʖ$6:1~ьsdMb-nƞ'@q[ gg`ÇS^{Z<: WUK_r-y5.^#{'"(7xy#OzqK6X[-+g]ޞ-):ɇ%Mc0>{:{sǏ'aqY߂qy眒wY㬧&~̷\]tiwb^ s]nzc;kY/1=7^\*9%rA?&cN~g{rA~ɍ%e~#bGk~ӷް>7nػuw諭1ڧO.Պc/_b%]њ_7^"8QxIDNmdתj=Kjθ&Ϯ}kv_.Y껷Mca1li.8w}*{ȵEtfama;yy{+7Θv=ڵr1]+ٓ#kFa~|ri~7{sq;'ҷ1+URW؉.\!vږ@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ iWD@D8A A ADFD 1NkLmx'"T', ~IӍsOJ{N8n؞@n:@<@y{R'ʯڃ5E}cb~)/F,l_5"p!$V{j>6>Xʛ1g K6lf?jyWs[#Um_]p&󍲟|7#&>=oilؼ+S_}=wwz?<cN慠яfYD E{ƂJ=nY̫ͽw,>}O_8~Ϟӆ=i=>/^L6q-o)((((((((((((((((((((("Pai% _{$ Jz$f v hi/^8+}|_=11$'|2<"rY* `;*Fc^`J% _G\cJl5sCl֓}UIbHWrZmؕIrDn=|{}k??rɓ~a?Gׯ0޺ukjp-T<˵ϧ~$tO}8̞36c`˜a+sOsm[kUwav#=h˩\|"@āqÇ~yg~C<o9gq=g&0r%ɕUgI<> gaPl[wNU}7Qc|f-mM;k C!BׯOUJJJJJJJJJJJJJJJJJJJJJN#N+[B"4bD}!ϕH+$*G1>n_z!Î1SØ=HGiL92(m86b71C=Lu*'޴}-JW&_@ի~"xv}ڍ똜LΊ*"i8ź)o{.˪={lG kI[QjoCj~uT%WNAs|3.mra˹~SkZY7_|>>gu1ki]roX5 {Ag;e7\1ގ߷ؖ(((((((((((((((((((((8M&]G$@@@A1g !TRZ" g*~aq~)*UŒ]c> 5\SvȜZ5c̑km {rʾ#>?6EU"6D{1bsK%|s]#ǘq뷵#paUdMKl~՞֕Ywew|ߺs7oLg=hW~YTG~=շ*|k73wg`fr8@>{_r:&Ug`8wC/Zk/{qЪ@z/(񆭖]s;b.>N'G4MK]?1س#aM}ҥK1orWo3gth[%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%PAӠ9K`G?7zjHEJ,Ag]~}5b `B!&a[(,^^xw޽I|dܵk&7sob b+?l9B؊HElƨDIg\=sDJ0JBȲ"̡31+aʗ]ډ|V !})a J?ydߦ}@yr^%ry w΍7qj6ikA6uyS*FosF^c+Tyc7xZ狱戴{Q{3{2~:'wgv0{t8 ^챿߻ptM[UѷM_~ӧS+7Çqr@Uζ \/_$ L t|[uz?,Cox[7-e3ZۯΒ]rok;}){!OP8(kdzbW* 둼ʚb7oޜr^`ZkNO/=L{߻@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ iج;M:w '"D""X k~ݔH_K@BHIBBh>?~y/--F+Ns3"C+؉j؎X's.-Ʋ_d666v@hλ|5Z},|PrEI߽nvߟr3h1ƚ[uOշYIKXl/~ͬY'kUkƟ?.%Sɚ௞ƞt$ m-Xl/:b}wQoo'q_7YŧhLJb83cIJHIz[.מ*{PpCZkJNO/j2k92 *0lq[{){Zorm9|{|cJJJJJJJJJJJJJJJJJJJJJJTX| ;e  1J$4ISWr.1odh炕o^Z2TĮ~e܉3~2q"qn/g|>s^DG|N}"|{ĊD۪{ybQ5w_TZ27;Qm/iʛGqqiu>+:U {,ʹv'#46b8gxμ1ώ t9-m:yjU93XwkZF[g7szs{l.y堒n|ƍ>\q=`}svϖo @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ < Oyg,CD MND@@@4ڑ^܄X ?_Mtߵ۔M_mlf, Q-Lb!:ڦe|5,?pri%K嬛[{oȊ +\=yd~}Ef,kʖy>}:]Gn~_N{/\,CQ`osqTk_:tg &|;⸊9r~9oE;Q s9ghc(~\6x:}vr.MgJ+^qս*(wXrboM7|3-aקuvKJXuw}y8ErY_n#%1Ƀ7~>>7>w}}7'=룵C:3g)(((((((((((((((((((((]8mF VwVWy"!T !*ADL Ȃ腏Ds=bB ۷oO1-fqFD$b_Bm lFLL(Ei <77:m2^LAD0xp O1)YM&!&2$xK,Dۺ.sOGnYokGt\4N،nֆMBإ^]lKMs-kȇmM UWSs￟ ˜saN&V}1jB36g _bҦo̦bəPlZk\Xϳ'7q8XclobJX,{;GtZS*ٜDZo^k{D^:KO+6WIz,/><}%8;Ag8jx\k]/i>vu7g~_:۾;>)Al::5>$ͳôvGu߱r^g&bmϭ{q6<˹ "gXYsb-_^$H!t7[∐Dl:5FBut͆yš?FhŔXVH,5?ӄgp6x! clď_6q}>wþ_+%>fLrMR?)}6;)s͆>c?s=v<59cb1l{;;J9KX74O]JN?bɰ+{һZiZE^'o=QhYK kh]5/uX'UeN9<sG{F4Fne~[0H293PNiwTzI{笝%| oy5mr,"NVu V|r9Omx*g ъ]%f}9<2."k9S|WAư5k[JJJJJJJJJJJJJJJJJJJJJJ4 TX|;w lI~ # ~ tҾQ?-;p>|HX3 .ġz7e.}g(Z=z4'#pOX¦~'sAE`%8!0G;D:W\ٻ{=?Jx8=x`2~wʼn'~zD.hݻ7 O6WDʙ .Lb\rOe'oy端Q|6`-hGgNN$/hl>8Gy5v^`nnmb;<}Ԇ1x̥Umy駟Nə'OLz;nݚZkXZ/5gqO6׮]ϓ3g%9\WbSu;3#V:>8rFn?5Vn&sF@vWtʱC|7/6_6y8Koܸ1Na]q~8G-NXco6[h]p~9s {ߴO>d}y|p>-rU~8CWq_+rw`=T}5sl-Vkj}M1hJJJJJJJJJJJJJJJJJJJJJJMvJNwxW >~xW"J! %>%b8ia1aa6b -&)=q'(sEa6EA4OlB`4(|RKp{? p$4X߾}{>G)|8&B+጖ovjSń>iS"ĉM>G쭩G׸כ{91A~S+vW#8ٵ˵y37 wݻyV>FD)o0kkNٳR_|'zl.i#zy_܉=}V}c،Ϯ|䈸|4d=<K|>vƈIډ+{u>Qgb6L+|DDڬX;}hU}[.1l vߏm|?fyg_gs[룞K֖_rOzV8jƆw|_l|Yw{B+Vxbtn=Y=Ks>k "KauNc?Rй=^سK,ff˻9t{lmܳWq1:Uϥg W,%|W엂kqJH__l}tBn޼9U}sݽ{wj#B3wlzgb^:ŀ/~PgE挨|bV׻-b##}q|SÅ Ƌŋ#bsxbgDE>b _T},H~PY+%yy퉫|'6#ѓ%wňngݛxkO>leׯ?.; [6L*X⢈F3q<}te|O58 7kn? g&{l&\\Y/%׮]kg?'Yen#L=+c+y3P Ɗ;Mlh4xOX3}0UK8'X37@y>l7ι#1w8/}cgק9ܹsgMӍ`7rSl|wZsk׸h}㭅[_z>oH%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P@dMA%`x.QAP ȁhY9B " BF}EjOl500f^J!>qL~$ ?"u%%we%q90 }b"U?%dRe9:7ًr6A=lɽm8gIOq_̜_}մV `?\]Wbs.G gS.8ϲ>ﯛO܄|ܞR߹2/((((((((((((((((((((8*,>kTK`a a ~+i p'_+ .c*E;glͿiÞwi\;ƒ0DHɭ1Β~f/{DaÜ%) _ɑ~g\6q+kk-rF}g%ݵ9&ET5%v[pѺ]qEI}1-)ƍ]%.82TW'=O˗w'U6a{}8?"-Ux"Kll'k7}!t/̮{xĎVl>}˖kygϋٗZcᛘ_1ܑ'c{XR7?1C CAX'S'a9ޟ|淪L;(bCwܙ"9;ʛ]F)=Ŋ9듫.|S೼\߼ysjZ]wΊ)k/N9κ30|be<;@%t̳۞s#>G=Kđl4?Z--;yq7YZu&VƳϞLi2R/_07!0e~֜D'3Z>/mƸ7M|ɓZ<^~_N5%N_AZn+935q KʘwIl|sm^6͙~q ߼y3lVb vҲ2:"6fKu/y9G^'Fį|S"52^#aS'8c)U}y\3ܵn87%JϮ!8ƿKzodz>i=۬#0O{ɹ7׹안ٵ_2Ǫg"lZ+=y69~rO_sZZ8xf?ॺn)((((((((((((((x8WnEhKJJJJbyXQVW &cTBy'S^D15<KKw 7Cx \P|?h^|WX"q/v;TsOLb/qOD#~.lWN$&s?s&6]ux9W--ckl͸uq`3aGTqЈIhMy7kdmL:ƲgӜmל_jD_y?uHƗGƋ% YF7&l7n Ë_w|~\~dO,Oݬ_^eLZ3\skxf;6>Vr?gi߻8K%aߠpJ&q %d}];;=-[m-ZE'rK ;97k~Yb1uiFWg7ٖ[yBZ?ɳ\]_@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ iw8BdBpBQq36D'a# G2F_q?p@u]AV_z]5~F JZbbQԏuw4.bul/V8Kc<\'wA]Duy~laۑmI? u.,2؟7Ϟq(B/>`X_Wﲘq+*{xᔵ DS"wփEX;;N 쩓y8LsWs^+g^@^,g{AG1|=zb^~ρM'8(fu_q|ݼnnյ8OW=6#rf7=kesa}nK @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ eT~g9V%P+}h@$%} CXǽv2,WcI>?XM5'fm_<>ԿXhN@r' ρ<''ޑ0Zֳٛ9n@IDAT:lG;,1VϷ}o%.((((ޝ6YYXb%EX' j_8 [B}߄fsU3][?uN]oyywq  @ @ @ @ @ p2Q˱};?}Ok,s^ṿu؎?WYQmpgwZKy&vp1~C= TŸ~O:pShI @ @ @ @ @ @ @9`sTm*l=Pvշ{78(C=ȷuЯ qI)CӺVD`2ܻ\VGs @ @ @ @ @ @ p/lr'@ @ @ @ @ @ @ @:6r @'I$K @ @ p,I}yxB=|vIoIyǽe  @ @8-Cw(\9~{ڢM p9du|}GϏyp @ @ @k`5ˎhb_;N@=ƟvyG @xD/i֧]H;}s-!O @3qHQ_x~N? @ @ @ܰodOilr$nikoY4 @ @Y Ɩ&^pyv}XV @w]`7dAkkwO @ @ p x>:k~|\\ھ/K/2B @AV&ﻐ;ᱮ}s @wp}g}.Gv @ @ @M xK \6_0_Bb_~0׷4 @ @ ]񻏶m~Ǵl~0q>'@ pgɎ)nC.|>}JD @ @ ҋ{Nz @ @s(>z]Iﻓom_VcTŏm @w]|rM @ @ @ϳW[/pr7qv.?o׷v|- @ @$p>c߅{}gR}ouk}zΏ9_9 }s]-  @L`}½V3 @ @ @JJV~\R%s믿~ /_~n]zicZ/[ @ @Y;KTmGߕ??}vkc?_>ש// @O&g-}~3ItnEm  @ @ @g)g+mO( %SeLݾ/g?  @ @)pw}rxkߟAieǖ^=x ?\xY @xB}6=C3II瞧 @ @ @\@t/e_Ki  @ @&p}I1de|w{I  @g{x<=W @ @ @M[ ' @ @'&Ⱦ7f箵 @x~g{w [ @ @ @Og-'H`X.|7vK/^ٝ @ @=}''i~ﻒ}S{{9U @w}N^'}. *%J @ @ @ܨo_޾@p>~j @ @}&jsߥCK- @~yd"@ @ @ pS/T%@ @ @ @ @ @ @ @m[ @ׯ&~{ @ @ @ x?%_@?] @ w%d]:宕]y  @Cw36$@ @ @/j @ @ @ @ @ @ @G5  @ @ @ @ @ @ @SӆU- @ @ @ @ @ @ @G>7|e+ @ @n;Rc|s[ZE9 @<{gr{}r @ @ @ KXW @ @ @ @ @ @ @ x9hdU$@ @ @ @ @ @ @ 2oy^  @ @w> @{ѦjA @ @,˭~)pE4 @ @,Ifq'< @b)3 @ @ @ x @ @ @ @ @ @ @xX @ @ @ @ @ @ @ x9hdU$@ @ @ @ @ @ @ 2oynD  @ @O.$ew>Nmԃ @qu  @ @ @KZ> @ @ @ @ @ @ @nP,A|Y @މ[4^x;E @;'p|?Ə}׎Xq5w뮊W?c]ke%@ @ @ @ @:w`]sQoCx Y@qpko[jJN @ @ @ @`񳳕2.p yUgz V&:b5~-li^;N>m_ywsZkeG|ޮL[>Mo~8;MoAj_{Y_"1vvKA8_{] ;[{=towNf.q.E`;7M @ @ ps,9{9xb¹ߗIP?O.M$]<" o{.yqGY t]5Im-z|Y鶤X y0A,]kv65n,ֶ-Ɖ=,֏~Sظ}G>G4*WG޹{Q{ζC"kkv/u|}oo>x<]oGCm\(l:Ua}kSݎw?<ۍmׇƺ^qPJ~ۿN)w]Gm]??nl}qnqwo\&u{Y/k¾;OeyM{]#ϋz> NR!@ @ @ d,~2?gVㄅfD*?^|[zg >n3kooMṁ?S؄__'?y ;8qs'Mfkծ&Z~QDswso~4 =T|ݤ d%Ʊ=,}wwOƿW_}i /\ZBƗ&wO0_=]c&>{FcRE~_?=Yx77[w{>G3Óz׾v:Uֽ.^* kݽ®ron l\7qW=v(텷z$e}I? ٧qq^mytg>w 'ۿqtw秵xͿN*U]'+ }:]s(w'r:>o///ok{o|IYָZW^9ۮ_m}?FW~w owNeONgNa㢅~}o7m @ @ @x~NG'Ln-(M"mRWӃt!&F=j[7i~7cĹ&lczKWH;zznjk&ljR{߯ľ&SM=K#M-8C M}. G`iƖKJ]}7o\xNE6 Z{mH%ۃ]^mK-O~OMϮ>D˖>4:w|y:xkfkqXڸ[hoo]_OIckz|C}/=ľjlƭK{ƽ֮{{-+>X/ctjit 5./&S绶Cwt/Ty>OqƮ8CCm|mG-9}C1Caki{t}Q7DkGZ;v^ٵޱS>ʦ޲ioU. Zxʮ @ @ @Wm |U>T5IlMlmrTۅǺ?Mjmi\ oLi M a{ tuuoZ쮿^[)ݯWsIZIwLNqqךބƼwC`cnaIcC5^߽q|y;2Wwk k/qB6@a}yVˮ5D]CkxK54zX}1Ϫ>ҽ=Txvޞ*ڼ*~_oƹ'>71o)stRG^<RsJg VzN>¾7L.ٚa6՘Uq-ƽҬkGۿq~ljڭ6콱xK51([x/k!@ @ @ p,nq zBJMt)j2o2WMكuMjT`km/\:We?oO 3&uu]{&(V 6sݤ&l?bm{ηۿ*S618^ @Q7S`[ؽ3fX;k|) 5;})lb:vFm9R z }nֶm}s7_+lɫkc- @ @ @/z_j@ZKWէ M>p d9~ղuJڄh"T `{0UyOZkî][XvѤo7O&Mя~t49f~&5Nnk\ےoY{ȨL76k8ti :mxXv^<$/~=pxOv?l5{`qx_rWy>Fl}6MW};9+>ϕrh}WTt~c{LJz_<eS% ^q-w}z@n @ @ @Rn@ZzW_hR`CuU&E5Qx{[)l_za+GU11~ZƮCNkSX~黦OJ|;W_=KW``AMaپ{w]}WЃ-Kq{o}z_da˾O͋OC{rB}AƁumzиiyMw`{z @ @ @x><5|ZMjiFޛU$&/nRed&6Y &L594Mj\DŽT#]5$&e:~\og 9G z-kmd&5IMA^;or`ΛhߘXZ+VcމSqq1f[s50Po_M:o8~-a]C}:k{wTxx_1{C":-{<֯q~}4?=꼾Tl;>5ugz)S4[˧4zx*_=^ox^ޣkѵq-fwXjDakGۍk1>>S;uRf}XO 0^B @ @ @$ԚB 46cUUoSK? orUULZ:ɦIPKISlUśL|;8ɮorNUmo9Ʒo1╡m8饴s:[w^l=/GU&*9MmPؾz{cK̝;gۣckY˰8ֳxr,*ϵG-T{^w|y|+kiO9e++cY^MDM`mmR]:A_]/k4e{{ѕs} y'깱@Q2j)꘎o& ^i_UKym)^Iv9}쇕2.߅|QVųL [۷4K.U|,_x-؎o)k]Vk-+1)Žˬv=ڝNGi:?qy˵wܱ %r,9;^˯J,^un-ƅ9,@IDATm^.-ʺnOtLm~u,eY 7Lp?<uwl߭mKp[&=?Rb<,-s{K{^>S51[VύmgV-[XY]ߘo>nkϥ߾6[mq+k/WmO)l,Ywi_Ǿڹiṯ+RZ*S}LX8Eەy<1]KX]UWeK35UƇ2,j{c/\&:O-\ 9i-ڤ,6d$M.=O|nrFehlDiVMh"7!*1ߥ]N[Wx{ZϿ\+Ga4[)lr_+Gi5x#)l^xe+M8 jM-oLP,lm?SXSuɈwB_>PkR}f}GŨ{}249;)^?xk%3~poή4[G~Q[G}ɦ0׾SX~WG?:յOZ|9zq;ٮ?T]9gokvW[;/^??8˖mKiǔViyԦիvu5W':y.v}T:fcehL-^ؒ~m)odG\>iaǶūӮy_k\K#lw}&'?}|TX߱SyVƢ90yym̭gpGO׾+Xt*{ke(ώo]=tO}mhSَcwկ/ݥ޿nW~W[oc=xy5i[x]ꫯ>v\r>jq som}]]&+kw1v_Qkuk˱.®ƍ¥}qQ:| {}&;2m)ʷ~[8v~Q<7~P:x}Q^ݛXQcIau긵EFov?U9:0ʒCchaU#X/C֭˧h]˷pllSV:[[w])֗U~a;!z}O*bvmtfYkj^?ZvQh~2V֖ڳa}/b7*>VF:liuY_/']ftv-ZlTIϦzvNm\qϸ=Z.kr^vƎ❓Qi_G=Pem1/_+g[Q9:w*ʻz\5&vLdT?)l_ywm^kΰpҫmv Vˢjir콵2F ˫sV}o^ht:waiv-,mo߱Eڢ}WYYyT)L>qTƷlo}ųn俱M-ޤ??>Mڨ ߄T╡(M5FWV0oGr(Rfe$ګ5Mě0\{NmJ5AM X|ْAuoBiޛS{ybu/,&t^.YQ۷w&zzPpibkdձ~S? + ˳4ZMGs\4cqۮǵs{ hiTڱ:_JYyd;٤F& VT}s1ڪk[w=V[z>VhM _/#c}c}ey]?eJ+[?2̤4˧|wa} ʳX[;V&-s=&k&Vڸ(+_WzRy߿ս:N=:߿t^e_~76=MR-,ll,}e|[:V/╭*l?uFw1x՗/7~M*T_NΪrg{4_ ]Z|M=Ҹj]gy7Ny{MKk~XWˬyuk&.Wھ–Cp*^;uLy%]f*3ߘXo, @Zv=v\X=w=x⧂[>j-iK/*kcvlWn׶d^?^tlyU߽îʴkjʴk|ʿwy})^;fnoKou,qGc__ի>ooO^k?S<>;ȸ5ڻ1uf1~a^i=_W݇kέ- hU|eqvuoNƽ:tǫO^6 {}c^-Kw+{w=V}~e! +>,vsnhqkad5\1e&-Wy4uKcS]kWg{o(24/ß韞i+Ci^c.[*K!׾Chlyϻtf]Y-Wc{an__:>PFeܻǸՆE~w?Ϋ͋wtl8K[M.Qk1+ڣrԇ,lm,ʿ2mZY/~~ӟ=2ʝW`k]s][wW4sZ[wTZ۵s^W~LΫKm8=Hi'c:uu~ZzZ!_mgW/UګTW}Sa&w4~S˳:TիO\˳<=iiv}M2#t]QpSY*ge֎sdR>ힻw^gկt Wk+)_(<~Oپʦd_-cgV]=CyFzo,\v5vT.)̏ו~8ZSV+W+{}Ηիqv}jWyVʷ6Z/ʿzY @ @ @x6& @dp\qoiCZvz!kj&E4Y"MhHsH뛸IMhbDaxUy\ )]Z TջPJs{˯eޤMʹ I 9] ۮ٭/,J4wlu.Rm)^I6ۮLotKJIT1ݥ_2޳jc~4^93Z9+kg^j&#;Һju|Ka);~g?Wap[s;~ee^^9M_+ƹ^/~V뮅-*L+-+i?*^[]/ɣq[\{󺷿7Wګ1[aǕ^eZmeб۱ oVnǕFig(^.-گk|˷Wkc*K-ݶ˧Ni_Zi͵:˽z絺vV+bsvLVR]9ʻsJpm}tѹZ),+ok}~y]w-MRs*yZح/î5պ{<^aƂk5ۘYWگkx*^>;vܺ:wi-Jյ/,ZkRk:_׹qrnel-^Z+WǕʣF^qV:c;2gP(oy$?T:M(ݖ^+o^k2P9Mդ^&^ehnFU*&.νlCul6yT֖<ʷ&̡2y1<:r/Uu~Vޥ_9CiT֎r;yh׎_9K6z@l]gU3ZtLuh_ fkգ EStKI=[~- 7t ˣeyy޽־/Wm/l;թ4׹ů=IZku_[P3$FmBXm\4֍]3HcPmZ}JRYYsTV:{.[*{yo^:rθ*Ku*5X}ڟS;ge/k-߶Kµqueޝv?ұtJukKSJ+@aK׺֮ս>UY+aWٖ_U^[_cKxKae(l=+j˖]t&W[W3|)ߕt~um/mwLKiTҫ><~;ەev^^]W-]kջ}ze (SZr3+ڤ^]sIǷ6h_m}$^^뼶[ʣxկګ6)l=_*WY{;/K<݇mW=ʫ>w ˿`u\7mrWAvR=k~uA۵wy:*Ҩ|mWƮ/sSyjձq_{h}[w+O[+wձxǗn ZyR(ͫҫ~-եZd Zf_XmmSޥӾbeSvNu;Ծ̾sw1ӁQ9Ji-]yJfg-S>aTգ}q]#yޫSW4١KmҾʼrt|޲Z[;tR?nZֿ۪>Y}lI76Z^ F1dcDiU–mƂ]+W︎-11kˣvyeWڝSK׊W7vLiwu^rt|ǭ/+ ׺4^eiեt{cWםs:?+*O4fƕƸʚ WΥk-4UkN~k {_ǝ_\_UZK<ڮoV^,_;WwXwmt~Ue+?Oc+O(ٔVǷk*WN{xyZK/~q dAkK] Ν]9vNKt{Z!^˾v\5]{U╯xmUX)l][u^ի]9XZ˫껕%>ZZr:Zl +s^[kKӽgi7ڽKK޵PXZl[=qG<˯K&WYƫGy/~\eKWzWD}xywNL_:չe>խKyQ]J4;uif\;ZKW|vg:fUΧ)lߎ_Weսʧy{g8:B @ @ @`q  $GeyM~&$4bM MLD&.l&*tnhZ,_W./$O}S&7Ф&eV&i^),'O9WRo]|k_;MPM_o2HC>ϜD^YJ~4PI+7da/~' Yzu"ꫧx^y}{;3̛PՄCHm:c +wT*C4)/-{ձ]i;9|'/d+\|#yD@ՏP.WZ&(8STD.?;_j⥗apxRmwLSY W-]¶kڷz>,?ɒKnysYn9~'9.CK]ƎY/Sy/75vo*ke>4ʷj\O;_u kѹ_^ec7SYUzKqK_'AJ:t[Y~}pܧ Gi֭3X~]wv~yUٕFۙ<+ڥ6k9k+CaywW;]]߭CiUXz-Ok_nh˫wK݆K!Ѐd6us@`3` !7m/6:u͗rXZ{<]/m ?a_a{m9de3=r|sQK'C&OI2S7~_xE3Q8Y.j:U6lcp%^rrȾj?wx'SL3>9H3;J>{Ufk#{h+z;0}^@mZ6$Ø_9P}>MϭN~Au%~ڻg|Ga5żR&o rzS\{{F#&^ye-({#猕tGFe"zLsrv]dC4n=?# ~)553+a=6m~m:Ld19n_;﵏tGؐSB|=_k⡭2U:w쀽y2Gq!Wl}sExVkmFn/';cyUPcC;˵p(^%w6kB`!X, B`!X, B`!X, B:\vuZR GSĒn87~D~X\v…\J4r *] N<.Ubo$ܵ]xWK]p $\6qe.f w;k\$2C6ꒇK1X,-.ȹm-Tm3Pxr.ၟK3D tц|%YӅs [TqPr!IMX ]yW6?-?(.C }AN6tX2èTn ǖw!rMKs+,Zz2a<0Mj^٣.uP<{^Ɉsy#tvz cd^U^?ֱ֒'Oa&_GNlpG?9_0{zm%.*gb/prE~6a~Nw0UaGN8>J񶶼ΒiCk_ç /1rm3(^qR1=< KF=r);d_|͙eG~xqCs>IC<yzMQobȎ==ѥ#ڙח69\qkCg/m?7|lm؉Y}zmy"g w8uqІ,ղeIj~vGKtֱrK\䒜ỎEXg*^6r:j_~MpM}?ϋG1aL/X*̴q.CrfmԹP[\>Y#Zq$3{qz%J^vhl@1Wĩjl }$G\)$/qƪƬG%C_,8K|Gy~{QjO⇣3W|g6y0G,F8w@~[ڨ}EL{}Wx{M޽`@%|Gʼn=f<frybsəG!9䳉^ϋl3A3ovp^@/ ʌ?T}Ħ 9l|wf?R.F/9yĜWb Kzrѳ89QbWȇ9PnZ_8'Tm08>r dZ#G6+v6ᷯ=(=oQ흜evXŚf~^D~=×ц_}ᦘϽ?!_L.ncx̩|p^oT+r˯([zgS]~Aއa-دjֳٮ~l0_<9:+~tM|X\O^ZCr?9OZ>+xUk9NggsvwNXFveU3FR/|?L`5}{O7 ]j{FA#{sކOior hipx1&S\38ړ<'%\ۇsO'xGA8‹ lT񴆽t+p‹lgKwqvQ![n5ӗs֋+}(賃5j|^[~h KoaLX͸Kөmno)h7^)t4)[-ܖbF!}U~:ڷe~<Ώ#ypGg<~{3 {1Ju6_Mdya:x9[z~lX{VW{"~~xp4w #l*ӳ=r{+f\*^l>g# /g ڳ&j-]{.ȱ'W~lObv~W7B[P:{=,no;+?Q"ޯ;R19aᜃqU{ؐ齋Ƨpn:ɒcl).RP5{Q2)F +zW7-I,3LOo%[k~ 67 %wpTte#{%xUv9ŶU B`!X, B`!X, B`!X, "X|]ŏlCϕ#C6jd*U.Խ d;~:6&ۼ6gKNz˃ 0No6A?5F/߳GJ_ݳbmkRzT%[g& Y*١;SO>S7Kʾ:6VἯ`!_{.;/'@Gv$oR29`L{Y|V|gc>F\_fU*0O&:']FlL_:ؓ|:G?>d;^{gS~yGb~L1𩚏؜, S6\Ģ(N!`Y|Q-񐧲JM>yVr]ȬwNϷ(/>|*,an\z f>t촧U6[ kc ]فOW)fn@Lcbvћo [aɧ֣k+%썕h8['b~Qgo}Uo{LtЙ\q|O6AS_zҡM3~k?6dyRv(Ot2o ſ_3ׯBQ﷣=WEE}a}c/ۣQK>9}1V5G']dKgvL1d37qTM%?cڑ_g|sc,L=dcG؇|cRWk[-|fJykg>W _:{.nvbLUI;"טJṇz[vLc dd6ch\8bVa^Q2_6o^J/[ԛl8QؑN^ΩGK};SkOG6lmZa*k_Gn&ބ&lPI?Agajg+c^M6[T r9{|ky}J6k'GOl܏+,`*lfu3ӓ'łԹsўJf>TXMVgqa/L6!?l2⡻_>%-sl${=J8xiS%M. B`!X, B`!X, B`!X,wG`}. ǎ@@ݐ{ReI]jzwɅ ˹- {sZoebډ]/u՘==d2e}2\.}ͻ{M>جd_`]@>'n٘ `..{/\ #ڪ/}K>Nla;ld/J|ɤ4m E/VxɀGGfj̛rg}v_B`mr? g W`b/|Æ. (?b_ҕϏ5eP&pckـL tG/f^l>xНڏRF*r|aܯIC{V9?My:W䍸)#9/b83_[I6Z<:*p#_u9S-~쐧WagswUv/}̼K>b:ȭh/7w&߷ߏ~ [k\TΆ(,Q{N~ EaVcg#S/ɚ}1G/eM&;@BV6?`,ӇCM?57|\>Y oσ΃sg]yL 9*7n|6`wD7Ƞߞ1v1KVynzķ18 Y7c'^0x7ـ6mw\T_(}UF,{S{ KXO+DXi{+ئḾrxaD7Զnqv[o~Сjy2#W"GaZ _<2' gsl99_>c;Z\~OL6{+rbmީ';ȽI; {^Og1#q} P̅=W٥Kߙ|w$۸5 N3nk|鬏Z/[o6YWzU4Æa={ÒLmr'tlyBTab{5|k>yO U8¿XWY1#ט8ض@IDAT* _ysm{A<'76W`R[{b/(lUtcsZyf 'CeF dbN6?~jkP>??}YfqV@ 5_laɹ>gx8#6 /ܙ?y,ȲNl>ӭMqFٽZ'gZf̜b>*dT> {[EU%C$C!;`9ϒNU[gY>9ow)O٠>->NQn‡,'3q[6ejy;j=J&lymK_+UW[Eb?{ɏtk4lO/X, B`!X, B`!X, B`!t#XtoY)7%ꒁ6]Hp_ua_.(࢛M6.M|_.R[om}]裏'?.W R__mE\tx嗷X>-.DA[>o7\nڅ|!ϸ `6;]6_.˅+4l;]|~GEg -. ַ6v |6hK+7j^ڰY~Lj'_?CNg%^sկy>껬KTKrr_F倸￿9ְѥ>_<(埋8y6]Ѓ]xz60>կ~u=;<㏷65a>͕|"-W3ْ=l?'yx-/TG)g{('Ɇq%h㷥tWFy&6s_y.fݾwgN/:ϴM|xP복\ ٭S8ӧ w炽tzf-bqMϬd{~eX\JϞtn9H4;zh_?ܞ1yZ^,ar8N>7Xsl,!S.6ppGآ=sb {'dsSl=E\38MڕDZ'NEe}?OZ>Nlov{+ f 37E,;|L|xwM8!zGi_LzG6V[5{zkō??nl{Dm؃6o.0tx{V߳q?¦_nUs9$+_|epgon0Qpزf}}K3׼ܖQ2w]ؒMOni1ࣼJ~kN179#X;&|w 50n-O1֥S?Nr*5:pɣm8o~ӟn~m&IwV3W<,v=bo?|'867?ag5^ym/o;lΒCņ 9fkfי#-lNx#mXg J/W~?-T|f}_t!X, B`!X, B`!X, B`!8CұX .v靄>ALrˆ..`dû]:vI1se\w J,… .h1ۛ Te % KTج?]`c1~`ZblC}]wӓE?96'l~.xYg(Gv8JH9QSbfŽXcP~Qsq' .Ơ1s@9gJq?,ͱA>P.0SsĞ@#Qn lWe>Vzxؤr0F65i*MS㿄gܯsͰn ˊ8:W^/fa->䇡܁-٨q .C֩K=mC|[T Y?r__$rJ2N&3_X{OvO8ö~tp''W͏d/n`lf:ړ#1{G\DK'>?l«m܂J|b#<,&:iyb[#?їRO}v őiÅ=&|au ;/Wįjϒsii?ͶØGio'_or|:Ycf9'ܹyvڃpgq{|}{?/U%A{WU|ޅƋE:*YJq\0v"`;٠,<3g:0aP<:lr<:뜗.sA#.vÞ⠐?rY|W{ǣ/BVnrfp6_%b\Q{_O?~_CQv$ۜ"C~>^faw km>{DZS<âH֥4 O6lǽŐ>[}Wն=ǯr%?3՗NLNy.OHLŒ HJ6_c-9tmOS9m:~9ˆkW}7&4aGX+O?ٿB`!X, B`!X, B`!X, Ӎvz}Y/.(T\RpؼѼ5]hv U]XpBm<.Fua.8CGElMx[K.m𡲇]6]|g]B26yBOƳ1JNlU+hlbGDɫwU$?P]C)>3̥eO|SI~xh@XeʡW<]DRaPͧQck3}lRU`W>(S߶ߞi.mEZ|P%j8^.Ȼ\[1rOL1})MŏbK4'i/{޶t1_ӕ?lN5,5Ƞ7[sB?<配lVY—)?xU< .U ̋Iq(yYFx#Ki+?ckmL2̱% l<ٲɽ8{ާX<^0>*pL6+Ѧc7׸>ZSLe{1з.;#bn~ c,Mԥ]3*-?Z=g7jg8[OxN+fG>O_ d+.dgڍ%Q;?v]D7~KT.Ng?Ij~O&~٢^+c=/vA^(Ò~Cj|ᕎij/@)sw;Δ{أ;;wsƿ-ܣH'_| ,85{;Ώ|rSҭMfxx6;)[؊*٭OxlQOPwM;zm<+ӆnԚs=#|rކNS_$-.½8M7 ׻|km|J^b&l:I |b LJ3rdL?a+/~cl{ג$lg> '$.*~Y ZO7=x6ד(=~Ǘ&j{H؎5$S75~4l]nlgzגC>=kB?-nɪuÌNqGO$tUk^gQrg;dv~3&م#4vS6 '_·q3<+ټ ~fē?jka߾F--O6dQ1N'L1oۭ]KZߓوcZTAhOKŃ_d7GYrO<:zg-f9)p;gtO?X*F>/!\7S%p_ 8$SHQ#o;ޥ*?m 'TjYə|oL-U+Qn~XQrͧ;ߖ+Y<$&=*}FkO) ;Uߚ:D3N2dYi%E~TP4= Tfmkwnh'zFSuawmJ.g|՘O }r&qƋ~X೶68`6U B`!X, B`!X, B`!X, !X|=,BpY\F/nł}qIb.hs 䣏>ڨ .E?*.ϸjK.8g9l;){Sɥ'il_ХO ld%xN/;(5dL<ȳ|zOl+.pRKCl ]?{ tm0V]bD_&d.621h{0!cƻ96lc+G9 ;lH?٠-~s3e.cW|؆A~#G/?xg6j sR1O~-^x[ϕiOU??j),L xBEb-Ϊ,]_ƑG;,?/ttFC[~v6$vI~r1噮M*3SnqdZ\{x~g 7==1*Pcg+Q+x8Crm⋟<'XJF7o0F5wrùucB= 9Aw#Y wƧruwa*Ciї_~!U {NkswT.?pt NoVG5sO7ONfm/=OG`?ӣ zسSOx}l~5}ۓ2'p@9УoMtd]nj=7}v +7?;OȢ"}8Nɳ k"О{t{>Rx:';J{Լsyr?=qPynCguΨ:;蜛.b9Zon999?:8|K yBfE9yدc}?.qRÍ}R/|׿F3xYIyyx5n{%\vx)d[(G1^Pܭ]rK5p?*?s.=s[ıvy6]SupG1C{>ڰX'\;OP#ͩGE C:ۗ~&>{U|er*|ϖi[6rWUk;:~TeJU1@LX[VU>+?+_FoM|4&>Q>Z:)?7N2g|O4~I5;[e-`臭{É={a׶|&UJy1׾-⻘UvykEq |'{], B`!X, B`!X, B`!X,b[ ǀ.8"\B2]Dr%qP|]X 6XYK.JLc.8$ e*g9 .[vdߩvXvא|6fN5ªQdUj=겙x]!jl~ ,NOsxUk]:U5. :mɎ(xo7aJOcvkü1{]οXLOuzcR;z3b_8;:}~/0'4g <`3':{)|c Yr8<Yim+֓eHqzӯoߪd*xeX4T3ӳV@Α{L.U y/`g/XVJ=Oh8YSb!ut[e3nSl㣗:Zmz(? ;Ggs̻hoD;wukaFfXE>+xs͛춮mslR{<d9., }r:ɔw5*f;,bD(r8ўǜx9ޝi$bs0 G%Ys\6e{~W'L'rJF+nl8*dͳvd*te9l?c}0Oos|&s>qxI!wT-9Er!yb Ԏw4Q^򻈪Z/h:[jG:'f{;}Ur}i6AcݩvŒr۹LO馳 t# #[(ʖ'#>gٗb\P(;~/qEU B`!X, B`!X, B`!X, "j{]KB`! . D%, hCaj[벑 dQ &.%u $(]lЯ#jb}֤w:mH1k=k|ﳾ2 o*և6P xuʥ[͗6MSu)hvFe?7cc-{.)r|?ʁuj)ͣGS[_O1FjrU,m-J*g}^jwiSr(h-S󨪴1ԉ̱nMXk7~ΎxP%֚/]t#[dLaZ.s.[hMz {}.\".7ߙ9z3ʘ.ZK.sjY(%rw[\P2)~{2cL;F)[mQ% ?[C]қbx'mR_Zǟj;X%ҒFTmNeii3S&}.{~9Ɍn>mSx%O,=OUxgty뭙+{>6$x5g]qR~ڗDY\-gۏ#ƚ]w.ڟl5*͖|GƷ?NSM?zmg/x7ֹhi,ê6~jJh?{;h:s_Sӽ=S~: cvTo'==Eo}jz6fJ?O+ct;֐m~}K'gOJv5^OlEX'=$/YSNzN1z c|͵jm[y7ߗ+~[j\D|dU |O&Me/ v?!rWrkMSr3}&4[c68KQx1/crx|lٟEXW3OCG:;[qJdөHS|:Z7~] ~àtt^ϧH^>;kGkã❾jG<̱x~p !b|B`!X, B`!X, B`!X, B/_%m!\>pi%_tb)e ]n`μ .0"L|._]n kμʖ{ޣ>㢅 ]ʦyaD_BZiN_\7zxِѣx$#Jdio1_L[͆hפ˦;gۘM^.ׅ}?Qȵ>jg#zNwuc?Fb6]xT5V<'^byɼyv+Q{El_+,ỷϞg?]CajZ,23mqoqf<|7*޳Z]b+^5aXyl_v^K1/O9?9k,y Z?̡<nO/Q<ӜvV?o*7g3<8亼d hZsg3{UT (YU荞8xۓ_&jjߪ﯎Y,'5㞽lA2%Nɸd{rG;U?>vJMzk=I5?~\ծU16DU5ïvS9g ¶#{?R~k{Gz ;~nT6ht\CȰ{v6PsgN Gw8w{־yԎZg~֩K[ixߤYMk kTxrDӑ-Pɧ"xq?[Om<>tlF>hD : w/}K=lE<>XQ(?&|5zdÔM5Ӗw2ѷ9h҆YZ=3=W [8wV8|,/}-͙V1K۳͙sNxCTߤRɜu\۔Ѫtomg?F>0DqFxv<G#P6VS5:s+=%k>;_x-W݀digҟ޽GyNw /W]F'l_8ޣ`,延'?Ƀw}wó;L㾰Iz@;2*lm _>Wݗvr>4x}MW_&Wr\6 |s_~G;ݻ7ykTUj7m<*Z_GkC{KJ"Qއhʦޕ?ɳ_KTݨy9l/{n//ɞGvMkg>f#YxZS<@ijunج%;ӇOFg@[d?;s~lg~X{x=ΥAṷHd~c>>c)V,tn)䙜\\tVyjk=rJq/{>|W6j? kֵW?p8К>vNR~Gav ^Ή*%RHWL;/m3_{ʷ/vKu5E26rr~tʟp[wSi & g;S)2ȉ{LƦ 3o]xɵ^{w$?*,WT>+u!Bs~xԳ]6W1~4mmְ̹mNsssMs _Lng *?ZC<|W/'_xy]N_w&9oT^֥HgCg"f(]UKd_kOIJ5SGϹ#NǎySOό>~ި=윽O?vɓòk!X, B`!X, B`!X, B˾h!xp]  .[.ۓR]i~'W)K ]ɋ'tToNG]n.lU<. Oi9S/ ?yǥ8|W9@>_c֩MЙpy%yžMfyHx+;G*.{B2h,Ew<}<} ?8\Cvx'l犋.?gyx(z]si7r*0'g঍}䲜pwox}qrʟ^>6{ܞ̔r:yxiKׯm}\} 9~绽`y|t- *aQ=/ٮٗAئљB۾dW{UP{5g?+֑Y̍?`<8ݼX_CƥWb}LC#rHNy{G{C)yx>{zj}ʆjU&6lQto{lcw6h߶${?sj5o ٭ѩO;a^]۵}/\ *Syh:I9cQB)݃q{[e[jV# _5 |ɧ/WKͫa}޴o?$uGGZiv߅a{\Bm>{c6Oh=F>S%\UnZSN`@~yy{#ZA]qQ>;y$XyOQopc>o8_ygqs>xy0d3c޸GcLtgOվȈK<]B:tDoaczOp{>^kxb 0Ϯ: B`!X, B`!X, B`!X, X_,|kB`!p%E.FV/..@fc.'tEyQ hm:5yɼɮ.dJz?I4_.}ھ_#ɻoO{9{gk ȯ}9p$k9'TvV(.|/Kmc17okK/o'%?y44h5wms.]އ aP;n+.ŋ qwr.YgPn sϜmHʏJYKOmgxn*ɉl`#,/h}]?|l1׾hQhdmeXEeX)V,?{X94ynUz)~1GFcm9wL{J{ھ+WQEϕs9k{2W=22꛻L1a9g!htoC~.}{y7]; (&}YoxǯL;=p'{\Y޶'봏Mo+_^ȗjy2y噼oh9W/+ҕ#y9ƾYSד,!΂>\=DkgN&yoY? zpwbS:֗Ia\C3VsSt_NeÑΫM?8.b+Gc.O?It/},x/g<3G$- B`!X, B`!X, B`!X, 9zBBK%$.6|.;tA ]wBվrD"҅Ak'}.S.Zdor}ెI[{j]vD˘$_ho wdx,خ/Ah}Ɍb9mx\:£XGypٚavXړD\okj4'䒗rO >(s}_N3ԩ}s1Ne/*eo +.Oc_װƎƴo*xpt<|ѣmeGa2#ck>ɷ?@և?}m\坪yujaa~N[1w .egG{'hkRʦp&p}|)?K* ' _*3os-r}2=|N6c5ݷTw5vvw 8lQv;WW;Թ@>\?Wsا5{g [Ygk~ʷ⎪1-a.z:ÃG)pn!ȔC#}n>^LXҡC~z_}oOj>FL&h2ɭ1G 6ax -ɽAU}K|6wCVy#;LoʙlZO0ֶn%%>k~/~q3sH/vM=^$_{ƝIyd+G:{lL7[cc1Td Ο#]ȼO>+fdV#9Ǹbhˣ><60^e!X, B`!X, B`!X64@IDAT, B`!rIZ,.D]rLfSdd_9PiMq8g؉AXl)&tM{w7rYr^hca ܕ{Λf<%S{>Aץ= =_җ|~]5o1\q|Xwq4k:3?yrɘZ>Fr6%=d등 \GS4;:91d_=s_ (pO6E?HV8*\qtyS[ +JSvy7y$FSYa.oY%OA򴗉67+c49 wH\/b2&)yvdkm_|OkJ3K.VّL?쳛?ds^6߳囵ְU9ՑO_;m>+3irO|\^xZ+y+7.u)O:={b<; y3l&gy2OM OE}[?@ܓm^ſ3w3nȇ;::^|ō l|SrG/9zfgr3H|Uv?Oʃ蟃OOQ+{X炽9kޜϨ?£1yٮ:KzF>ǞϜIvt wp7|s<(8'|(6)zdggLl'<q@ɂ3 $OQ}̏>h/?6mSfn6|c.>~ᖳlQg7^u$Ӹ1X~de=N3?ػeْg,UB'B0h4֛&H•IY $J&n9;w{0"<#8vXCfOPjOt_3m*78֧m=Wi0go1:D: 8ߣ})ovzN6o?<՞N2O6N6U^u #|콦%ݳ`;x1tN6.֤/+^p1x;C෽_I7[E`XE`XE`XE`XE`XE`XEqǕ<+@l?O,Z::hpByVuPÁڊ:ҡ^TYBCQ}ڷ(0sum:ȆVa C<8W<_*ppt x:av696qX}II+\Z^e09AO9A黶jLmO#mz~Sz솿F]%KΊ3mɿl2v[i~F2& >k %yrXܡz⠞c_ b׾Aڅ\3NsR &<5eK%rر==l/'U`d_[|/6[CÖ _r'[_`;+ߤ͙}{Wm^|CŦ u1M~9AZ&P9NڼlM&o^s"gU/~_u^*U8Ĩ1/~_=_O>si_ umYAv3«6F/g$C}PrO' }?|\ٚV{ϵklkmwojڷyӾи{x&(1XcxhO<c5 s`k 2O\kF\{E93؇n> _ϾWu |>̶YأO=9볂'{g}]lN/{Iϟ7#}}*?$G^/gn+_y]}>a_'߾nLl-Y[+o;Mw׿>}VpP`//_gg>rgk}U}x;./g}>Na /4w:w+dŠ[,6T.%vg'zb6}ַO>]/9Xzf;bc##K1&OYAnde l>U=9pܑ[!k {qmQ=[e5Ǝr5ɞ9><0myOT9K{yJ:/{('$ Ѱx\XSE`XE`XE`XE`XE`XE`XB0"\8с*߁m< dhOT1Ak7vГ=qRѯ+80vX{S%<~)\q ۊ\d94.UxwPIt(a.:9yxS1}70DGk)Fk|r@sP/]u=k~Su]nݤ% s6fĎ}Jk"Q]!){0αcti>&1=}ꂹ**Y 7a޿J^5~6_rҁPeQjfr~|<亷S1\9tևW$'C/}U_'$3 4;!o!4b(*L졨RnM1W-n1đ}YolDo?y#W m1;bG;Ǽ:%=aK> %[d-AEvQne-u +wg E}֞aܧo6X؆&{ :9hF=sѤ^cLU\yoz_Mbyf~M3`Nw6dc {J}(u$sG $TО{a&V1YƋqms][&K_qc#9ͧ/ƦtFYöcNf8׏c~4/lk.8Ƌ v7h@zC8i:H%*+0Oyt5;f68C1/H __3ڠ/ԦׁQpءWst/y{Ws@9|WᏒrua}scdr?tԮr9:uM?n84'L^U<-J皒7noQ[4oa]QM -׊,t_Iq\E]~vrfh+ᡶל1Gd^n~ڳ`w}ўlS&}]?٪14\iU3VzmvkGvXzZ(=Kl%B1_+l].fa jvEş\xtz7կ>;kP>['<ֲ\iqN%꿭u9RюZښEdwUo*r'u}\6HP ܨb}G,k'3LSsʁTvlH1eC;>sh]X.5LdCs‘}g|Akck>ޣ > ު6 N$Ig'm|֟_=6NXQ7%993e^;']x\g⥊Fyڗk9s^NkM:ɠ9xf#~ogN7=|*mπ3jLAbuEHs}[\ݹun\)?tUiMMz]E`XE`XE`XE`XE`XE`}zsX2恆a8C~s須t(Ke@W;O~u(J˾~3鰅:®xpM~aX/}K'j>[`~ӟOկ~u%՗q~G?Iq_=tx?ONU[?,RB5lT0s⯃l̥nB.<& >3fW5:~/21T7>Sӧ꧿5Wle}rE|7gO䂽A[4[ZW-<ӆ m2{w")ֵ΃rɃ}{/~d?ɘE eg'abw9]~ȑj|vvl|;bXCV_[?pWkVɞû댿/N1sa;=N[&֛x_('Wr04Wldt#g徶8Cq#Ys`d*|?nX+[7i5!mkyۇ_X?9F٠d8{xsĿ?dw=Q̷t^nٯc?oP.8#O}y?[Ɏڧ'SY jFyum/ [bFlr&71\yfMݨܲ˖&'6bu ?En䫪~i/akm|Cc1\cxgރ>;S>5grÏ7U[=PNdQߛDU7MM55/|^nOWր5:W(9cm߼ӭ%s>b-ʑ>'Χ|FcնTm{ڭu4>S2 =¯=NSrAU?}P<}0[Boy>]#֚1~ާ3rLlqE_:gTAֿg:3cMicg pm>,&oxvW}!3DZ;~t F\#{.܃M/ڪ86yvbjV{<geVQ9- {j hk1D]/SösɿTz!SMGG1(=ݿGiظ|[zřY}ONg!kz!WXѳ7I2W/Y!h>6>6/Zsr|7Ц /_Ȳ?Q?Kmw(kUõg-61A橕٣*pߚ(x5q{|=}ggN.@,?}Nھ/~^E=/wkLeq?pmuoX'Yɰ^%tEf7ڳ^} /[3L~%?9 R`sx7OYo OOԽ˧69g9q0{B%w T~VϸOQzaTi[grS.r}N|{E:.99<}נ]?Ƙrr,6|_lB۳RkiN7o:-ă|Y|Kl`;IJ|=ʼ{w- U{)j-.Z֪}M}bKu]C,*)ܯ=Xcݤޤ>^Sv=r^~{{qZwO9酉_rݽ&Z!ó dL/ ˨Xs~ѿoߊݜS9y1w9eI0dE`XE`XE`XE`XE`XE`XEU"/JW"@MYD<̠CB8Ci;⺃+$GmxUNT2p,89 K?Pn~d0MdѡOU>|џNE]Y5jOnLvݔlv ;?͙4@cǵ_&?6N5ۏkl lΝfˤI"}״٭ Yi}֞u&/`fG|4j-fYh>׊4OU+oQ\4$w[苟\C͏|=)ɨˎј% k=;b/um? 8V2MY~A_nӏqõ#LO^k6U ԵbtN~Ll&,mRr\I&9l:ƧSC [''/Ni'>>O4~SC^ !~osTggwuڠm|H~:QW>eiٝK>rU _kTKkǞ~!OuL <֌2{\M~y7Q_6 :}fufm=s̹ƻ6g^kWfKt8 A1'P'x'͙l6]_l)?}5`_>_r>%'::|{r׺ͦ)h#>3~Jq؜+3;a<| sN6:Jsbx.;]{ 'ᦄ9?U>|쀝llr>GS\iIdESl46eėq^襱KsnmRxv̘;>BN86vٺ oyԛܩS2Ƣ~da}y~m 95Wsyt檾lhocM86ɞ] q(sO'/tMdM U&99졐k` [A' sÚ3lrĹl/96^<1;z ` 7'f0Cݽ͚ErCEu#,|uƦS5ٽD.fwDntV5(QaҚmx5~Ӛ$Je5k/俸jbDqqw>io27'Og¯{7]*oaSifO[.Ǐ)oE_: ?OO6~[@ܔ{ k^ӳucW iז>Ӷf9 =rܾcZ||VyW>),/ۍ 0 g`,'E,E߁[߾79 D×S6~ mݰuoUg,?+^s1!|dU#r,ے!g(Gsgryu;2Y/rM>#lu`ȲfKܾC~Sc)fU{O8=fyc+i <ߘ"r>craE`XE`XE`XE`XE`XE`XE9'SZE##=8uauӟP~|&?ЊCC[!>BUg?{:,w _‹???!*:lt?*ȑiZ4Ƌw}u:(~x7w@@>Ͽ<8㰈#lsUP|wI|`8_ ܖb1C/[*p>R\wslԧ>2LkA<\;pkG?鋿y{*l ~D`{OߟɟjD;lec8X|?ְXZя~\[#x)Qm10Wծ|+5'1g˔`=lQg>s6nMs4.S$>az|n\K\`/~_łpsg6̻t3{j8Rkuww;`W|jI[i<&ǺӦÁTs\+$≺[gF3fů9u]~XsAs2]tW`sx aJM9#7O?b}{6ħ{yڪv{11{Ok~-׊렾"~򓟼S;o 9 ;\~Irߜ֤qRemk]쳗X[2gny6Ag{lx~Bn{3l쳃>g~7̵AxCN7Bimߓ#E)O*5$ٔaѾGJ\[ֿ5_'O;h}xqYD?+}-E渶~>}^R9gB.nWko{"??j }E=xN +}{9=|~S|9B lgql3rۿ=}^XM5}k'WyOGy_g~nP:a Ժ~_>X vos_?ws/|}Ju'|o~_֕rwd5Qm}ڱ.`9t:WoӉwڗto[̝Kۯg 7^W (֏}sypB_e־Itl+^w0w<#8GkO>&W^'",",",",",","!OܼEN+"\C28Іz8Ġߋ 9a/q8qC퀅F@;t/e? (0!ۿ8[(; e g{v(tP?|:ᐐC\S d@ok ^"FC/ ;?9P<C}怃4=:DcN^Pfk-J@ĵrRY3r3! r45]LQ9-)r/:89OX/'KpqXuKSU@ڪHk+p&3C.WokvsY촏%G sRS19|kyG[#^tQ7a`=BC[ Դ'ҧpO(}^~DeK;Z##v/9wFê~8[#b^iǥYN_` _|^w`}kۛ̓/hEW%:$/a\[o8%֤-?Z_d5se>5ݪ‡羟Uf?o9ț~h*r}[+Hc75yr^Z7*?|ސ.X=S,͕&${>[7I2Amk/x%pz3wjӑY=sCߍWU{C Ŕo}v.|~:݃UayvlmbIs\oY.7}T\la_ˉۍSɰYk}|ֵàS{lbkVJ{u^Q+XV[v{=310ٟ ʟ*7Unx.3}{jLgC,K>]w^&0r pr|>߷?ɠ5p0veokԼdSi _y6"XȠ~﹐jB&X?ԵOikhskm2l?]eY{;da)ӽٚy?|΢ P[QrpϱN_{Tϻ1t'6]tz\d~{6TU̓~{s{_c;T8@tqC`ïbN=:hѡ r@+lu] ~GC\sԉo|!;uH99ܱA#ۦ^>Ɂu6&`v+]D-I]Ѧ.>P▭ŧk{蚯"Ʌp5̔dE\ﰁ{f6ؔL49!|$[MNXwLs\n#nx#c/޽Gqp^.W. _&?٘Ag| ^U=s*Ɏ=?_'icz6_UXI\;lцm{klNǁ͊|=5`Ϙ>x'y􉏽Om^|Cś *b.{o|R \[T[tvoli#鵾~ 6[9&?<)Yvq#3hcrƪo] A?G{xXwկ-\Z#~*hߴ&/JB5b(<> ưGOt'*O`ܺo2+ ȏ\g|>6{~:7` Cn)ʶ-S!`*rOV\/UGukCܖ8ir޼>Zd3ile]&|' \۷'W>Wu(s ?Mg>]jzS6/`#ث0浾se~^\>dm~iuB̽py|Ks`:,)J2ЦLͧ*#%a@i-u>%ɺdg9_e3;UI'5`ʦ֐6jicv'<6p1<|=W3qNc8|o؊t0b۵x0% ,͛/Bj~d৻ٔt;uo~}0f|3Vζ 12P/ł, l8x:ZS^zŒ٩z壟 X>A6}AaGoG3[S_s 5yOfK w~јmhfyJ6Ztԡm v0fg2M{N!r>4N] ,H>ջʟig^m67>>x6N캶a G>[ (xdɻbƛyx7_9g}Sw+ٷlSћ=6(>v ?}F2MkҸ|msɸ˚^RlN{yc_e##wg_[5< Ѹ)?ۼ?3>$⣝x2zxL)U9n gպ6GZidoX<-;e;:4b\T{wq(9',5x^SI?LRg>jlfW1o(^sֽs&ws6x>xx6vw*a!&[/ixեBx!{>pr 1g.Wd?A&ܷtK뚽 Fz3N~ɇqccw)7D|ڥUyW\?ei3uKGrM oϲ#Vu r**d>ŜDـgb}ٗ %"\gZ1<1̖8gc27JM:56L/^6leyJ/>8]䯾dҁs8T͘1|͹\XdW13 _yߜy3^rG-9KE`XE`XE`XE`XE`XE`XEaog/ϊl\j7&S ?:Xw=QJp྇ sIAr~lj0sz8\.|e~E m},cLTc{^(qv /;/Q7յj_%5Oa?ϝd62I(|VgJMp6֯4&\Xi;F'[Vb;m2r]|&/'?9+\a>d(x/CQO84g<6_r/!%W sXÎg>_O2b-KnjӧShMG /rolp'@IDAT*]v+sz~XŚ]ZEӊ6Q2U麶$eܑCj^>c5$_E9k_I7\ë܊&cw1y-oK9d ky;sM,4v^W䍵בPl~8{ёaq KU葷Mw ߞ~j7]0gI0sS9&=Ll&q?S'G BWՓ?cqo=]<YSa;')䈵G>X䇜P`ai94sȪMvkx{v𷽊l2g^u< ȃtO?}Nz.aw2c?yt?5Jر|gʇsΙ<˦99l?rKÖle^m>xsˍ9dcCrr_ǒQ6hڴ,",",",",","v#pgD?R;l_A`A:Ȁv.Z<CkTJ_ vw?=r1I䇃PJ<ӯl#,I٤Cj\яO/?x1oNU[ɮ_rOzɆmK!h` w*N!˜ ZOɠj*;KVIYNd ܹ%ﰤWa 3|&vfvF͏yJT=쓻|`Tm}Um|b:&^أ$ ϴzqJV_G3}|ϷsX+׌YlƧd%gRcǣ=F sk-Qy1׽kd~|cΜ gu?dM-9r{4'?P~}@5Jmz`xdw:{XQzh Mke5v!Vb!-lݼ?>UמؕlƺХߞ(^W_(])z&6[cjBh<7\<ۇi^m94ǒ TAF|ohC̩ 6@[G>egrr_S633}nƐ ӎa?,{7^ZC>7;k&tNA[1֦ڳvso#('ކqvO|ζ'雱FzckgM?8Qw{|D*T_ -g mW;U~?j{!_}Sg1No#\>k{IGPKe;y\> ~\ex5X{5\{eOWz6?H;~3T71WFk+|-l-Qy昧Mls>)_xyVb> rtGVXhGO%l>{6 Pa*dOdO+z+Vmer ' 䣽Y&M61C|Er¡un[6S[q{9oZs~/ʧbkoYE`XE`XE`x33~{3V/",",o;ן|ۑX7o2}iABg9~s9;T`E]whUHQj%~2R U:+;cApQgU+kry./:t??.>%QC#S߱w{ ;P gmvuH()(^ܛ%lfߥ6|\sfCZ("̷OOůK5?uV|mvӱܿv>Go/J}K2>r}{j*|nOW&X=үT2n{,YÆX馾s#vx|Ot?>~<}Ǟ"ư:T4^|NƎ}bu\GۮxvvXiUvc̎{ɞkhkfG ?(PeO|mMҗ-fŃҩVks}0sԜ9}S9|/rn>bZ_sӮ̶7'3گyJba{mվK*֯gl h53dO]oCG6[6wQKsx}Jռdw}x۳k]CWln,il3`oVʅl3]1\(tbk8>9t][Y~Asl~BWY`zeqo:&<3}}Cϵ̇dﹷ<~M`$Gl4Ǽ=8Qk=ۜYi}n~2wMԩ?nӉWu:'a3?{G;'6Gẃs֫WLax郃ϰ}7x9bM/l",",",",",",o7xXRJpA8|?k+]>8>Ll:sƲ'mC1ӡ=]+:VUz͡JjMS6&HPenmvCmM]GUmtMe%ͽDubߔِSFo9vvNCë8NOgW!^2dx;UOeNyr5|ҸlʮxɘXK'z;眛⑮ǦH9?C:wtyk!-ZCdʹ9ƓӜiǜCW|<)'o2)ɚT;QqNk\s؅ҝ T!x1^7 ԉiq>R_r]xWU1Ga?d{wMf$#9T쪝(>~%S ^ۜxHV5r7N{?n/Mx'oR94hڪmm:lG-Vm?Krk-=/iWȗ˭m9l/l(~oѴ&̟9JrP~| %ؔ]bw1oMB[!;gWcra{h![ɏ=lRUXwo9{ok!dVUgæ)ϸ<~>g5ɡ#H':vҧ*Kwًwki[tf#AVџLs:mKE1( r?Pl8>{WmidwʓQ>ϸ\)ϚIC_[@+>}lN|o}([ґ]k/qnNMsޔVބ/93/Em狑tC|s gL>֛tw,٥ H?٬JFqmvgt/G))N.2Ojv2P%Gy(1^hyBHRkNvdoh㋒xx6z15ٙm'3]6鷏tKoYجy*Oc1|:[zraMg󍥧y\y+^7O;/:&錒eXE`XE`XE`XE`XE`XE`X}qp\)"8$ ~d5'P>_<_;ъtM'pa Na:t/~S_s oMz)d$wyo6i+=sC[E`XE`XE`XE`XE`XE`XEiߕ<:Տ^@%ss<vY;x`D}%^p//^׿>'? ԋ-^pkoCwOr~/{Spnm;CEx9g0/]G5jnN\xl>#y(#ϜuNԙgNONO6q.GǖuӎuvO{^e1yte_3?e^@?t;~a>/Lh9_n9W;g;Y3Ǝ%?yr4y*'!v楗)Sx&3lΉ r1c'<7щ>6e|\cquk9dm?Оs<y~L^Ti;]W?Fڧ7ϛlGſL.^b\%>UZ_ͫ5Ǵ6Y}KV͆)3]1h~3~/ɛ-t﯎|xwgd>Sw}]Vaw`c=/_剺O/H-/c<2cN9GisEYKxܣ-[ӔtۜSY'FLd$gãfo4^ϕG\5~wb1)̎{ Sl?keKebޮmk(gQKS^n.[+9RxOGGfoԜz֔ktԣ=νt}$%zfϹxkNt}u&.",",",",","]oW<כE@9+zR|;f99/a/xK-tukP$//8E*GrޅMcߔs6ryS]xKku(;̉(x=4LjǴ=eW=}il_} ϥOܕSùsd\wWszkd7w_PY/x͜xtέ=>%c%Y阼}wݕsCg'WoNr9E^Xߞ?uQ_UsCMo(R7f[66gyhQ=u6~;orWwIה5sεk<%לk]a=מ}$k@ගlXswjG"gړw#m]h226kd\/POX? ;0/&{ 3C맠Sǹ{Nw깹9vM^|YO[_39;ϡZG7ɹ /9wL7j}7s}9w[9FFѣܛ/IWs݅k_;ɻo4>MsqNT_sֿtXE`XE`XE`XE`XE`XE`X}i]"cMF[6n@‹`S<:h:Uu/ϘXzah7Y(9SDZGy*|o=xn?7(9_syw񒮧ƏAs\#ܼK};{uCeu5͇k}BwI9loOgQc_ 9zqQyɳUM=y SMcrbM646uq97ows͜KkOƮ료]fm6U9lK5:ktŇ] krufȸy.~矛{wsc쬒/R{s~3sJF||:x͜Ǹ~zՉO-QIb7LֆȆ#m6=Ԇ^.vLkLC~̇?g}캏9hm殶+6wwv]}rɿE`XE`XE`XE`XE`XE`XE`xq,"$8l1Z>Ox_WVTO~rzyS$p|9Ư|ӟ>(PjE`XE`Xv<#y>}zF҉kRz%jO}S'Ko;F",",O@|{+_}Uռ0z>S?}Eb}9~Q^Y6%|E|էi/",",",",",",","ωZ /6TGRbyHqxC'^ɟɋ??:t/G?: Ӌ/wNy'A=/8|pCm雀cXr2oomؾ _λ/3F^L,o~g*/^|o~DtYIgǴx,K}xyo_|?99WlK]|~l^MIraڇ:&Vy$ebm|~ }h)3KX6?$&bK9x|ޙpyL|)_{Wa<{˦;eb&qUW1q3g`0e"E`XE`XE`XE`XE`XE`XWX*P_"TšGyŏӵ~ߞ C/p8;s:HGWqE`XE`xBCXCʞPNC`_-g][E`XEY|*s??s/=^0߁nw^^<R/2~,}$+f؈o(Y/(xV",",",",",",",",ρX(E`X tȭ_Vy)Ư{8/x19±>rhK2J|dYE`XEE`H?hq3yI%J禞mY^%ս,"TET=mj|l~.j/<=u*.Ɵ΂d:E`XE`XE`XE`XE`XE`XE`XE`Xv="ut؍cDzد9TP^qH5jab_}g쟝^1eXE`X^ v~سU[PKƼjqj]EMA^Kڹ,/^ Klzi{߁1/wf~[Liz-Ogߕʟ??:}W*O.u3}\Y",",",",",",","OjXN Vk@3O|^ݟ韾_R:>3A9:4eb){؁-","|,/|_S%P=_}ZE`x/[~c}m^B@9/^ Şz[=ywh^E=r(:Q~6{<[CwyTV'G?Sn,{{ݫQnYE`X@d{"w.",",",",",",A`"j;gx_<)9V?\poDCN}99KE`XE`x\VRv@:d_ίU'%@ @ @ @XS[Y@;dxW镪 @gee_:<6 @uj+i־g+~ivGSR{| @ @ @ @ ,S_^d@Yەj'UsGز@u3Ty)] 0R ]yN˒gl'm|[z} WiH}_F 2U:k=  @ @ @ @W>(NdK>2S\{w9 @`SyNJKK=sTmk#@uzt9ܮΕcϝguo*M_,4S廦.YUږ @ @ @ @( xMܕ@Mv4ow @_x˳[]B 3r[ T_Tw3p @ @ @ @o?ՙ @ @ @ @ @ @ @ܪ_,՞S/(zO @ؘ@͍u @ܭ@=W60N @ @ 0L@`0zXO }|^ӝ @ @qez-vj՛ @= xv @ @ @yf @ @ @ @ @ @ @% x-I!@ @ @ @ @ @ @ 0;G\Z/c? @ @ ԘOR  @/Pվo @ @ @ܢ-VZ ܓ@Q͗xZ @ @{8I.ŦiU6~) @!Pڶ)c  @ @ @\[/_[ ,~dE @ @iP$ @+ [ @ @ @#M`%_1z%X!@ @ @ @ @ @ @XԔhWP?NT\h) @ pgp8XJ;kոϬ^ @. {I=v={ @ @ @>8+*\bvU`qOj!@ @l] 3.&'_&I+?EX:sq{R @]OwR9%#ٮIə[L|VںT @E .wz:@ @ @ @Ȋ @}l54ed_MMZTڞ[~E @XS8̷ֲsNGuiյI[f_:Ҵ+:wv @[hIߧ<_J @ @ @"@}D4|.NQ>Lgg~&s @ p OOO̘F`|N2Ҏ:MlO}RmϵٗeیsG @g,g{A˷r @ @ @t/ Ik|>CϟO!:KݿI9ӧOϓga~;}αu/KYJ @(NXHm?e;okB~zC |;Ҏ$[ۣ_eO5}iW?NO1qR @#P%8$3}s7#s>H$o߾=O;:/AY1t? @ @ j"\-55R1dͫ*ͱ6ߞV_iڲW.[N\kܧT @ =Qwz?I-Y>iRUz/v+s @ @ @X{ !L(Z*$~9ٮk;TY @ @_8Fv|Bj Df_v_mLO$MvkQmm]ꚺMU= @7+>^%i`|=W} @ @ @WCsu~y'ݺ>R_K/fg:SJ  @ @3 d\#*ж G:q׮3R]9%kuv>$'@ p'  @*D-9I/b|~/vֿ980]R @ @2&RK򙔟+ׯ_O2N9|όiշ`ԽOv؃D p/4y'I>nrN]{/VI @ @ 0D*Pۏ54Kxn_J_-  @ @5Qi&gv_MoTlΫEW=- Rqu<72* @.gz.<׾l\)uM{sC @ @ @oXF(!PSv |%ׯk24VZUZI  @ @3 HG3IIkR[Jy=Ӷ3VSj&}z @.zv_v~>umKw @ @ @ ,;*~`GL;L8͚, *vZ59 @ @+8|eSg]WmLzԏ @^%5+_~}G' @ @ @) x "\ևܢ='S9]GmK  @ @3 G1 R68:^$1\ηW*Mnqk @x]]lgmϩ6_ǥ @ @ @% ZK`[%ʵNBOϵmvY @ @ xH[v_stN{~.}5m>=zC @.Puťv}uO) @ @ @k ,{@n_Mk[J @wv %[?i\w J  @8/>K @ @ [Em{sk_&mIm @ @ 8G[v_9[˷mI>:ϸϭ @lAǫ-?Η @ @ @- xt(Qzaz8N8=WsG @rl-״\{+]˶-qJo' @,_{> @ @ @%pUr@R㥸 @ @v2fR&̭:3_ @8/P~r% @ @ @lE@`VzR; @ @ @ @ @ @ @ 1\uY;]~ @ @(kcο\[bO3 @N>}[t& @ @ @X|N@vvY @ @(Ў$_ mUA ՆW/%@ @@=WZ+ @ @ @uů96RkRm}~\ @ @~u[JFRm߸K: @@No}N; @ @ @xYaG  @ @ @ @ @ @ @؂_,B/jw_ή: @ 0D8˩ҷ], @#y=Z%@V"@IDAT @ @Q@`񌽢N(p<w؝B @]1Ln  @;G$]K @ @ 0R@`H}e P+PE @ @%`m\Tۮr @- @ @ @(pRD @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @t7 @ @ @ @ @ @ @S ,[T @ @ @ @ @ @ @ݶ wYk9 @ @ @ @ @ @B$@ @pû` ,_fOjE @ @ @ @ @xvu_35 @ @7m @ @ @ @ @ @ @ E[Um"@ @ @ @ @ @ @ x\l$(ce @ @ @ @ @ Z| @ @`ųdh_f\nL @ @ @ @ @k ,O/R ,V]ܗ @k ,^[t;"//7HI @ @ @ @ @ܡrNrI4 @8hU @ @ @ @ @ @ @ww`9/n-/_ @ @ @ @ @\h5!@ @`]zS[O?A @ @ @ @ @d|# @I@`񛘜ׄ'@ @ @ @ @ @ @8-^ @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @, xp( @ @ @ @ @ @ @@=A @ @ @ @ @ @ @`x @ @ @ @ @ @ @=PV @ @ @ @ @ @ @w  @ @ @ @ @ @ @XCY @ @ @ @ @ @ @ ,'@ @ @ @ @ @ @ C@`qee @ @ @ @ @ @ @,8|O(p<wY- @ @ @ @ @ ~o_ @XCH@wOOOoT @ @ @ @ @9T˗9C- @Y@`wUǏw @ @ @ @ @oӚ|V  @fX @ @ @ @ @x@c燞27B @#'/?Xsl @ J#@ @ @ @ @ @ @ 0D@`v @ @ @ @ @ @ @+  @ @ @ @ @ @ @ xRBQ  @ @ @ @ @ 0س @ VC @ @ @ @ @ @ @Foc5 @ @ @ @ @ @ @@+ Ր'@ @ @ @ @ @ @ QX"@ @ @ @ @ @ @ ,n5  @ @ @ @ @ @ @lT@`F;V @ @ @ @ @ @ @[ y @ @ @ @ @ @ @Xю, @ @ @ @ @ @ @VC @ @ @ @ @ @ @Foc5 @ @ @ @ @ @ @@+ Ր'@ @ @ @ @ @ @ QX"@ @ @ @ @ @ @ ,n5  @ @ @ @ @ @ @lT@`F;V @ @ @ @ @ @ @[ y @ @ @ @ @ @ @Xю, @ @ @ @ @ @ @VC @ @ @ @ @ @ @Foc5 @ @ @ @ @ @ @@+ Ր'@ @ @ @ @ @ @ QX"@ @ @ @ @ @ @ ,n5  @ @ @ @ @ @ @lT@`F;V @ @ @ @ @ @ @[ y @ @ @ @ @ @ @Xю, @ @ @ @ @ @ @c!O  @ @ @ @ @ @ d> @ Y{F @ @ @ @ @ @ @( xEL"@ @ @ @ @ @ @ 0Y{F @ @ @ @ @ @ @( xEL"@ @ @ @ @ @ @ 0Y{F @ @ @ @ @ @ @( xEL"@ @ @ @ @ @ @ 0Y{F @ @ @ @ @ @ @(j#qB @ @ @ @ @ @ X @ @ @ @ @ @ @_q @ @ @ @ @ @ @vo/ @ @ @ @ @ @ @Ei @ @ @ @ @ @ @ K-!@ @ @ @ @ @ @ pQ@`E @ @ @ @ @ @ @lG@`vRK @ @ @ @ @ @ @\X| @ @ @ @ @ @ @X @ @ @ @ @ @ @_q @ @ @ @ @ @ @v-YSx}=<<[I5#@ @ @ @ @ @b  @fXn @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ O@IDAT @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @j @ @ @ @ @ @ @w  @ @ @ @ @ @ @,>n @ @ @ @ @ @ @ /E $@ @ @ @ @ @ @ p\@ @ @ @ @ @ @ @,/ X|) @ @ @ @ @ @ @qZ @ @ @ @ @ @ @ `]@ @ @ @ @ @ @ @q -[&ܺJ /ls?ٍ7 ߻woπ'OePq?|,>u @ @ @ @ @ @`UU{ U`o}_e-^N L_Fm:!=Ma>?lԹa(T-+Ol$x/@~-Dx/tv4zk 5 ?s}ZD>; -={?*So_ |&=gI%*3s6?mOK @ @ @ @ @ ӓXUkVWa- 1tWR l)3 .^ǯV[sut8~V)s5* Rv]Qm js>m_vz^'@b=_1W jLnWoj:wKN]xu_4Z~ @ @ @ @ @ @y&_-R@x~ KA+S0_~Y!BBp9:΄+ Zxi/T<]{ _TJA~MH`5|i{­-Wy^sϲ4N]W6?k/3Ok"@ @ @ @ @ @O@l-X~uBE?裳?p[?혗_~yU~=͛[^vM/~JqWM,Xs: ֫9=mW)xo >䓳wnsM 7[6?|ͧ6w }@{,.sh`q{k~U^wO?4w}۲/LiIyo @ @ @ @ @<)'%y -SpOG?U-yƍ-W^9~+xt`{キQe- u+,_P>z=Oa }ۯT6 ݹsg{ou}[\@_<^2 @ y3,nJ}RU'59l L<ߟF2 @ @ @ @ @ p5(\\8_+Xإ`T_.9!.iNs5 ۜN{h]Tj3 uͽw=xZܻBŧJ Yӗ3f=m, LzVks^Ot]]\kLd0ǴiWB @ @ @ @ @V,^W`Mo |ު(RH}.ܴ"-[fB@?T\l Gl> J/MpVw|g[v X_@=4ңzLhn}sO}\)a}6~߼ysϜy<2ϣdgÖV9ڮW^y7igyi1<}=om=3&$g3s"=Oz<ϖT}g|^oy}|IH5#@ @ @ @ @ @\N@rNQ )TA Fll.T4!/&F^m7MhiQ&tצ}Co9Ըޏ r뽵ٽog$@/ۗ+h3g4:yLz/zwES_20H @ @ @ @ @ @ ?+%P_}饗 QZ/ܖ4!Hݻw'l녂nܸ*emğ~_Wg}ٶY9'tͭjnA5OJq=Ը>+߯{z?gajo_SLNmNЯrv?<3/믿~smϘ3,g}ޟ7K?zL(^{jh*ꫯ[ eCu @ @ @ @ @ @( X,׼¹oJ,X`꯶ea&T ??-\֭[[x0eBD(_g?ۂ>Y@䜯?/v'`iE W[P;wq=_wknw/t_o~ uO{嗷~_FM\@aG[hDqz^+Te}ٟoϗBƧ|> V?O]4< * @ @ @ @ @ @]@*͞w)X8x~qTe ^q-kc~rA_,޿~_^΂J\ {- tu/bq7'?_af__h{6=R@גgC`B}V3+߯%?ϒm=ji>a7o~%_JnESm?G __1\WQ˳xSςtbv8:Vv|]I @ @ @ @ @\S.Ps=w[/;8/@IDATZ@  C53jrH:0rV/ޗ_~y?y;OsaNUt>zUo9띻_k.c ؟sڜfty5qmۿ:>4uyZV_xv_n\5NkzX8h_v^c^kݩ㪥۷Y?;;^ͳ:ܚ}]2O+k_gcgڮ öuz}ݝr_gb|k(Q('ype7ej|_XWճgty\8۟ó g۷PI (޻wo U);lBȽGݸqc c(t7Ӡ_|qvΝ~?Whj{Bϯ[ouonvDžj}`Y6OȰjsMU-jsN{mܾWv#5>WMks܇O\u^c}-^MObO?~+3ُm|u}_q9 .߽{w߸^w$.{;O>=KgŌjӇsOmo?S_wA< ~l UR=k}Su=\`Y z^m|w-236~f,/׹:n91m̲:zC,gn[v}iNs3dkaώ9% @ @ @ @ @8* X|T)&[@WX\ JkD>! ui WL ^tͅʪ 4mTKsS5wM?'xRs]c; \Q9[-Sr iܵT߲9v}_Eߍ=Q]{UsӓgYuaz6ڲzl}3Z6W랪g9cuѩe5qP8cV_N>{d4&;1vݟ痫u~LϾjۯ𼪮]cOu|8l'lϏs6m @ @ @ @ @ @,~g=&\Sp.zpES I( \klqs|tl!m}掙LbBpF2יYcѲ9gw, ttz]s}s/#ѹWHj?Ot:wTݍuL=um5ome[m_z7-;w7HW$k1U{~nnzS۳8y6?k9ﳝ;yTKIk/;ULSg˩/xٮ}vS2.;~ LՔYsٲ{O,moX}WԼ7}1qZgm7NgV猁Ε{A@Wg5o s%K}Ѳyc6}:շ} _{Qڕ\#62g\wk}Qk3o=Ϧ~ϫ94>qlg:c,g^}Ϯ\ώW-ǻL @ @ @ @ @ @`g%%[nmG}#)(T(0E>O>)\8 vL,&1Zn޼yvƍmn}N~O*Q--ݻw? 3A/7u\O?~EypX-Br(\sOYOh͖4ܴ,vu muQ[GmO̳W[΂:Ե7Nk뚫pWu5kL9wLu]ӜwjuZ{e_9zzy|yk휝145TZj{g?suU?-ǿghO]߬_TgW-q޾n}lw,CcY}/:{OZ^vʴikncumMj6uVK덁:'suimןlαl}olYw-i?}8cyکqcWzq Y4[o?ĝ;w GZ6l9cuљkoW~^͵To޽{w{>Ys套^}gIϋqo6E}вyd۲C0Œ @ @ @ @ @<`=CNKx-QP0X?p Z~h*vۿ݂h RG?BU \ [hiPE!={Wv v\E`GSk?Oep?϶B:n2o_gn@ y׷PH:vB))Ʈkooɟ?ﵮ?a5ZV{}}~دls\??>ku5~mnZ3{ͭO8gP<뺷osM'ttpt{~:/˭uyںo:Wu'ͭpwڮ֙:d}[o?¥~w[4?郮֧TsrlW{}Rt۷o,8qESؽӳ__yeWk3&ښgښW]GG۲L]ǜh3~P{m<[V_˦1{|鞞g]˄>jsT[jm};irjz5uv=]C[7{y~/nqb}w㹹kkq{vyZg|˿˳??۞3/LіYtos@Yznd\-'?yQ4k۲ @ @ @ @ @ @?3 i,h؂8 ?쵎kޟZs_l]{z鼶;wkvv:'aBks3.>/y@\GijYt wǹڙqҸs/~- LKkKu̯y\QʧkfqP]o_]KcisG=}xu>mLl~}wi7ۮY8킘mw=w^WBxBW_5>~}}]8Zjz;45TkX~辤a'c7Ӷ0>V%=7mM[uWgώw-έ> _tN̡JsMg?movivu6OZ`vMW+ick[ˢc&w={v4nؖӞ7=/OWjcUVksSԳ{_o΢d|q˞5=znEZcowm9W[yE o @ @ @ @ @ @ _Vq@% 9նsg%Y 4܇":9]sj5t[SC]4uONw_O-Sm.yص~MW&h<Yv#]gtyU~󦫸jUX-{B9ݱM}PMyګ}~=?o<㺧y7K54f[4ݲki6avZkZg<ͽ.j닎ks=nS|zOνMUMͳ^_ھ9בz{oz#yNZ]cͽx?r7ϩYE<~ۖ]y>Id9n &v쁍#HrSiVoƨ5z[oSǃidw4M>/ڽgC6muö21?fia%>@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ >=9MDXL@9_S#zbQZ_֕qM")>36=zWG͇~8HM'`?|Ν;oSyXSbݻ#ęo%V^'qx|!4n.q(bH^첵XlGs_9g9oܸ1Kݹp_3e=s5;g~P_&<{s/۟WŞ$]cb5~嗃… dg0Bݺ?qڝ,掏֯n:zy)g2+OeOO~0M'yْ%w͛7׿vFo8O|en>}zm s/}CFOE"_C_x%1u<1&:P|DC\aN>""N"3$;W>@̋{&lx|0DKA?ٵOvW~/~ꏟL,`iU~JoqxkN V!i9bbV];תGZMm'~;ps`[R3g>j'#vw=g((((((((((((((((((((((/ K 0%D4A@6b+dcN@a&+"8#vI&rLb?"9{."~/i˺U'n&BĤҞ~T5߲u9WsJz֫-Ys3]9N9ed]kq}U:}<9+)ӞucRs>pX_R; RL[_U_~6=szgseߌՖ>7փ/1 _wg֬47_g;]H,.}hWYx.kŕ֭7K./"cko}֚핺~lwe]ܙ}9c Y9gN}2O|a e][??$W?lƛs#oN~HY7Ju/{g-Ć֫.u4jR|V]21Q_HUzQ=g?2_Rb_|KَOusc믏:?__>!]/HxN&n'n_7?}ײ:a9nbΜUbqyDb"zUoM|VʉewJ9シ)eΣv_mZӺw'oXΞ(KAN}+m%{f~'Y{ӞNS%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P '˗[X#| ~!"X!T/0,b՟'srsԩuD Nw]DNBG `{O͋IOqbԱu/fVY֓{nڞߞ+ᅡ; 1Hl_bxcMJh,tw b?Ӟ+'_|v\tiϯOs`)SN.K? q޴7 Wq brPb8c.AgȹO泴-sj^KY3&rPr63gǽ] gʽgOԽoT%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%PanIJ a[=?@xBB"EHA(AAAlFFp%S$KdA Y~"E'"Hb"|插؆ȗsp?DG41 0|(s=zh5 Fq,)o魷u"bMYG{'vf1~hwDhŁ587zuvY3kO<4-gn_iO&_NXL8O&;SJkK2t_59Sp(=KlKtg|ǣ̞~e3˜˱΂e2oGJ_jduk_Ϋq氟Jw|Gԇ8S̑O?t?we__e_V7ƾῺvukg)B9'Lj+s3~.}5z"ň}9ٽ!w}pu'}ԗw}ڷJJJJJJJJJJJJJJJJJJJJJJ'PatD &4Ggr"$"X# ƈB{3rn1aG2FlD_&4#T#!$0Gxd˫u;sR &&4xԻU)Ĥxgaj̦6"#9[c?Yrv@fYcbcmğ:z|##ĥ9gT?mٳ'L1يckfXgs̬B06͝f6Ѷ.5{s)q> ~*y1οx`I(>ŬE}] [g{c ɾ}äio8˝c/bOS.Y쿨wG9g3)1Ū(6Sy.9Qsg;4QwOwz+ewVefݡ;;)n|n((((((((((((((((((((((@(}@B@PA&P&AY9"#Ј" "̡OO2qMGJǜY#Q*i׏(G& Ri<DZ͞?`~x& gp)Sq4"7g-&kס.VeidVw& rɳ;|㳻fvfsW¶.q<w~:GN<Ć93o9{'Mr}6խ K2WV>8g|t6Slm=!{9r~sߒ-Ms?[#.ws~}e&cgvۄY|un|;Ϲ띴v\ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ Cg/4EH':'OFAњADDl43&M_g)0vѣG[_} (_>|88&kwygc|Ngzdh*Z>!!1/n* J}M_Ė{9w龻~8XF715bT]19s16_[oһ&q7 o^_ ^ㇻi1I@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ Wf]x0 pOF%G}'"t!j! Ş3o[Dz_~(g !! #! d(/l0 qg}Il>ָuagO?!4bOC}+m?s@Gi2e1 Oy{i#iWDqؙ?þyrOb#f,,6V?%ꛒ{K/[ON?e|ӾLemUe~s#X}^F˝ I|.;Y]%KΉsN<72l=saǹLNFaD< 0(B Gnr>)ݔ|&Vֱ5ľn{0g|O|on2Ŋ,i~O3KϻIeλ>ُq#_G/E?nqF^&Gl;sȿUcV=*;K~RSܻi}Aw|oo):޹KJJJJJJJJJJJJJJJJJJJJJJJ`7~Nͨ) %#dYHax/kߋMD8#m?d>=9涧!;%2loR?|%Qs^k֙U{w9[g*Ǒs3;|w:i6v3O@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @K"}~*D)֥KH/—u8E"po~!va3bDHcb?z=Ý9sfmgn ;^b#l;wn~n߾=8Y0Ў0H ^>fF-&o,x8 dس1'N {ݏs8~9k3"ef;sNρq=w!1>=zuխut"oև~8JݙϻיQ{ݦs?m~wϝ[ۯJJJJJJJJJJJJJJJJJJJJJJx~>"R|P@XlJFAdABkx:w?1| C~7>*=l+tʒ%1,ݼ 3Oڌl{[wމEsois Ȼ9ּͽ7}|+6E0i/kvOŧ)3o˃!8?''I9x{S[bII/#ng|1g)kPJ{K6O~ fWM[(nƤ/,Uvr=_U)˵dss~cmĂOo޼9D[xL{X)-nܹ3'~R&ͣ;k6XG_=OμJk/[7>n:c;~q6R'}ʙV4x1Ζ=8_5ol9Ns|U|mY%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%Pǁ@a#!T!!V!zWYoN"T7J&z1ĉCvԩv }Z%%9 _W5Y}'&†~H=|~;CEhH%" x"1g~;睒8#2;}8Sɳnݺ5ΦΝMLoK̡ͻU)~YM^>~wNɹ;Yξ2ƇM_1u۾o; 'ql.JJ\R1{G\G, Us>}ĈL(퍬NT|ҥ?sٳgG~o~_X'?C>c>q?n9(cm|z dCbF%ݾIǞ8GZ&𢡊>h>7#s+￿{M9GۿJJJJJJJJJJJJJJJJJJJJJJ2+Ǝj 1g[UY)ŸՏ؊0-i?UF0Gv<)~G6:Y'[i_e2a,Z5m=x#HLc1'ţw;%}Ėqbmum@GfNΨR֓4̗9dik'6h uL5|WGjAj?g簴3k#;"^ w;_?)̛=Q^K)G %n|K&qK?Ob~?OD9'O kyZ~O Ҟ=ٞ!'kb[fgi_e/\oMYWl9>'ɓqL9$|g?/֑uץa)((((((((((((((((((((((J⣺sv"F;J$/g!QI "F$"4$$EUKVo?~umEϗ߿?Jof[j9 |q8-ʃNla69~aǯ0$n$U>Dk>YXx"뾔˽/^z嗷φDFk\ v{ \& & vvc~ʥlj΢lD9|Vi Ρ/Ϗ8[Voڙ@2#~|~<cn9?ƈ])99ohG:P @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ r {?-B97=ݤYt4n>=sbi\/c@A$##A"![sǶ"2o3;~3嫯u1~U#.us^Y/ٮ:c]'NmW^yedEbmD5~E(7[/_bHq'6^:Jq/n]6JI}y?e#ޛ#Tw;0Og}#;wl6n'@׶.\Y 1~Yc36^}aR~ш)vʮ$',.]K/m^R2y{WK\%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P/2*l^vmǞ@D-@&<ֿ "抰Ȑ1YnjY"clΈԉ!=әW/N Y ,cw?9~l~;zL%q,/^ES|?|䫯-1IN\LL+ؽ{WX/1t.dd} ^z8$}śqZɯ9UeeOs-ygy^IkY?k s}C"ܫ[oJX]/@;wD~fa+((((((((((((((((((((((F⣶cJ >N ##2 xkQYbugLu)u~}LǤuW{Dy~W=>Ʒ׏Q)meb 7ً-%Ƕ>bZѶLK?#!u!r60<>9}5z @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ *,~wk^#ΙKBɞVŮΜ#gkg7l8Ucvx[-".72Fv+bX,W_b;a?s^g_{|əJ ?x\JL_3>üxfy].+˷ZW־n~dٗ f7][_}5Uz泯 #Fh/m#Ea䫬t)~ZQiȾ)~~zK?ui'^}=?JY䜄\\;ٗrcW^L*QbN\ɉ?%swI;3O+w̥rax>jm+((((((((((((((((((((((@/.v ;'ZΈd) 8z^Kxodum 9bu~8izѣGCͻUI;їHz&$֚m:bbL"2K,4S?k5Z=cU<_珵c2";矒+񛏄|=W_}u?:{5nӬm>nXXљ3g*.ܹu΋Swfķ2vzUɇZֶ@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ Be' D$C&GA!N\8#Tq9bpQMV% s|\5m&2|B©zD< 5{-NSly湽uF_Έ&3w|mc ]^?Yl攭/("Q~uol9޼|}N ˗_~yg~'<RWSۺn?pw@gW^{nwJJb1f.ɦ5?ͥ\%x;>{Ou"}Ncμ~9OK? ¯\2WY&0řgBK.37 f΄x_\ 1uc9m+*O˱I|=*Պ嶩JJJJJJJJJJJJJJJJJJJJJJ^?/;YDAXx[[-6 o"2Ӷ.OBb?׍k!@$HE%G8ɺn>(->zmlO).ٲn)sy4A3~)s,!ǟG^~u^y啭^{mϝ1= 79-aO<@Y_ba"Ҍc_}3kuIlOKu{Ȏc3\M`?9J sii_=έƳgeum'n|ײnvccW.@ġ<]˞s6%SWYc((((((((((((((((((((((xP"k) p`CmDDpC J[4Gl'_ َPh'My̛s{r؎ ^bgӞ}UweRf$>'eŹs_&}ȓ#x˜ꙏ|6lpVv~Z{xΙצu;e]ٯgAw 붞 a1Ѥe՜WaCi4RlS w*sĽ?s'泖1-L p&TI{(@.Ġ#1M[wSw'Kl?}PKJJJJJJJJJJJJJJJJJJJJJJI=z`lӈEY®?~<ډˆs7L;D5ID>ܺs+bo+_}˻"x7 ,:8m)^)ř=?-??j#e;%%u; Y}&kSwGG #+6Rޯqsʴ\{}Z&˶O7^~&2|6^*6As9s߳,vXcĉ| Y9)-qf]iago%3pG]X}biO'4߈7a9᧤-19/6ĚO7qfncKy^Ue<ͣ9 ܔ7{ql̕2'%7ͱqk웵#,Na>|;78˴ωMY<,e9+Gy^%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P/: _V ;y[o5ľwߺyx6Ȇ(O>zɶ[Ν7sfѣG[>>6ܹuFlGo1D@||׆꫒x'@e) q8ӧO!zE|绱z ^V=,ʰaeAgwygŶine[&} G,6k|0_j7/rm+/ϲnbpI'\#B?w՗)gŋC51s#ɾݸqc뿆ċGV7yׯ_mo{n0??س&4s؏Yok׮s鋾N5λLf~ |Qj?<)%vl,b/Rv]Ծ.e"tT;?uǯdեM>g.7L|~#x9qĈM^[77ΠXw6…! lc8?B'Ňu./"aB<"6"$㈇pF_aO5yXn{cP׾⃋}WWba.{D`o0ieq!*dl;-vA ج5.R/8FT9^N<,zf}rƈq3?q c9o',6 ֮_Xs|8{zǖ>ڵIΤ{XX_bmse1ӃK.l/.|w}G.>d3Ë]}${]/oiOÄѾ*c걏[h-Wu{}ʕmf*Sc@L]^e9?ۣ|bE{{A]w[O?Kν3.q_z7Ge:r=}.((((((((((((((((((((((@  !0##0+)0%gcEfJuB"5M2=%yNƘX_CJ V%|7O|m՘_y|4>Qsw[vuknvi2FKV;h;1Ӧ!䙏Y#XG^Jb I[җ2Ģ>W2V-uqn<2~C)8u?|_a)cNo.|6wR;_gqƵ'9~S6|~_KX7L,Ƨ?+xwl7_ӟ@:obŚ;l5&N S;>3R\7OۿQ=^&nQ҆oǞ'%.ΖXo=SlyThY%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%Pǁ@a &e`̢I"CdGV'!# #. Rf,![2Ky'Gؓݶ#D$ODխK=kP7v?0J9?+NdCfJ{x$FJX߭▨,1d4P d>E8.[Gޫ9Y<9sL2gs;姤uXul)1֜J~f #-Km>̝_63 ui&euce~ņ2skggpRmJYd"Κ޸pw3n捽،OqB/'^b\S쿺*.~>vnZO {_z>{=Rs/c훘a3~ׯro'kٰ |LJjzKJJJJJJJJJJJJJJJJJJJJJJlu: |! #f#^zO_vFHC A ёDCs1d[ݻmcO~>W_dV%DxȆz?^%?$&67:.hDENl=*qe C{˹C)&;N2ߖMsG8>2%f}W)/?g|^ٓ~/ߌ1'u%9Ʈ|ȗc;qfmΆso6w=+cqHdk/]~wL|w>^UgRİtad}U~Mɞl=5cܹ3J>➘&v&{ qcG|WFJg?$3wx's|Gᾚ}nJJJJJJJJJJJJJJJJJJJJJJ ێw 10ap2W\"4͛$l#8#MyƍwY qNDo;} 1tqx㍭wyg… B6J|0z4_VO>dilD|nغt҈Ts+cK:&ĹīkĻgc; gX]D̦UU2yd$ka기a۷o;gΜiÆ6CAoA2'œĶjby]G%Ok_2bDXWx)lN"mU~]gT\-#7m&`O0"{x~7roq;߾M׮]wT.}g]Ėãr_ߖ%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%p ]]tW5[c1D7g4ºX h̳A>L?%eKcn>adnk'¿QׇHV+{x>,}*6b ~'͞L\+Osi;I)v&[3yc;"Vg;EȆX23?9)+K8p7xK1/ǜHGFݳq}n'3 =JԺoL:M毬oO.=I 쉺JcAk-ͯoB]+R_f3ٴN6sY}c1nv:sVm -ԚyLl3UhqίOu#~Y*?k왧i'9Wkb7U8 s]e~_m%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%p\Tr\vmjebfjd⼴H޳R__,ig|6'_&IhĞ>*6 6>̂6YSlթ {V[ _&, Z|/% y%6ib&Sr7yV&~2u:t )6wm)~M)+&3γSJZ+_7wLl_%a)gnQn}'W eͧ:W^DX8sʩ+ݔӲK̇Hi~?xX]4OēJ.dٸAf~֕Iཾ ~Ӻ<5fIzWι?9̖=Pl諔%lls>{=]f-b?9QZ"| }W6}j=m+((((((((((((((((((((((@/vM{&@Ct1ȐÇ.=#4R'6'#ڒwUޘ7%ҙ3gbcSOs8 닚KK{>G8{DEs<,[=n}s"Hf3´ #?ZUOELO|2߲rcJMZyNiٛ˹w5I],]{Bj^U)ΚeÙWwݻoثF0%gw泏J~ϏSN =}_|N~Z~Ά<4,)#}֎+?e䰵ᗻT=2~*شΝzg7>x'%M6q<}<8>ٿ{Er`k6E 3l%L1unĮxM_0OR-KJJJJJJJJJJJJJJJJJJJJJJ88z#Dp'No~3x6=$%_|-%!5 kk"dІ… Cao!0N>{(?=}u!rʰovڵko62{pXDw/{Q'R}:O<ٺuֈAD2aFu![cP{{'1??d/"v]b#[&ߏRaN4_oKMZĪ+杵'r~/$ěY|WJ~1>3n;3?^p|9,W8K.m~ZGZW \x#ž9DkwիWϤ)Oںݿ#IJ84$99s9lw9G=,(((((((((((((((((((((()^<6]1" A! %>̢`#&ušv5KDy`-֕s_>|Ӿ!Z+[\E8ŇX==qg2ȼְkqAd%7sc3iX,Rd6O/|y&Wzvnm|͹7iws6ϙH Kܚ=[eSwwQg@쪟󞚇$usO(gD%|H{gk~C@;wFL`i.ic_Sc.~ =TZ짺dg)uQ{Sr۟U~c}1{ly7Ҿ+{b_IΚ:A}~VVG 'lͱoyĻasS%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%PǔU`@ ~IYzD^aE2:??,BJ\Q$Qэ2Y_Dv̶:.GKJk0'[]WiX2"SWOvoa nl*ӏĆ_Jڍ7{U7fU<: .~sesߧϜp9()^ԗI?ɯMi~oήu'ōghS%´0sl*Jωޙy|/1ǦuZv}ϲ_3/$iP&W|Ol+ AVsfSMcbW]ag6OubS]9y3)8$^/YkڔڬX)~Ŧgqvs]u>zKO~l) SZ=N-:Ebobc9 Oa=ǃ%cAC/vų9_J#a'KasxMoo^FY_־iAs^%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P;S/= 8w *! PP_\ *8Su'0/uJ-1uEhf-}gD)G 4aNv%B~*!1$r)}l,/|c^X߅ Fi驯Jc߉"vⷯz%_|AYiaIx8qbݚ#L<'kRqf-b _'/?a1.H.{OާqH#V)Jm\=ʧJy>G/YY+qZ̽;ΜXR:ߜu%ɩy[lb꧿_vvmX %Rd쩳mዃM>{/n$/Č9au>i~%ySA&}1?$θ}W^^`.~%ų==Lbow{(v-6΁]X<;ed8 I`_oe'&$"TμRO߃,G y'!X5{2O^D|OsKHPebȼĭ&߲ gb;>y[~>gf~ǏR;sFHsOY|̹ ޳e̒9ls^|R7n|'әqsdܺd^߲+V:2ƸV|yϟcØKJ)q[lM,_;aʞe칬uou{hrاĞ~ZyZOΖw9>Wnu=wU;qsg~K$=O !F98%Ϗ=}g~?RȽ>JJJJJJJJJJJJJJJJJJJJJJx%f,0YDVq`. %~?YW"! ›`)S!_TEӇDGq&?Wdu]R1pCT%'uH{JrEX~K\([?u~ќ)ng{cuYL&3qA{\"3;!bMs'RN~.OmYC΃ "X;cmO~ğvs#47w8_uId|n/f~cږ~zTā]-kvI+{~UR=)~R\N%<1C=[yV2?﹘c1>,,y9wvc$*ķ1n=}͉R0W:%wۉel$I\Λܯ((Pdk}Q$ M/vM . Dq6kw=2iʬ]D @"D @"D @"D @"D @"D9z>gNc|X^A+#}IxG tH⛏OcG,W?ן:?Хw=917^Is\p]o3dL|Y⧤Nizc>Dgkƙ qYY>ۯN{IlO?}vA=sPģ_[>|3=.'?7͓s]ol7*_wq__;q?6$[Szzdߏ1&R"D @"D @"D @"D @"D @"D5 Si)죢}#u|,S棕)^.E s3A֐㭥%U;1CC9{XC'_eue~n퓟yn{+g?^;'ϱ\ ڐ#15EVu2hE"D @"D @"D @"D @"D @"|kN|䣗??ypp;"I׼޿l9o>I>oU#{%9?*mڮ1:$Y歹Sn]O,959JY"yDl>,^Lx{MOߎ9ޯojkf댼@"D @"D @"D @"D @"D @"J޺wk޺C #!c|XWkk8׌>e<\{XOx}utuucO+߫|<Oseyq,O}ǟuLc~N~z,akH,ED @"D @"D @"D @"D @"D x]_oڿp5{^ק%9~(|v5{ N?c{fiWmsޔ54D @"D @"D @"D @"D @"D @nM[VD @"D @"D @"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#Ї7&yD @"D @"D @"D @"D @"D @"D @"#Ma"D @"D @"D @"D @"D @"D @"D @@ߘQ"D @"D @"D @"D @"D @"D @"D @nNo4@"D @"D @"D @"D @"D @"D @"D ?}X|cG@"D @"D @"D @"D @"D @"D @"D 9>,9F @"D @"D ѹ`@IDAT@"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#Ї7&yD @"D @"D @"D @"D @"D @"D @"#Ma"D @"D @"D @"D @"D @"D @"D @@ߘQ"D @"D @"D @"D @"D @"D @"D @nNo4@"D @"D @"D @"D @"D @"D @"D ?}X|cG@"D @"D @"D @"D @"D @"D @"D 9>,9F @"D @"D @"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#Ї7&yD @"D @"D @"D @"D @"D @"D @"#Ma"D @"D @"D @"D @"D @"D @"D @@ߘQ"D @"D @"D @"D @"D @"D @"D @nNo4@"D @"D @"D @"D @"D @"D @"D ?}X|cG@"D @"D @"D @"D @"D @"D @"D 9>,9F @"D @"D @"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#Ї7&yD @"D @"D @"D @"D @"D @"D @"#Ma"D @"D @"D @"D @"D @"D @"D @@ߘQ"D @"D @"D @"D @"D @"D @"D @nNo4@"D @"D @"D @"D @"D @"D @"D ?}X|cG@"D @"D @"D @"D @"D @"D @"D 9>,9F @"D @"D @"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#Ї7&yD @"D @"D @"D @"D @"D @"D @"#Ma"D @"D @"D @"D @"D @"D @"D @@ߘQ"D @"D @"D @"D @"D @"D @"D @nNo4@"D @"D @"D @"D @"D @"D @"D ?}X|cG@"D @"D @"D @"D @"D @"D @"D 9>,9F @"D @"D @"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#Ї7&yD @"D @"D @"D @"D @"D @"D @"#Ma"D @"D @"D @"D @"D @"D @"D @@ߘQ"D @"D @"D @"D @"D @"D @"D @nNo4@"D @"D @"D @"D @"D @"D @"D ?}X|cG@"D @"D @"D @"D @"D @"D @"D 9>,9F @"D @"D @"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#Ї7&yD @"D @"D @"D @"D @"D @"D @"#Ma"D @"D @"D @"D @"D @"D @"D @@ߘQ"D @"D @"D @"D @"D @"D @"D @nNo4@"D @"D @"D @"D @"D @"D @"D ?}X|cG@"D @"D @"D @"D @"D @"D @"D 9>,9F @"D @"D @"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#Ї7&yD @"D @"D @"D @"D @"D @"D @"#Ma"D @"D @"D @"D @"D @"D @"D @@ߘQ"D @"D @"D @"D @"D @"D @"D @nNo4@"D @"D @"D @"D @"D @"D @"D ?}X|cG@"D @"D @"D @"D @"D @"D @"D 9>,9F @"D @"D @"D @"D @"D @"D @"DaIE @"D @"D @"D @"D @"D @"D @"DHSD @"D @"D @"D @"D @"D @"D @"#RE @"D @"D @"D @"D$;@"D @"D @"D I_n#D @"D @"D @"D @"poMͷo%h$#D @"D @"D 5>,5E @"D @"D @"D @"D kM@"D @"D @"D 7's7WD @"D @"D @"D @"D @"D @"D @"D^,S?rSr?gϿ!q$~Ɓ<3{˭m,E Lo}#Nu,{=ܿ_FSGXc+3Ͻw|j< X?Xk\Kw>j&սJ~< {v4emTݽ?XNZ_v;6 ׽aV{ _󛞽xLbIm=c䪋 kFruD @"5?9~%ks2/x=OUYgG>o97xլ=tn^C?e^ɷM˙z<۝o寇.רϕknkZl_\D 8t\_ksm y?̮{5!%WbpG=;ust6W9@vWi/_!+/_C&Cxl?O~ 絗ȕ*>Dͤc~Dnd}58{{??'9jﹸA],׽G0dlm^v_oտW0͍1T5?t˻*"?$;}-|M}tlwr,|w[47կ~ͯ뇲OIď} cy^Vޞo&]g=;>mثľ~{pJz/a :.E @"gy_{{;[z[{V&/7yȞ<zc~y3s~o9Bj/Krnޚܺo??}ɼz`#sϹ^JBVŊ!Z7ڬK;8瓹g͜ɛ3[3qqqb:ZK>ߋ~|=@"ur4yuFm5{p?SCto#?x7_~nk ^}޳9yX{YY^kw %vw}1.@n}Xuѻ@+yY7Vr~7c퇤y?4k$^/ǘ^ُ}^V"/~ܳXֺLk^춏U0#fwGb@"؏$~P޽s/^ǻFu^:vi]㧪[kTy/㮗ww?Ögw7K:|;3~#`e3;>&ozp?$`G[?9:ܵ5tD @.>}/;ynQo@yF_=8Ee_~ri1^C!?x\+>o_^?ό{oݮ.cʢD }w@IgǸEѻ5}e?󙝽s>?؞XY9'_1Cf][D @"ؽ罓iXZog^ᝉ+KUo_s+Kd[bܺ"9{bcƲx"DViM'rvxez/uQ̿LKد^6-<x9>CW={Or8@^}X[Ez`vcF|U~7zc:5RPQi`};^~l͋|uKJC`k3€1 O7쳻ں֏(Aķf˱DZkK/bkg{s^:+E @"=z_3a wʞo.9-g3n߻^g/vR|ybE{&{seS$H}e|=bZAYbQhQkw_ͅ=_ž yR"Dk$k{ٱ{IYrN4?wǑqKدjm_$?gFam{3={<D N~v?Mn~6`0y7Znɻ^rWMnCADzԤԽo-~(KlMhhLk~%`탿oPYެ[;*Ҿv;f[-+Az?xٸP޺kX>yܳZ =ov{}8ack׿{TĮm-o!kvHy'}~׹W)"e#`cIs<~bp@D Xg>\Z>n"D L`D/yY0~܃lʞ۾g}~/̼g۽DKx?rɽZ]Y]Gw>^%8w}=GsO7?ȍ5_z ek'?6ٚE`̉~w091֘kbkþb[W[1|@"|'ƳcrJk\Kqȯ&/ezMFW;^Ǐ~w${s|'7y}{ αu۳빬}:rG Jԑ0fCKɍ~s67y87[X;Λ{htiG;?]v馓?зw_DX{q^Z*[7K}Jy`KJF X_^<'ܻH֓{!9{)"\s]sdAVµzpٵVל]Ee=("{{&yA\>n5K}|0\gPOb쒲Vk]|I1)dzdϓbcO]D~uy^﷎Օ"D N`D/y?J<}{ܺw#gWϼgH?{ j{g`͙3{ϡ~ ~y9~m$r叅sV|?fxWGe:7/ W3>Ҟv2|slݼޯ}ĺ}FjMZ[_BX{"||oۛ{EM6.O},ŵ= %qzn\k{!%WN~VO=-?O.>z^u>}m,:@" #p3؏@4}ڏnvcFQe? OtkM^kwr6Ƨw}mI;HexDP8=Bq,RLe{'"p3[_vy-]w|l<[ʮߎKxk?>^#֨>ynkDt͖]NW,Msx&P칀ze@nvivs9|vw,YW;cD @"!iISKxLw\k<.nsfF$3Y<|žqj1 @{^|?>;߂h;_Y== "S:essM]Gl_>@'ȭw]D Nq\xsS,|޵wF2#8/iz<ש}c}Qw ]^ͤĦsK:D _>,Fyl\9O.)/ۃ)yNyNK39zGU>gNjer7~tWn t跗ꦇ>;O~MW[|Ս:ί*A={ 7m}^*6^^iO:s錑=Icst~vWnNW/S:|,Cs^~5@cU~H?}йOW>;5NǏD?|cy?՟?'ڙg,r>ugq:9scsY |,kwfȯ~:Y>}^;V~OV]ʳyynzNNб՝wP6^gσ36]=v||y޲qd9;?Ư/gs<眸:7gckcGJc6y2s17w՝4=f{6O;&^n:O{Wɿ1]n-M9Pb:'ox&cc_?-VRڜؼPwՎ7NcyΓt=q]yD.96&%c|v)uoX'Oug:N?ϽK|g 'Se>9auKt,s]؍=|Y~Io3bvNݙ$#[L?{ڸXx՟rqu+iy~~>'ϸؒq=ӏig1i7kY^~,鳹;>}u_^ 5xS278isk:Iz^dyqKNwMWVl/~\lz؜X[z\/;aui6y~q{v~>8G[k:QX6^].i>vglCy^ӎVu״Ɣ=>sc̋՞Ŧ7g8xsƎ q]1HiL&p^Of:=999>өOm>/OO?g^ecϥSb_3Dt8)]'ι|EKtK6VY>mVKJbb%űcs)_\:.`^ky)t\b^?yՇ? }Kg9Nk;y}9Ǝ;_;v{8} >igzmc6?<>cc9]c~;/>Y+vrJ}]cRSrX_+k-ge1qx:<>^ӤXZcO%e֐_kٲ^P^:˫{ɗӦuMby4Xn .R_#閴?ӵvti{Ͼu;wL'yv]}XPk;|s)3ݱ򻲛wZg:V,&:ee~ \ssoxGy:is1scK7?zO?v|qΫ=%7q4׹ss>e>c9W:>8I;~3й3[sγ8Jg1ڟs{uGYjzWo\UԻ6{\&)9Xϸsu8{.!޵-]Uڝ柺ON.19;gRQ9IϿ31Cʧ?[ :^gʸmc̏BxMbt,^1~+Y{#ksTw/Rv]X_RZSkv|wab/ W~L?u}Ϲ?YuڤzzW=l){.;xkmxkvKߩym郲Փt^c}Ĺ_}Wl}Ls|mnZKʸM'뇮SLX{u;xev/6V7I|\y?m<>g{s];f&O}:6u&l>l'#C7~Vs] 0 /%t0Q#Kl9\p4=m_h:}t|)u1duCY6~0w0|e>]mhwƱ~*o{9#/~o8y'$ّWvS@J,;|7Sm^J +bV'o{qinv|av;ۿۇXիۜ;JJlƦv.W?(?cGٟٛcm,!fvſq~cJ. XgvvcP'W6Obܞ}Ɖ"}ˋis[:>l'ϾxՉ%%mlƩk%~?r,Y?{m?kL&.Ѕk{\ߺ\ɗ<5ܿp8on2tdz9jLdes:q<76/`Ց&ʛ3y4/1>^mk~8fzk`8too:c_v =ˎ3?6G1oKʲ^\l혯g{gޢ*nqC:/?6ۯݎ͛vo9Ɔ3Xy|{~IoIXn(O.~cí=p>9/S.9ol[R^Zycs}o|\SysK2~|8νjsai7wEh??0e:A:0vdG%yc*KG~m͑|!]kImI%{}`?Ex$nG|ǁV8M89dLڨү/cv}779g~,֕?91dN<ˋakKJm׹>H'KcQx*/üoVƏ>o=skFYCkvg~?lBnn) gMg2~hKJa~{\?o86)yfͻsΏuڦg)qS7{+qkg|p`^:Fy>﹊|jΘ޶1de={OtR>b.>95XvN2?Ԙ{f{)rrc{tq榲>bZ{W=T '}i9cL k|be~XoHYʋ~.9wyl>$waDOsJJ]c+[kwk;wOy,y;iWy|1Vsb봯XH6t%uPϮuҸ>cu|}zQ6'νظVGLn|NBy$nvyجM.ɿqv,ѻy{+_ĺގ)~r<_v{Nҁӧw`JwJlٔ,Sӗ_b#W%PB8vV#q`V6\ 6z|`}6]7//v77+?Jb%R9S>7[sɸ?6Fb>m8+^:؎/?]űy8/:AWzdkĘMӆ>݆b'esk)П>ܯ谞Øj?~i w9 V>WD~|`%:ru1Gч߻XևHEJS^ +_mokvsoqHl2}uq>~|ކ_P?ח+oǫ#Jc$i ν.8b=O~ Ɇ8ߵ7}s,:lgevkγ9!E׬6u}f'ÏGL6o#ʻM]ҭ~]pv{]qwb8U8y·?lzŽ1'.R>ݰq;?t)ɿFG~߆۔ QюZmw#į)1!m(qеDƊtg|O'녙.e7쾔7fc(nvE/{`׍le~CW?|pn3L'ӯ솎Rss:qIqY?\6X*QVOJK|yX<0y#={b{>)l nLR1c~+6i{3'i_Odz3I3esdNM|"sŨ6[I_>)c?(/AO7|36Xocs3=c#0bGb_g>9` 5?j}쏺6՞c؛ >7һy5 yLq^aOW2t''Ƙ(ok6yrlGslL':nESV~'*ܗ+c6ϳ7{?e;jN8'+#|׭/~|hbo똏|yce%tc=JbioŴdi+ig}>n9(IpCykcu?gSYY?ok1]]p8_#=R~왟_>H>+냗ze}0X <דIC]5ĪL捵xM?:V^s{*1|_[Ə4Ƕ>#{=)1H o:٘+{i.^cyc% ;V,i;ؘvKtb(m} ,7^]{szn|=}s ioެ}g.ȱq}=~:v:oeR+C$]^} [f1&xbw|/?dn_py`5yi?a.}S>6gC:b<}ػ橄E }Aݸt񛓲4a~Mc~a2;z%Ÿ{e{h+Q&Wy7ƾ=ANc9XwC>K?/}[ i &\.R{{͑Ց[,7C:/ޞ86㆑b#kt=p[>msyJ75>|Ўrq9~6[[;s7>8, ?Kܛ86ޮb7%ǹs~㥟>I~[54?{٥{ez'|ٽsƁ7w{~ksܘsvA'=6wG|׾.Iw-X;ulLcϕc_8[$~mcʖc{1?+c,vX}(ox*K$~aߵ;]nݘ70s_mnmc|&XSL?>й9-[[*19lϾem}O;96|`?:˻-v֏tc1n|gWM|Ƈl>\%e>llHvĆ)9%̞W援x#㧿uߔMe>|۞gŹX?//ܦǜXn ع?>7_ۼVN`C2fعغVX8=cn+K1sKžccu8o$X;Kĺ=s0Xl=;5)oNh9u%~/%~,/νslOp(oCټ^p#][1#sI-)ۛlY*IrkI:a~D~IN'f]|ӆ}WyW^<6(Z>j;ĽC=.c3+lcF'[ûow#{h;>Zhfl̼ƼvO}.އ]矲,v`{!z%eY҇=mj#kķZ߳y>e}Ⳟw lΞf;a}iޗ(s~V_5 k]UR"p>ᮽ?;"q]&wzS\tw[e}va\.Rk7ں8,L:]\]gǹǒz:O}ZJIS6?|cl\GbpnIkۋ"mkqc͍-md1cz͖2%i]]PϦUfS:Ż2`Ϧv;>fSlʋ=e0NRT͎~-N}[}ĩ-yھ˱gkcm-sJ~oCQxcN cH(o?g/8~y>q^s+5^|zB:7~G~vq3>l!N[Ο:qŽlxՖ3^IصYb(؜/ciN_bgŪxC?3v-\;~&n>w&i9bwR:׀2[ Z#K8Rk֏ s76Ý?o_z]{>8om8-un VtͶ&̖&.?>kw1YW[|KH~(kL7Ϟc(KOCB֎X.(KKglǕd\<>2>ct]?9tw,a{l<9.ƗNי)VezgTuMI맽I|y|7[s,;/沲=7%tT6AZ5OEk#i`~=Cl<9$_q2L\Α8ksoŒ.^c?CbvLײXݘ)S>)S>zkfe~hǶĆ=̞XXgIH^W/3K+8-c,)?ٗ.Ta <ct9k/cƂՑlA8?[}ڹe~6؞;v~mQXHIsioczݧ]]˰DC6>m|9c*.X;wCI|O~̇ yec9W~*gva=|>ұ6y]ɭW-i\976aoܹV~O'_#q+[2ߓ/t*-Na>so 91?p,!vƟԞuϣmK}Cs|&qI<2Sc6oqyόg[e:CjOz-C7>t;yH[|w~N'ñ^f[/w??7^hc~w3gq/}X lu/k͞?)i8Oktm~?D-|ՑI}'>3[l+O;y6f Ҹnn)Oǵ2vZAj;f|tw^ڜG?KO&'YNfc6 ;'4?'||xƟv81"},͞6;l#/xKcu䕝{o:?Y,x^ǛWʛC殾Kc:SyByvs~]M!Sӱ68)Kڭ=)[i;ܛݧ٭C\z$V8Dh\7]doa6\Hv19m<;nvdc+.ө4&F4?OvSx73⩿sg_~jsiv[YJ%k+A;>g;&%>(/ImzN;G.?惶.8if:G/վ6josF❜M-Y^{fsX&WsC36JJWM#)_֎r~:oXz~1L.cXYZRճ~gu/\[%ԯNq[?gwRW|"_{6{m'/><|=iwN[ϱm֎k>$ѽXٔ1g~lϘ-9gӦv>i:)ϗ=:<ӱĎ8w[Sj3QMjg1:`c}s1{>\"b~.Ӧ>yͫ'ownsscnNǭXjTծ|oinJؔ];G[l:?/m9my}n!>9֗RV4}<<[xL-=1l^3_~}h+8k8~<؜q~iqM^RW1xv\iopN͛ST7>{f]uh'>*uqWyqN8|ڜNb\Zyr-9'ז='crfC3>k;ڟ}v}cuOVf//eiNgǒeNǵ~Gښ;'[XI?W?q~s#X;WΏ~XRx;;VG|~5> ^d㚴Wfcv;*/ 9{ԗ߳sl̶L:Yiv&g]?UƋ-%1OkxyׇkZ[q#GtEŽu7Fb~ߘN;釲4{7Vc\!-Ƈvϲ%KcrO;w9OɑSh ;i~t|=#gGz2`o*;hؕ_>g;&ͻ2yc6H}}8.cza>\m[//)ϧ隭6|Cci1k+=U8'cyu?k֤^?xW?}5϶v9;7έ?~g}_9{ܶפֿw#nM?@ƞ&Vϱď2[gpL}q[MX[[Imym'S:7lٹ{HyVKz73'6/̭67U?X̖k'6c889Ź8[ccyqSKKn,8ʒcycߟ;Ov+q޷<ǀfOYL6;OڬtjT?~hG{cvez߸hDncm%l8[)ƃGc:l}nIVINKIzyʬ.#q֊DqoVL&g#ka Fy|X?ZCs|?ۖ?զw+̰ëV^8l]N厇jp zq8q:Cwg91I|}K<ؤ\[ >ߺ,fk,FZsOgeh\_Rٛc¦x۾k_d8jxdžbꓰC Ncqq0[=)wRk}7Wn8l]ܿg(w.|gwu>|XOԫ_թp!وaáԈ8ʞ/bd|3/? W~_ܤZwՇagiΗ"X5}Żg96ZF0V"c{N'<]}clţֶYWsZl{5xa*Oejn_=  ;oŤqF\y%Z_ij\i͙߳@| \bŋ&lug;x=R^Ɂ`x,Kg9j~/LrV[HxdMpظ͋eO:nj`|5'04.ӟ>Tklz =RsE/߃IinbWCv_{ֺЇ?_rۙa=ߝg^jOAî|v5t9{N^t{rOyñ'jp+k1^b{z \{Wݍ3r[o-_vWުAlQ-zŭ3O-;8osԖd8٫!|Zww9<|1WtK-ؘ'rXS-k|):^\jPNbg3SY|_-=|9<%zwz ~m^ |'Cŀa=B1 Ǵ@'v8tWO#WNʋ566v%[WgW^(Lfr5sxnEru#7Ⱦ=cߙkq778%>"&yO?&|kĝ8[;?׸vK:OZx<_gNj?K؝)NXwwdwG%[ߙ2fjŃSWP˟`Ϗ6vմ>}K`hîC;yQ vr.7l'X˅Og@ÅgO=>_/5s85vfSAx{ \`U;ros/}q?̩9Ziмqso=&j,ߗA;0qkO_։ϞxNhj OgQ_rS|}_+-_~`ǘੑ婟/{}`{-.݋Cd[jVK<5 6č^MΘ6Vqrx?q,]w&;~Gա{sh )Xp6|Sxe뀻u|g7^}'x?bm$j{M~:(v}c581l >\vekN!{QcW= $9SԸ\t73X ٝeN4}X0S=:sgsʞ mk9zgSϤÜ$j;i"n1' -)oxOuK|oشo4۹w-rLl{kjW &;VZ<:5x򏯵bٳeM8ι>6 ֻ_azxyir /m}QEK1}y_ً4^jVqawG~%C님5/U/a}6w?\xO~)j^}q8K{e󟯱j>Dl_:hy<﷿';y˳_h'Gb~_???INc/-vuK~n?UX4~9孯9c=}{يyK/ lŤqXLw73x˻2oӚjMW`7יw/1>ɓ.O{*<_Wg9w][~rrFqbc-δ:U+el߈՚ ?Vby&s 80/: rǶ{#[]h :Qr3׳ڻSH{No֞kxw--78kƸnYN#[71~y;>BΚX\,gqaGr'Vbc4}{f~;h]N޶t#<M] 6pԑ#{nYN<βo_N"FTܞb98'jXa}N.8Iw?-Sp;b >ZCc9CSW|/9륍xpֳcO. w:Oo5C{!{i4?>=oq]4EhsjSVvxW~NcNGsWhf?}C!jRˇ܊8;$6YX?/gywN&z\80;$8ynx?W>Xtv:{Eغ{nx>ëro޺u\)o5QR==ZBMOq1ȋ=D8zՓFXk,F~a#&f9W9 zg>pUGU;./=;y9SQWqO+?-j+sgAGӯ޽qV7wHoyw_կ;ur>^ڱC_\~va[}TK8@M i$&;[y1~G)L~?r3L:sΪ>^tu}̾b|u嬈͟惓a|љ{l8 l-1;ppҚ{ZU{qWK$β~(&]ܟ9г0KS[4'=Ė>qqUsx+:k?W!h|kJ=w;ĸ0xMͶHGw^m,gc}qIiuSx޿S{M=g̙sfkVwֈs3Йq^:;홽06.{CoGy~/>[a&ZY>j;GpkabZ| ?|<&'N},e{b7o9P} tKWoy$954}[l9< ~76eܚ~"Kx|p=},˵{,oOWCuG o_]ΙxW{S9s߿s:7qqܼ3Ҟl.W΍~pg`ʩQs7_}Oi==Zt#ܬݒή:~yhԨ}k5Vx}!rQ{ z..u5E :{'[g[TʙLykΣ~ԽcuՔOF|̽\FcCo/h~?qq3{?Oi?b }R}'|ܪ;u~ߓV+k \ͫ7ogo^|y߭f}rq^kaohixP팄ß\u?ՙ+oU S}Έ>~ޞ'} ζV+S<휱=q YG5YꧯNj?;énbyX{.fELRCZLn˛VOW4)0韂'ܻ6_R=ɹ:{W'Fp>zq@׹ї~wA-ΗggkW{{/a+WHZ:qV 9Zr}ߧ9:΋t=;:w=k6oo`0n曃Wnb'?]?sG9G6j'?M_8[3ݿ;ˡuZl=뻇؞{̷gy683sҙYw;/;8|4Nuqg+S7W zQ8'[=sicbL{f[<{>?+^gW|{o.rH/l>zoe)6[| ^}2]𬙫YlZ2S_,/\_: 9EXa"bG|Tj,}\ŧH؜-w>}a/O5>jc\WO}ͳՇE~z<WL_&gLxl¨6-gڜ:N$V_~^^n/^匰I⨶pu:WZnڊo5C,ݑafÏ ?ҏXZi1Ł6F֝1 ME#l?~X [\>˫gԁUaKQx֪{ ݝӷtkz~/}'0/V&YY]Dł[LSu}}?LLz>giӞp~_ϟ9}B!)7x'qæ50WAu9//zuՂFXZvKش_|kT_|r_yhoorR^q:xVL;5|>Zk?bzVn/n?׊+> }"n~cgq>nyJG]9'?kx,3ȧک>wwnɛbY+O瓶+Kliٷ{d'f9'ZN4 p:'<[Q_͚ZByonmM?sj:յX۽ӯ姙So]Liyx߸=|k aڟx࢕+[; ˳f.L9uGaSr4g!xٙW79b_Ϸ}ooω|ӏk'W Gj[{XY?| wIlx[\?ΞQ X[V|1 ~NUg<`!˿8+c댷gpae9^F+O{,xX_=s?|=s&tR>|s`cWC#xקWKWagp˿}O˽K/miqˁ b+^5p [-_zsOjr0\C{ .]gOx 3`X'3&͉'{7?Nšac/9Zp|{y:o1neM ~ ~'w"%j](w6ϒr{ CI57OyNCã&l;!6[X9e;_1]_1)/%|i+ʵ8Ϋ>|Iyǩ9[NbO_SW%8&n=ϹppOuVݾwNYšZo^m~żFܷ_. {_닥n퇸έ|Gbn.q3\Y8yΚocsE.='=gԚ:؏ZgEqV{s0˓x8C]ą\qOrV=3\9ӯfoaMkW/ľpG'-QA:W<+.^_ (X]gkxyڋjͦ:~-_݉=шDKal17k3e8}.#}뻎qwWz⡏ Z+OK;gLӈ\Ƌ!z9Ψ|7Ř8x.$͇'ycO#=w4Ʒ5gj5X٦6b~u';I\T☃k63V;O5د{[xD:x[׶o5k _vVbʙ\n/hq{,WX0ťZCl1a\mo8VN_]k>$p1 !x{_@?K jնiM0Xx Ø 8;{[qNxGR|kxޫYs墯k{SK dk`ܗ}R=QD+F\i}X;+pN yuG̅χ.VժqÕOwV ~{]6W_ٳէI/FsդbXY&])j՞xXhVocx\5g׸|äo-wsg /9kqѵ =5$bXX?mMΧ&>8)XlO\1O鹚o:8/~fLr ;f[oZ;kWƥ5'qĭY?[4>'ĩqhm_ۇu>Hx7]}k&O j[}¦y/oGTDߒ||N~CؾZ|~|CWlO|#.598yM%مE{g sr0r[S s{A/|]7|a{8x[j[q5O`Wţ:W8Z/ll1˧zT3k:'}=G)U[veo|i=Cr+?CMʛnUi1KeN_>ק;Z^jVu sfw'4-?[}68],V-Z[oY]i?[g 8TrTsvƀ^5WlkS=58ųbT+}6نISz+.|b\ň>.q+&04_L 6aя^սx .okɎlһwٞ{Jb$ܰ͵lnϸTjf)x _kӰ.׬wq{㬏8bjq=Ѭ41q/vӧ_7yr 2^y6}|ngh=p2V~WCc+1OW3ْ87Fs^Z_rXp(Fyol\ ^kIح7-~#;:Zv^K/^>i>F;7&pʷ\͛ko\@սx^4.nխC~Hk|a=R]ʏ>zU?${9w훵ͯݧ8m+"޶]+NgjOD_<< ˅{<~g˽8S W'0jpil'3[=k-]=&ٳNgZpUۭS:+~s,ʧsGI#qwi<VVO}xҵbOǼ#^Kؙk}/8%hjxT3og}Ϛ֬[G¢J6}vXT/>-sÏK<_rV8~b1$I+>Հ&;04rwEſ_-ILgj0_nM+|W 0nmŕ]}%&△;3xk57.&.6/F6N<ϯ,)𫙃 dN||͕GxV>78(yXa %^]{M~kl~-!g9t˭|؇$\9v^p9kmgOgO[\srӈZWr~KIR̅.j!>,|4q5^r0ή1n|w#0Veŵ?:{6ڣz[[}rk.lSH_+|ՄNkIqSz`sm`9>|4jzg>|ږ=z|emgfpJ|I6qj]#ɼ&eQ8&jVbjD4m =cjg W]I{M|`#֍7xj!svqKxR_>:;y-\8I;woZ}65/ #UsF֊8H>QntM8bXo"ՀpU󅯖Y5էfu¥H1_o(Vjٹ1gs8l6OdmW V-t6p-Fч9`NW[Mnlx>?}qŝLcx3gH ߾ijZܾW~`|^e>_8vValT吿}`؃9_Ŧѽ/Ɲy60#K\Xa yKί~usC,x!$~㿾+?p!B5kfM>ՁMw?LE՘.0ʕZfGank>UמF~j5l:|<{s/ߧrg9(zcarjO[:{z|mp[R C:?Ɩ]-Ou&yqv._C< C->Zk닯N 8i"1pi b] s+Jˇ M_W\4qIXaQ+Նw+ŵ#xmr!=N>;=S?qa`g˺-]C^M{6T h|ăCպwjoXɼSW| ᦫi~P}صᤗ<-EbΑ{aw:}gl Gt8ѹ.|;xkͺƗvc_߾zrՓ/hp{SaMɜ-EC\B[93VWy1<5'ᚷ{gj.Ill >܏rN_?S|W{&:} ~k[5YoU?%lqZ9ɳZ33D;>%p☇ů}(G<Z=/^Sbk헹O5ĻNjН S^wkŤz$/qDq%G6南~^3c>;ⲧOs|Ǯ8ϼq9|#ڸ^iqop`/3bv>h`{}?U 'm &aOF\{Q|rxG|Ľ<$r.Jξxˋ׺~R-d~hFkg= g7' \ꙎK'i'.- gO$T+~|^/6E`ʼnFg#p:z&aôֹopkGxO5Dz;k]|W$>M'k̗[|.W|sDMjuy>1OF`ՇX?~|[Lk͜>[0yUoY'N1nw./09W{l\xKl{`W/k&5VE<{]|Xk֊\FZջ7?K~τl#p}†Z]F 1ZN?|M|?[ӧWyF ҇A`i?"b^,y]+^_4cG+F/;x/ ^ ^^%F/^Bb?br{݋_;'g7?%M\/m r`^ó>,OcÖjbVK8nuw5É ^񔯾:OnQm'bڇR|aFp?>0 jy@gmo?^ouKVbs'\á\h'GNq].r_ "O>݇hqr~]g?\q,֞m{&.[<<ڏv)G^kM04BKL|ͻ?|Qq^#:3|Ň+OOs&+{}b5VI<}umN;H5ǥX꣯vBfqml}$kp51^1jݾ$&?LO6LgQS8rRG?O*uSY^^8Qֲ:y<3_ȯV>wF -m GklTbyoo#4ao,[w=}>x=+bp,{xΌg pASwy-_&&=|!r'U`y9b[Nx{jr_y=S-%.`>qY=V8G?>E#ߞCr']|!0{?"/g[c9VCp<;-X_W;֘Mu_x~Ҙ.~6c)r{}5LtV'y/ W~kɟ->3?M4:]z/. ]s o vԔu{m}wuw[[>C^iyL1|CvO83έ٫_\E<>hDlypM[~χ=O;a[_}q/{WxUr5Ӈ~ W᩟F6l}/0h1A#z.~ȏT+|—{ʅ=ռz?;8DjQ?Sc[Հ0Į_5G]ek䏫G\eqφsS}.|ݽpΖ#iK-VՋOMi{ܞp˾n_>'R/pu3jQ~컳΋<,7b?WJp!]]En߹OHwwO7jOkuL{=GW=wu`wr=K7 8ݺ?;r,pÊK;4߳8LNXǹtcFgoxa]<|xZ}5j1W O6|߅oQNx˭aTbU;ڸZ zI1=UKn!̞?ӾzwD<8Zww|p`y)8aj~Ʋl _qrS5՗Eyyw.;J5{ao%Z\9ƩQKMQg#_W+;-l9]%wy:x꫷FO\:Tky K=SM>exuҸwρxoN{?#Η;[D~:r+_?u(wVg͞c6f4fG\8bl[{Gwq.>Yv:6o'>"lj=;qn=5K'{>ìrÙg~c ^ä5\lmc\_}SԆgk|ئ>\ ><{N[3_LpOΗ熾zno|ֹg΅O@}ĉ}&lΨ\"c#o;S5 gڋw?1~"lͷ?V"K~4>rOɁ=9~/ /196O83xru;KgXl8;K9v;%Ly۹ˏ&-M԰ZD_ճ=57l=plxԎAm3QQlטS#G9g7Fp؉0'/ۼ>L9և?x|zW[Z<+瞛c.;<U}խmNvԁp;vjn슡v~'ӳ^=Ŷ|}O>\ܲ)vc}yK[k,wwM6~/&͎+{а=7o f)Wj|pޟ9wO041auwM1r5#? K%ͽKI4}7sV=uoY2{+U+%a߃T_ybi1zQKŋK5c//8mCۼ`ug>;:^ْ~1%.w}^^8zѫmy9SU[ 7f#/?PBZ^~7/>T8Q^Ͼpә҇C_V9ț-vՂơsOZ_s-f_Θ/5 W5Kjzn4\a΃';姯~֫[_-FkĭԽ3NЄßxN+a{kXlv{yqәfđ󁫹 ƶ=#YذYlYrk^&֘Mp|HkYQ9'{QmE)&;޳ӹR.bq}޹ıKyhVc< ƭ{Owbs5rg,/v1_3bk&l668ޡ>z1aܵ_wW\Mܾ7,5lWc6Xٵ},fniXd1jˁMUܳeΊ=.i%xLhD Ūb,v܏&|[8/|/LڐVvq/N>sSV˧\>~|w|!oy7Ժ ŵTc~fo/g t97qp't}'.C~ҝ75+1-]tkU`Õ;ز{w9Oևw}=!%(Oi6[qwr~Ag?:iΜ1/^ckU4}9['=8sgf1ƒ_8l9{57M,ݳJ?'6eb `w'+Kb5nzjTy߰ *e>󬩓=Q_̕ua8k=׹Yoݞ V<\;Qs YOj)~sCg?)VGK yV󾰩oߊG=K#$jK,6[/1~`oLĐ/繁6w8Sٔk[SΌscLď Ԉq;g80&W7_\Kk#\`Zgo+^/[$e㼣WKO///^F-C/}qb9b5XTNpss۹ذ$l7eXpݳc]3=%|!Ъ{ ǕؼxqrK=KL{Iv$,ny!kr>O5&EzW[~vNOn1_Vx⬜17N>zpԷů.[\9y6\.Zakl.ޛ,O"Bĸ3M?LY.[.+gF[]7G8 tkxtyv[%_6ڲOˡZi/]N-~~aOGˣ㈻:oS{9Z6jF1}wp9}󭖟};5sOI[/h-^{`ĵf3`~Y,5{x%wOXi\b}I] }GW'goX^`}QR|rK×):'Z_>G~>Ōx_C57~ͯ\K;3}םM3c_;wM9o-_c(U_13Ho}oى]?<>|V'ۤpM̓b87ַ?7^taI͞%s՞-pOٹWSʏG`K_qZqΊaYgsmF1ٚOgZȉ1/k.\l-=[ASjzs?ƋMtˇ]F̷.׳o1i,-ϖku1Opˏ>EjL|˽z5|vgZ.L=i_l]_"pǏuHx O?^3suHW#Z> ܋Wu:t{Ro,\Kvŭv'?b}))n|ۻ*6r_:|3N_;MH}`l^VaGŲq$;DŤ5M/5#)N=oQ݋ΩũYWgkX|~]}x?u>5?-^_qxZҧ8/aWh>VMkֳe(iCPͭɣSwNkT_\ϒG Nˡzmp.l/o&Se?]ޘ}14X{۲(5=_{44@&"2WMfӦ{reT7.ŭ;0W^>|ywˡz,Ki{t4^WZžM>5q6ޟS?ߥ`w뫯YSj H0hck}qŚFgOlahqj:gk xWA;X4Ys<}歿'3;m oq{3fW|}Հv.vճ>;r&tamOj?Ws&_5I0s'_s|Sgk-_C_mS7!zF+/Yٖqd3uغ:7Ɂql£A]84pՏ3Z℥Oѿ>efup6?VoyU'03CxǔKMĔWz^/.rl3,x«߸xt>O~1v\V95{WŲuWkr*GOX*vgի_qyNs\3ZV['}MNǛfD^u1W>5-4\1g0fﮑ0IQN3䪏k)rOz'_U_~Yǜsr)^3vvԈ&}uU'lU#sl/[ǃ_sSI<O^:x2_g7V | iE?T ^{zx@Jࡡ5GŅù/FϰM,A9yzf!̎6Nɷ/TYٶ&ҺOǕf\#< H᯾kߟ=%g\H9WWP|+|{/p_,bbӰ%lj֚W8m\=Hљh>?XpÞW>a1j#rκ jMwfA.#4{g<6̞/x<.kjXD>h̳OЧ:aO3N&^]Ga_?,${G :A}C I5fYyQ1:8Zgܝ7I]_|5s01c7[hλ( r5Zb_g\>¦gS'd~Sa>_81"~'Њ1lZXD|*se~܏F3NtN`39sܺν?k;J̕Kg>ڔ˥5|j0:sT= :5sk{ :L^kg,& _srP?o;j%[w.l묗ۄl5_\0E bL"q2Wi58~0è<>4/Zȡ<Ƀ0 ҝ9:_mGpP0+?E'"m>5fZڋ۹)w]Hkmj>USc9l` [[ՄOZ ­|kSw揯i5}>>¤֚ym%d$nbws8 UtǓb[kY#gQY)bȷO‡ن<]-O╫xrʡ<fX]/yF5M~Üu|mr| i@ͺ\ĀMh q:k>_v;xzxNNbQodC,x<rV8h3&[cK[3nsW,mUb?}4ws[ᵞ+?g&>06_(T_.KӳfVQ;1`Uډ)8niIkD;p Z9/4ڑj&ݼ5?z&a X8yև}_oΩxr3G9oxP|ޢ/@7r7C")xjV\:W%L}iã$?_|Ňu8pyH/|r#>;b`3&_?xhZG?׿W`m!8kd- @IDAT[l:LL<ʇZ&gM/b~g?:Whbkrꫯ\h /s$z..;J]Ĝp2{uV_惗/('{7uWtjCvd׿u{/WuW/G1n'y̽FRÚVgS>xgOkgՌ _sDr5oL8ư:q6?pvvM8̨Z+<:xƟFBfۙ:zf;6x(.g{uD ?yl€x6xY4b(yl1hras{єj/FS;xxQ灍>;6FY0{;ҹŀnMUL.9s?i˝ nL{sT=!žZm v`5dc\}X篶vtIyt֗<ù>ꢖj×\Ub?.xuϠ}95<\/r7}QGv|ku~:wi}'<8G8y؟rງ:Kj׻>fO S>pibj9xp'_!l]s?ôt 3ִ9vfcל0~$)5Yj=es9 uv0}X:g5 C0g׏WYuʉj=`<7iO⣏޿ʣs68:[ WU~F˵fq8یO5pQZD{J\gs {,;|jGG.F)o,cO'|fzj['4sYN?b8mTgpۋ su\-ɋmɯ np#F#L΄n&O{O-0aYju|_\~;B!9ٴgskΑ\SXKGQ}@%3sG,-'opY^DgX͉rgCy?=PZr=%--Z;sgk2iP y F+8Os>\>[jjL`?K`}נݙՀV+םzQ^oU<ղ:W.{^oŴ[{&W{:cgg6ZtajY]đkN8;4=1>j?PϾ;^y!~hC.ճ..Q Sn?m7pp7WyG Ρj/`>qkY/yj*Y\"VW Xuמb^&5M_xqϨFv|T r;{T.y)3S>ZgrƁM_Ywխ~Fyv}^c ]q񓯆o,3O} >jKgbCpqx-=ͮſ91ăknJ6|q`j8MGhts|3}s0 \uW7M͸}0(^|!0hj$Nulpc`j ~k\L{k1{|ė.0V'0fjמ÷<;'o|XI0|[ϰ 5✍#lZ9૕j1zW36#<ܜmG,6ʼn'˚8Q/wOw{L/>ՕO ;]g/^s +ëهZbEhSǃq\#?%g1oG#lԍ,&0]c|;3jVńўuOfm#`j~tl&>/:ՏMyN>pV]99Ə`Kx͸ʥg\u$1\9 8h3:bVΫy"p Mv=8֪'GRXAcwMW̛߰ëA}M6]d[.;wfL&?uVx3ɵuvVWC܏a~; ӟRmwg?=kgKf>+~|ik']?Z#bN9R?cU]\5Lx?~ai؋Uf&pS.NO+j+j#;3<]3vn׸vjwf|5#g~g){v]׏stg'Lx$<5ɁI ފan Μck_;ciqG W]_<^=LvՂODvUG$=~YIc9Z]-3pk%3Ϯoǣ{>[>t6wA M`Y+vR- y-;7͋Ec8V'bYckΙb>Q|:x4F;Jq _Z̅<WkSv#{[jևX>%f.^t\iuvc^u+<]tu1GX?GXL-1' k4Q])1.=CxQf>gbk-j<*o>5?6-H]|uEm#+T]a%p;c{p{R>j|EZbχ>b^}p/ <ÞKմ:T[q>>'rgWpks-=Gk}?9y;bwyw/Fx|c::yˁ*a?|Ƃ OSu#wث 6_3?5V} 6{7kMvbc1st`W{Sߞ=5=L5x%0q?)D767>C癿ts|=uwݷ: \du=96_>C[{7?s3^ү >`1ջ봜ٕg+/XծqןS;CL³^sV쌲ƖͼGUq/?bYí&-W>b׭Vɥ{8Śy:dN?v84{\soV t8 [u7] ?sgM_!M4vC㪎.@7o>M&8 >5g^M|iڙszW??`_=܊UlnQvR/.즰S.*8eL=l#awv8tÜsdzxSs4}3}̩w9Tlsk~ 㜾o[{Չzn\v_3y nwgy.Ϊ:{{ߍX&jo)ףqx[;NtUok.?ܹÈw1# ~I{x%c_q-ү/t{>szT~Ws_|/}Mibvã'ޚW>$d^՜}hݕ881ֺWG1M3 #>_Ȝ98xh7^nn c~o-X8zb@6M|t_erYZ,2c6W ;G<5m6t^VkZ/Qb6-|?=}C:⩩G=[fŁ΍>ͦ#1gm1¢;~5qğܙ7cEIxWgzm'<] ] 5]G{4Ӻu(:xǾ;1*zU_1ZOsn'==_d?ׯ%[)ni׏kRp_ulz썫٣awwp˱bԺ9cσ{~ pG\rkӪ_/_;r1dMo/ÛW3{Xr:ž>> j;Y9L[O!șTxsVNiҝZ~F1$תOlݓ+| šq>"6p%v1Wq~aWm\[7 wՓÌ7gwJ9VSN®\/7.6ZX+ѻBsgq0WaucX_Z5\KţS1W ٨#)Y_kye~Ɵ}kg8}d)bԯ6:ѳ?qkMK{w ?}?Wi=m}ɖNw;G_5^q^?jŏ]1gzHcw3_NNk\Ə0}*˗^T-G68֮/)1];b9; N1q7ƿG1˞g8}Zy̾;>?{s֮fb5g6Z ;Qʾą&iY;3֎dz{dd/l^>b> sγ[Ro5ʵsNb͸eY\8+:La`ܱ W3vOawx'nŢIgڵ_3S7Z>g+-_k:a6vχc\fAS2qO<|VMgo#m8amo8x?N]PH.̇Ha=Ȏg}1Ga֭˖~$kM^;Wp-_k3׈|*Zu/\˟O98m]f/<3g1fLxs:Niv:̤۫Nÿ\3O[_pN ?ia{zR<0``N_?{sǕ[}񋳧0&?B;G&>{5GKpEqgg┷q-#/75sil 5:7`I G">L\ Wxߍ]L~ru V8|}|>g fZ8;q&]GIgG8'Տ&oM͑WG|˓&+3‡O–㞘Lů{3+|+)fOf_Hgʯk-|Y{Wmǽ|gZ]\1#3YpV / pg|^&漧=ZRm Hf/riq3BZ~qsp|v{q7u~o >/y&Hʭw/H[(VrP79˷wru'ƫG |1V/}qդvG8i65}kb< GygX+Vڦ6kn>pPK{2#lWB̈́_-pÀYh6=HoNxڼgOw(71WCXi='O,׽7'ńq3NӸ/|j/ܵhwf&.{xyw6lw>/qj3x_ᛃk?¯~d;紶k[m^35G0*Gg||gk>;Y/&m|GjόIqWi?,Cq؅9~Kj_ʁ/iO~۫Wsָ3`ecwfal>iG~a7:#+rMqU M%^gg~tvRyiH>ˋm|VG\ղ3.$cxyw;3wuw{8'KWs?&fIvǦ:_ϧ>ZS~nלx‡<ă T~ zj?|MHLG5f_1a~ 8=3ROfuj_JK#|_aWbcq+'\ۯ=\v5gw$f\+^̱n{j"7cgW=C53Y\cp# Tm݇4?Y˼9IMǿ4y:΀}5khW;wwNVUt/|R§Mg6`H?#./#Ϙ9\rg;7iWϨ[_̣(:8"^{i#uXwΌܫb=8+l!qpκy/N8ld e65mh/?Mp|Nj|BŘ9ˆa77<w>hJ̰` pf'FGQePƒ6%ʯw^>ګUpԣc'5|ғ~<+dwݑο{K:c4˴XjGki"'wd85'UHՎۊi^!^W1ՊN}~ܮqJw.U 'gߞdHU=Fy7NƵlt 9}%|[W?,_7Q6?FsiKۅE_k<`z0dH`z178a@kqGlo~W^t6_7[D<lXz)QC5Zxɗ>H?n ߯zb}g.Nugzk8K|i,&mM9X >5o>-.M:c>[~s] [|bjqK>{ƾpV7'8M/?l;C<檇Qc 熽8O}#\48p ,/b71Kus7|8lԏ&׊9OjƮ5gG_l 65Uo9Lq)6hН׌q$\ ΄ڊ'欑/gMzV5q_j/gxj'Gn0漺q`\Ѱ=r,x띍6.3 /_~7W|}8҈y3'4~rΔkƄ]%U_.~Z Me1Wcǡb:y`Ǚ!&^0g] ŖȻܫZoQ];KsS&c_Ow쁟O5m,ֺb1½bYٚ'p?`^_>lowMçp7I^}Mg۽2a.r{$8c{h/G<ùkύIXѼ Fj_-gqT56zIrCLYOP^3Oua_im_ó |5>Kռz~H>G_=|jI8ONEžgqX}&w'4,jV=bvo͵WrqoUM}y9q+9l^\uhoG[.zq/IzClq;^̓W 8|ˇ[b?5Z;1_"=X;܌n|ݽW\uoGX{kzu{~bFjd:0y&`Os5!>r{T MԨG8k~'l&>4HOWhKw嬵/oߵ`hp;NT96ݯֺ}C\ctα=' 0ϑ95l>[#|W,W1h"?>?y$j̤f;/9=-dńg׺s=#ʃv8pNe+&qcOr=j"ySvٿ둈hN-qT'mgNY/>Wky?BN]Z{}L}I4{u],;GWgKh|>4}t_3j YYpŀ~MƝ3.pa}d}gOO5s${~t{G1ZjoNS+-ZNX <1t{!j 8b!xiqҧӵh]s]wp>F]buÌuW9 rQ`Ub3[8+;{py^?m3-8Ɲis1k+ݐ| r h}"s&{GPCjnohv~;7>~{<[5vA~{6٩u_~l_cwυ_UKv/pqř Ӝ&78+ath7|yWsk0W\Ao)/{ 6'|8-&yD_0jjϒ秸ŶjF7gJoph~>bIk[ '{KĻb)Ǿ]+ ?5Knv<4/R^8]Y9WQmx_8 5ga;WjG /jbCqwtg/V#{~q%ڕ|CxӰZoL: ߕőW|q>*&3u57ry9X]Y$|5#9f#9iHW-`>vWon~1a`uF:ocڒ^>gx&,6bϳWC|Ĭnx~x>Dĵ5a?5Ik ә3^ՊOמ܍IZ/I9lNn539l ʿ+VawqoImV <'UM.2^>H=0<<{0zHPrCʇ+>,\=hcv~ExzMnoZ޸'859k{žyqiyTBͭ@[< ԗpS_w'q2OY3j^HqKLզ7r;`U XwwX8Yc޺v?WKq;bi?a7<q l"ar.1t&hq¡56b˅6ݼa$5w1ñϘدyg<`3 ?{bQgQWCrO*սMӿ=_.krϻZüFh㏛og0{uaZ;=V ]K40Q܄TU_aV|1{ W|8-vk=y{6sZ,ħg͜E1؆a=27ckD3{4H"l:j/OGOΆq{0b`oV/1be~IF#grԟ\j ֑'3]}يI<|Z6qxSKIۏCWdGn#iz^'.i+oא1~j,={ q3V}x~NմP}3.Gk|O׊WbJ1:Gsǡ3jU{DLX0ՙgϗ 9ۋhX)&?9c}g|ꍣ{Z)Osas_6 j7g=^CGElw̼'8N|qxL/}~K>Tզ}7_ nxc7bZO;޿׾ n$05+7Kq;k-Y ׮ޟYCsu&տsi_{kpYYtn;GxbNy5} !xb隐[ybyl9op㪄 ϹN~Ĩ&]&`.qzΠWM9~wX5!;nhf=!\!pΥy]vΎ:$H|<Њo^8ܼ\8.\8;)WE>^/HYcrw:4Ҹه(o/qNkZgO :" Lu. zhwxɳ9-z^\uΪÕ\U+X,h5{'-[5砳׎xwCgPܙ_gg[>VL.Lq̩bG]jo޹~;_͜ώ]_鳳Ʀ3f^[5~N* >Xȍ~^ny݀`*'88'F야s=ZΫl>-w{uϟ}8Cuu/KT7^jVk^"bIizϴo!j)6-6Gy=#'>p7za5_IW\'+<}5S΢]g`Ϟ#F ?w^02W꫕_Jڧjf~Ԟ9/dgݯbi1Ӟv)n5~C1{~λ֙[k#:z//G yuÕ Gz6j5+?9U7}$Pjg%bfθq穵WG1߇#.\p޶CL1>wdx㬉<5ң YϞm/쁽5~W?~ji\;#r&O;6OǷW6j3"btqʳzl:'Wg3?*VOS4ixEUc6pyPAiŌOǡe!<@'=˜ g j_|_BBHlk4`0VR?|ˆZlR-^ ~+^^;|xѯ&Hg!{| Z|xalxYrfܙ˚1<1|7G6xl v ty_?w DZ<ӗ<+]ǸwݗO&f=̮>Gb筳9M#c53ѓw9Y<5՞ơڄxO6k`wWXu<>Uq;GӮՑOt瀍|]gXGk.|KxƝ_\XSb=\N0VƇ }UpKerO5'Xb8^c-'}"n̚7N:tvW5jJ_źb7o1p=^1$\WąIkò5Du=s_tċ&kՠǑoO`?yꛛfzí_t޷'߫xgvឧ?yG 55ڟ40PxeƳe_pJ?;nuHo6<=Yy4nn/uZxvǷa?Y?ϖѳs#knI/=o?f[OT`^\?{yI&~3Z3{l=Op&8$j\>kS@1m׆1`&⬭r_99ϻ)WpK0 ϚǮvSƧx]czj;;®;ygN?=+>vձ8Lяox<^Y%~~j?g긇=qf N:ÓՇ3ϑjǿ=4U.].aFWGo_tn&Av[\s4򉃵l;]&ቱbN]\x33%/mxq-xT=bVi7cC_yNh3~#b׫„LZ˞m}Gu^+s_s:۸53ˑGI{Ϳ4خ]přuvߺn|G`t}WsI1{ӓ]G7bN[I{ƭ,-N19WXQt Ze{|ݫ\Zl*8x߿C{bx~ɚgA0p\u3Ys#bmk:q硹mqqbg#ڔ)߹,Oz_591`˻k{+Nvũ_r7߼L;yIXK`̯:ķH`Orψc ;|:;"Gxx݌_Q]ޚ)lILg xMN磞3wb8=8k4wE]8]3ʳ6;[<{G yO?^WI5oHjGT~o}2c9سvoŽ:Jd{+MU.w슳ƪN&;y9[XCwVfߜ뾵096xQ3_/rq>ÈCpf+8_<{^q;cr'jwN/ݬX]+mکɷՏoRXĩOWG} lk^3fyVCu=|X&67y[>m`6[уk!7zakib yGyMY1g·;m6FF7?`Ts?Lc~\ÒW#Z?f%߸9]-W-y4.-w_iS>4j^欫MuNӾ#/\dcn:1^ ¤˩s.׸@IDAT ?`iӠN] n_vzS-şf^ջ} #_l`^/r(|aOB;.nxMuyc͑ffO]S9CX= 9b?1[t~3d7536ţbY1&G~who\^fc]kA #\Xxmr|TbGC?ĸ܊;Bv5r!]?d,ʟՀĖ\QP|4ղ?knl:.|fSjY\sWĬWK`~e'k~8wX0)~|#M}<ģkhbNkSVQg헓y>Zp. ggr)l:kx{uϮsbn6yx9ǝesiT0ֹŭNVxAiT'q}ϱ3a;3wǃ^ᠦwgןA_әO1fKqrX݇=Xp "wغ8b}9u 5kjZ3qp7 ɚ~5WSc 3ܩ5>4w4_~:890_e]4۸?]S[s{w5{9ebʉ\e^3욙KϘϚ׌홆+ Kkfb1_+A#q5{$:t\ĭ; 6 d)v9L;Z1'fvM7aM?%83&Za3hu->hr*_i:'3mG8bTaŕcZP+¦LW>w4W`ixؒ|p}/^xixvlP>lnNN⽭U+߯ٿU/B|? zÇ_T=Hh̀{φ"grg>{>Kk^,cۃ:{q//˭^ҧw})οaw6\e?ݜ\Yn4^`'/~XgaH{ۿ#f/O)V:G}{Sqp6g}x^fkO1۾l>#8[l &,uo:]S]\M~{(3^iZwgt㕘QƏm?8=8jj!:#<_|W[B8#fgrgS>,{*C`hg"nmc>,IfF>.x3,3ÚU=)0i/vV5ї?5}ru6_VsuF3gWQ3X٫gqW` H`tnyokzK|Md5'Q]bǾ妶g=w3 8ޡ%+ vdygCy {f/u_WN`L&;6=;^:f[g6-K7m)=ŲW0yjf >jC5G_%b+}{ 37 :O|wo_$Glԧ%;ڙ#'soQt1obb=׿qpSy]S35ؓg|ǃ&8vZ=W5rJy1:|7.~?~ZԠz |w}]X5 ?X-Is}QD\׺ Q}[:?ī{||WG?NgcoZU{|N/%=7.!Si}Q~Qw"nMwvW465J >4Q3qf6w[|9 8G|M}_G:ߴ8"o\VjH{?όP;q{5A4xp+/X}X %)}|Ui)׫xs\G׾uׂ=o٩zwmݙQ8/]3#ii¢[f~G3)'6]C^?SxRI^|5}Ztn'e~=$jͺk$b߾ew67>|<4s9>̩Pl\ d{MK;xj><|YcngOn+7~jaO]Sc+_>DUl~qִ<[jԜ&j {V_>xVW45\LnQ6G gL򢫟fQ\=Cqػ;΍Ӻ?|_嫏w9:v=/bătfՎyWyϲ.IUq?za^R W}/epKKKNjK/Ko K6ݗeB/ɯr/%1ß\Nlᖿ .΄ `p_lp^~1VKO1CP/hXZ2Ago5*kZWpfW#ma]ԯϏ#՞-O,/jVg'VwZjaaCk5{:޸ǥgt8cM=#rּrK3qŀ =W_m<;#*qkoY{~5Y5^儫<ե֙:S;5~=oo4|:o|VuG9Lv֏<8f/`hv6̋_{DqM2C)vFZNƜz?/ѳ,ڋ9Wso !gWo_PO_MYYpa,y侁Qx*|kD#?P>Ɲ&G!ܪU8|im/?Zyǭ8]C3t#j)d6t׻/~oUZ?1pgK`ջwn C-tO =Y{ȹi?P[!/ZψF]^⪅.a]\JW~||ͯ6>`gog#8aTs 7s.`]#-|t#}^㨂<_={3ۻzC̱?Nl5 qYxZ˙O8x3FNï}陛oNķB~SCKe 4}wis=Cj#qj}jҗ{w:6ي]Fa% 8};>x)o~򐫺z˞LJlnj.3/\Iz<ჿayqN)7b+<58rS{vpͼF.啿= K1AK]Gy#_hX0'^>1\Uw&%8[gsNx+u5VC_սɅ6]Zl͋fl/}|uatuG\Vb\_w~>;OY}_q96bٯ^q>mNgs nͻEߜ|_UCo ݧX=%b=C9Y_ A9y&&wvNA>sx/g'=g; b \qMɹ~`^-Ljw 3=]<{3ΖMZk]55aUkp|io3ךּ'vus4/u{ر{9pV+xx EW 1j=YM3zW=="/xǿK pҋ(ePFh)7[c޹UҬ{FwF ʁÆ{8kxN5緧ٙ/JXO:ѳrQ#A<y5k3K C>]6=:$;ށcmO {檛>̿;5w~g3wVU?kkQ:bVY[c} 'xW-"jQjϪ'>wx'|+i:h 3 պxJmc~›kpn+#^֋'>|'Κ|u-שLK;~qwkzhjpSi{d ]\Q<6d3;|dw{}/#b}A>d6jљ;N0`Q/~5a~0}QCqIO_]!*N1gfr+=3swu3|g*;395oj}I_]U\s"2mnsƧΑ>)6 |ĽW;w)~:l5GI1W-:'t<jD[[4'~u^\x+Kۋ{L=|\]s^=KHuЇܿm0i֊V4}*o8qy'{>ғ/yY;kOU.:qK[-==/ oXy:6O)j71q6r6Ov3{f4>1Ujqo^٪v.fgΛ;~<;qItyXp3<p];{:tYknr<^~E~|e_}+⪉s/?gl#uďw\fM3Xՠzܭ§qJSxڝ)[_|kh; ڸs?O4y'/~3c˺ᐉ϶53޴ׯF˗ÄHV.Ƭq9v'֑ͥoۗ=!Ո_";y#s|xO9e|5NWorP?{Kpn-bju:[89#3~UZϯX|5c31^Z&YIÚ&G`ngmmO11!Թy4c5WGF;K3rޏkghkゟ>`nv6qj*~y/G|ؖ[6M[sgVMacv=Ql1MK`ȋ~'0_/U_%H{k}`1K$3X٬^BzMlW kr'^[_gfm=%'?ou׼M_ꦭ8Sn؝<Ӧ=6=~rgn/zZu& ?ENM ?|y-dzpz'?9皞1fߙjFZ2O;/\uOl[Njb'8t70Μ[.a/j߹.o<'M_1Hf<ܫk6G S79ZuW;}WH[/b{9O?]5at.S8t;;'l^ڨ|Cڿa&jT~i2Ϡ8Ji&j9q̑<],K5 Sņz{z}kŌ*@T5|՛7^9[}sJkf?9W e^Iӟ9,^fT[پQ<հq< VZj}=1_r8)#g̯&TajwT!μ1Ž]1ųZw8W#H䗞8xkYnxͩ:%kqsO.n\g}iK#W!W_<}~֓L kfkk4; ' .qk޸|عCqkBp<N>Z<݇艏8ѳu?ܰ5gmsŸ6.vu8՞mK:{D~=si4}MM;Hj ,ޙ{63u~m3YS995.64u/H?X՞͌:t!6gMrm?[ ^d_-Gmh9׾YS]ř MʛĜ5?u}xG'q6xG9Q>ŝ_=yu88/3/wڳ>6x1sلĮyYFʻ| 3=C/a&f|ŶtV} cbꯍV_|`?=m՟~^gf3 \Q٦}{йpƝ?yLχ}8S~q0Yq'ͻ0׹lMg9Ś\X1X^ (+ocw 7vs]:>%ˍm볙"uڟ٭|k}{]W[6iZD֒jчL[sǡ)_68u;[E:#w6{'9^0WXjwJ/adtm3!6gr/#-OQOo:#.<HxCY?|b=qx x].}57am9I^lu6_5luV95~jOʅ^u}W#Zb\LEAԢ^}V-3D/ vanZ^=qV9;ӫki95eqԗGi/-|:b&;s7:>8}6m._68g$ĥᗧEO} 5[{x<:ǹȿ<3l޶^k">~N[{aj+ ``~ap5Խ,⋫q\y9>l_F9C[3`˿:ffj.?pĕ?{NGLw&KWx˱ Slpr.O D_^; 7Wz{bbSswZ{ÞU e7ԟ9n}oWhO:7|̕+ߕVmFyaG>8j~3ikb _ʍ#vl:Sqj'0Yͽ㛰{^?9ǧ{'U8ի-n:6%{֜X8W4ٷv띵8O!g}9#Ύ|Vԅ~3ػ;CtЫq4KM[}Gwk m^}H'&_uP3'o׎~F5Fij?޸}>^P\{rW<:k`\ΏL1p<9wbU}~5}s8xx\IwG#lC}ѝ?y(jbIѹw/'|٩c,qG8O'L~|q̙ъ9+V?ssS58q)|@3]cMy³ls09~O;9;j/NcG1;xx&NWb:oxǬ7.8[ٵoA#yeVlz_fXˏ&j=9Q~ YSw4:D_#O8wH{⎬}î<b\>>ً|&G'u.|z.,jD Z^S#x37C.o` ӞiD}NipW-0'}geL3ǯ3$& /4ؕ{\xO\;{t9 տ!O]M<9z{6l{pQc={J;3Ίy6p{>oqKV6yĀ'&n4Αu{zvtxCc[9rcOۻKKŗNfѼ6&=ȌgߚԬ#l^H1ĭ1ͩ]wjhwWۯ=3O-wg9_x׸:A$[GXp5[ Uq䊿466\sߊ=.|qo?>b s;]GU+M|R]u7ƌ;poj_b=վӟ6 Y˟7G0/ϴƿZ:w 6בּ1}̧hp໛a푿xvV5V⨡&~ϑCov_!Ekphu/;XΩO~{2N)>ꎋhOtOnOtWtQ}^Gu;kߪݪۛbᕰ-~j{Cgxq%?F,bUy3k_k8jf!Xlk4ckr֪[qELuEX7}yh*Nl¯xWˏN˫z5}ԧZg[o^-=v:K\'}9u8Ϊ3C]mUb˾/qя/iko)qSӝ[|5|<8է'MbUg]wv?ƩPk>rv[<{9#Y;?;|{Q'Wߙc9/xyчɗ6׬'p=lļش::~yV7pᩅq7T ׺xxdW?` ٹgs=>Z:#sڙu{SÞyWGgA-g+5m  v=GE\0f_1h{ǾZeݾAl*qZ8\>WNمAyK˯|.G\ZyFY(f?p7I2K<<3*ng{pippOW S9\,7}\W,w50Y1ZX8ى/gOP&<>jmͩ)uPq'wbWo6I┽ՙ:ɓ7;y9~6?%vk_l\L[{zt᫕{3I-^Hr.xpy6.0`CpVb;ܪӯ6wJQW>X̫fjwl{q⒉z@^.ߞa磚قQ.4~5)yTIW׼wQ WOcy]z[ ~=;l 9Ugkb;Pqa ˳o^l]3!' l05FaS|c<8#;GΆg@IDAT3Jvg giT+}gD.j/z8Н)w3ly~5C=_7POA.}f^M冏>'H{N|>[#şOշ/zVb'xOq}W򇇳z =e޳ ӹr;;و]=:G?︾c71pa{S٣>`{&|=3cC_Lڞɧ/15Ӽ}Gc--Z儿>ա5\< }rN媵g\N퉜plr쾲簝'3Ӟ֬k#}:?g,|=KUxzwVkyh~ZCSgZ4_7JU-ȃ?%߇rFOkV=G>j%y5xߖۙ)c{|nę&ֈ|s4–MI?E[{!T}Zkns3(7u=8j=k8xԟͿγG9f]վ}pa@4N1F/G r;yUJJ\pĠga+xy>lMN$|p/&Xr^$Sl[bO f|{Hv\4w %8O4.#K/}(y6ֺwsj۞rW=,Vqp_8|}>0ĂG_g,pk## z~ȿUis=4oJvs><18vNKm՝鹖Lsr2s^|GG~{8u0:o9mM⚫b߹»; 0a=8;ĨC_f$֞{~#\k;L5<ڏYre_~{νߪ%Nu 'zߙո;y(5>Έk?{w;s= Fp0kxSouy^=9.]bNNly.mN+^?_E^*R*70^*Rw׺g\9?cxQI^zkF/u$۩/~ͳ2/t6 gunkFg>sû|&y>$Ikgan5U_/ÜZ,M|Oqe<۹ޗk|〧/;})Iq4G"v7LD/ףGk>.k,W8ꟕ5ꉛ}2cokL~ߘX;oث[󏄾ċG뙣8 0[[Qf7b|$(O9g\}#)?s<w$l\Nӹ:qcÞX'ko\}OML;u {Z5*[׶bGxgŕYpYm{6\Qx. j._?y:9 sqi;ix+}X܏k{5紿%isތ?\|*喾6ηV88clV3ƃOsgl:q?4mW|س~u_wow-VgWkդgҾX3q2N0W9tۿgۿbۓ:wl)[tiq n9ZS՟_gN~zÜZ]yO>яqΙft;9\ZUϼw_κfN:mZ9{Y[mb¹MOk?¡=a'vg0o<=sqUӾk}8n5GWޓ]15*.᧟5-=ַ^t?lKq;yM 6<ҹ^̽9q)Vr/=sňպ;j!kvwT|΋){T{uH69_S=6WG zi\Y'_r7?6Zr\ǣ|gaMx̣5XMeO>[8eW sh<,h6y#+-շ>Y/*'ŮލSz5gA#^'g:էqub_fil>D}Mؽ fչ(=#njĶ:՟vś8^ױ议OkQL)bfL~9gtzj{ule1WjLtƒŷbf:O>3~}}s=ſ$#|}ں>[>8̼=p=_~ojŞ4[l0ڣ/8_wK*+Xs5ug>{wbЌwjO2_I0JimměmޞƾL{FMj-v$?}SvZU5*皷1\{%XZh݉~O7xqa^#S?Mv-o8}g?:} +VQ&F}b,w9髅ZOAAOy9jRß*0jT Z{{8Ѱɇ?+3Xlf=p|.rSB;m+\ڭg.YYk9{Lh\m#F툱pa {fٶ0)1ţJ5 qpͷVSG"x融.yKW8 tq K^ҥOC٥[=D/^x /yA3.q0oukág/<|s_kh񭱛//[V"N mM\aa+ذ',)h$'8+f ƭ6ʯŸZ|@-{ϷS[ٞǁ}91OW;9ɥ3UCϬ>|¯vgؚW9 +8=t .j)b?'ok|`Ob~ĔI&>Ӟih.yZWЃݭ48[:>Z|\{BX~3ik=M&&y _L}M۽~AL87gGɻOSș3mN{Z3-Vxi5Zas\WkWI*s1'zG\›ա;?6a!aȑlp=iߋˇ܂lkŭ4|vi`W/rXyav%Qܛu~ò{{.L6yGoziq9ٝ ȾS c5?%|Af)홹KOyqg\}¶.~n^']Lq6sU үfuTa5\Ju$jTw~xOK?Xa8guuY\]log,ZYS)tZ|1[+9g }kl \_l'q&>a'`Rq~I9`f_?:G݁0=bjV +OMg{VL} +QIܪǯ~ڜx;~:{tZ83fpfq &Z6ν= ]|`8Y,sG6>_dk~Oqǟ,KЙkl)rOƭMO֧isϧX_^װͩf [ϴ?<˝t؊=0ztsmV]<>l#UjFjⲮm&5a 1glkkF`D!'SNZ8YXrL)G G 7go[|MSɋ;֭7ݾ7o?#=mq/iiSuWeΘzܲa)Vҷ❱bV̚Ƨs¯a?kogof6qq:sCQ\v^ ·8\˿pM8eF)qozqgb:W/9>%~{̷u^'nm'g=`b1Yod'l1}Kq x0Vn'. [˧5~{ƿ}b53ǁ$3V39_]~S?ǧuUXWW믿~;8l :Z_LcB)WĹfưZSӚyqġbZ޽Kw7|հ_|r'˛r^=CM_l"5ue/Ր⚛Ҹc\{ϧXw}拫>Lkl9=)xy^hiLQo'>`5ig# ZAN8~jF;39R-,Fκ=Muv6_\\՜n~bf[37W1W{j VX3gshu4G?g͎6)}9ǽsjZKԈV_m^`T24,1Fbc<{ L`I=x's=]I;\\`To>0y9<]Wu\9垆S[~)B?ό3P] 5v; M߼y'bY'7~=[în\{æ8pefڛ#ū3kqΚf\3{g=jlBpvZcotfƺ>,v97> .U;1g㪦8{ڬ^yb\}q?.eh8rWhM犎Q,9c=7\dϘ>g(twso_0[ N視'fw\ԃN5q03/nى}؋&Rc߼FC\b;]u!o<ļky+~b]_ wݚ9"0ʹP>?1Wj[)GŅk"ٙ2 _)Wc\w>#xM}K0;3,Z~;M,/vp/y{ ٸCW=g#;o~Ƴ:Uq=xX/VK#-*.&%mLkf bXMl;dr72Y55kAW}\&Ӹpfp/W]i/ōShRm&_+`“|űd3t̷>֜B^ă]ϖ{@,qU|9Y`Nb9Wn|;_lq/r\-okJ}mۚv:c@:cቯyDpjH{ab_|bޙȮ~;];5xxवo'GzW-%K1ۯ^k0zD_~C9waٻ-\ Mq0Ԍtw7ƫ~5꫟ܭ?XnǮ!v$ņyb+taR}sw;ru6ռXa2/3qx󙂯Usq`ĦqͱNwxsJܪMrmi~G33c [q%⋋+-k?a+k;>pq"}yvcmh'GLΜ\=cwW{F=/pwwJ=S=@\yAk+77vʵ8.S:>u5|S@#l7rA? ǥW`ZKҝ\!w5&'[cLjjKa{ԊY.10%螅=UbAO0ա8ա1?v0Wcy?//x\_^ (3vwR1`z&> /X`~p{IfK\$Ŧ>,Z#˗Guq{mO ӤoF7|g_~,g@?a7Ny0ᩕ&<'~fyn?=_ؓrUQ./U쭗}Gٗ>I6_}?Oڔ/M +w?o57O?ts[-'jM)j߿o{Ce_B}cM.q(],F#N&rW+ʻ{?ժŬyկ~;6{R {{1ը}gh,uϔZ \0XxSv(ΜW%?Oysx}Ǜ6:.w>'k=@ǯ=˷j;s}sc{†<:Gsd/cyNxgwF͹Zc}Ouq^LBm4>o=u{5~7{a^0==KbߵbhbԊ'wq:;biOa[wrǵ\O }_-7wxxZe;xGy)¯Ŷ?=Ǟi:^ōN~_Κ==la <{f?tb.!:8}ܜ ;3>>s.İO&w||LiS8 Ύo3/{Rok睝Ʈz;G_VL1_;4qy<g<u<;)9J_Q;]so{nϲC-ۘn` b|)&a$oބWge{+8WZ++yfGu9bgӚ=+ӝ s⤃FyƦz+.6>*yx ׽fC&6gOÞqW>:ޟ[;{@v&f{{3kshx;j3=Ť,\?!Ǯ헼_rXS*vy}W:bIZ9s ;_dߺY{ϞUt}p{qs<ryܹVΜ_\ #}\wȊ]gÙP9ڝ 6ġZ6<75e[;~P#9e-rfc +쬰Agٲ^;Sbg S༘]b';ywMk'+W[żɋiYo]rXLUٟع;}+WlΗ{kk r'+3fWXX5 97b~{Zss(>>_oily??".WobӞQ~{̭9حrZg`ghglsė9ų y}gg.Aoᅮs&\SC/ek{l,">{"f?SRv.bNO;{K|+gdgM,Lj>83~_ r7]/NeW2ꮩ+l*qF|TY|,qG]!>il޽8dž|lOg>ރ?s)9?{W|,u+O9|hĸg(]v&=O?d¦~Ǐ)/װ8t|]mL|??c(]~̦=~ٞ=ϊĶ38>ۋ.Mf痾¶{=vlZc>oiq^/bP^yah~Ly.ۺxWढ़=pff }_ruw,0y0{~28wNIJipCG_¿[W3[+:KL&Z+Y_w޴kQsW98ؙuc{9Zq4Μ]7|VkWlZ\Cl(|yGGC0!?e>7ϧٿ{6OxLOeϙp ;|s o6v鮰)/"/r>vy߃ unŮ'{] ZEl{O0vmn1SsInF o#ޙ|&vwpC ]=Ãa=pnYnN,fm]^mr|rnmكl^X+lwƾ568uR3{^Hp^bE~4,w6"/-:^ANg|r鋋?0[b^lbP/U2믥bXKzQšUcUam~®Xr䭱I;l//wFOdŹ7Ջ=}lr6'2x6.Zq[ m}ygؘ`}~v1,G9g;OGZ2dl?63br&aYrt1ⓧX67WOUȩgXx^V-<Km>=!K ;GU΅ּ}s>w w7ȑ1''~CˎӮ}mlاb{Ӛٜ{1TZ[S6|jSkm,<=Ur^>ݼI5zY1ܷw8r,/wvlov: zwo..2N}~s8pm'{ 5G,m<b7|9|tboyjv}ϦB_!3{ۧgl35Xy\]g뼆o]oeqX{xŷVbص0{}=㵸NְKg U螕3Q'OdvcdȋcE>8m9*Ś>ŤUyΘxԽ~SLk<>9}[/9ٌtϩ^߮O1-3Rϕw'v^oٜ5;GBknrMl?E9<ޙ]˞8ݵ5vvm3b!k1 +];Zkg!8Df6șdN9vv-vYcc/*f~̮![}kZy.||Oβ<`퉾#ǯ絢>}L=suNFϼ>ڝ-8=:X.O?mbڽy,N2\%wUb:>Mζ}a-̛Ŀ3$d/.|8Xx76hXunsg:u0y?𷼝 v򧌅kɬW;;ӞuzÊxwk gwƌ9=6cv*0첧?:R{%Faz;}{]0Ew6Uvp<[c/=oNђ;b۸;rbkϟ2gbstw v:k>dߌ_xX#/n_2_K}bogs.ŲBn1b9egzϧЧK'=v.2ڍϸb{9>U+tX16SL첥Xڲ??q׭E=❞\R۞cnٕ^wnoO2iczo˝{M!F}eqk7xİkؼt>N~SCngS6κ9;žֶܻޮlŠNwcgn՚\Ώ/b{2lvg>ObT_kI{*^7[sˁkEu\#:;μh;鬝s3,l.So9o}gǽћMwe}`*tݛ3GKmv.asVr`O/9~G+sڊa׮Og"W{} =[s&KbWx!wgαx3/v-iw#Ķ7;ŵwss|+Oig>,|fW&q{./7yeq? |F)bd+엾y>ZvF݋Ǖ|?dyg/r–|9}*cU_;݇ƦO6'?+|5}%b\aqܯV6c6yvw g}΂M~'wڡC%E8%'+Ve 0y3 XYŴ^ɟ@cy1ONbaSyov|#?,W:+{?d*l;{v?܊=q׊6<Ķ3=1md7F} vLc@?ngn=x/)M{ kmڟ8ٵ9c>{-Yq?c>POslS(No>\ٟ?v|n Zbg5f[#7ŷvd]luƫz[+.^'w,G<?Xέ\6]gbQ狽rvsm'[|,FkOŰ{ýs}q}syI*g}>m忳HFY;s_l\&O)Ƴ[8R8lY#lOV š+'?t27l|`ͳQ ,{/]kn'YX;'Žn.u烾|1w>VḘ;go3f='/]?mW^.=_7VꩫO}Ta[whyU9u=kqo?jodq,#Ư/+Z2+dw'g/vă U/V:*SVs쳵>\v詊+bccXkUNy/gor3\bK},9Yڏ,|ϯ3u[}L]Ψuk;_nƛ#'u{gzg{'?6`<?e1Nf>ekŰs/s}rg;bE9 ט쐛q''Ns31sԪ>L1j|->rfc9{fOs|=?~ul,O1UsϹh{cyZ'Z[ C~Ϳg67Mbذ7 ď&ۿ1x_ؠa=,.Uk-^=ſ=|Lt5ӆU1,VkKb|vcmڟuM){o׊nnl#?|?ct<,6WVY{ϑaEXqSoo<Y:5cEomvHǹ\1Kϼzmsi9ɰXS7am6NqO>gsOx|ÏŰ~N qݳٛ,dMVK2aon6_l־,bZ\1#.?8kŠ*#X3dW|9m|֧?YWi ?S T긐UrڹzL6ozZ՝S-".rV5S>?`/!:G~:.0$;{vO;['{쐛[ٷr[sb=9M<=^ټ`#cJY>űV̽BȨ95{>uwcvo?'ŷ*}~>}kˮ߸VuV+Eoߖ)Kfc:xX=o96S_6r}^Ao9wx>k@IDAT~V-vc8P[vNgCkm}Vl/Cۘי-_\^زa;-/)l30|˹WƆ>}=[.kl|^b)0>zy|Ź*]3XtfX]}5|=rUy^sullMף5_VY'&/װ?ٰe~匁3q=u6j?g;|Ͽu~D o&Dl&?nnW<׺*v 6eÀM.P;{GSZ+}_.t;YL2{|n{t\U-cez[J^/ Ƌo>|oy󉷾ŏ5%&c?EN0O={oJ]>ζ>>k9[eX)qOf:-v;͍XUssB`M5Y[^|Y9#糝Oy[qMg~v5}{X:glc~_yuyrK?_2Μ2=M 9Ou;9QԳvWS/"~U bhN*COTKy:Vv޼.qTE>[#/GU\w7ktg)ӷ_gW d&?swM-~y1XK?Z:3i>vl_|[q"`7?w6?&ȰGmkgV}_W]Yw9s'2Xdymflb__=쉾sFfV#לg kxd-5ǿQĤFw}ָss"F%嚗wKj|?{۟i_w͹NؕgncAxd;;t5k g{鳵X'o|^ |p\1;gw:k-.}9t6v~\r3~=Y |\[1~Ĭa/#o/>xϸ2 <wx|kc/ ]yXcˏCԞ&錟yOv?.&Uٮ}oq2VҘٔ{cd̋Cz#󮰵>?g{k1ǖz5'.Qo{oY>cs}͹';)Ϟa^{c63v7b[Wc6'Y1)l^f]r)zS-Wk9~y#>?t΅3]Z޻.d,F!Vs\~꺾pN l5bԺ6=_?]|X=93ʹ5Α&8t.6sdb׷Cvobgs׌kּU}꽼n77Cwf2!+l7ֻ>7ӧU@_M:g+r%=w~Mrq>1;w_TȜ6Ų<=Jrd?Ģvx}f cy-E+9ml;9ʹ'tdԞ-frΊs.28cdY;Z{Jc2=_v~,jUz;ӯ9U!CYt6'SW1k7b}9Ÿ8g">>U)9{9=uʇkgyZ3}žŤ:O6a@F,r>=]U/vo3:t̆ɀG[ż$dz!>%Ö=ウgS{>'OT=,o{E5}JfO^^ gPI.Nɜ!kOˎn<}ϫ:7;3W<vg>0[gˮnΉȗb^?)Zq`cyХ'G6To-rn= GXÏ GF9eTeOl؃b%ON>{HNlV{6ܹ` sljU6Url>|YS96Sn|Ǐl{6|~,NLf-'sy2/f'U+,g{>Wć\( '?4;c׏Syұw]/߳φŋ;"˾x=VsycY6^:+t̓ٺ*=guKfrDi0\sWpMZ@c؍d7:cv{xx0raavF?aSlO1ŋEnPZ Ϟ$ſV=_XTs*sr^ZU<_3>;~WiŤst+qcL9k*rRddg__syx[ɛ}{?]铌68Oo1ˁMƮۍɚkڱ*"a!s!ݙZ9S'_o~y]ϖXqk- X3NWvyMgy/Nc>ļ8<[qc f俸S^~͋ ]!8Z<';Wau-֒Ρ2yqO):K~ԒyzxSu=Y~d1Oyo|^cp>VܹފS1yI.mOŭ/e՝M2W6_2g!o]?Vb9=.^sel]\+Yfc>gn~Z2a,apݞrXwk}XhMz\wX{~}K9sx}98vr|z a e8}w., 8;tf- X_~ļOt(z~-l;#uť]l[a>? }Ug~So1?bv|[&o?Lڝ=>o{Xp8϶ nL:;dUl!~>w&x1_ͦ!gX[8g׈".]cv,=S~y,3d s3{s>wbÙWUs繛o2yMA+{.퇜?/{~TN;Ĵ19TŠy?;NlOyh:[r瓿'u}l_xİ-}ݗb +U~k93xj's3q^I5VŽsu{s:V\*ڝsۘїs\3V3q q:罀=k,&6:^.{gW;G_kS, wOtw+q3fSUEw sl{326Źsϟ.z|i1Xseiٛ֌'?w=ݞ/˃G/{,N,qv3qmo^ZչYte\̆8xuk{毾7>}܊i9ysyy Zf^_[*?ž*b;y䝛3UvUuLvIytU}w2=y_1ere2Q= Qj-=0w>?/:'k\EU.LvY+ܕΝ1_|..v?}bΒV枷uygUE+ȈkX+gs<ɐ58  =*g瓵9c5{0M}[,&*l;x1LaۜgO^O}q@o+o1ͧV oa؝ ,t9yOχ"A?}{hngbl}y㭯bm}̷'yпb{,gbc6יsg{lUWyx}2;{seroymþ_eo9|soBwe{=_bϵ1l<7Eh7qv.w6͋9KǮ1ڕ?* =YܯykӮIpٽC_~-WWs3vl;oO>W[iU9zb:31?\y[Պřdloپ7'-]Osr56|c?}}fkǿ˲_4ֲܼ` [7gjl^o31žzSGnp!gnLͽ@ML,d~1q/&q.ŪU*y_y_o挽@OuLӧ lb1T l㹹]w>7{-˝w6X=cOgϱb:7{DžM/r+ndZ/8L:޼V͓lo|m/Us1;=?}񴷘ϼ7w#+:ɽg{kNKf+;SWn<}2k^=1[罄u٦sais̎yEqL=]l}`8o}Llst+rM_z [[q53tv֚?oJvֱ;c0̛rgs&,s|^$O{dȟ嚇1vgk)Vtw~<7>LlS]Sk}8s"ε=Ÿ:V kpojhy>-r˦{*Ks곱9ϸ-x^|yKbbg-_铿{>Ob|vw=|1^FۗuUy:5zx.9yX8~̧=ooݿ{h_ÇMrݚy3v\Yۣ+sύok{<9>U|ίb_N:}Lt6'+vϼissK =_3l]}zrMvtjm͋o{s!ml9Y7wM:ߋi2m_Ck,ȩ@s_,_yI9[Sa׋[|wwX9Y:|q!3{Ӿ,v}yx>x^[dwAXO?yh=Gpկ~:^n0Yns<̟>{Td"/c_g8Yq͋ly޲y+rO;{}*x[W^ǹr{s?]O,غ/I ݍϞG;ϵb:Fﵳd{ΖXl+{y͗^/|ٔVYqrg;-wx4ړڕ{=}c5,N# z|ݘh'շsl:b/)O؋KyL'y({؋Q߼:=;>_u7&9Ks닾#?^>17V_;Yr^3욛_ck/EW\gSZ9>s ?7w{bK{[;fq[c8/v7U2/Cog5nNq_z[kOǃX\ ᚇ\?֭9`O-^o)b8m[+gߜs3eo!o^s=?^/DyugIV6OYX+ښ❍k;ﹽֺ]<&󒖽s?rҺ@gB\XyMaw1L?y鏵دNWVX|[>ce .ϓ1;lsoy{j,.ׄ}׸ٽx_yK 33gsln]3o19;'Z{uΑwFzoɇܯړi<Ư);}Ιk躷Ǔ˕d_|bov^ڞg6?f_+ޏX[?+뿶\ٻ5c5&v^}eeGvu:/sQ+NO{z/!}&o|FZ>?sl1{i칮 aݘxkl9m&g_рp„=L7.y͹f|^ߺw5SSjsdnݛәG{ӾX "s=sʹ!<.glϿܶ/8}tM U[bce[cފ\'OXy9,v9=sqkl=g{y람,c>Ň'v]?e:xkɟ>7=k[NYy[ݲ9yɿzagov.ҿz?r>tȿTu,\}CJ9m>է{=78=e-,^j{l^wO=l:;|q{9ڙ;}Y칮sݘƏ=oawwou}_k|\[kvsb0<ͽ7=ӿvO}:rRE/ҙ>?/N%x=m!p}E O/~0C\{)Z:g1[=Yk5t~kײ/ϽL_,xsoǪ,8s]+Yy^A;}o?W{`ƋZۏO?gf{3/Mll]b\+N\Z|%gӵggէs>~{O4k_<s(7[qչgSc#wX>?ck޿d~?r|y|Xs-:>^?џkTti~*8pKQkwN^”X6~p6=?~wT2!yx*ψy1Ogi|+kΟ?etLl7dqՕb>dɞy}UL^uޫ{b&?U_'a٬k|_ÏUg_㶖?i)ZgjϱRϺ8>|_w2w:_;k]/g\g-~ǁMw֞~=juĵ==;|*qV|m9~Ž<4r7i쿇l1cɟΌߧ֟Sg]m^ǧS?j{^/| CՆ<=59g<3\Yz.;oٷ-w`#9yj{su#bsOsg9ٻ>/>-{ls͕u9m<NNO8NֿuX8ϥz^(\/u~&>h9'F_*??s{1rٿ˓>~Ϝ>'9[n{}טqhVbz͚#x&ֱw?~tk]o~77?!_Wz;OgGgho۫cw}~دė>_7ל_zFnټūqo|{[|G:.^ǭNWN/{]1}Cй?ӕu|ݏ+znKz{y]u?Ąŕk~Gk^-'ߑu#]m^O 殱e,3}%toa,iC8ןk3,3ZWۖ~ܳr^׌A}Y}_}<_wibqelOv1o>g;5ק)b@?K?^o&x#E#9{=[[~E X~q~x=s_secz_=61|8^%xf}X5b?OIwtO~q4z{^\w} xk|^s{[,nͽ1|OOz6b.}}>lX?8~|[q|ڟ߭k^/gkk1|5}߁5W{6"|@{{[ߗ~k <.Ɠw %O/{b &}-Qsۯ`]swo]÷3﫭:uY{S>"ڳ:ϕ{L;tſo~sEb7]⏈V9^1[Nbwڍb@"D @"D @ޝA?>9ԏy?.x]pχ>"@"D @"|Wޗ^j#D @"\ U+lwdw"^b{N"D @"D @"m \H eµ}n_EIq8W_|ؼZW<9D @"D @!ཉx} I ޫ+@"D @~uK\Ww~RϿ9*D A/lF @"D @"D ߆?S??}A|_2 ~$ $xqo/C??şٟ=˿şɟ*@"D @"D / U>>T}MI}=#zh'= W@O4@"D /~;oۇߟ~ƾDw޴xC?ZD /D @"D @"D @j_,*X/o!xO}7ͮusZ׹k׿׾}ؼ?(𞻒D @"D χ&ѻ!Xzx;A?М'~>L#D @"s&w(w?w&GdZc?9,D %_K.D @"D @"DGC>\}G?"?/VY{~xٿnݵG8@"D @"D@~ۖoދ$U7a{dzS@"D @~*V}ƻX}^NyJ"@_,~#D @"D @"oN}YXxxqײk5CKGOo9@"D @"j%__uk]ekm5?6G @"D @"S'}?xW-z?*@"D @~w"e//ú1}/|Ww )D3|w~/M%o>@>"D @"D @">}X[Li}ՇO8%^V|Юk#D @"D k FާP7GKKyo6@"D Osߝ]~:?5~w@>@cg_"D @"D @"|t @S@"D @"D _Av}S^WD>#D @" +7J"|w@_,d๋@"D @"D @"D @"D @"D @"D @"D W_A=@"D @"D @"D @"D @"D @"D @"D d}.@"D @"D @"D @"D @"D @"D @"D _A/|F @"D @"D @"D @"D @"D @"D @"D OD @"D @"D @"D @"D @"D @"D @"|XD @"D @"D @"D @"D @"D @"D @"O&?x"D @"D @"D @"D @"D @"D @"D @"bWPg"D @"D @"D @"D @"D @"D @"D @>@_,d๋@"D @"D @"D @"D @"D @"D @"D W_A=@"D @"D @"D @"D @"D @"D @"D d}.@"D @"D @"D @"D @"D @"D @"D _A/|F @"D @"D @"D @"D @"D @"D @"D OD @"D @"D @"D @"D @"D @"D @"|XD @"D @"D @"D @"D @"D @"D @"O&?x"D @"D @"D @"D @"D @"D @"D @"bWPg"D @"D @"D @"D @"D @"D @"D @>@_,d๋@"D @"D @"D @"D @"D @"D @"D W_A=@"D @"D @"D @"D @"D @"D @"D d}.@"D @"D @"D @"D @"D @"D @"D _A/|F @"D @"D @"D @"D @"D @"D @"D OD @"D @"D @"D @"D @"D @"D @"|XD @"D @"D @"D @"D @"D @"D @"O&?x"D @"D @"D @"D @"D @"D @"D @"bWPg"D @"D @"D @"D @"D @"D @"D @>@_,d๋@"D @"D @"D @"D @"D @"D @"D W_A=@"D @"D @"D @"D @"D @"D @"D d}.@"D @"D @"D @"D @"D @"D @"D _A/|F @"D @"D @"D @"D @"D @"D @"D OD @"D @"D @"D @"D @"D @"D @"|XD @"D @"D @"D @"-@IDATD @"D @"D @"O&?x"D @"D @"D @"D @"D @"D @"D @"bWPg"D @"D @"D @"D @"D @"D @"D @>@_,d๋@"D @"D @"D @"D @"D @"D @"D W_A=@"D @"D @"D @"D @"D @"D @"D d}.@"D @"D @"D @"D @"D @"D @"D _A/|F @"D @"D @"D @"D @"D @"D @"D OD @"D @"D @"D @"D @"D @"D @"|XD @"D @"D @"D @"D @"D @"D @"O&?x"D @"D @"D @"D @"D @"D @"D @"bWPg"D @"D @"D @"D @"D @"D @"D @>@_,d๋@"D @"D @"D @"D @"D @"D @"D W_A=@"D @"D @"D @"D @"D @"D @"D d}.@"D @"D @"D @"D @"D @"D @"D _A/|F @"D @"D @"D @"D @"D @"D @"D OD @"D @"D @"D @"D @"D @"D @"|XD @"D @"D @"D @"D @"D @"D @"O&?x"D @"D @"D @"D @"D @"D @"D @"bWPg"D @"D @"D @"D @"D @"D @"D @>@_,d๋@"D @"D @"D @"D @"D @"D @"D W_A=@"D @"D @"D @"D @"D @"D @"D d}.@"D @"D @"D @"D @"D @"D @"D _A/|F @"D @"D @"D @"D @"D @"D @"D OD @"D @"D @"D @"D @"D @"D @"|XD @"D @"D @"D @"D @"D @"D @"O&?x"D @"D @"D @"D @"D @"D @"D @"bWPg"D @"D @"D @"D @"D @"D @"D @>@_,d๋@"D @"D @"D @"D @"D @"D @"D W_A=@"D @"D @"D @"D @"D @"D @"D d}.@"D @"D @"D @"D @"D @"D @"D _A/|F @"D @"D @"D @"D @"D @"D @"D OD @"D @"D @"D @"D @"D @"D @"|XD @"D @"D @"D ޹jvu|#r)t}N諾RbcC0~࣑4A$A2 jHQC41x~\:[ X緆9}gzgΙs֜[ɚky~Y/)$  H@$  H@$  H@$ %&xkN$  H@$  H@$  H@$  H@$  H@$  H@$  H@A@arPצ$  H@$  H@$  H@$  H@$  H@$  H@$  H@%9 H@$  H@$  H@$  H@$  H@$  H@$  H@$  ,A]$  H@$  H@$  H@$  H@$  H@$  H@$  H@Xb $  H@$  H@$  H@$  H@$  H@$  H@$  H@$/umJ@$  H@$  H@$  H@$  H@$  H@$  H@$  H` (,^b$  H@$  H@$  H@$  H@$  H@$  H@$  H@rPXԵ) H@$  H@$  H@$  H@$  H@$  H@$  H@$ %&xkN$  H@$  H@$  H@$  H@$  H@$  H@$  H@A@arPצ$  H@$  H@$  H@$  H@$  H@$  H@$  H@%9 H@$  H@$  H@$  H@$  H@$  H@$  H@$  ,A]$  H@$  H@$  H@$  H@$  H@$  H@$  H@Xb $  H@$  H@$  H@$  H@$  H@$  H@$  H@$/umJ@$  H@$  H@$  H@$  H@$  H@$  H@$  H` (,^b$  H@$  H@$  H@$  H@$  H@$  H@$  H@rPXԵ) H@$  H@$  H@$  H@$  H@$  H@$  H@$ %&xkN$  H@$  H@$  H@$  H@$  H@$  H@$  H@A@arPצ$  H@$  H@$  H@$  H@$  H@$  H@$  H@%9 H@$  H@$  H@$  H@$  H@$  H@$  H@$  ,A]$  H@$  H@$  H@$  H@$  H@$  H@$  H@Xb $  H@$  H@$  H@$  H@$  H@$  H@$  H@$/umJ@$  H@$  H@$  H@$  H@$  H@$  H@$  H` (,^b$  H@$  H@$  H@$  H@$  H@$  H@$  H@rPXԵ) H@$  H@$  H@$  H@$  H@$  H@$  H@$ %&xkN$  H@$  H@$  H@$  H@$  H@$  H@$  H@A@arPצ$  H@$  H@$  H@$  H@$  H@$  H@$  H@%9 H@$  H@$  H@$  H@$  H@$  H@$  H@$  ,A]$  H@$  H@$  H@$  H@$  H@$  H@$  H@Xb $  H@$  H@$  H@$  H@$  H@$  H@$  H@$/umJ@$  H@$  H@$  H@$  H@$  H@$  H@$  H` (,^b$  H@$  H@$  H@$  H@$  H@$  H@$  H@rPXԵ) H@$  H@$  H@$  H@$  H@$  H@$  H@$ %&xkN$  H@$  H@$  H@$  H@$  H@$  H@$  H@A@arPצ$  H@$  H@$  H@$  H@$  H@$  H@$  H@%9 H@$  H@$  H@$  H@$  H@$  H@$  H@$  ,A]$  H@$  H@$  H@$  H@$  H@$  H@$  H@Xb $  H@$  H@$  H@$  H@$  H@$  H@$  H@$/umJ@$  H@$  H@$  H@$  H@$  H@$  H@$  H` (,^b$  H@$  H@$  H@$  H@$  H@$  H@$  H@rPXԵ) H@$  H@$  H@$  H@$  H@$  H@$  H@$ %&xkN$  H@$  H@$  H@$  H@$  H@$  H@$  H@A aT+~?aGKu{]?ҧ/\S~Ra)Bc o({ {soKqk6>o,>Ŧ$  H@$  H@$  H@$  H@$  H@$  H@$ J@aj]s7Bw}[h#"~[ҽoMo 6J"n޽ݮ]8"Ήulv׬Yر)ƍ‡#.~ݽ[ň[o-bDF qN H@$  H@$  H@$  H@$  H@$  H@$  H@PX|:,BNj_~+;VĖG^^p"(ŗ\rI9x}UN '/RkuJgϞr}OOiӦ"e?Rr?"p#GK/[~}l9\$  H@$  H@$  H@$  H@$  H@$  H@$0O@a< {@Q!/"bĜZL$c oRD.#H\7BJ ۡvܱ)^Ww2od??$  H@$  H@$  H@$  H@$  H@$  H@$  ,7˽+>Bň #RZNTE\ȳ #8\a8 g@(Os7"~[:' FH>B/ԾwY𶷽 64~}%  H@$  H@$  H@$  H@$  H@$  H@$  H`PXrvY"C(:T#t?S?UĈRݻA[۾}I e 5MS/\Zkǎ+-޽;7xccǎ.*{ڟ>b]vYi&G H@$  H@$  H@$  H@$  H@$  H@$  H@X޵?'X~ 6r*YSRsr8@ԋ/"H 9#Z_zѣE X,]vNPHxH|}ZK@$  H@$  H@$  H@$  H@$  H@$  H@Xu_s+HNhDb´9A5"GNr2ܣLMxG9irsH9ivc3>̝ʜҦ޴pۊySǎC뷉{c~|{K{kl2{1}yF^f R88yq0ÛbkM'>cַq܃P-5މoi;iyNLgz 3ΚڔZϗb_|-p Ic`_iS~  $  H@$  H@$  H@$  H@$  H@$  H@$ Pz[ CG!`D#^|E8/q} ͂/cW_}wSl!/K?bB"̼ @3~Ł>b?pFD=.!.{y_h:n2-1–6~th3gf.+bĎMozD1%f~3'+kjb&47va@Kʖ-[F\طo_Ή3MJum?|ֱsMa֏߾ow{)-qyoڵݭ[v۶m+]LV$  H@$  H@$  H@$  H@$  H@$  H@X+;f[D"D ؔ$'#"En 뿃!;杵ď #l=v؜>gO ba.DwڌGPiT(}0WN* "5k zΤnYXQ <#\$GChJ.c?vg - +sAnô@~"\&^Zk_ɼ>[a=h A7 vyF= Y$  H@$  H@$  H@$  H@$  H@$  H@$  ,9ި@Ab=$)潴 1/"=h!߷Ml?vy|3'rp(ƽe3);Rf}pL㌉{bP*}bb=x?˸Zl2s`'!0e\晰pqlȼu^?>3bd}=Y#s`=Y>?n\\cn͒7#N'ҒƓZN*9s0?-0TӇ/3?m̗AND< H@$  yn9T5|\wԃ6? Di͟}yT6˔kS,~k?d$#r`ZhSd$:HFu摌## GG20V.#"k!揌5OY?#k\$  H@$(,^i+ C/0#CD(.OEh}*b>Mqo8}T!"g󶷽my/=0Of2>bEl6L(?C#~_rb)(}h؎`17Us\lGK=Əǝ8Ys 碋.cϜLގ)NYa)ڴ5>p&/O da ;ey-M7֘u}cJBr Ke>9kyme-s?c5q*w H@$  s=]w}򓟜^h+_iOOŭC̭Ԛ?2経OFn~FMF2mZ;4:HFu摌## GG20dT'Pa-#figQh6FpH@$  H@V׷Ν;ү-O!xۿwر^(C21B/馛K.?~1A("bDtVG1O<۷{S\qKnݺ.+qj#DA%FA"b>ѣG'N3tv>\|bq;;߸qcaڵEDp0A1C0 8Dp9 `ȑ#_|1kĎT7fq 6#5M=P{09Oݼyse˖R7m40A\M.#D\[e#bb?u6QD7''?xD"W" H@$p~{キ|Y?|_[cԚ?2経OFn~FMF2mR;$2e4bROFߗ׿;'Cpnv1Wo喎z7A*N>c==ݞ={{CP`[!GX<"/kI뭷VGքyR,lPO%WLijDW_}uZ#R?y朠k}JN{;}}#Ɨ4dľ`_žP=Je.g~X$  H@$('>?4Nwgr~;x=?9/КNd4L~yԢO}?G{ws[F2Wg#h>sF{(+ Q'QCW22zOFc!_8 $DEJ켏Q5spMaAD+'"ΦA57s{]e^2#~9YG+>#$^N$F'>Fd<#.<#F+g\|glSlS5Wև< s]ybq"`g?4]rDa?8P)Wm{g(?ƥKN,]gHoPxθr8QN~!~`JNM912kB~[*~$18t$1dOx8%CL~3X9{r)r H@$ ?[?5^x;|WՑ|/?3ȃԚ?23)yںOZucU7>2cJFCTFhЕޓ(+ Q'QCW22zOF揌2:?nY_7ɈϷVF䉌Bbr+lDF!1d6y"h2yڌp50%?csHL޵Zɇ9dACN嚼͎;wsaМr8j۬%뉨S}~_zvO= jYCRsg5!/1!~ '=m?p9{>&l|R->Is0gORY lr͞|:,$  H@"}cCϸZ& g}Cl['[5dTk\sdl8LWFz&# 3?C~'gZ>Қ?2_B~k5dt~5\7}Zd~H@$  H@V׷Ν;ү-O",{;"7NEaMsG qO'ɻw;V#C[}"C@kD\sM#C0?i >?=XY xO$a_FIeW^Yě"b~A$' 6!2 #*`k׮Rqe )"TF}\KvD0k|!cAY/"LMf >>2&_8ģpym2gX+qbHnb~@^ 6D\#ENO,gr!l7"gwCN6W[ a5ց}(\$73?{*#Eڷo_ $ cɱ6yQ|Nr,q߾o{[~a0 2-ĘyNf$  H@|#w7~߿I|jUӺ hmZ\!K"C̋P|G-s"d_rF\sMs#D7$=ڈ9anݺ+(uڵe.}|K%&ݸqcm۶n͚5Ewq~iܴiSaÆr/w_Gd '(ȑ#DK=vX=tP ڼyse˖"Xߜ @IDAT'NyhYl3KDbMq#ܦ"fN͚Ws#~|'/en×]vJ[0g.r3d/r8ubuX>x"'/>KX]h~ g3i?'`{xGͬ%6<{^NP'gN| π䗜 Q"tDX!\DhZ!ʛ&B Q3IDwtv[v2q#NED^r%EJL viC޽o{]^uUsBk–Î"O<=~r9PbIAN/" A'o|>`>ٻ_"~v DjGE>Ož'8s^ 8&g1'~*"nj>cV$  H@|/-|_,?K =wg/}K=zr-ݟFkwk>揌f֭5Ẓ}rMF2GlX>#[JԚ?2:jnc2֭5# oG+5ms{vGf6GI@$  H@?*Fōn D !CȊ:N,F@!QU#CTs"Cpʜl#l]}>(-qz0bH&Gag)BYbGp=" 樉{Ԉ #FdG 7*G. ތ>k[lP\"d 5"L5>>g,~7RX8/c[uȚ1"zfq#cl2lb&1fqC0- K*}ryډŰ~>lO9 8X-8`Z |Y/Xf‘?yyY|11>s@'G8uc<N$~^12&SC}0kO`i\c3'O%?ia+8-$  H@g K> {'ggGO]55ZGF]0ںOyԢO='2 ɭ&VF䉌Bbr+lDF!1d6y"h2JuGn|gE=s\s\s7e,sP>ܟ>6Ceh̹a~9BEE#M6i=n^q\Yg~?g5:Ylsۜo0nk H@$  L@#9C>{w<駟'>?u3њ?ޚO#uk͟E\~ -Qd^;|4>#[JԚ?2:jnc2֭5̣3\sukQkGH@$  H@? Ͽ5[V!QNdRGhI-# *~l0__= :rX{:r?% m-V!~,Ϥ"ȥ"D 99:'bs ڵkuuV)7n,{:/4`/y)8n߾}VNZsblrr+9A_gvN8QN<[5;$7s"@g/ϗgϞooNsngs55ZGFezk֚?-Q>n&# GG20dT'PaɨN>.bٰaC+(ψqB#&f cy"*=YcNJ'{@K\~36o^3bfLIbSJ۷Xk6mk֔C?tPGD|\sM3_qySpˍ7X\ia穧*[ba;Xyǻjb=-|?TXS]w]Y~Wa $  H@f%kO>y+_;v~~|/; SkhomZ<3bDkZGFl% H@$  H@X}T#5_PFq+#Жܫ"E ĝ!C\H=SsخM?mvSJĞ!b0_Xѧ TDIA J?F$vA+Ҳ-;:bP֌9}Ħh\3މ<sލM5߷;e bUA=J6̒E]hԾ0=ZʐLXRc`Z؉M>%v &:'µV9awfy.9bdr\8r"P(XRkbj6|N\}ح Bc|g} ==həU}{%  H@$p>tyڳ{;yŸњ?ԚO#2uk͟E\~## GG20dT'PaɨN>a\ skD9փurr5kЗ8(/Rwrk_r2q*؜e?;i?0? q{- H@$ V yߥƞ!|j͖[kG-dT'PaɨN>2fnZ|3?EM=X ƐSc@x׮]g kgϞg۶m#mce#Gp E|\bl Zr:dMd=֍}So|699m>?:wv9>ݻ]l>O>dٓˬ/cr~&[n)BfbaL-?'ymbz|?Ғ0\%  H@$p>G?=gi~c+YgCLԚ?2-[[i1Zu7ɨN>B̝X5`,?bN^YNiЊ$kOUg=S G$ Ӄ#eX5W nb!!S"<%sb1sc+뒘[oYwra˚Q3,12&w39ozx5FsZ1y@DLa5ApOe߱8osӊiCv>b$  H@Z'wS;:fqSkhomZ<3bDkZGFl% H@$  H@Xu>(!(8Bψ aM|w "L#"LE\G˽g،#,x݈̃8öoHg !dD-Ϙ#C\<'N|>= >X(b̓E͈~.qL<&!y܈sB|p{{;&a=Gn>7kz0O%?Xs/@~?;e.J?~?̘'ū$  H@8}뮿>رcs55ZGF֭5Ẓ}rMF20dT'PaɨN>BoD{r!">0&Re9%s\;<1(?MZZ5d4&hmZ<3bDkZGFl% H@$  H@X^b;w|IHA4"Cu{-h!?D֯_vmW\]pE3!ˆ bE*ZWboڵELtE菹2bB@y*`rex>Yˬ)>eY ? ҇' M6} H@$  }sݓO>9MK?wy155ZGF֭5Ẓ}rMF20dT'PaɨN>00>6I [ށSy'br~~8ʚsoK K\Cyb+}؋͌{ '@ ˬk Q)}r7b\<`Nxa+/*>N@.#Fp7b}&;brهCg!W90m?"VOٛgH@$  H%կ~{ᇫ:55ZGFe}k֚?-Q>n&# GG20dT'PaɨN># Pc?޸0"ʴ2sxq"u\ɻ617y# Qx87`6_bq>ĚwiσPɘ3;f~}b 9uboruTs"#^roR? 37_W-N&Sy0gOܼ~~汕$  H@@?/|afW?ϗ3p[[5d4[nbVo2Q@}y$:HFu摌## GG20dT'Pa15N##h6$  H@$   WZUPvNi}ge/1œ΋!>GȊo3M9K..¹w;=z;rH9Y" r ,~÷ .أIN/NYeTlO]D  b`Nbkp]tQ^l>|x$v^bZ%Nl-1 d9=Cq'vrrlNEF9>'vYl2_N؍>wlWl"Sx6Y{<_~HNpal^0~跬;ƒ|dQج1be>3A {<#Y x:m?sqsE$  H@gֽ # ]zȽ\twyg~|/{{mݺuт[`Z5d4[ʷnbVo2Q@}y$:HFu摌## GG20dT'Pa15N##h6$  H@$  .o1ܝ;w_;ZmQ+=Sݣ>ZNE܇sݺuE"?5Zb/bA"(}GL`# ,>Bك*"IGxk׮g#}gq̅Mĥ&"e^Ĉ0BNsr/i#f_ˈw]l">v"䝜 K W^ye"%*Kv<dA|Iܴ_߼ys}neaN~NgN0\ֈY?҈%J>"Fg - ̉J6COn>j\$  H@ߙcTrww}w9?y4}t:C(Ԛ?2-[[i1Zu7ɨN>>b{Wr]-5BCE'%n >0FD'>;;1 5-9<ؘ 澓bT$6أW|b:6#X7>çf8wM`#؍0܃Gַp%F51xE)s#%w&/냘_봬*>+0ߙy/ş'( ߙ&s%I!ϙ$  H@@/t<Ȉ|g>S~P+hd csk1JkSk>揌f֭5Ẓ}rMF20dT'PaɨN>-'p'sDXb]ĭ1c?15"JbO/yxIȉ#dnx܃12gT'~?ω93]ٟs1_;n3eK\0e7YJrTqU mK~D=++lEӒ;'`Mȯ|~= ̕39-_YC֏gF>x?I3'k{p;|&Ix_$  H@__ow^oV^ڿ__vǏYC Ԛ?2-[[i1Zu7#]w}y}FZ5d4Z[1e֭5̣\kmZ<=Z[i1ZuSz2Df~潳' H@$  H@X9fS-q;=xn."=N-A+,"C<"Ys[<! hIm۶"`] {̱}2O棥nܸ؋%ibJDNEX{au'<ئ;C扝ss֓>E2W/}Gw>w>~nAUbcwͼ5d<'L,|ff\؄ 9n&#}Tm~/vEk|ji֭5̣i3ukh>WZ[1e֭5Ẓ}rԤ&"yI@$  H@V+k=%i)"0D-bt-1/6"!!AĠC؈O"C{}kĻR"21!"6y;}_8>bF c?bqpx gD&ȍm|@Js09O8>d3cyXb%_osb K򼿏b=wffP` sZ$  H@$0 ??z衑||y믿{'Nx{4ߝϦ1SkhLomZ26Bލa>Fć#Ԥ/g`1$# "mcyw#Ex9MT_{GHJ 7-~1w-`~i>p/y9"6s1l.-܉^̐~O7~g{mEk|je7Mo[[1Onc%S[kG'O[[i1Zuˎh2揌&e֭5Ẓ}rFЕޓ(+ Q'QCW22zOF揌NWޑ$  H@$  FN͸is ri*ji $"ň49ٔ>m |eJc;~BKay9}SW980TPI]fjsZxH.}xDVH$  H@$X{GO~#]ʯJaÆ!O=φn>Skٻې;OMMҘII$L3i''ŊT$*kB)h!V*8T_ H#EDR$55I34{}9,Xg=w}Z{3^znkgh1ikF"n3dw]k]kѮȵی5hWmFךa+r6kMv0vѵ&{\Z=vEoim񤶶c^ @ p H,Z$R@Y$R\Y7gKڜg.[S2|S}K"~K^r1/f$Ny-C)w\+1s;:qfM @S7{vwg{6;r)Ŵx׌HgyFo7F|g xa4F#qxa4F#qxa4̑eat01L @ @vx+C\clb:_e?g͈aʚsQ攽\8>~<Y?׺{YM @nG؏_ȏt鵯}橧<×WއSkim0:ޯv;vlGڏm}vkΖ @ 5w: @ @H7/Ex?''M~~hWr6/-ſҾ)Ŵxm654nnۍoM]q޿s#F r?XgIkm1-Fǿ1bThz6E~mm0:ϱγ݌Fƛ @ VVG @ @_Ϳ7}ۼfxK_k=~6??)ȵŴx]׌HgyFo7F\wuF]{-53h~^FK,2Zj_geRc:.˽Ԩ{gJOߴ8F=m1ڧrySe}[\Ǿ-FT.ctc}*1oT=ӾLeI[ @ @ۿՐ @C>lKo[Y'M_WrqhmbZ[<.nftAsrF.VΡ<#/be]\0B;E8 FW8 4]P\at +\`tAqpAu~_[<.atAs @ pxcЪI @7??7q_l6^l^mܦ7<䓛?fmbZ[ @ @ @ @ @ @ @R@bY6  @ @ @ @ @ @ @& 4/g @ @ @ @ @ @ @8Kgl&@ @ @ @ @ @ @ pӼM @ @ @ @ @ @ @,$e  @ @ @ @ @ @ @iOr6 @ @ @ @ @ @ @X|&h @ @o@IDAT @ @ @ @ @ H,> @ @ @ @ @ @ @R@bY6  @ @ @ @ @ @ @& 4/g @ @ @ @ @ @ @8Kgl&@ @ @ @ @ @ @ pӼM @ @ @ @ @ @ @,$e  @ @ @ @ @ @ @iOr6 @ @ @ @ @ @ @X|&h @ @ @ @ @ @ @ H,> @ @ @ @ @ @ @R@bY6  @ @ @ @ @ @ @& 4/g @ @ @ @ @ @ @8Kgl&@ @ @ @ @ @ @ pӼM @ @ @ @ @ @ @,$e  @ @ @ @ @ @ @iOr6 @ @ @ @ @ @ @X|&h @ @ @ @ @ @ @ H,> @ @ @ @ @ @ @R@bY6  @ @ @ @ @ @ @& 4/g @ @ @ @ @ @ @8Kgl&@ @ @ @ @ @ @ pӼM @ @ @ @ @ @ @,$e  @ @ @ @ @ @ @iOr6 @ @ @ @ @ @ @X|&h @ @ @ @ @ @ @ H,> @ @ @ @ @ @ @R@bY6  @ @ @ @ @ @ @& 4/g @ @ @ @ @ @ @8Kgl&@ @ @ @ @ @ @ pӼM @ @ @ @ @ @ @,$e  @ @ @ @ @ @ @iOr6 @ @ @ @ @ @ @xYF-o7ojy #@ @ @ @ @ @9 ?lo_vYYIg/}K7/yK.IH; -KHw+Jo]wݵ]6gDe_sWb;T\%#0mزv׾]ַK$_x׫Eok峽~^'s[une\I}~V˦oi[ @ @ @ @ @ @62t*Pool>l__|c.K&*Moz{ٮx׿~;n"p%Y>裛~x\\2]I.ouJ}ի^yTc oxEr񭈡;_W6Ї6mrqI&&ފ4ߓZqmKKey,i MҗM.o-KxldWo/T(r|oz-9_G.۾Somo&M򭲷*$Gzmݿ/n^Wo;cl'ϛoU<{%~\3gz}.ODyCqIޱ㼑/6iW$N2i:j!ILo<;Ρ^b$@ @ @ @ @ @sn^rPIB%5`.D%J*Zic $~}~z}}2۾Sej_YCuoLKIV{36p{^ @ @ @ @ @ @X69J*IDКK2덖%uE/zv9/| s F38g_^tUzv|w$M-3%hw+7Comy $P_vW^z+^˳Xܹݣ}Pz}1k܎5uo?g/PyڻZ @ @ @ @ @ @3X|& u.aNO<җ}Sqo,dǒJ"*_vڼ5&-J828'%[Y~vz׾o&?Ko9V=8B>vٳRϙ @ @ @ @ @ @n[o|Ǖ0% L6I-J*y;fI&,P"doa$x(a]'kiz_Wm:J.7Uv,$۹LhoM],x"y~[vgxIͭ>g{7<߾}gKms}Wq]v]m @x'!͚ >w殻_[<yil7ƈQϷM]O1F ?C?bt\Џ8~~3c]at01L @ @NX|'-k!ġ% MP˒&1rBYYX4unFkꚒN*[ݯgY.Kzk֋a3o2'?ۓ\VlWVq}i9rʛ˺WnY?ħ=ˮ)CeG:o:VfǛceVFR}˛1MZf\<- ޝ:6g5K)cj|M4m8r[QשeWa˙i333cWߟz{s]}[v~}չcMW յyYS<գ,ƣf復x:d[|`RzcqKﮛ[ߝz|47}k|;zev|z.i97u9-w۴}3MֱVS?k9u~G\G %7s3f_2{onvf>7 @OO=?o=Lv5e?H,^[2+o9I}m7[{gOe~6^IlX-/\ɴٴ\71z47^X˛5eT۵,ƞ!>vk,LƵ7];㡾sK_Ҷ_zz[ߟz㡸&q'mb/~q֫߷}۷]$w߽[fW~^ŝ{hnd[svji&aoz~כ2<__xscQeٲ\KO~Tf+m7r }mԿ4f?o\ӲXm2/[=mn}bXӭg|Ώ?vngR/sk;}^To{m+ @S5?ة;k.K/m뻾b)Ŵx׌HgyFo7Fz|g;$su?UCk\š5FdgtC2W3jqh!]8vF]mэd"@ @wjO'|7nu&jW'M4xɹ}{˒#{+u;^~n"X')nJ\ꗨ[/I[o*榒JZl%Mr$~m/\Stɵͭ7}n^Y%Ur*z=ka[ߝ2X|d,qkZvnֲ*u{vn{۹i,1$?omIJIpg)w~w^$-=Vf۹=)om[1[~I(-{V40߱Sw󩼞SnYNyLmnlZ>_׷zrٞurY֬OL|My߲qh9ϗgo}[mzon//n>mקWO<]6}Ƕbm?GsrG1`13suM[n#?#ץLj?kat׌HgyFo7F \]š5FdgtC2W3jqh!]8ZZctH~FW-1:$suF]ӾjLի @ @_WR߇zO_a_JT*DJ?]v$tKm_Xo"-h:/6h4 px̰d۲iJ`+ފXbە۾;ecZJଌސ=K6,rb(!I4Sۇ,zG^Z εKյr[NUIWw~z0յzMݯ2'%UXW[f[lu>VfdeLݖ˒͝ST"dSFWsm4I%!6~{7nLȥyXi_s[D?[4,}:Awޞu{Qw}$n^ŕQOճ~ۮÌ, zyPv=278{ݿ+ݧԫDݖ9y?%gO=3n͚u0B3.%r[߾]طuQXY[<F7ft, @ @;CjF؝Q_BrBrN%MrX@$Ĩ75IN{,v畠TRU U%ZMYJ+jJ6YMu%>MZ.IlUյ:w$u]MݷĹ)1ڲ&8vY3 m&_%uM.e3Wn%oՆ_y%MZo⬼mI_Ӷ Ygs,+ʜr'eeVceWvlo՜_nۂ]Eckvg,,xsn|ur<F3ŗLӆYo9Ͽ2v1mRMy_=ڽX>CԻ~VY\s_q==ka׌HgyFo7Fz;H^2:l3G%6sH^2:l3G%6sH^2:l3G%6su~_[<F7f @ |CX|6}UtzK*idD}UBQ Rf:J*ĢJ.:6u>Ol{m,77+IN[{goE,Y%|}wdݧ{sIN/ [{?Om6uW=&ɭr*e,v8U%uX[V%lܔU쭰mkzkfo,b긬$޶ݷ6+i$tя~tYמu}ӛt1o0=Zvm`$:P][=}ʬ^եXSoo?Tڹe%>W(v(Ѯ:zc?#շ\_?s~G[-kU~U7uߙ:~{ [gQSsux=CbϪPBg6v/ )vXqdWY4I%Ol|tXkbf⮼]J]kbNѾ\jh_Sul6d?AS5OquzuʬUf^LsϺlP_^_<3Ӧ,k>zv>{,r3/gWG.Ω4^;?ۮɹ_ô[sPo޾YQ~X~UfھZgm3ws]뇞{V @e>k?{E?3ϗלgGϿox6w)Ŵx׌HgyFo7Fz5vz2Εc- yEcmk'ŴxƈQ}|<ǚv;v3o& @ p'H,ZֱJ*%2T[mW B]S"T`T"US Z%%@5IQ%}CQe%$9XrfX= L%~ V2_eUw \+sKn?h]RW fյ9Lbqa%V^%V^I+Y9y?[/ y%,w(Wfn%HZ[UfvmVIUvp2$+wW攑okY7#Yl~l*0 Huխ\n}q"g͙5PZ?([?FL]w#%S6MG>r1+roeow^S{}>[J_y&rl<4i,4sslC-XգI˱8u5mdd~yscd9&G̹2^7}Oc2JnnթSnVLۗXV݊6gVL.3OycǧOT~YXe֯EϷ=_Z֯6QY~%&lDe_v\y fehWGWDYxRZ[LkkwH?ҏ8>ݎcZm0܌}L @ @vx^A%0Tb$U$ldI2+yl,4i%YuѾ:J<+jLرΩJ+y^S畸U}J*q605*--70uo*!_nP\dSVN=J>W?Ogg]\աkr3ݕ3WSge5Mg5w]+Γ$Yr[nyʮsNWViuSb8T9 u\meiIF4vYx7wmU?[S]~}y(Y8T?Z&vcf}0}sbT~seէeƿnjɖ%Ig߱1tȭ-Q*s<͵gNlS:SeX͹v9eزji[_nLinm]طh}.{bO>F=m1ڧr9-at @܉Vu.DI*yK Z&^Pwq,ʙk[o9PS K%*M",as&I{Sו85QSYVvmYٝ;ev9Ssr<.w9󋥹$Tgi=)sc?f>e9qWf1t|Yn粽fg/´s5K9sOm>e͘}}diu3MIʪ]f|L?Zkvuz=7u]vlʚX];8uY+4O y?u^Sǹe݌i'řA} ibɣگyK]&E/'!ϲ{훦yVN5Ml5Ǭضt{Wvrk߽Ʀʙxƌ@IDAT;'S6O`>Scb[]˹ccyh|tʝyy}_ֻw){̧89,:;p??rgËcs7lrmbZ[W9?\olYo=/| zj>_~˭^ỉiWlԿCxYp顩5,bh\皋gqS~`չ4cs׋Pl>獿gdGo /nG|ӟ<#v}K_xCq{2e?•gDŗc}"1\Ƕ}ɿo}pދ>o>=6;gvO~r6˾ҁX[< !Ǚ1#qel=#+ϋc~t]9v[[<~4;~4_m-gcn3/#k4,  @ @(۱Rtk& RJL*J(*{.'Y$,T}ݷM׾M`-17ouM][Q/ז_ZחXWIgoy[6y{%į֗ \V%9Mrtɒ%͗mT k߽]/A;;76YiTW-eq?%Ko1C6MsKz^VInx;r>-rsv,K@󱧓[ĽڴȡsĖr}*vSyɴOչvjڮI-{UonXu%浬xĊP匇%o&Vڵݩy7@W?ˮ{=mR̵{ύWw^-{Mg_ک>ѵ=3Z$jxַnɤx_cxhθ.ݩ4%q9V,]?}bsK.y߾wc9㷸+jYzf\We"hXz(l_-^Nݷ2>Om˙gUI=㊭zcwpe >/Msd]ji}[ol<we=,Λb^ͭזź,wywN7UNkYNݻgeֆߌhoZlY]2iL7mkytnSD<;}w}m{ g~fw?y׻uݱx{m1-F7ֻnkgh1i||gb{=#OJkim02v[[<ѱtmmGQ{~ݮHַo׾#7ۜE @OrJ衇t(_0Jp*Y~~aUQ C%8oMQ"T G%/% PTXIF%'U^E%XvJ&ݒ~闶S%CI=߳M,bhY2ZeTvYݷidIDQ,[$We̛+[/ɷ$)d2aZfEe6pUZ Z]UvXcOs*7늻yS}sJ%e\%D&Ǵs^,Yxuˡeu$o2v))~V[Y{79.]WY[ա\ny$v>z6[o'g?aS%kɑ-k껜hXm۲8C+bj.aέw{NTq=Aڽjb|޶QkdT{4f<%6iT{}ퟲR<~ͭ7 O˒4c2]zZBsy]&g3*/vO-9}s0GZv\խrkO:T_Sս>QYSn}f>=.޶71+e[/ϗ<*gzu~R9,3NӦFsMտ3)S19>yKSl[VGW3ۛxV}3vntoZ|Skim0v[[rv+k{F-ate,wmxc=я_?ҏn~o۷k^m"@ @'p3D(_R)Yi&5-7[v9r^sΝinw9甼T$dUWE֗sIXW]?s a:SeUI+-N9S,;}m))lI@3--+;b]m9Kusnq>eK8uԡetʹU4OB9$ XZuL*sA|lkʜ6O簜_Y77>N7yLON*KƖdر)OÔ3}{mO|O}[ߍklʟ0ex1r~ޱ9so󌃹s*k硲'SXkyl{ڦ'ΖWKڶv6oiܧw}6qilE>P=gmUo汴Xzg_]לuvCԡO{>ϳi❺|X-ͼ[6wlr7y\'@N/oȾO×/PΧ71O^[<ſkkų~ƘƈQ]NqAF+}Gǂ!xxRZ[Lkѕtߵя+nkG?:~γ݌FQ; @,]ρ@IJ%' VR$ձJdD{ɲe eDJ >]{9keX%+LX5]?4MbS]%g@_ u1KxID+}5ח}O}L>m9S9SϫlaOi~:\.kφL{-F7=ڮo}ƾ>Uct\\} }_~'<#5?g7~Lv-갶kkų~ƘƈqgG ?C?bt\Џ8~~3#F7ꌵ9-Fэ 8 @ܙ3XczOsUW [%$Xr?mbXIL%O}_T.D:X e%]_2ZɅ%8VWI%u5Ub$uN1U~Vzի6]ߵy[ߺ뮻.,tIY*{w}mJXST//o>OlՊʭݣDpIxqW_m$W,9M"p1]mSWss}X>sƿv}$*ynrJϾڥc__ 8 VR$eT;h[ݿ{Wn>趞b}eveQ"we8v?^zeYڞ]=Zoy[6-AέwbmWjzwN In٘asf>Q_뾍f'ZN6|#JPm6=jΙy\mR_kCe7wN}ϮWuy\󥶩ݩ}w1 }NdTrv^ڳs+[3vktWβ5x֗{&]۷=C-4 @^>}'?yMa}{>;l-O{c٬-겶kkų~ƘƈqgG ?C?bt\Џ8~~3#F7ꌵ9-Fэ 8 @yWz3(_2.ѩI*٫wNIMԹ˩:6?etT2$luNǖTFeT]vN˙xv+˙GI`˺UN)euJx@XYsesMյ{5;biJ+Ʃ߷OL׽yS,tNeTnzɈܲ%M\̱guo^?Iy_lY̓8[feײ{ٮ)srMj\;U\gTǙWmxi]u~Y9yWNǡsKR( t]}o:}v,;ogHe<6ԥPܙƾ:M?Ϩ72=溩֗7r=gڸ19e,{ͷDܖ6-y:Eg$ګvn_(zgI̳>}5oc9uoĶj9Sԥw{wSeLθ*f{^*ω7H׵Rʮ36Za|쫟} @7_}{I.~GNCv?S}atc~mx؏v;>1:.p qgGW,KrC'v CҊr PU !pKPA+ ԻJj{DA"a8$8gH}ͳwֲgi2;=Yk;ٳ.~z~8iks|ysr;y׿f~ƦHϜg}}8s?Vl9kk9홗s猹mqڹ˶&ټT$ڸ~sr~DZS\Nکn;,3Ϲ3w?N8 ;www|o/8~OcZL[2kxG[ɼo ! ! ! ! ! ! !֍:ckN[< @ @%pwUP"''moXUA;oZ*jGUXozӛ|>);ϧ~zUzmPzσgsx?t|j;]s^8u?~Pǖe 9vylioas̱z^4ݸ{ oxl[{>tN7g2˹+hNҶe9WbNqL~N3smKR,; @~^~o ROZO۪p/}W*i>U-g?N5EZq]1dxmLEhm+n ?ߖGq4?oy[yeONT[-㯰mo{o/=)2]Y[%>ǭ6 w﷍yg1ܔL^LN0x{3eѡ]wmvG>`)n8jo9e> pӱlyñ6kt2Km/󙷊{{~sX-+YnXkGc1PAsϬ5]|O|~'P1.kNkVTA{̫;W1qϸ3ʦ,N=G׾X}Oy{v{{y\w[1m-byZ<[̣-d7FϐGϐGϐGϐGϐGϐGϐGF5H]LY @ @XVF@EJ˷K[^;^Rkfڊ A+)sǻv0BeTasbYk*~|qe 2}gڹMq^Ło,o+뭣m=о8Ѯ)-gEo,sש> Wpw>7¬Hfweڙ8SwMr<ǜuϋ/ @[[N7_u_sϹ/OZL[ŲkxG[ɼo ! ! ! ! ! ! !֍:ckN[< @ @!+)UVoYioMS>/r={9?<<}_d;Er;fU|ZYd}?K}W^oͬxS7͵ͼV(77BƊ+h\N^uY Ŵ,;sBZ}.e./q7fV6yV5wJ9R~MMk9o>On{euƴ65DψG5}:=s:1}ֱ_ccX诂[z;e`ol-wε}kEb):mT4lw,~cjlڛ4kXs꿸o `wBc&΋_o\b))&k[cSGL!amMi>׽nXN!rVhܘ뫵gEmU{rkb-ec<7a;Wsl{7\W; s?,sX۝Wi'皏규;=7s[Nν0U[L;ײcZ^ȿ7wOL<{>m[_vN}^ȠPȼ9km|N,c:cB\/+qB}O si )079)Ϋ- Mk1{g?sxTŔ[}6ζ]{MNjl3ss\.xx_LM-Ff9?-;Voo ˹qs9]۶w+'6s'3Y~tG?o2oLG^3]ʳyh:޵v9wkg9xP6kgƚ>ss9sZ;6~sU;}s\YYۿ۷͵=igm;y8jVT~OSzBZۯB +X}VWUoShU{lu*^][1,[gNjqTVQ6V\XksY AVvjY1VD7UPZOs}MT (nm8}c{[ /g썱[Ϲfޒ՘rkۼVhui߷m9v+P1=쳢R3 {rnҼ/94Sclk\t}4O}{g@_q% {Vܶ?VY?=sBvkrYN3n}>1Z{ⶹ*ڽP!kAc{[͸rh:mˑ>Wxbi;k4_=+n[i95y4v'ß/8}|i\ í7Ry/W(zK?W~W{OcZL[2~kxG[ɼo ! ! ! ! ! ! !֍:ckN[< @ @!pz؍DV5EZUT~ +@y]3V`U1TNQWEYSٱb);\Yf];㫈Bߊ*2ml>kc>_tV|Vqunٵ駸Pb_Vc æhq5`~qT8E5}W;5=gEg{ZMs)2lfm.ƒa>lf3ܮɓɑSmgy5s4WkҜ4Y{-MS\,,c8&'&׶Ų\ֹ/&iiϾ{e~a</e4l6vgǺo{f/ڜ.ɳɵ6\|tm_N̹g|~\[{,-_[jlyw-?=/<ɟ>s ĭS[ik0Xom޶h1u3u3u3u3u3u3u3ѺQglik0Gp @nW?c6һ B) t YwԊګ嶓«bi *8StUS0He::OQ]c8hTk)>2j_|v~1xgScGj=k5]\3S-m;õNg}5ɓXg3q.9϶o[_l('Esv毘ڟ!ȿXX}7mY81NN=UIsv1m~{o}\R[r]Zo!@^ٟ#'}_3<}qV-~o.TUӼyr>e[k{oqNJzqo93G"-:'yLaq毵|{b;gY3ork+ږWϗe^CC픓m|Gݻٷ_ y94ŚmůQk׼ϠY:3 @=~n|w|w}{Ox[pamkl1y[1ZX?C1ZX?C1ZX?C1ZX?C1ZX?C1ZX?C1ZX?Cu֜#yt1g @ @O*?~+b }*xb)j{˲𨶦B**gA;ϵӹ-UUQ[ۮY'?EKSV֮ڮ)*wU9WgmLf}â㉥wqO xpK5[QeY輊[.]^3vb췝9y.gc(8gk;k}:ǖ݄/7ʉɋDZ~6G: k|Dz>o-';vj~Λme}U1v6ŴveƾOkkrs}~螹זrYs|,gMf٩K @WBWWw;o)Ŵx],÷6o[gyŘƈQ᲋}X>UZ_[}VC]*<'> dEHU[!رoy[1]IU W<__xnZQ[~ۧzjO>+;W`W[ܷW|*-*RXPm*,'|r}hWm[y}!Uv]Kݣ>ZحmjyWc楂 jb*{YGp_\y^{ӛ޴ڭ@·Ϋ݊ȴ>`fc]}M6z}*(l|9{>1-alwl\yKyRSp{~y}_bjgs׶m3Lˋbu?5G>cU4OE}#׷m|+gf'xb} E_E/h&_V\Kʳڝ1w?=Y9}~bjəbx~RMkNk۸?Ks|tO/ R맟 }|d^2zOus09??KtW=.&2m;޶8*⨯Ws?Cʑּ9rέ0s~R=Z3oC,nۜ-vj8j?˶i}7w44'QV-O3ڮ66WwS|(/&7۵+Yj 5ټ{fX}o~~ʹ^Ōn 'boG6O\7ߍ*O󋥼,@y~P~~cC9_ywM/,n,5r1+SsIFV rϔeyrk`)^nw+'Gw]ryE#@ @W[4(Pa]sM+tB bk>jȨSWRWTqVqu/M%ZkEa]VPX_u}WLVASt|9R[Vk o "g[[qU16lk=f[?عS/)^\w[nL~U]5<~gZϨgJo;svߜͽ^/m93qyN92默/KS}Z_˼ra ٴ6ß/ߍܪy^fϻu;ضcXN5OWs11? @뿾q·~q;[_]Oo-bYyZ<[̣-d7F~y;k3rk䴵nm޶<:?ۭG)o6o[G?mkl1y;ifF- @.zkPyA5_ b zChEfU TaTo֝ۗ"*lXE}lS+p,OaWX1WT XWEL)9cBޢfRcOg꯸[+~NW3qo0vwXH5_S ]WAVmw^ ڱk>gNomt}MVX뫢>7}bhL9xky}NN}>ڮ7?}6^W18++^Mo{99ŶÎR3']2'f>r\G=c]?V7KmGcێ˱<6*C˯WϤ6`kK=Y{]sPG}5sZ!j[::95Vm[~9<_ yq)p-e t]-_5ɫ;oeƼ̫r_uNcƵڪŽh?:Vq,N^Q?;X*~7]7]6wOcoŰ @w<++w=߳{|kxG[ɼ}btL1Fg=}btL1Fg=}btL1Fg=}btL1Fg=}btL1Fg=}btL1Fg=}btL1Fg=}btLc[sZYU 6KWMQAnq /FW:+7*2k §eWVU[NUV粯n-bsL[VTUw ~[8s[][ Om4{g'b[klƻW_cjsw_}t$/f>>geW_˺¾Rˍ>˱4rSH[\S<3קkY0'Z3U/bU-X>̳9`bGydocc]7 Y|,|Jv]qLd9S1D9RQ}ܓbmLض˥̮8{yv/ˋն6nXj.ɡǿrn~d89U[8G}R-ϻ•򫘚׌گۯryMokcom7'=繇613K98[,^vk,*XgJ9Xj6cߎ7c?GHնl(Sc|[ߺOs\ɖagNwNt˾l~~x꫾j;w>o-bZL[ѱLmkl1y:>۟eb*+^ZS9m̶kۯ zuΌmK ۦ*9l~껵zٱlc Q3ml}gnͺ_vea^Ǧ}/,bYs>/L&'ܺ6YrXXU Y }>fl>{nk]F}MOmmuU1/jjuNxOαSC_v>w?WNPQpȲйǖ<wmy>u2jio,U^ύsݲyk=W@15oʶ%Y꫟q,g6l[ ՜-;6?Ӧ嶱u/8e~}ǖOc~)AUX}X5eܵs2cgcLL| m&}Cw}w雾tbmk}:z?Z<ft3͜7ycĨ;[O 6 8etfa4N7FiH2:m30[Fm曫0Gы΋G; @ pȨ{o|!wo$:)p֊٦8BeU`UXTXEx_!UoP!ZM_qN!WEW \ua>WYl:s*:b°8\Ue?f}[f5猽Yc6,7jc>O3go,u^17_[_{l;}6޼}9eGcY~ww3m[e^ϵMnm?1m}.fjm[ߍkXlSsێ-su<~ww?dǨsk㰏/j~i-6bRW[][ mǡm1MN\͋S߳61Θ&OCqɁOgf-'yKx]t^ϻ{m. \?;smON|.\1L~dױoBܝ@ɏ}|ϙ?.>?{'8۵}jZ<Źft3͜7ycĨR8atO2bg[YxZL[k]Dɣ{5ܾWs{em!@ @*a/1Ɗeb\颡T@4+j5NA\u]EJNQW3)jw3έ+Z=q/ONX-Ybwbqxcز+bڞuiݱ⫯)>c>Nİ6js9w=S}}-S}.1mc\,嘖91.?},rfbUѶqWF׵uq7K3<'7=q-;c^l=u?L_☵>vڙqkМ͘k~aɛ];z81~>b8f۹~a?&{ߙjb]ʍ֮[Yfg c(}},j=X1l[ @~~sϝ߽nFۿ}w>ЙSp[ik042o7s<#o-~g{et΋1b?C̓Zy}NW1ƛEn6{qi܍mE]uNkc4S$|l[^-;rնqLl?)ޥcÓ%4 "_g}7E􏂼=9znkbZ<^L42o7s<#^_-53:.ˣwY}F?o-F/fe[ @ @8Wp=7_t} Lq|MaAa]2EY˂Ms*g]eʵ21󸕘*ɕlǡqwNcy3[o6ym~qg;2:9ݼpj3[_#˙?gP| @EȏO ɻ_7ͻ'|rL}>{voo}x lk1m-Fws,L#v33r}1:+'S9{YcS9{YcS9{YcS9{YcS9{YcS9{Yc֨6glo/z{O @ @|5od@SSvY51i{9>'^u [S,LX*P6본,/ L:g+c`[sqQߎx \Z^wZܾ>;{܇1\gUIjr {gg?C?tiwkǖ})ŴxJ[;yͭ`dnM׭0o1Eskы~g'w['w['w['w['w['wu쿵xJ[;m-[;njn}i @'/OU]b>w)r}ǼǮe_m_Ngr[ @{'?3~_}e>|g~?w;M?kkw[`a׌ny>o|;n[ctJqF-N1:%s8}mS{N>ũ=FdngtE~kZ<d!@ @&+_^ Dm2 @ @ @ @ @ @ @u . @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6Fa @ @ @ @ @ @ @PX  @ @ @ @ @ @ @kPX|m&@ @ @ @ @ @ @ ۙ  @ @ @ @ @ @ @6\[g>O3c1 @ @ @ @ @\|ʧ\u#@ @(,{  @ @ @ @ @ @ **~ի^S\| @ @_;ۯ\|R @ @ @ @ @\_ݧ}ڧZ @ u[W8Z O  @ @ @ @ @ؔ@o)nwś @ pD@a rZ @ @ @ @ @ @඀g} @PX9zE#A^W4 @ @ @ @ @ @`k˿sC @CŇ">!׼fvKZw8@ @ @ @ @ @}.  @7a^_jo+~@ @ @ @ @ @l[b  @n0Kp^I= @ @ @ @ @ @ @YK @ @ @ @ @ @ @MPX|fI @ @ @ @ @ @ @.).'@ @ @ @ @ @ @ p߄Y# @ @ @ @ @ @ @K (,$  @ @ @ @ @ @ @7aH @ @ @ @ @ @ @ / r @ @ @ @ @ @ @7A@aM%1 @ @ @ @ @ @ @K @ @ @ @ @ @ @MPX|fI @ @ @ @ @ @ @.).'@ @ @ @ @ @ @ p߄Y# @ @ @ @ @ @ @K (,$  @ @ @ @ @ @ @7aH @ @ @ @ @ @ @ / r @ @ @ @ @ @ @7A@aM%1 @ @ @ @ @ @ @K @ @ @ @ @ @ @MPX|fI @ @ @ @ @ @ @.).'@ @ @ @ @ @ @ p߄Y# @ @ @ @ @ @ @K (,$  @ @ @ @ @ @ @7aH @ @ @ @ @ @ @ / r @ @ @ @ @ @ @7A@aM%1 @ @ @ @ @ @ @K @ @ @ @ @ @ @MPX|fI @ @ @ @ @ @ @.).'@ @ @ @ @ @ @ p߄Y# @ @ @ @ @ @ @K (,$  @ @ @ @ @ @ @7aH @ @ @ @ @ @ @ / r @ @ @ @ @ @ @7A@aM%1 @ @ @ @ @ @ @K @ @ @ @ @ @ @MPX|fI @ @ @ @ @ @ @.).'@ @ @ @ @ @ @ p߄Y# @ @ @ @ @ @ @K (,$  @ @ @ @ @ @ @7aH @ @ @ @ @ @ @ / r @ @ @ @ @ @ @7A@aM%1 @ @ @ @ @ @ @K @ @ @ @ @ @ @MPX|fI @ @ @ @ @ @ @.).'@ @ @ @ @ @ @ p߄Y# @ @ @ @ @ @ @K (,$  @ @ @ @ @ @ @7aH @ @ @ @ @ @ @ / r @ @ @ @ @ @ @7A@aM%1 @ @ @ @ @ @ @K @ @ @ @ @ @ @MPX|fI @ @ @ @ @ @ @.)K^rw?vm?Oam_W?g׼5OOXa7WzYYTnvyWu<7渹OT @@n˿]Οg\nR?kg>ZZ~]s\??43h>>mgͲmwm֟1veyf۶t]3qM{ @ @ @ @Y@a5o_{oŻ??=ӻ^~-wg~KmK{;ޱ-7[9?ON\k_~K??4=~MO;8>>w K⾈WɒF s,9(RWq)[/Wr~L9$g(W@MUWW=˷OO>uԩQ\z{\}>JshoyQ(~Q%/#f[=1ÇO9\~~j_4~>;JJJJJJJJJJJJJJJJJ'зضgOA s/yR??W>ν[/^u믿>^2~ uZ/5Aۮ=AR+H/TT*io0.]uvyRw^P{{k} @ @ @ u^0R??==uMˋo2\LKx^du셺7~ O_uǏċƾE?ȑ#}x=Ηl?/f󟏹t}E~ߌS/euy?'d˳cǎT7JJJJJJJJJJJJJJJJJ6#з6^OA Aszł|^:xX/YT@:@vPO5h97y\``cIJy|nq?%P%P%P%P/ƞ޽{c_nϒnbe^Sz {y}ay{M?{|ͱz4+}&_\?^5eA;%esgfGx{9?m?נUיkLע?oW@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ lN/oΪ=7 09wxT'NNp*ڥJJJJJJJ`d`)/ ۷ej^5O㼈/AzQKvty'?ɨ/߶(1<3*8A]voݪH{^B<ٗ= ob_,1f:KJJJJJJJJJJJJJJJJ&_q}u þttbs:4 cl<^#'N"L?"yj^*~xu^;vؓzz|/I|:e L2شms΍=W+|7[_r1y~C3__(((((((((((((((((i}xVrb8v TƲ 8.(((((((EN/+IlYo,+צ=yΐVyl{N"G}=>8zl_I?2.:^DIl7D=/6Jk^L`+,A:|#6(:ZGyvݜqsI? ߳W*Kt^ŧw:&>fǢy3me.:搿1ֳք9Qgv9M̙2\#>_ُ>_{wt*ɻw޸.t!~E.=tٟ~l%2v>ғ>c@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ 27q$ fEE/ o;|_b7}G[s #C& Kzz$K;vlQ h2RP;w`>}̙@Lzܧ<щ Iy~wm|_|1bMҧѣGGSN+J|(/8W>G'Ϝ93bN'?v:1{'zqӬw_uڵ|^eEvͼ[t}xY7>YЯgϞ}2@Xc-OV5eXf~駣t}hK0RӧGl՛JJJJJ`?p'{>~A)igs/C:^bQ9q؍7 ~7ߌ cۧ7GVۏ.{yߎ V1y  Bɓ'GUJqy>Oo~;|Z;/^|u'N}NՕ+W:bf?;bc̋AVϾV>|xde?15OyYV]sٰy~FCÇs|6kzҥa[d?ժ==O͙[nz x&kg'wı.{z .Z}GYXeut˗/x{k.#ɲ&G:cO~ [cΝuF0UsgYJ=:6}GgS @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ &зw nؖ,U&bH%q1 G)v' 'pUX]V?IM٢ʖtf̲2 +hR% B'x>/bM Y9?A2/O٣wA;G{|0ӯWP(QkI]#GY'=ae:K;:?Y#ӬCf rMn޼lkM*]939ӂŽ-lrĊ?dZ{֪CdjLe~cj끟d9lTZCF涹JJJJ${a{klɳPwonjX^e׻36{=*ԧny{rR;9IQ.z}acAG|Ɨ.%[uqEJs=]e6D[{+u`hÒ/bJ{%V6%ȳ3Oyc\t>Y8鵶mǓ-2^kA e#;P‰UkgMȥ7ve)fVio޼9xG2?:5ք9OOO֦Ċ^ĵ}1v-緍 6kң//ӗ HC֔O2N:gK=%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P@_,ޟBJ]" T' &xW 9Y;}'.AvMR\X,NfLO2ug#CV6;'Kd 7}2Kplqru/_'h vEPgg韶M %2ɓiڣmR+fNi lqÈ}' 6SϜe07\`I߬sIYVב<&cMRzSZJKN_s}Ϛd.Y7^0&9T%P%P%P%P{UyS)[wW~^i_r=z糿}qλwlb}y{eKFn|9i8O]=0+`OA; 㧶|Dwt_Pc+|3_1f=3{o eVG,,5 ay;cK|ZО>>GNxi[ў9阝Jm-xyZ6 b"#91>G'{cs)U'S6o=-.2WlPosP'?2JFSv/ԕO>gl*((((((((((((((((b]P* i^.;'y_ɋyaq/16rȑ# Ǝ #^|I%d+'|9:%A}c}d*1&_nו'-K$gFqBO_##2cF%ZoPs>?ֲ嚠m뫟Rݾc3&h((((x GwO܏Kr}>#~={ӓ}{va޾yMu<'2U|ԞMc˒>α-O7.l$ zwug?I?ޅLiRtDp'Ė/`1s/€LǞR-YtvY"<٣VXMg.=ͽ}-9@Fi`h7NqktZG/t3ҿ\k3vaBRe>ewuAdM"o =ƅaFz W&YCumѝoA]҇03̷u}w>uL]!upٟqXT_6(7@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ o}x{ 7o 9iV oc䔂?ϟ?uFG=ǏsBA ,SzO<9$(P D׮]u++[W5l䛀@AwAyt]|y/z"OuAyԕ3;lQ~GϘHof>:d[ |wFi\!%g8Jov%W朏]) .uznݺuƍ' cuPn7͑N}C5=N6쇔`Q%(n$en:=CO8qbٳnN|%Y[Y㧵6ֿ|sz:2Xvc= 7yu6׃c=P;Xw|u|cn1J͓5bMe#m[ {7POvpo]e)*-&62Ay.u{a+2]ly/i`'su{6uՕtWHt=Т;qL/]VOUy z %{;$lf:Km|;^ڰ?4Н~弾)Z.[#b< l2k I.ʯ߉#SE^Os؜~,e;6F)[O2ӓXOorZđ=&ut*r]wIm|7>)rΑşdc#[YRw,|s`D2sgƲ)]+<#c9 (((((((((((((((((}M/}%-) 'pNPRvg1ߌ_ d'%WAH3AdEo(~MHi#G`J&euH][(,GsG5/ebMH}McɧSȑ<]tF/tM΢Nv:G_yI^N|7n։a:1kY=k)ɮ[2%2Yy ueusGfS~lvkt7OuLѿJJJJ:~MR;C;g Krz'wo9)6D/%4>{ md;e^#8)}}@ 7m`/ lȾTt9C6?gp,џ= `EO읆]C_M)K6 K7~)D1.Ŏؒy5L֢97d$i&ٱh~䓍7;J ُg<[%>doΧCF7oe֘$8臁W9}CqѕNYEdxI,s=:`GxkϘU~֐d K҇\zOA֑<YCh} gݱEE^M%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%PXj{  EB9m c7!xn2~ΌߒG)XYp #4>`g^3݃Ofٱfݓ^ܽ=Rdd}wO }ዯ:Vw/$|rerQfᘍx#>Й:Yw(0 3r!#8v SOѣGGߴߺukb-|ل%Xݜ$?m%s8K2Jײ%8}lq>kZX)g:2.%;Hև<1l9^<_4&ֈ=2WO[mxG>nGRʳuc59W"jwQxz%c3޸UN tY:z6Yd9205[ɊJKƑY~G_~(s "SW=W|Ƿ%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P@_,޿sk N%so޼u&Hܹs#XM@[#IE?~ |)@… Ct K3ܹ3~;[=Dih En{^J'ۣ\6[=~ {Goŋ!?z@QWlb NR he=|{ʕ+O \r?=ejv2{e{=}{q+$_|y9۞T^ q񂭺uD/y gL] dN81!|}LƄKm<|UsY=~ӣ.N}YæV?==#$g' ?ξd[+9eK?ßON:5y9իc%UeS|W 7zc6kWxfc G_Փ03u̩g@\%7fXHƒ6cc6}-ӧG|O/|ڵz6/yص>y2=I9c=P 6[J"kLu2g2a cAneuqS#\u؃AR'oy:T%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%? 9U ,`B Kg fs^? J_蒢1 K SAPUY[duiGR*''1]([ UuWadJ}pR%r)C9:H:Sȡ[Rfsl4Fm"|%R`gu N6OdmlWI5}'s"xe\&͡";ר9p}`5:ZI?#y>52  u+F|9sF :[̵A֟c~NJy#]X\0%P%P%P%P%um>Yw,wv4X_{1>Uݞ9/RKe)5Hsr_t.w%z:!{&kCF_n"1dWl26ِS+یa_4|3ewե,:nQ?'K3Y]bOcR>doɞ8K2d 8a.`5}㟹d%ٜʑM̆LIbc3=SW(i~!) )]C>a[җCN>ȱFQs)4$\6ѸuY϶,sY]o)G\6*ٝ5H5 pҺgL J.2JJJJJJJJJJJJJJJJJ/Xy&L L(m夔'.%8Nt*C@%xqQ^+^ڽܗ;^Ltg={o9cܣˑ\Llvf_={:'=dgny 5} 'uq;[crCv&Kր>YwevgR.e.YCa8>"v8L}^c::!:3Y+68N͜v,}~OО})O^H+:Iγ9e }k =@B y fŋt}UpQO|V,[)(tlE6s:}Q8|Y9zcz 2u_3v /ʘN_):Wydϲx V8(Yֈu9\}^'dYz zq,vZ2[)$c_=Ǯ _P#sɔ 3kF\M%P%P%P%P% vh }2 :&9仇w{i2Mcs^LM=3t!:! %066_%zdjSg:L]/[ٶib_wm*YaHlΜ#F${p{|{f7n',6Y֥W.f̙uJ:&3أ-6cs>:f3|ؒsc=9RT9GO< g;'63L&]<[clr^TW>M%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P@_,ޟk^ 0ЙHyT `}Ҫ=Ad5F,uAΓſ{:Al `K #wb4>:s)(# 8;3'W^e|w.y1xSlɒ,;63nں{8գG39yIֆV:$J_Nr7 +߿_,kduQsu'$Ƙ#GHa=Hb+셲a_P.СC[׸YǍ7F-{yo#eOMeW3=Тߋ|ۇ 껑g&у_lPg>x\i ~M,|]/~rXfX;k5c36+om3Ԭ?lͺ̔t9o括%zE2YK;>w<յgMٳ٪n ̿Ɵ8qb؞9̱q'~o C9:yP󕹦CXߗ%anM`&9N>?9M%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P@_,s bl&KϜ& P08A*~. xI6A^R%_0?%%ё7  KR􏃅?/pO#;7)Џ2}'z2 7AJv,uzb:Nإo ]6'9/ѷ1O-ͩyƖ~ң*MK.pʖslNo:cv6uZPqգcI_َѵP>-92oͷ+W[[#: 5N6}I֊9'ZNcs,c y3xkl?Aʬ u%c|K.ɐOٕqc?PVMwS @ @ @ @ usݓcO^={9eɽv}';^6uL~¾Nv9{ uX~J{ҮNV^Lν{x6~#cG?osJ{p{,1^VOʱsׯ_}hy!1fWuY/^&>?Tq @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ 3}xMn#O`H <)y>"n3& m (@1ӕi25yjMJc}t TɆ2 Sh)|쐌!UA=AM0EO} 紛Md:_eΩ/KC.N\m9\:x?$!u-~ܪrufNFcuan"K yΑ|R֒[ X&|c!OV7N)KoD]%Y|v~vo'KJJJJe&=SK݃7Jvo,ee+*_^z+{xsߝ'CIv)<dWz$21=~2=$՝~~^Šsٕcd.îM|~xm{SK̕^(U!6E5,[C3'eyqnf/MIr6vm"o>off-JӜquo9u95d^#+9Gr|s%c˰ 7% ǘe a'lҕLn9eKS @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ $[z j) }8.@P`vlȐ!O $:}Σ,jvˆN~8&gܪ2W\/T//I8_Q7;͙q.0ќ&XQv}t%A?$Y5n_a*%T_G0fɼ\tS%2$s<%7I'`-, d5o}$Ui]$;Gr1z7O7]ѩ,C ֔zS @ @ @ @ uk~;Wܳo\߷'7vrs~>δߪ2>٧Y%[ٻjz|S:/ѓqy[1iST>K?wYKilO6/>=]ioҟ}sx:6/ܘ?ͣ2~SG?25] ? p|msYSJΏ)mKY:nVaT%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%P%jů<޾}Nv &8MmOpJGf術.Yvp h$g锣_DW6Dwl?zt'`sXc{ҰM>)#\p{מ 6O"K2օNKŷzkӣGOt;wnY Nvc =t_zwF)rɆ"k$s'Oud4o ǧNs:Nj߇2'Y@Eqo1k@ssmn͗>GO&yYucCo;dL2*Ma;y=_%P%P%P%P/3^"OO<9>\r/nfұqO继v>޽{cO\QsOI_/W:]ҏO?*/~ot:v>._ΙL0sݼysdu6=xx`hNidmZ_Mm/\N(]3SeI5dm7uk8חgKJ:`>}sv$ ~ծYW5}6^d7Ai%K|6'1ۥ˔J ou9JJJJJJJJJJJJJJJJJlylr;jl?~<i&@MÇO$ r @.PN9z֩'Nb|r&O-JЋL$u Pdc 0%`e^{ks$Ԛ|gNc _=Ay8s04' 4G}N%CT1T?r&s5Wy nN&CYɎ_F&;oގ?>l39tg/C摼omA]}MJWݵ`6l"}JJJJJe$ֽnou'/̮}{S|}kמ3Ϟ?/fc cd'_gOelOB.dϗ|uΝu.'7/+el"|Jטsgy>ʌgL׳Yc_=sO?tO>ϔ̇ ̗>_=51Op#tL.݂MٴL'}tir*Ygm-f2K~꘳U[W_F?[WusJԱRFA2=Ik9:9^,HuLyGCI7=-ƚx_#&顃((((({׻=}^F֖%Ƚ]Ld)Fb/{gO:YyyL68O,)sa;yaYd42_v|{=/dgG/|ViSfOyJ$/c7냽22e=>dٮ$Ogίab֔\V$ 2ɉo^)JJJJJJJJJJJJJJJJJ`wnw3|s|eƗf |wF)SܦV$MAu:nܸuQN_ z7F@>t{ARv^$/t 8$C__M-wAӧGI`J>&H lX* +_}(qѭcǎ:Nl7`٭'N ;3&} CN@:]tYj%l+[n~ou~;Ȳu\&ox~Z_qf}a"r߰0Oڲ~y7}ԓ׈<ׇ9uAȣ_ ls]?~xǏtX/9rdC '[kur̥kEW…Q:.Y{օz3>kk*((((w?p=~u{岯={|Mų tk-9f=3{vګ䙀=:;-ݳGH|8cؤ9.cB& W~Jv+C{=vf=CbsFVNjw]iOKvɻó5_l3l7֡culM7)cɑIݚS~Y{X|,u=RfvvצdK乓6}$r񕝳>s2rWوA ׏Κ;/+r<3{Ԇ(((((((((((((((((%w羓&(QТ@A~Q*O>A^PJt^x̷qSЬ@}*%Vi,?B3;~6Bt.Ilxx̙s?K9?GwW.:6J޽{c޴>N} +%]VBK#u~X[wuܴWY#+E rS}/N,VOcj2ցk#~X2\xqWr~4dMɺRg%ڔLi]l2sovrPl]߬}RN1lt }~ooz֖qdȳ((((c{۹??{Z۾"/ #?y Ev^~?S7L{^XiQ{|c?F2<%/K7aLX^6Iem7muٯo"iijV$ۇJX{[S#u #0aߔy/"9͡-*oE9S/% wOz|a-5\K^\@1k0k){n9#WqsϽu5\Gf/0o kKli*((((((((((((((((؟bJĘ ɂ$ ԕKd x.IA - ӇluAnLv7^(Wqz{O' _Ayl>gb- hKbcS=J*ɤO3W%EC ubA6ed^{^ NM0.7gb8`Q׫u"i[5gO&NE2Er,O[\g)q:X+gCKdؒJv" .Ӷ((((xU koAC7~Gs/akrÖٲNӞccۋgogEl )+_̚ʼCLJ}VYv2V޴/z͉<|VZc7oN!Wu-2ޱpXw,:rmIX37ubC(((((((((((((((((M/{! bR09'(0q)g fiXXe- h&^ ;8s.6臍'C[Йѯ> Lc!_=;?4>~ B?t?lgL^l2LW$25N?RFrѹOS[sNgU~^=J8&9}r=H߹I&mLHg\6їqƲ<}飯51źqֽ,M-Ki!~Yiz @ @ @ @ {wuo"cҼ%ׇE|}KG&ERCU~}b_*1ytrG-_eN\jg#{>9|O9h=_9o6oBiFm-6/Ne7"};=2~9Va$> v31. ἎqeMͶD_(((((((((((((((((G/9Q 69Z%) 4+H8%Λk5[_e7dG]fa}kF9Ip}u7c"+I92}ad\_ U^A|bR_S?CyMWP77=ˆG ?gy$HIW8e~rp[2cO9˙'ml|2fQZ8G{JWٻ(yL̎M8nm-L[o ++sƆئNwߍql>\cJ2*am|Q֖D!6u4%㬁|\]ַsd%lo}~u{-JG?>oCz#ǸطCdwaR={%OeCv3kXum"ݻO Xkg ;Yy쳔GN&a+JH߿?^u.fY?FO|ͳ\d-٧{/\0^ZW^Ys:\fww~M_ukٵO[u.Jh y8ևȴnk)׸Ok 0ΛVgݵ׺gn{g.fƦ(((((((((((((((((M`Ebx&hQ,U@6o೔ ,Lp!Ye9'hN6$1/&@OМ/ ˃IR'K)oJ|Cd|2ԵM>~8'p,ǒ6lѫPc87'6a,?,IY2,96`qaNdJv>_Y?)ܲA]3%Ց=DWeFgcwK2߰_~rM3_Gis}ѯ۔3L)%vYџ?]#!H|XIڲ9/3|5޽{N<^2"(((((K{u9{\쵽ha)gV~5daE2{qf|+i8Oeumi63+;UOY_Qξ{93^V2_t=kgf͛gO|9?>3 ؝yfcg 9tIl>:.> r֬qYo}X~QDC<p2e.c2%߰냕vI[⊓qyg I?1.2seA̟1rS @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ '? >Ș 3n%W =y9| ;Cv' x%AxM]?>M2rl~C'xRL~R|St͹[&0|ɠW.lt,C~ dC捍8l~2Uu  d #Est[ad-K:,IXZ_b;Nb2[n춎7a' ճ&UC:`اnؘuby8^06?en':×ة}9~`ܙkE.٘?#;vU66Ms%d#/}fu2nاns>|Q?>['9oyd8gTJ_D܄knq˵J1z8z&cO֑k$ps'|(((((((((((((((((A/yѬ& "(M@ Wh#2R@ 8t;lՒ)&ã`7MdC*&Z-((((W{e?g=ͳϽ|흌%K{| :ܗܞ^;_~Xs'g@y ڵI٧dV 쯵U~dWx矤A[n ةOJȸ(ÇJd ].?+'oIOc$۾ 3u,Qj'ȑ#c̏Ǽ8M|ao|$n$kŵy ,wz擬.cnmf筕Wb6=ˈ;\ֹcks}uڭkXLgx'K{{u)((((((((((((((((bw@`Z=|pի'7}=:DFǙ3gF0Zv׮]%)t`[/o޼9+W hya>m ӧO6:^|y{cxҥ> O:)gOāԝd/r(:7͓;12'm`ƿ˿z뭷k,6Gf֓Y_l}…ձ 0]S%.o/{7yu2焍Y^v,S0)wB/?˿e:r='ׇxun(uX|ZP]"KdnjcP$yO:5ֈu=2ft[7uiuB6_Y7l!3f8ɓ'G]`E;{\%P%P%P%P3۫8"{!XW#O^L߼?? y\V>~W{~!찏?b.9lꫯnltgDbٳeA|mXׯzx)8|Jn9q?Xbv֯Q/OXz ϟƒ^*ҭ[nșac0[0| y;ށ0>^+9JE*Q`1 HDTVN[mŲ{6ӟol?_ֹSww>){kն;8{ݳwM֏]qmW?OgwfUjOmYv߶owwdؾt>wK< @ @ @ @>}fb}5Io-kk֚n"ZZ[*:ŧMJlro {ItMRMFlf4 x[u]D-d-]]Iv_{DܱɎMXlB]M,w,_f$m{۫Is&}Wg׶rtNפyu}孜_&)Oϝut7{mUmno+m_Vf\z/P֮^?UgUn5Quu{c}}u~77{}s~i/a{u㌺?:eꫣ=Vfűs}VX{agӹ^WuYoCeTOZV;?2֎ҵɶ;g3_ו @ =X` vnc5\y 0g5:6^=/+Q]׹iԹnQXxuV"682j_0늡Z =KWwm<1Mq̫cJK鎵;Q֮Ͽ|jG{q}_M[1xWKoL[2o#uتg}gzqӱx^jG<*th뽻lv[Xt^{'kp]>}w9u}{v-ߧ}e[_kKQUfmjڳIлc[ޯ +Ÿk^o&5M__3_߯WgU&ήyvmhy}[mg}C)s޿կmZ ^xϗ~/WGmܪVߒʩoB缲_9>oۇv^dJ/Cg*[}u{{} @%|=#ڳyΕnqgjg{oآ_}myvLcqbe79v涎WVFΗ:lU=#&Os-^w1ՏW8}rc>Xg{]W~Ssut?Ֆ\rj__lb}>:{qu{e+Q9uǏyw.,w?an(GKw]+kwϖŽ9loWv]Ԇ9kkwl[;66鹾.]]^8x@Une~8nsu.6v[[7)ֱ֎J~qGcTKo\^{kb.i,?+ھ'j/]c):vu컏Ҷ^ҹkvmŹvcs(b޽ھ28>e\J*9w=Mﷸy5~܂ԹvU3:Y9ݻ1Tfj}vS}^őbsdzs-mű}n޾zT WGVO3D^W^mVٳ(8}zrVܤWzXݵ{;_}:cug}O|Bpw\8Ĺ&ӵ''ԽMk޹& 68Iry_x\XjW[k~Qm:vxVg>r[n>cL;w =hmѽڱ{M}[ @;ul^g-,~scB}Xqq]q,-n|Xe>Vk+XziKϲq[Y->.~6Eyl<^vԇ_E ʼ@-,57}~vu5 @ @ @ @C?F3Mĺ;~muWת[ԙs{}zh{eV϶QɎMWpO\# @Kq2;v1;tS~UzzvMǥsλ;fr|}l1cVؽStVT @ @ @ @|k u +rM ]`IəkV"ȶ޹űj<;-<گ<l_ϟuG|>Nh  @ @ @ @-R}q /B"oQX^m|{ |{] @>ӼƘ-oQ[Y>w!9:kWo2%Q>R- @ @ @ @ }C9 @ @ @ @ @ @ @W,,*% @ @ @ @ @ @ @ XX|: @ @ @ @ @ @ @_ @ @ @ @ @ @ @,,q  @ @ @ @ @ @ @U JG @ @ @ @ @ @ @߸N @ @ @ @ @ @ @઀W#@ @ @ @ @ @ @ pc oyB'@ @ @ @ @ @ @ pUR @ @ @ @ @ @ @7< @ @ @ @ @ @ @*`aU) @ @ @ @ @ @ @Xw  @ @ @ @ @ @ @\| @ @ @ @ @ @ @n,`a;O @ @ @ @ @ @ @ XX|UJ> @ @ @ @ @ @ @7Ɲ't @ @ @ @ @ @ @W,,*% @ @ @ @ @ @ @ XX|: @ @ @ @ @ @ @_ @ @ @ @ @ @ @,,q  @ @ @ @ @ @ @U JG @ @ @ @ @ @ @߸N @ @ @ @ @ @ @઀W#@ @ @ @ @ @ @ pc oyB'@ @ @ @ @ @ @ pUR @ @ @ @ @ @ @7< @ @ @ @ @ @ @*`aU) @ @ @ @ @ @ @Xw  @ @ @ @ @ @ @\| @ @ @ @ @ @ @n,`a;O @ @ @ @ @ @ @ XX|UJ> @ @ @ @ @ @ @7Ɲ't @ @ @ @ @ @ @W,,*% @ @ @ @ @ @ @ XX|: @ @ @ @ @ @ @_ @ @ @ @ @ @ @,,q  @ @ @ @ @ @ @U JG @ @ @ @ @ @ @߸N @ @ @ @ @ @ @઀W#@ @ @ @ @ @ @ pc oyB'@ @ @ @ @ @ @ pmU)T IDATUR @ @ @ @ @ @ @7< @ @ @ @ @ @ @*`aU) @ @ @ @ @ @ @Xw  @ @ @ @ @ @ @\| @ @ @ @ @ @ @n,`a;O @ @ @ @ @ @ @ XX|UJ> @ @ @ @ @ @ @7Ɲ't @ @ @ @ @ @ @W,,*% @ @ @ @ @ @ @ XX|: @ @ @ @ @ @ @ua\m| @ @ @ @ @ @ @|Y.,/. @ @ @ @ @ @ @WG @ @ @ @ @ @ @ / B @ @ @ @ @ @ @߽O @ @ @ @ @ @ @d!@ @ @ @ @ @ @ pw ރ'@ @ @ @ @ @ @ pA H @ @ @ @ @ @ @wA @ @ @ @ @ @ @ `a$Y @ @ @ @ @ @ @]  @ @ @ @ @ @ @\, @ @ @ @ @ @ @.`a{P @ @ @ @ @ @ @.XX|I @ @ @ @ @ @ @w=(~ @ @ @ @ @ @ @,,$  @ @ @ @ @ @ @ XX|? @ @ @ @ @ @ @ _@ @ @ @ @ @ @ @,,{ @ @ @ @ @ @ @ / B @ @ @ @ @ @ @߽O @ @ @ @ @ @ @d!@ @ @ @ @ @ @ pw ރ'@ @ @ @ @ @ @ pA H @ @ @ @ @ @ @wA @ @ @ @ @ @ @ `a$Y @ @ @ @ @ @ @]  @ @ @ @ @ @ @\, @ @ @ @ @ @ @.`a{P @ @ @ @ @ @ @.XX|I @ @ @ @ @ @ @w=(~ @ @ @ @ @ @ @,,$  @ @ @ @ @ @ @ XX|? @ @ @ @ @ @ @ _@ @ @ @ @ @ @ @,,{ @ @ @ @ @ @ @ / B @ @ @ @ @ @ @߽O @ @ @ @ @ @ @d!@ @ @ @ @ @ @ pwru7|IENDB`nltk-3.7/nltk/test/index.doctest000066400000000000000000000050511420073152400167570ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT .. _align howto: align.html .. _ccg howto: ccg.html .. _chat80 howto: chat80.html .. _childes howto: childes.html .. _chunk howto: chunk.html .. _classify howto: classify.html .. _collocations howto: collocations.html .. _compat howto: compat.html .. _corpus howto: corpus.html .. _data howto: data.html .. _dependency howto: dependency.html .. _discourse howto: discourse.html .. _drt howto: drt.html .. _featgram howto: featgram.html .. _featstruct howto: featstruct.html .. _framenet howto: framenet.html .. _generate howto: generate.html .. _gluesemantics howto: gluesemantics.html .. _gluesemantics_malt howto: gluesemantics_malt.html .. _grammar howto: grammar.html .. _grammartestsuites howto: grammartestsuites.html .. _index howto: index.html .. _inference howto: inference.html .. _internals howto: internals.html .. _japanese howto: japanese.html .. _logic howto: logic.html .. _metrics howto: metrics.html .. _misc howto: misc.html .. _nonmonotonic howto: nonmonotonic.html .. _parse howto: parse.html .. _portuguese_en howto: portuguese_en.html .. _probability howto: probability.html .. _propbank howto: propbank.html .. _relextract howto: relextract.html .. _resolution howto: resolution.html .. _semantics howto: semantics.html .. _simple howto: simple.html .. _stem howto: stem.html .. _tag howto: tag.html .. _tokenize howto: tokenize.html .. _toolbox howto: toolbox.html .. _tree howto: tree.html .. _treetransforms howto: treetransforms.html .. _util howto: util.html .. _wordnet howto: wordnet.html .. _wordnet_lch howto: wordnet_lch.html =========== NLTK HOWTOs =========== * `align HOWTO`_ * `ccg HOWTO`_ * `chat80 HOWTO`_ * `childes HOWTO`_ * `chunk HOWTO`_ * `classify HOWTO`_ * `collocations HOWTO`_ * `compat HOWTO`_ * `corpus HOWTO`_ * `data HOWTO`_ * `dependency HOWTO`_ * `discourse HOWTO`_ * `drt HOWTO`_ * `featgram HOWTO`_ * `featstruct HOWTO`_ * `framenet HOWTO`_ * `generate HOWTO`_ * `gluesemantics HOWTO`_ * `gluesemantics_malt HOWTO`_ * `grammar HOWTO`_ * `grammartestsuites HOWTO`_ * `index HOWTO`_ * `inference HOWTO`_ * `internals HOWTO`_ * `japanese HOWTO`_ * `logic HOWTO`_ * `metrics HOWTO`_ * `misc HOWTO`_ * `nonmonotonic HOWTO`_ * `parse HOWTO`_ * `portuguese_en HOWTO`_ * `probability HOWTO`_ * `propbank HOWTO`_ * `relextract HOWTO`_ * `resolution HOWTO`_ * `semantics HOWTO`_ * `simple HOWTO`_ * `stem HOWTO`_ * `tag HOWTO`_ * `tokenize HOWTO`_ * `toolbox HOWTO`_ * `tree HOWTO`_ * `treetransforms HOWTO`_ * `util HOWTO`_ * `wordnet HOWTO`_ * `wordnet_lch HOWTO`_ nltk-3.7/nltk/test/inference.doctest000066400000000000000000000426421420073152400176150ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ==================================== Logical Inference and Model Building ==================================== >>> from nltk.test.inference_fixt import setup_module >>> setup_module() >>> from nltk import * >>> from nltk.sem.drt import DrtParser >>> from nltk.sem import logic >>> logic._counter._value = 0 ------------ Introduction ------------ Within the area of automated reasoning, first order theorem proving and model building (or model generation) have both received much attention, and have given rise to highly sophisticated techniques. We focus therefore on providing an NLTK interface to third party tools for these tasks. In particular, the module ``nltk.inference`` can be used to access both theorem provers and model builders. --------------------------------- NLTK Interface to Theorem Provers --------------------------------- The main class used to interface with a theorem prover is the ``Prover`` class, found in ``nltk.api``. The ``prove()`` method takes three optional arguments: a goal, a list of assumptions, and a ``verbose`` boolean to indicate whether the proof should be printed to the console. The proof goal and any assumptions need to be instances of the ``Expression`` class specified by ``nltk.sem.logic``. There are currently three theorem provers included with NLTK: ``Prover9``, ``TableauProver``, and ``ResolutionProver``. The first is an off-the-shelf prover, while the other two are written in Python and included in the ``nltk.inference`` package. >>> from nltk.sem import Expression >>> read_expr = Expression.fromstring >>> p1 = read_expr('man(socrates)') >>> p2 = read_expr('all x.(man(x) -> mortal(x))') >>> c = read_expr('mortal(socrates)') >>> Prover9().prove(c, [p1,p2]) True >>> TableauProver().prove(c, [p1,p2]) True >>> ResolutionProver().prove(c, [p1,p2], verbose=True) [1] {-mortal(socrates)} A [2] {man(socrates)} A [3] {-man(z2), mortal(z2)} A [4] {-man(socrates)} (1, 3) [5] {mortal(socrates)} (2, 3) [6] {} (1, 5) True --------------------- The ``ProverCommand`` --------------------- A ``ProverCommand`` is a stateful holder for a theorem prover. The command stores a theorem prover instance (of type ``Prover``), a goal, a list of assumptions, the result of the proof, and a string version of the entire proof. Corresponding to the three included ``Prover`` implementations, there are three ``ProverCommand`` implementations: ``Prover9Command``, ``TableauProverCommand``, and ``ResolutionProverCommand``. The ``ProverCommand``'s constructor takes its goal and assumptions. The ``prove()`` command executes the ``Prover`` and ``proof()`` returns a String form of the proof If the ``prove()`` method has not been called, then the prover command will be unable to display a proof. >>> prover = ResolutionProverCommand(c, [p1,p2]) >>> print(prover.proof()) Traceback (most recent call last): File "...", line 1212, in __run compileflags, 1) in test.globs File "", line 1, in File "...", line ..., in proof raise LookupError("You have to call prove() first to get a proof!") LookupError: You have to call prove() first to get a proof! >>> prover.prove() True >>> print(prover.proof()) [1] {-mortal(socrates)} A [2] {man(socrates)} A [3] {-man(z4), mortal(z4)} A [4] {-man(socrates)} (1, 3) [5] {mortal(socrates)} (2, 3) [6] {} (1, 5) The prover command stores the result of proving so that if ``prove()`` is called again, then the command can return the result without executing the prover again. This allows the user to access the result of the proof without wasting time re-computing what it already knows. >>> prover.prove() True >>> prover.prove() True The assumptions and goal may be accessed using the ``assumptions()`` and ``goal()`` methods, respectively. >>> prover.assumptions() [, mortal(x))>] >>> prover.goal() The assumptions list may be modified using the ``add_assumptions()`` and ``retract_assumptions()`` methods. Both methods take a list of ``Expression`` objects. Since adding or removing assumptions may change the result of the proof, the stored result is cleared when either of these methods are called. That means that ``proof()`` will be unavailable until ``prove()`` is called and a call to ``prove()`` will execute the theorem prover. >>> prover.retract_assumptions([read_expr('man(socrates)')]) >>> print(prover.proof()) Traceback (most recent call last): File "...", line 1212, in __run compileflags, 1) in test.globs File "", line 1, in File "...", line ..., in proof raise LookupError("You have to call prove() first to get a proof!") LookupError: You have to call prove() first to get a proof! >>> prover.prove() False >>> print(prover.proof()) [1] {-mortal(socrates)} A [2] {-man(z6), mortal(z6)} A [3] {-man(socrates)} (1, 2) >>> prover.add_assumptions([read_expr('man(socrates)')]) >>> prover.prove() True ------- Prover9 ------- Prover9 Installation ~~~~~~~~~~~~~~~~~~~~ You can download Prover9 from https://www.cs.unm.edu/~mccune/prover9/. Extract the source code into a suitable directory and follow the instructions in the Prover9 ``README.make`` file to compile the executables. Install these into an appropriate location; the ``prover9_search`` variable is currently configured to look in the following locations: >>> p = Prover9() >>> p.binary_locations() ['/usr/local/bin/prover9', '/usr/local/bin/prover9/bin', '/usr/local/bin', '/usr/bin', '/usr/local/prover9', '/usr/local/share/prover9'] Alternatively, the environment variable ``PROVER9HOME`` may be configured with the binary's location. The path to the correct directory can be set manually in the following manner: >>> config_prover9(path='/usr/local/bin') # doctest: +SKIP [Found prover9: /usr/local/bin/prover9] If the executables cannot be found, ``Prover9`` will issue a warning message: >>> p.prove() # doctest: +SKIP Traceback (most recent call last): ... LookupError: =========================================================================== NLTK was unable to find the prover9 executable! Use config_prover9() or set the PROVER9HOME environment variable. >> config_prover9('/path/to/prover9') For more information, on prover9, see: =========================================================================== Using Prover9 ~~~~~~~~~~~~~ The general case in theorem proving is to determine whether ``S |- g`` holds, where ``S`` is a possibly empty set of assumptions, and ``g`` is a proof goal. As mentioned earlier, NLTK input to ``Prover9`` must be ``Expression``\ s of ``nltk.sem.logic``. A ``Prover9`` instance is initialized with a proof goal and, possibly, some assumptions. The ``prove()`` method attempts to find a proof of the goal, given the list of assumptions (in this case, none). >>> goal = read_expr('(man(x) <-> --man(x))') >>> prover = Prover9Command(goal) >>> prover.prove() True Given a ``ProverCommand`` instance ``prover``, the method ``prover.proof()`` will return a String of the extensive proof information provided by Prover9, shown in abbreviated form here:: ============================== Prover9 =============================== Prover9 (32) version ... Process ... was started by ... on ... ... The command was ".../prover9 -f ...". ============================== end of head =========================== ============================== INPUT ================================= % Reading from file /var/... formulas(goals). (all x (man(x) -> man(x))). end_of_list. ... ============================== end of search ========================= THEOREM PROVED Exiting with 1 proof. Process 6317 exit (max_proofs) Mon Jan 21 15:23:28 2008 As mentioned earlier, we may want to list some assumptions for the proof, as shown here. >>> g = read_expr('mortal(socrates)') >>> a1 = read_expr('all x.(man(x) -> mortal(x))') >>> prover = Prover9Command(g, assumptions=[a1]) >>> prover.print_assumptions() all x.(man(x) -> mortal(x)) However, the assumptions are not sufficient to derive the goal: >>> print(prover.prove()) False So let's add another assumption: >>> a2 = read_expr('man(socrates)') >>> prover.add_assumptions([a2]) >>> prover.print_assumptions() all x.(man(x) -> mortal(x)) man(socrates) >>> print(prover.prove()) True We can also show the assumptions in ``Prover9`` format. >>> prover.print_assumptions(output_format='Prover9') all x (man(x) -> mortal(x)) man(socrates) >>> prover.print_assumptions(output_format='Spass') Traceback (most recent call last): . . . NameError: Unrecognized value for 'output_format': Spass Assumptions can be retracted from the list of assumptions. >>> prover.retract_assumptions([a1]) >>> prover.print_assumptions() man(socrates) >>> prover.retract_assumptions([a1]) Statements can be loaded from a file and parsed. We can then add these statements as new assumptions. >>> g = read_expr('all x.(boxer(x) -> -boxerdog(x))') >>> prover = Prover9Command(g) >>> prover.prove() False >>> import nltk.data >>> new = nltk.data.load('grammars/sample_grammars/background0.fol') >>> for a in new: ... print(a) all x.(boxerdog(x) -> dog(x)) all x.(boxer(x) -> person(x)) all x.-(dog(x) & person(x)) exists x.boxer(x) exists x.boxerdog(x) >>> prover.add_assumptions(new) >>> print(prover.prove()) True >>> print(prover.proof()) ============================== prooftrans ============================ Prover9 (...) version ... Process ... was started by ... on ... ... The command was ".../prover9". ============================== end of head =========================== ============================== end of input ========================== ============================== PROOF ================================= % -------- Comments from original proof -------- % Proof 1 at ... seconds. % Length of proof is 13. % Level of proof is 4. % Maximum clause weight is 0. % Given clauses 0. 1 (all x (boxerdog(x) -> dog(x))). [assumption]. 2 (all x (boxer(x) -> person(x))). [assumption]. 3 (all x -(dog(x) & person(x))). [assumption]. 6 (all x (boxer(x) -> -boxerdog(x))). [goal]. 8 -boxerdog(x) | dog(x). [clausify(1)]. 9 boxerdog(c3). [deny(6)]. 11 -boxer(x) | person(x). [clausify(2)]. 12 boxer(c3). [deny(6)]. 14 -dog(x) | -person(x). [clausify(3)]. 15 dog(c3). [resolve(9,a,8,a)]. 18 person(c3). [resolve(12,a,11,a)]. 19 -person(c3). [resolve(15,a,14,a)]. 20 $F. [resolve(19,a,18,a)]. ============================== end of proof ========================== ---------------------- The equiv() method ---------------------- One application of the theorem prover functionality is to check if two Expressions have the same meaning. The ``equiv()`` method calls a theorem prover to determine whether two Expressions are logically equivalent. >>> a = read_expr(r'exists x.(man(x) & walks(x))') >>> b = read_expr(r'exists x.(walks(x) & man(x))') >>> print(a.equiv(b)) True The same method can be used on Discourse Representation Structures (DRSs). In this case, each DRS is converted to a first order logic form, and then passed to the theorem prover. >>> dp = DrtParser() >>> a = dp.parse(r'([x],[man(x), walks(x)])') >>> b = dp.parse(r'([x],[walks(x), man(x)])') >>> print(a.equiv(b)) True -------------------------------- NLTK Interface to Model Builders -------------------------------- The top-level to model builders is parallel to that for theorem-provers. The ``ModelBuilder`` interface is located in ``nltk.inference.api``. It is currently only implemented by ``Mace``, which interfaces with the Mace4 model builder. Typically we use a model builder to show that some set of formulas has a model, and is therefore consistent. One way of doing this is by treating our candidate set of sentences as assumptions, and leaving the goal unspecified. Thus, the following interaction shows how both ``{a, c1}`` and ``{a, c2}`` are consistent sets, since Mace succeeds in a building a model for each of them, while ``{c1, c2}`` is inconsistent. >>> a3 = read_expr('exists x.(man(x) and walks(x))') >>> c1 = read_expr('mortal(socrates)') >>> c2 = read_expr('-mortal(socrates)') >>> mace = Mace() >>> print(mace.build_model(None, [a3, c1])) True >>> print(mace.build_model(None, [a3, c2])) True We can also use the model builder as an adjunct to theorem prover. Let's suppose we are trying to prove ``S |- g``, i.e. that ``g`` is logically entailed by assumptions ``S = {s1, s2, ..., sn}``. We can this same input to Mace4, and the model builder will try to find a counterexample, that is, to show that ``g`` does *not* follow from ``S``. So, given this input, Mace4 will try to find a model for the set ``S' = {s1, s2, ..., sn, (not g)}``. If ``g`` fails to follow from ``S``, then Mace4 may well return with a counterexample faster than Prover9 concludes that it cannot find the required proof. Conversely, if ``g`` *is* provable from ``S``, Mace4 may take a long time unsuccessfully trying to find a counter model, and will eventually give up. In the following example, we see that the model builder does succeed in building a model of the assumptions together with the negation of the goal. That is, it succeeds in finding a model where there is a woman that every man loves; Adam is a man; Eve is a woman; but Adam does not love Eve. >>> a4 = read_expr('exists y. (woman(y) & all x. (man(x) -> love(x,y)))') >>> a5 = read_expr('man(adam)') >>> a6 = read_expr('woman(eve)') >>> g = read_expr('love(adam,eve)') >>> print(mace.build_model(g, [a4, a5, a6])) True The Model Builder will fail to find a model if the assumptions do entail the goal. Mace will continue to look for models of ever-increasing sizes until the end_size number is reached. By default, end_size is 500, but it can be set manually for quicker response time. >>> a7 = read_expr('all x.(man(x) -> mortal(x))') >>> a8 = read_expr('man(socrates)') >>> g2 = read_expr('mortal(socrates)') >>> print(Mace(end_size=50).build_model(g2, [a7, a8])) False There is also a ``ModelBuilderCommand`` class that, like ``ProverCommand``, stores a ``ModelBuilder``, a goal, assumptions, a result, and a model. The only implementation in NLTK is ``MaceCommand``. ----- Mace4 ----- Mace4 Installation ~~~~~~~~~~~~~~~~~~ Mace4 is packaged with Prover9, and can be downloaded from the same source, namely https://www.cs.unm.edu/~mccune/prover9/. It is installed in the same manner as Prover9. Using Mace4 ~~~~~~~~~~~ Check whether Mace4 can find a model. >>> a = read_expr('(see(mary,john) & -(mary = john))') >>> mb = MaceCommand(assumptions=[a]) >>> mb.build_model() True Show the model in 'tabular' format. >>> print(mb.model(format='tabular')) % number = 1 % seconds = 0 % Interpretation of size 2 john : 0 mary : 1 see : | 0 1 ---+---- 0 | 0 0 1 | 1 0 Show the model in 'tabular' format. >>> print(mb.model(format='cooked')) % number = 1 % seconds = 0 % Interpretation of size 2 john = 0. mary = 1. - see(0,0). - see(0,1). see(1,0). - see(1,1). The property ``valuation`` accesses the stored ``Valuation``. >>> print(mb.valuation) {'john': 'a', 'mary': 'b', 'see': {('b', 'a')}} We can return to our earlier example and inspect the model: >>> mb = MaceCommand(g, assumptions=[a4, a5, a6]) >>> m = mb.build_model() >>> print(mb.model(format='cooked')) % number = 1 % seconds = 0 % Interpretation of size 2 adam = 0. eve = 0. c1 = 1. man(0). - man(1). woman(0). woman(1). - love(0,0). love(0,1). - love(1,0). - love(1,1). Here, we can see that ``adam`` and ``eve`` have been assigned the same individual, namely ``0`` as value; ``0`` is both a man and a woman; a second individual ``1`` is also a woman; and ``0`` loves ``1``. Thus, this is an interpretation in which there is a woman that every man loves but Adam doesn't love Eve. Mace can also be used with propositional logic. >>> p = read_expr('P') >>> q = read_expr('Q') >>> mb = MaceCommand(q, [p, p>-q]) >>> mb.build_model() True >>> mb.valuation['P'] True >>> mb.valuation['Q'] False nltk-3.7/nltk/test/inference_fixt.py000066400000000000000000000003741420073152400176260ustar00rootroot00000000000000def setup_module(): import pytest from nltk.inference.mace import Mace try: m = Mace() m._find_binary("mace4") except LookupError: pytest.skip("Mace4/Prover9 is not available so inference.doctest was skipped") nltk-3.7/nltk/test/internals.doctest000066400000000000000000000100321420073152400176420ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========================================== Unit tests for the nltk.utilities module ========================================== overridden() ~~~~~~~~~~~~ >>> from nltk.internals import overridden The typical use case is in defining methods for an interface or abstract base class, in such a way that subclasses don't have to implement all of the methods: >>> class EaterI(object): ... '''Subclass must define eat() or batch_eat().''' ... def eat(self, food): ... if overridden(self.batch_eat): ... return self.batch_eat([food])[0] ... else: ... raise NotImplementedError() ... def batch_eat(self, foods): ... return [self.eat(food) for food in foods] As long as a subclass implements one method, it will be used to perform the other method: >>> class GoodEater1(EaterI): ... def eat(self, food): ... return 'yum' >>> GoodEater1().eat('steak') 'yum' >>> GoodEater1().batch_eat(['steak', 'peas']) ['yum', 'yum'] >>> class GoodEater2(EaterI): ... def batch_eat(self, foods): ... return ['yum' for food in foods] >>> GoodEater2().eat('steak') 'yum' >>> GoodEater2().batch_eat(['steak', 'peas']) ['yum', 'yum'] But if a subclass doesn't implement either one, then they'll get an error when they try to call them. (nb this is better than infinite recursion): >>> class BadEater1(EaterI): ... pass >>> BadEater1().eat('steak') Traceback (most recent call last): . . . NotImplementedError >>> BadEater1().batch_eat(['steak', 'peas']) Traceback (most recent call last): . . . NotImplementedError Trying to use the abstract base class itself will also result in an error: >>> class EaterI(EaterI): ... pass >>> EaterI().eat('steak') Traceback (most recent call last): . . . NotImplementedError >>> EaterI().batch_eat(['steak', 'peas']) Traceback (most recent call last): . . . NotImplementedError It's ok to use intermediate abstract classes: >>> class AbstractEater(EaterI): ... pass >>> class GoodEater3(AbstractEater): ... def eat(self, food): ... return 'yum' ... >>> GoodEater3().eat('steak') 'yum' >>> GoodEater3().batch_eat(['steak', 'peas']) ['yum', 'yum'] >>> class GoodEater4(AbstractEater): ... def batch_eat(self, foods): ... return ['yum' for food in foods] >>> GoodEater4().eat('steak') 'yum' >>> GoodEater4().batch_eat(['steak', 'peas']) ['yum', 'yum'] >>> class BadEater2(AbstractEater): ... pass >>> BadEater2().eat('steak') Traceback (most recent call last): . . . NotImplementedError >>> BadEater2().batch_eat(['steak', 'peas']) Traceback (most recent call last): . . . NotImplementedError Here's some extra tests: >>> class A(object): ... def f(x): pass >>> class B(A): ... def f(x): pass >>> class C(A): pass >>> class D(B): pass >>> overridden(A().f) False >>> overridden(B().f) True >>> overridden(C().f) False >>> overridden(D().f) True It works for classic classes, too: >>> class A: ... def f(x): pass >>> class B(A): ... def f(x): pass >>> class C(A): pass >>> class D(B): pass >>> overridden(A().f) False >>> overridden(B().f) True >>> overridden(C().f) False >>> overridden(D().f) True read_str() ~~~~~~~~~~~~ >>> from nltk.internals import read_str Test valid scenarios >>> read_str("'valid string'", 0) ('valid string', 14) Now test invalid scenarios >>> read_str("should error", 0) Traceback (most recent call last): ... nltk.internals.ReadError: Expected open quote at 0 >>> read_str("'should error", 0) Traceback (most recent call last): ... nltk.internals.ReadError: Expected close quote at 1 nltk-3.7/nltk/test/japanese.doctest000066400000000000000000000020251420073152400174340ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ============================ Japanese Language Processing ============================ >>> from nltk import * ------------- Corpus Access ------------- KNB Corpus ---------- >>> from nltk.corpus import knbc Access the words: this should produce a list of strings: >>> type(knbc.words()[0]) is not bytes True Access the sentences: this should produce a list of lists of strings: >>> type(knbc.sents()[0][0]) is not bytes True Access the tagged words: this should produce a list of word, tag pairs: >>> type(knbc.tagged_words()[0]) <... 'tuple'> Access the tagged sentences: this should produce a list of lists of word, tag pairs: >>> type(knbc.tagged_sents()[0][0]) <... 'tuple'> JEITA Corpus ------------ >>> from nltk.corpus import jeita Access the tagged words: this should produce a list of word, tag pairs, where a tag is a string: >>> type(jeita.tagged_words()[0][1]) is not bytes True nltk-3.7/nltk/test/lm.doctest000066400000000000000000000073501420073152400162640ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT .. -*- coding: utf-8 -*- Regression Tests ================ Issue 167 --------- https://github.com/nltk/nltk/issues/167 >>> from nltk.corpus import brown >>> from nltk.lm.preprocessing import padded_everygram_pipeline >>> ngram_order = 3 >>> train_data, vocab_data = padded_everygram_pipeline( ... ngram_order, ... brown.sents(categories="news") ... ) >>> from nltk.lm import WittenBellInterpolated >>> lm = WittenBellInterpolated(ngram_order) >>> lm.fit(train_data, vocab_data) Sentence containing an unseen word should result in infinite entropy because Witten-Bell is based ultimately on MLE, which cannot handle unseen ngrams. Crucially, it shouldn't raise any exceptions for unseen words. >>> from nltk.util import ngrams >>> sent = ngrams("This is a sentence with the word aaddvark".split(), 3) >>> lm.entropy(sent) inf If we remove all unseen ngrams from the sentence, we'll get a non-infinite value for the entropy. >>> sent = ngrams("This is a sentence".split(), 3) >>> round(lm.entropy(sent), 14) 10.23701322869105 Issue 367 --------- https://github.com/nltk/nltk/issues/367 Reproducing Dan Blanchard's example: https://github.com/nltk/nltk/issues/367#issuecomment-14646110 >>> from nltk.lm import Lidstone, Vocabulary >>> word_seq = list('aaaababaaccbacb') >>> ngram_order = 2 >>> from nltk.util import everygrams >>> train_data = [everygrams(word_seq, max_len=ngram_order)] >>> V = Vocabulary(['a', 'b', 'c', '']) >>> lm = Lidstone(0.2, ngram_order, vocabulary=V) >>> lm.fit(train_data) For doctest to work we have to sort the vocabulary keys. >>> V_keys = sorted(V) >>> round(sum(lm.score(w, ("b",)) for w in V_keys), 6) 1.0 >>> round(sum(lm.score(w, ("a",)) for w in V_keys), 6) 1.0 >>> [lm.score(w, ("b",)) for w in V_keys] [0.05, 0.05, 0.8, 0.05, 0.05] >>> [round(lm.score(w, ("a",)), 4) for w in V_keys] [0.0222, 0.0222, 0.4667, 0.2444, 0.2444] Here's reproducing @afourney's comment: https://github.com/nltk/nltk/issues/367#issuecomment-15686289 >>> sent = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz'] >>> ngram_order = 3 >>> from nltk.lm.preprocessing import padded_everygram_pipeline >>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, [sent]) >>> from nltk.lm import Lidstone >>> lm = Lidstone(0.2, ngram_order) >>> lm.fit(train_data, vocab_data) The vocabulary includes the "UNK" symbol as well as two padding symbols. >>> len(lm.vocab) 6 >>> word = "foo" >>> context = ("bar", "baz") The raw counts. >>> lm.context_counts(context)[word] 0 >>> lm.context_counts(context).N() 1 Counts with Lidstone smoothing. >>> lm.context_counts(context)[word] + lm.gamma 0.2 >>> lm.context_counts(context).N() + len(lm.vocab) * lm.gamma 2.2 Without any backoff, just using Lidstone smoothing, P("foo" | "bar", "baz") should be: 0.2 / 2.2 ~= 0.090909 >>> round(lm.score(word, context), 6) 0.090909 Issue 380 --------- https://github.com/nltk/nltk/issues/380 Reproducing setup akin to this comment: https://github.com/nltk/nltk/issues/380#issue-12879030 For speed take only the first 100 sentences of reuters. Shouldn't affect the test. >>> from nltk.corpus import reuters >>> sents = reuters.sents()[:100] >>> ngram_order = 3 >>> from nltk.lm.preprocessing import padded_everygram_pipeline >>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, sents) >>> from nltk.lm import Lidstone >>> lm = Lidstone(0.2, ngram_order) >>> lm.fit(train_data, vocab_data) >>> lm.score("said", ("",)) < 1 True nltk-3.7/nltk/test/logic.doctest000066400000000000000000001024471420073152400167540ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ======================= Logic & Lambda Calculus ======================= The `nltk.logic` package allows expressions of First-Order Logic (FOL) to be parsed into ``Expression`` objects. In addition to FOL, the parser handles lambda-abstraction with variables of higher order. -------- Overview -------- >>> from nltk.sem.logic import * The default inventory of logical constants is the following: >>> boolean_ops() negation - conjunction & disjunction | implication -> equivalence <-> >>> equality_preds() equality = inequality != >>> binding_ops() existential exists universal all lambda \ ---------------- Regression Tests ---------------- Untyped Logic +++++++++++++ Process logical expressions conveniently: >>> read_expr = Expression.fromstring Test for equality under alpha-conversion ======================================== >>> e1 = read_expr('exists x.P(x)') >>> print(e1) exists x.P(x) >>> e2 = e1.alpha_convert(Variable('z')) >>> print(e2) exists z.P(z) >>> e1 == e2 True >>> l = read_expr(r'\X.\X.X(X)(1)').simplify() >>> id = read_expr(r'\X.X(X)') >>> l == id True Test numerals ============= >>> zero = read_expr(r'\F x.x') >>> one = read_expr(r'\F x.F(x)') >>> two = read_expr(r'\F x.F(F(x))') >>> three = read_expr(r'\F x.F(F(F(x)))') >>> four = read_expr(r'\F x.F(F(F(F(x))))') >>> succ = read_expr(r'\N F x.F(N(F,x))') >>> plus = read_expr(r'\M N F x.M(F,N(F,x))') >>> mult = read_expr(r'\M N F.M(N(F))') >>> pred = read_expr(r'\N F x.(N(\G H.H(G(F)))(\u.x)(\u.u))') >>> v1 = ApplicationExpression(succ, zero).simplify() >>> v1 == one True >>> v2 = ApplicationExpression(succ, v1).simplify() >>> v2 == two True >>> v3 = ApplicationExpression(ApplicationExpression(plus, v1), v2).simplify() >>> v3 == three True >>> v4 = ApplicationExpression(ApplicationExpression(mult, v2), v2).simplify() >>> v4 == four True >>> v5 = ApplicationExpression(pred, ApplicationExpression(pred, v4)).simplify() >>> v5 == two True Overloaded operators also exist, for convenience. >>> print(succ(zero).simplify() == one) True >>> print(plus(one,two).simplify() == three) True >>> print(mult(two,two).simplify() == four) True >>> print(pred(pred(four)).simplify() == two) True >>> john = read_expr(r'john') >>> man = read_expr(r'\x.man(x)') >>> walk = read_expr(r'\x.walk(x)') >>> man(john).simplify() >>> print(-walk(john).simplify()) -walk(john) >>> print((man(john) & walk(john)).simplify()) (man(john) & walk(john)) >>> print((man(john) | walk(john)).simplify()) (man(john) | walk(john)) >>> print((man(john) > walk(john)).simplify()) (man(john) -> walk(john)) >>> print((man(john) < walk(john)).simplify()) (man(john) <-> walk(john)) Python's built-in lambda operator can also be used with Expressions >>> john = VariableExpression(Variable('john')) >>> run_var = VariableExpression(Variable('run')) >>> run = lambda x: run_var(x) >>> run(john) ``betaConversionTestSuite.pl`` ------------------------------ Tests based on Blackburn & Bos' book, *Representation and Inference for Natural Language*. >>> x1 = read_expr(r'\P.P(mia)(\x.walk(x))').simplify() >>> x2 = read_expr(r'walk(mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'exists x.(man(x) & ((\P.exists x.(woman(x) & P(x)))(\y.love(x,y))))').simplify() >>> x2 = read_expr(r'exists x.(man(x) & exists y.(woman(y) & love(x,y)))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\a.sleep(a)(mia)').simplify() >>> x2 = read_expr(r'sleep(mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\a.\b.like(b,a)(mia)').simplify() >>> x2 = read_expr(r'\b.like(b,mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\a.(\b.like(b,a)(vincent))').simplify() >>> x2 = read_expr(r'\a.like(vincent,a)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\a.((\b.like(b,a)(vincent)) & sleep(a))').simplify() >>> x2 = read_expr(r'\a.(like(vincent,a) & sleep(a))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\a.\b.like(b,a)(mia)(vincent))').simplify() >>> x2 = read_expr(r'like(vincent,mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'P((\a.sleep(a)(vincent)))').simplify() >>> x2 = read_expr(r'P(sleep(vincent))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.A((\b.sleep(b)(vincent)))').simplify() >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.A(sleep(vincent))').simplify() >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\A.A(vincent)(\b.sleep(b)))').simplify() >>> x2 = read_expr(r'sleep(vincent)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.believe(mia,A(vincent))(\b.sleep(b))').simplify() >>> x2 = read_expr(r'believe(mia,sleep(vincent))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\A.(A(vincent) & A(mia)))(\b.sleep(b))').simplify() >>> x2 = read_expr(r'(sleep(vincent) & sleep(mia))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.\B.(\C.C(A(vincent))(\d.probably(d)) & (\C.C(B(mia))(\d.improbably(d))))(\f.walk(f))(\f.talk(f))').simplify() >>> x2 = read_expr(r'(probably(walk(vincent)) & improbably(talk(mia)))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\d.\f.love(d,f))))(jules)(mia)').simplify() >>> x2 = read_expr(r'love(jules,mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\A.\B.exists c.(A(c) & B(c)))(\d.boxer(d),\d.sleep(d))').simplify() >>> x2 = read_expr(r'exists c.(boxer(c) & sleep(c))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.Z(A)(\c.\a.like(a,c))').simplify() >>> x2 = read_expr(r'Z(\c.\a.like(a,c))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.\b.A(b)(\c.\b.like(b,c))').simplify() >>> x2 = read_expr(r'\b.(\c.\b.like(b,c)(b))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\b.\a.loves(b,a))))(jules)(mia)').simplify() >>> x2 = read_expr(r'loves(jules,mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\A.\b.(exists b.A(b) & A(b)))(\c.boxer(c))(vincent)').simplify() >>> x2 = read_expr(r'((exists b.boxer(b)) & boxer(vincent))').simplify() >>> x1 == x2 True Test Parser =========== >>> print(read_expr(r'john')) john >>> print(read_expr(r'x')) x >>> print(read_expr(r'-man(x)')) -man(x) >>> print(read_expr(r'--man(x)')) --man(x) >>> print(read_expr(r'(man(x))')) man(x) >>> print(read_expr(r'((man(x)))')) man(x) >>> print(read_expr(r'man(x) <-> tall(x)')) (man(x) <-> tall(x)) >>> print(read_expr(r'(man(x) <-> tall(x))')) (man(x) <-> tall(x)) >>> print(read_expr(r'(man(x) & tall(x) & walks(x))')) (man(x) & tall(x) & walks(x)) >>> print(read_expr(r'(man(x) & tall(x) & walks(x))').first) (man(x) & tall(x)) >>> print(read_expr(r'man(x) | tall(x) & walks(x)')) (man(x) | (tall(x) & walks(x))) >>> print(read_expr(r'((man(x) & tall(x)) | walks(x))')) ((man(x) & tall(x)) | walks(x)) >>> print(read_expr(r'man(x) & (tall(x) | walks(x))')) (man(x) & (tall(x) | walks(x))) >>> print(read_expr(r'(man(x) & (tall(x) | walks(x)))')) (man(x) & (tall(x) | walks(x))) >>> print(read_expr(r'P(x) -> Q(x) <-> R(x) | S(x) & T(x)')) ((P(x) -> Q(x)) <-> (R(x) | (S(x) & T(x)))) >>> print(read_expr(r'exists x.man(x)')) exists x.man(x) >>> print(read_expr(r'exists x.(man(x) & tall(x))')) exists x.(man(x) & tall(x)) >>> print(read_expr(r'exists x.(man(x) & tall(x) & walks(x))')) exists x.(man(x) & tall(x) & walks(x)) >>> print(read_expr(r'-P(x) & Q(x)')) (-P(x) & Q(x)) >>> read_expr(r'-P(x) & Q(x)') == read_expr(r'(-P(x)) & Q(x)') True >>> print(read_expr(r'\x.man(x)')) \x.man(x) >>> print(read_expr(r'\x.man(x)(john)')) \x.man(x)(john) >>> print(read_expr(r'\x.man(x)(john) & tall(x)')) (\x.man(x)(john) & tall(x)) >>> print(read_expr(r'\x.\y.sees(x,y)')) \x y.sees(x,y) >>> print(read_expr(r'\x y.sees(x,y)')) \x y.sees(x,y) >>> print(read_expr(r'\x.\y.sees(x,y)(a)')) (\x y.sees(x,y))(a) >>> print(read_expr(r'\x y.sees(x,y)(a)')) (\x y.sees(x,y))(a) >>> print(read_expr(r'\x.\y.sees(x,y)(a)(b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'\x y.sees(x,y)(a)(b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'\x.\y.sees(x,y)(a,b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'\x y.sees(x,y)(a,b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'((\x.\y.sees(x,y))(a))(b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'P(x)(y)(z)')) P(x,y,z) >>> print(read_expr(r'P(Q)')) P(Q) >>> print(read_expr(r'P(Q(x))')) P(Q(x)) >>> print(read_expr(r'(\x.exists y.walks(x,y))(x)')) (\x.exists y.walks(x,y))(x) >>> print(read_expr(r'exists x.(x = john)')) exists x.(x = john) >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))')) ((\P Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x)) >>> a = read_expr(r'exists c.exists b.A(b,c) & A(b,c)') >>> b = read_expr(r'(exists c.(exists b.A(b,c))) & A(b,c)') >>> print(a == b) True >>> a = read_expr(r'exists c.(exists b.A(b,c) & A(b,c))') >>> b = read_expr(r'exists c.((exists b.A(b,c)) & A(b,c))') >>> print(a == b) True >>> print(read_expr(r'exists x.x = y')) exists x.(x = y) >>> print(read_expr('A(B)(C)')) A(B,C) >>> print(read_expr('(A(B))(C)')) A(B,C) >>> print(read_expr('A((B)(C))')) A(B(C)) >>> print(read_expr('A(B(C))')) A(B(C)) >>> print(read_expr('(A)(B(C))')) A(B(C)) >>> print(read_expr('(((A)))(((B))(((C))))')) A(B(C)) >>> print(read_expr(r'A != B')) -(A = B) >>> print(read_expr('P(x) & x=y & P(y)')) (P(x) & (x = y) & P(y)) >>> try: print(read_expr(r'\walk.walk(x)')) ... except LogicalExpressionException as e: print(e) 'walk' is an illegal variable name. Constants may not be abstracted. \walk.walk(x) ^ >>> try: print(read_expr(r'all walk.walk(john)')) ... except LogicalExpressionException as e: print(e) 'walk' is an illegal variable name. Constants may not be quantified. all walk.walk(john) ^ >>> try: print(read_expr(r'x(john)')) ... except LogicalExpressionException as e: print(e) 'x' is an illegal predicate name. Individual variables may not be used as predicates. x(john) ^ >>> from nltk.sem.logic import LogicParser # hack to give access to custom quote chars >>> lpq = LogicParser() >>> lpq.quote_chars = [("'", "'", "\\", False)] >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )")) (man(x) & tall's,(x) & walks(x)) >>> lpq.quote_chars = [("'", "'", "\\", True)] >>> print(lpq.parse(r"'tall\'s,'")) 'tall\'s,' >>> print(lpq.parse(r"'spaced name(x)'")) 'spaced name(x)' >>> print(lpq.parse(r"-'tall\'s,'(x)")) -'tall\'s,'(x) >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )")) (man(x) & 'tall\'s,'(x) & walks(x)) Simplify ======== >>> print(read_expr(r'\x.man(x)(john)').simplify()) man(john) >>> print(read_expr(r'\x.((man(x)))(john)').simplify()) man(john) >>> print(read_expr(r'\x.\y.sees(x,y)(john, mary)').simplify()) sees(john,mary) >>> print(read_expr(r'\x y.sees(x,y)(john, mary)').simplify()) sees(john,mary) >>> print(read_expr(r'\x.\y.sees(x,y)(john)(mary)').simplify()) sees(john,mary) >>> print(read_expr(r'\x y.sees(x,y)(john)(mary)').simplify()) sees(john,mary) >>> print(read_expr(r'\x.\y.sees(x,y)(john)').simplify()) \y.sees(john,y) >>> print(read_expr(r'\x y.sees(x,y)(john)').simplify()) \y.sees(john,y) >>> print(read_expr(r'(\x.\y.sees(x,y)(john))(mary)').simplify()) sees(john,mary) >>> print(read_expr(r'(\x y.sees(x,y)(john))(mary)').simplify()) sees(john,mary) >>> print(read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify()) exists x.(man(x) & exists y.walks(x,y)) >>> e1 = read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(y))').simplify() >>> e2 = read_expr(r'exists x.(man(x) & exists z1.walks(y,z1))') >>> e1 == e2 True >>> print(read_expr(r'(\P Q.exists x.(P(x) & Q(x)))(\x.dog(x))').simplify()) \Q.exists x.(dog(x) & Q(x)) >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))').simplify()) exists x.(dog(x) & bark(x)) >>> print(read_expr(r'\P.(P(x)(y))(\a b.Q(a,b))').simplify()) Q(x,y) Replace ======= >>> a = read_expr(r'a') >>> x = read_expr(r'x') >>> y = read_expr(r'y') >>> z = read_expr(r'z') >>> print(read_expr(r'man(x)').replace(x.variable, a, False)) man(a) >>> print(read_expr(r'(man(x) & tall(x))').replace(x.variable, a, False)) (man(a) & tall(a)) >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, False)) exists x.man(x) >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, True)) exists a.man(a) >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, False)) exists x.give(x,a,z) >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, True)) exists x.give(x,a,z) >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, False) >>> e2 = read_expr(r'exists z1.give(z1,x,z)') >>> e1 == e2 True >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, True) >>> e2 = read_expr(r'exists z1.give(z1,x,z)') >>> e1 == e2 True >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, False)) \x y z.give(x,y,z) >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, True)) \x a z.give(x,a,z) >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, False)) \x y.give(x,y,a) >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, True)) \x y.give(x,y,a) >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, False) >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)') >>> e1 == e2 True >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, True) >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)') >>> e1 == e2 True >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, False)) \x.give(x,y,y) >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, True)) \x.give(x,y,y) >>> from nltk.sem import logic >>> logic._counter._value = 0 >>> e1 = read_expr('e1') >>> e2 = read_expr('e2') >>> print(read_expr('exists e1 e2.(walk(e1) & talk(e2))').replace(e1.variable, e2, True)) exists e2 e01.(walk(e2) & talk(e01)) Variables / Free ================ >>> examples = [r'walk(john)', ... r'walk(x)', ... r'?vp(?np)', ... r'see(john,mary)', ... r'exists x.walk(x)', ... r'\x.see(john,x)', ... r'\x.see(john,x)(mary)', ... r'P(x)', ... r'\P.P(x)', ... r'aa(x,bb(y),cc(z),P(w),u)', ... r'bo(?det(?n),@x)'] >>> examples = [read_expr(e) for e in examples] >>> for e in examples: ... print('%-25s' % e, sorted(e.free())) walk(john) [] walk(x) [Variable('x')] ?vp(?np) [] see(john,mary) [] exists x.walk(x) [] \x.see(john,x) [] (\x.see(john,x))(mary) [] P(x) [Variable('P'), Variable('x')] \P.P(x) [Variable('x')] aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] bo(?det(?n),@x) [] >>> for e in examples: ... print('%-25s' % e, sorted(e.constants())) walk(john) [Variable('john')] walk(x) [] ?vp(?np) [Variable('?np')] see(john,mary) [Variable('john'), Variable('mary')] exists x.walk(x) [] \x.see(john,x) [Variable('john')] (\x.see(john,x))(mary) [Variable('john'), Variable('mary')] P(x) [] \P.P(x) [] aa(x,bb(y),cc(z),P(w),u) [] bo(?det(?n),@x) [Variable('?n'), Variable('@x')] >>> for e in examples: ... print('%-25s' % e, sorted(e.predicates())) walk(john) [Variable('walk')] walk(x) [Variable('walk')] ?vp(?np) [Variable('?vp')] see(john,mary) [Variable('see')] exists x.walk(x) [Variable('walk')] \x.see(john,x) [Variable('see')] (\x.see(john,x))(mary) [Variable('see')] P(x) [] \P.P(x) [] aa(x,bb(y),cc(z),P(w),u) [Variable('aa'), Variable('bb'), Variable('cc')] bo(?det(?n),@x) [Variable('?det'), Variable('bo')] >>> for e in examples: ... print('%-25s' % e, sorted(e.variables())) walk(john) [] walk(x) [Variable('x')] ?vp(?np) [Variable('?np'), Variable('?vp')] see(john,mary) [] exists x.walk(x) [] \x.see(john,x) [] (\x.see(john,x))(mary) [] P(x) [Variable('P'), Variable('x')] \P.P(x) [Variable('x')] aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] bo(?det(?n),@x) [Variable('?det'), Variable('?n'), Variable('@x')] `normalize` >>> print(read_expr(r'\e083.(walk(e083, z472) & talk(e092, z938))').normalize()) \e01.(walk(e01,z3) & talk(e02,z4)) Typed Logic +++++++++++ >>> from nltk.sem.logic import LogicParser >>> tlp = LogicParser(True) >>> print(tlp.parse(r'man(x)').type) ? >>> print(tlp.parse(r'walk(angus)').type) ? >>> print(tlp.parse(r'-man(x)').type) t >>> print(tlp.parse(r'(man(x) <-> tall(x))').type) t >>> print(tlp.parse(r'exists x.(man(x) & tall(x))').type) t >>> print(tlp.parse(r'\x.man(x)').type) >>> print(tlp.parse(r'john').type) e >>> print(tlp.parse(r'\x y.sees(x,y)').type) > >>> print(tlp.parse(r'\x.man(x)(john)').type) ? >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)').type) >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)(mary)').type) ? >>> print(tlp.parse(r'\P.\Q.exists x.(P(x) & Q(x))').type) <,<,t>> >>> print(tlp.parse(r'\x.y').type) >>> print(tlp.parse(r'\P.P(x)').type) <,?> >>> parsed = tlp.parse('see(john,mary)') >>> print(parsed.type) ? >>> print(parsed.function) see(john) >>> print(parsed.function.type) >>> print(parsed.function.function) see >>> print(parsed.function.function.type) > >>> parsed = tlp.parse('P(x,y)') >>> print(parsed) P(x,y) >>> print(parsed.type) ? >>> print(parsed.function) P(x) >>> print(parsed.function.type) >>> print(parsed.function.function) P >>> print(parsed.function.function.type) > >>> print(tlp.parse(r'P').type) ? >>> print(tlp.parse(r'P', {'P': 't'}).type) t >>> a = tlp.parse(r'P(x)') >>> print(a.type) ? >>> print(a.function.type) >>> print(a.argument.type) e >>> a = tlp.parse(r'-P(x)') >>> print(a.type) t >>> print(a.term.type) t >>> print(a.term.function.type) >>> print(a.term.argument.type) e >>> a = tlp.parse(r'P & Q') >>> print(a.type) t >>> print(a.first.type) t >>> print(a.second.type) t >>> a = tlp.parse(r'(P(x) & Q(x))') >>> print(a.type) t >>> print(a.first.type) t >>> print(a.first.function.type) >>> print(a.first.argument.type) e >>> print(a.second.type) t >>> print(a.second.function.type) >>> print(a.second.argument.type) e >>> a = tlp.parse(r'\x.P(x)') >>> print(a.type) >>> print(a.term.function.type) >>> print(a.term.argument.type) e >>> a = tlp.parse(r'\P.P(x)') >>> print(a.type) <,?> >>> print(a.term.function.type) >>> print(a.term.argument.type) e >>> a = tlp.parse(r'(\x.P(x)(john)) & Q(x)') >>> print(a.type) t >>> print(a.first.type) t >>> print(a.first.function.type) >>> print(a.first.function.term.function.type) >>> print(a.first.function.term.argument.type) e >>> print(a.first.argument.type) e >>> a = tlp.parse(r'\x y.P(x,y)(john)(mary) & Q(x)') >>> print(a.type) t >>> print(a.first.type) t >>> print(a.first.function.type) >>> print(a.first.function.function.type) > >>> a = tlp.parse(r'--P') >>> print(a.type) t >>> print(a.term.type) t >>> print(a.term.term.type) t >>> tlp.parse(r'\x y.P(x,y)').type > >>> tlp.parse(r'\x y.P(x,y)', {'P': '>'}).type > >>> a = tlp.parse(r'\P y.P(john,y)(\x y.see(x,y))') >>> a.type >>> a.function.type <>,> >>> a.function.term.term.function.function.type > >>> a.argument.type > >>> a = tlp.parse(r'exists c f.(father(c) = f)') >>> a.type t >>> a.term.term.type t >>> a.term.term.first.type e >>> a.term.term.first.function.type >>> a.term.term.second.type e typecheck() >>> a = tlp.parse('P(x)') >>> b = tlp.parse('Q(x)') >>> a.type ? >>> c = a & b >>> c.first.type ? >>> c.typecheck() {...} >>> c.first.type t >>> a = tlp.parse('P(x)') >>> b = tlp.parse('P(x) & Q(x)') >>> a.type ? >>> typecheck([a,b]) {...} >>> a.type t >>> e = tlp.parse(r'man(x)') >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': ''}) True >>> sig = {'man': ''} >>> e = tlp.parse(r'man(x)', sig) >>> print(e.function.type) >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': ''}) True >>> print(e.function.type) >>> print(dict((k,str(v)) for k,v in e.typecheck(sig).items()) == {'x': 'e', 'man': ''}) True findtype() >>> print(tlp.parse(r'man(x)').findtype(Variable('man'))) >>> print(tlp.parse(r'see(x,y)').findtype(Variable('see'))) > >>> print(tlp.parse(r'P(Q(R(x)))').findtype(Variable('Q'))) ? reading types from strings >>> Type.fromstring('e') e >>> Type.fromstring('') >>> Type.fromstring('<,>') <,> >>> Type.fromstring('<,?>') <,?> alternative type format >>> Type.fromstring('e').str() 'IND' >>> Type.fromstring('').str() '(IND -> ANY)' >>> Type.fromstring('<,t>').str() '((IND -> BOOL) -> BOOL)' Type.__eq__() >>> from nltk.sem.logic import * >>> e = ENTITY_TYPE >>> t = TRUTH_TYPE >>> a = ANY_TYPE >>> et = ComplexType(e,t) >>> eet = ComplexType(e,ComplexType(e,t)) >>> at = ComplexType(a,t) >>> ea = ComplexType(e,a) >>> aa = ComplexType(a,a) >>> e == e True >>> t == t True >>> e == t False >>> a == t False >>> t == a False >>> a == a True >>> et == et True >>> a == et False >>> et == a False >>> a == ComplexType(a,aa) True >>> ComplexType(a,aa) == a True matches() >>> e.matches(t) False >>> a.matches(t) True >>> t.matches(a) True >>> a.matches(et) True >>> et.matches(a) True >>> ea.matches(eet) True >>> eet.matches(ea) True >>> aa.matches(et) True >>> aa.matches(t) True Type error during parsing ========================= >>> try: print(tlp.parse(r'exists x y.(P(x) & P(x,y))')) ... except InconsistentTypeHierarchyException as e: print(e) The variable 'P' was found in multiple places with different types. >>> try: tlp.parse(r'\x y.see(x,y)(\x.man(x))') ... except TypeException as e: print(e) The function '\x y.see(x,y)' is of type '>' and cannot be applied to '\x.man(x)' of type ''. Its argument must match type 'e'. >>> try: tlp.parse(r'\P x y.-P(x,y)(\x.-man(x))') ... except TypeException as e: print(e) The function '\P x y.-P(x,y)' is of type '<>,>>' and cannot be applied to '\x.-man(x)' of type ''. Its argument must match type '>'. >>> a = tlp.parse(r'-talk(x)') >>> signature = a.typecheck() >>> try: print(tlp.parse(r'-talk(x,y)', signature)) ... except InconsistentTypeHierarchyException as e: print(e) The variable 'talk' was found in multiple places with different types. >>> a = tlp.parse(r'-P(x)') >>> b = tlp.parse(r'-P(x,y)') >>> a.typecheck() {...} >>> b.typecheck() {...} >>> try: typecheck([a,b]) ... except InconsistentTypeHierarchyException as e: print(e) The variable 'P' was found in multiple places with different types. >>> a = tlp.parse(r'P(x)') >>> b = tlp.parse(r'P(x,y)') >>> signature = {'P': ''} >>> a.typecheck(signature) {...} >>> try: typecheck([a,b], signature) ... except InconsistentTypeHierarchyException as e: print(e) The variable 'P' was found in multiple places with different types. Parse errors ============ >>> try: read_expr(r'') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. ^ >>> try: read_expr(r'(') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. ( ^ >>> try: read_expr(r')') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. ) ^ >>> try: read_expr(r'()') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. () ^ >>> try: read_expr(r'(P(x) & Q(x)') ... except LogicalExpressionException as e: print(e) End of input found. Expected token ')'. (P(x) & Q(x) ^ >>> try: read_expr(r'(P(x) &') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. (P(x) & ^ >>> try: read_expr(r'(P(x) | )') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. (P(x) | ) ^ >>> try: read_expr(r'P(x) ->') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. P(x) -> ^ >>> try: read_expr(r'P(x') ... except LogicalExpressionException as e: print(e) End of input found. Expected token ')'. P(x ^ >>> try: read_expr(r'P(x,') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. P(x, ^ >>> try: read_expr(r'P(x,)') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. P(x,) ^ >>> try: read_expr(r'exists') ... except LogicalExpressionException as e: print(e) End of input found. Variable and Expression expected following quantifier 'exists'. exists ^ >>> try: read_expr(r'exists x') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. exists x ^ >>> try: read_expr(r'exists x.') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. exists x. ^ >>> try: read_expr(r'\ ') ... except LogicalExpressionException as e: print(e) End of input found. Variable and Expression expected following lambda operator. \ ^ >>> try: read_expr(r'\ x') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. \ x ^ >>> try: read_expr(r'\ x y') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. \ x y ^ >>> try: read_expr(r'\ x.') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. \ x. ^ >>> try: read_expr(r'P(x)Q(x)') ... except LogicalExpressionException as e: print(e) Unexpected token: 'Q'. P(x)Q(x) ^ >>> try: read_expr(r'(P(x)Q(x)') ... except LogicalExpressionException as e: print(e) Unexpected token: 'Q'. Expected token ')'. (P(x)Q(x) ^ >>> try: read_expr(r'exists x y') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. exists x y ^ >>> try: read_expr(r'exists x y.') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. exists x y. ^ >>> try: read_expr(r'exists x -> y') ... except LogicalExpressionException as e: print(e) Unexpected token: '->'. Expression expected. exists x -> y ^ >>> try: read_expr(r'A -> ((P(x) & Q(x)) -> Z') ... except LogicalExpressionException as e: print(e) End of input found. Expected token ')'. A -> ((P(x) & Q(x)) -> Z ^ >>> try: read_expr(r'A -> ((P(x) &) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> ((P(x) &) -> Z ^ >>> try: read_expr(r'A -> ((P(x) | )) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> ((P(x) | )) -> Z ^ >>> try: read_expr(r'A -> (P(x) ->) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (P(x) ->) -> Z ^ >>> try: read_expr(r'A -> (P(x) -> Z') ... except LogicalExpressionException as e: print(e) End of input found. Expected token ')'. A -> (P(x) -> Z ^ >>> try: read_expr(r'A -> (P(x,) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (P(x,) -> Z ^ >>> try: read_expr(r'A -> (P(x,)) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (P(x,)) -> Z ^ >>> try: read_expr(r'A -> (exists) -> Z') ... except LogicalExpressionException as e: print(e) ')' is an illegal variable name. Constants may not be quantified. A -> (exists) -> Z ^ >>> try: read_expr(r'A -> (exists x) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (exists x) -> Z ^ >>> try: read_expr(r'A -> (exists x.) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (exists x.) -> Z ^ >>> try: read_expr(r'A -> (\ ) -> Z') ... except LogicalExpressionException as e: print(e) ')' is an illegal variable name. Constants may not be abstracted. A -> (\ ) -> Z ^ >>> try: read_expr(r'A -> (\ x) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (\ x) -> Z ^ >>> try: read_expr(r'A -> (\ x y) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (\ x y) -> Z ^ >>> try: read_expr(r'A -> (\ x.) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (\ x.) -> Z ^ >>> try: read_expr(r'A -> (P(x)Q(x)) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: 'Q'. Expected token ')'. A -> (P(x)Q(x)) -> Z ^ >>> try: read_expr(r'A -> ((P(x)Q(x)) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: 'Q'. Expected token ')'. A -> ((P(x)Q(x)) -> Z ^ >>> try: read_expr(r'A -> (all x y) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (all x y) -> Z ^ >>> try: read_expr(r'A -> (exists x y.) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (exists x y.) -> Z ^ >>> try: read_expr(r'A -> (exists x -> y) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: '->'. Expression expected. A -> (exists x -> y) -> Z ^ nltk-3.7/nltk/test/meteor.doctest000066400000000000000000000026751420073152400171540ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT .. -*- coding: utf-8 -*- ============= METEOR tests ============= No Alignment test ------------------ >>> from nltk.translate import meteor >>> from nltk import word_tokenize If the candidate has no alignment to any of the references, the METEOR score is 0. >>> round(meteor( ... [word_tokenize('The candidate has no alignment to any of the references')], ... word_tokenize('John loves Mary') ... ), 4) 0.0 Tests based on wikipedia examples --------------------------------- Testing on `wikipedia examples `_ >>> same_res = round(meteor( ... [word_tokenize('The cat sat on the mat')], ... word_tokenize('The cat sat on the mat') ... ), 4) >>> abs(same_res - 0.9977) < 1e-2 True >>> meteor( ... [word_tokenize('The cat sat on the mat')], ... word_tokenize('on the mat sat the cat') ... ) 0.5 >>> round(meteor( ... [word_tokenize('The cat sat on the mat')], ... word_tokenize('The cat was sat on the mat') ... ), 4) 0.9654 Test corresponding to issue #2751, where METEOR score > 1 >>> round(meteor( ... [word_tokenize('create or update a vm set')], ... word_tokenize('creates or updates a virtual machine scale set') ... ), 4) 0.7806 nltk-3.7/nltk/test/metrics.doctest000066400000000000000000000253221420073152400173210ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ======= Metrics ======= ----- Setup ----- >>> import pytest >>> _ = pytest.importorskip("numpy") The `nltk.metrics` package provides a variety of *evaluation measures* which can be used for a wide variety of NLP tasks. >>> from nltk.metrics import * ------------------ Standard IR Scores ------------------ We can use standard scores from information retrieval to test the performance of taggers, chunkers, etc. >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() >>> print(accuracy(reference, test)) 0.8 The following measures apply to sets: >>> reference_set = set(reference) >>> test_set = set(test) >>> precision(reference_set, test_set) 1.0 >>> print(recall(reference_set, test_set)) 0.8 >>> print(f_measure(reference_set, test_set)) 0.88888888888... Measuring the likelihood of the data, given probability distributions: >>> from nltk import FreqDist, MLEProbDist >>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf")) >>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss")) >>> print(log_likelihood(['a', 'd'], [pdist1, pdist2])) -2.7075187496... ---------------- Distance Metrics ---------------- String edit distance (Levenshtein): >>> edit_distance("rain", "shine") 3 >>> edit_distance_align("shine", "shine") [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)] >>> edit_distance_align("rain", "brainy") [(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)] >>> edit_distance_align("", "brainy") [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)] >>> edit_distance_align("", "") [(0, 0)] Other distance measures: >>> s1 = set([1,2,3,4]) >>> s2 = set([3,4,5]) >>> binary_distance(s1, s2) 1.0 >>> print(jaccard_distance(s1, s2)) 0.6 >>> print(masi_distance(s1, s2)) 0.868 ---------------------- Miscellaneous Measures ---------------------- Rank Correlation works with two dictionaries mapping keys to ranks. The dictionaries should have the same set of keys. >>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3}) 0.5 Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings). Segmentations are represented using strings of zeros and ones. >>> s1 = "000100000010" >>> s2 = "000010000100" >>> s3 = "100000010000" >>> s4 = "000000000000" >>> s5 = "111111111111" >>> windowdiff(s1, s1, 3) 0.0 >>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3 True >>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8 True >>> windowdiff(s1, s4, 3) 0.5 >>> windowdiff(s1, s5, 3) 1.0 ---------------- Confusion Matrix ---------------- >>> reference = 'This is the reference data. Testing 123. aoaeoeoe' >>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe' >>> print(ConfusionMatrix(reference, test)) | . 1 2 3 T _ a c d e f g h i n o r s t z | --+-------------------------------------------+ |<8>. . . . . 1 . . . . . . . . . . . . . . | . | .<2>. . . . . . . . . . . . . . . . . . . | 1 | . .<1>. . . . . . . . . . . . . . . . . . | 2 | . . .<1>. . . . . . . . . . . . . . . . . | 3 | . . . .<1>. . . . . . . . . . . . . . . . | T | . . . . .<2>. . . . . . . . . . . . . . . | _ | . . . . . .<.>. . . . . . . . . . . . . . | a | . . . . . . .<4>. . . . . . . . . . . . . | c | . . . . . . . .<1>. . . . . . . . . . . . | d | . . . . . . . . .<1>. . . . . . . . . . . | e | . . . . . . . . . .<6>. . . 3 . . . . . . | f | . . . . . . . . . . .<1>. . . . . . . . . | g | . . . . . . . . . . . .<1>. . . . . . . . | h | . . . . . . . . . . . . .<2>. . . . . . . | i | . . . . . . . . . . 1 . . .<1>. 1 . . . . | n | . . . . . . . . . . . . . . .<2>. . . . . | o | . . . . . . . . . . . . . . . .<3>. . . . | r | . . . . . . . . . . . . . . . . .<2>. . . | s | . . . . . . . . . . . . . . . . . .<2>. 1 | t | . . . . . . . . . . . . . . . . . . .<3>. | z | . . . . . . . . . . . . . . . . . . . .<.>| --+-------------------------------------------+ (row = reference; col = test) >>> cm = ConfusionMatrix(reference, test) >>> print(cm.pretty_format(sort_by_count=True)) | e a i o s t . T h n r 1 2 3 c d f g _ z | --+-------------------------------------------+ |<8>. . . . . . . . . . . . . . . . . . 1 . | e | .<6>. 3 . . . . . . . . . . . . . . . . . | a | . .<4>. . . . . . . . . . . . . . . . . . | i | . 1 .<1>1 . . . . . . . . . . . . . . . . | o | . . . .<3>. . . . . . . . . . . . . . . . | s | . . . . .<2>. . . . . . . . . . . . . . 1 | t | . . . . . .<3>. . . . . . . . . . . . . . | . | . . . . . . .<2>. . . . . . . . . . . . . | T | . . . . . . . .<2>. . . . . . . . . . . . | h | . . . . . . . . .<2>. . . . . . . . . . . | n | . . . . . . . . . .<2>. . . . . . . . . . | r | . . . . . . . . . . .<2>. . . . . . . . . | 1 | . . . . . . . . . . . .<1>. . . . . . . . | 2 | . . . . . . . . . . . . .<1>. . . . . . . | 3 | . . . . . . . . . . . . . .<1>. . . . . . | c | . . . . . . . . . . . . . . .<1>. . . . . | d | . . . . . . . . . . . . . . . .<1>. . . . | f | . . . . . . . . . . . . . . . . .<1>. . . | g | . . . . . . . . . . . . . . . . . .<1>. . | _ | . . . . . . . . . . . . . . . . . . .<.>. | z | . . . . . . . . . . . . . . . . . . . .<.>| --+-------------------------------------------+ (row = reference; col = test) >>> print(cm.pretty_format(sort_by_count=True, truncate=10)) | e a i o s t . T h | --+---------------------+ |<8>. . . . . . . . . | e | .<6>. 3 . . . . . . | a | . .<4>. . . . . . . | i | . 1 .<1>1 . . . . . | o | . . . .<3>. . . . . | s | . . . . .<2>. . . . | t | . . . . . .<3>. . . | . | . . . . . . .<2>. . | T | . . . . . . . .<2>. | h | . . . . . . . . .<2>| --+---------------------+ (row = reference; col = test) >>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False)) | 1 | | 1 2 3 4 5 6 7 8 9 0 | ---+---------------------+ 1 |<8>. . . . . . . . . | 2 | .<6>. 3 . . . . . . | 3 | . .<4>. . . . . . . | 4 | . 1 .<1>1 . . . . . | 5 | . . . .<3>. . . . . | 6 | . . . . .<2>. . . . | 7 | . . . . . .<3>. . . | 8 | . . . . . . .<2>. . | 9 | . . . . . . . .<2>. | 10 | . . . . . . . . .<2>| ---+---------------------+ (row = reference; col = test) Value key: 1: 2: e 3: a 4: i 5: o 6: s 7: t 8: . 9: T 10: h For "e", the number of true positives should be 6, while the number of false negatives is 3. So, the recall ought to be 6 / (6 + 3): >>> cm.recall("e") # doctest: +ELLIPSIS 0.666666... For "e", the false positive is just 1, so the precision should be 6 / (6 + 1): >>> cm.precision("e") # doctest: +ELLIPSIS 0.857142... The f-measure with default value of ``alpha = 0.5`` should then be: * *1/(alpha/p + (1-alpha)/r) =* * *1/(0.5/p + 0.5/r) =* * *2pr / (p + r) =* * *2 * 0.857142... * 0.666666... / (0.857142... + 0.666666...) =* * *0.749999...* >>> cm.f_measure("e") # doctest: +ELLIPSIS 0.749999... -------------------- Association measures -------------------- These measures are useful to determine whether the coocurrence of two random events is meaningful. They are used, for instance, to distinguish collocations from other pairs of adjacent words. We bring some examples of bigram association calculations from Manning and Schutze's SNLP, 2nd Ed. chapter 5. >>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668 >>> bam = BigramAssocMeasures >>> bam.raw_freq(20, (42, 20), N) == 20. / N True >>> bam.student_t(n_new_companies, (n_new, n_companies), N) 0.999... >>> bam.chi_sq(n_new_companies, (n_new, n_companies), N) 1.54... >>> bam.likelihood_ratio(150, (12593, 932), N) 1291... For other associations, we ensure the ordering of the measures: >>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N) True >>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N) True >>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N) True >>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N) True >>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N) True >>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N) True >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP False For trigrams, we have to provide more count information: >>> n_w1_w2_w3 = 20 >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) >>> n_w1, n_w2, n_w3 = 100, 200, 300 >>> uni_counts = (n_w1, n_w2, n_w3) >>> N = 14307668 >>> tam = TrigramAssocMeasures >>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N True >>> uni_counts2 = (n_w1, n_w2, 100) >>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N) True For fourgrams, we have to provide more count information: >>> n_w1_w2_w3_w4 = 5 >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 >>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10 >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) >>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4) >>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400 >>> uni_counts = (n_w1, n_w2, n_w3, n_w4) >>> N = 14307668 >>> qam = QuadgramAssocMeasures >>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N True nltk-3.7/nltk/test/misc.doctest000066400000000000000000000064221420073152400166060ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT -------------------------------------------------------------------------------- Unit tests for the miscellaneous sort functions. -------------------------------------------------------------------------------- >>> from copy import deepcopy >>> from nltk.misc.sort import * A (very) small list of unsorted integers. >>> test_data = [12, 67, 7, 28, 92, 56, 53, 720, 91, 57, 20, 20] Test each sorting method - each method returns the number of operations required to sort the data, and sorts in-place (desctructively - hence the need for multiple copies). >>> sorted_data = deepcopy(test_data) >>> selection(sorted_data) 66 >>> sorted_data [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] >>> sorted_data = deepcopy(test_data) >>> bubble(sorted_data) 30 >>> sorted_data [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] >>> sorted_data = deepcopy(test_data) >>> merge(sorted_data) 30 >>> sorted_data [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] >>> sorted_data = deepcopy(test_data) >>> quick(sorted_data) 13 >>> sorted_data [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] -------------------------------------------------------------------------------- Unit tests for Wordfinder class -------------------------------------------------------------------------------- >>> import random >>> # The following is not enough for reproducibility under Python 2/3 >>> # (see https://bugs.python.org/issue9025) so this test is skipped. >>> random.seed(12345) >>> from nltk.misc import wordfinder >>> wordfinder.word_finder() # doctest: +SKIP Word Finder J V L A I R O T A T I S I V O D E R E T H U U B E A R O E P O C S O R E T N E P A D A U Z E E S R A P P A L L M E N T R C X A D Q S Z T P E O R S N G P J A D E I G Y K K T I A A R G F I D T E L C N S R E C N B H T R L T N N B W N T A O A I A Y I L O E I A M E I A A Y U R P L L D G L T V S T S F E A D I P H D O O H N I R L S E C I N I L R N N M E C G R U E A A A Y G I C E N L L E O I G Q R T A E L M R C E T I S T A E T L L E U A E N R L O U O T A S E E C S O O N H Y P A T G Y E M H O M M D R E S F P U L T H C F N V L A C A I M A M A N L B R U T E D O M I O R I L N E E E E E U A R S C R Y L I P H T R K E S N N M S I L A S R E V I N U T X T A A O U T K S E T A R R E S I B J A E D L E L J I F O O R P E L K N I R W K H A I D E Q O P R I C K T I M B E R P Z K D O O H G N I H T U R V E Y D R O P 1: INTERCHANGER 2: TEARLESSNESS 3: UNIVERSALISM 4: DESENSITIZER 5: INTERMENTION 6: TRICHOCYSTIC 7: EXTRAMURALLY 8: VEGETOALKALI 9: PALMELLACEAE 10: AESTHETICISM 11: PETROGRAPHER 12: VISITATORIAL 13: OLEOMARGARIC 14: WRINKLEPROOF 15: PRICKTIMBER 16: PRESIDIALLY 17: SCITAMINEAE 18: ENTEROSCOPE 19: APPALLMENT 20: TURVEYDROP 21: THINGHOOD 22: BISERRATE 23: GREENLAND 24: BRUTEDOM 25: POLONIAN 26: ACOLHUAN 27: LAPORTEA 28: TENDING 29: TEREDO 30: MESOLE 31: UNLIMP 32: OSTARA 33: PILY 34: DUNT 35: ONYX 36: KATH 37: JUNE nltk-3.7/nltk/test/nonmonotonic.doctest000066400000000000000000000235351420073152400203770ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ====================== Nonmonotonic Reasoning ====================== >>> from nltk.test.nonmonotonic_fixt import setup_module >>> setup_module() >>> from nltk import * >>> from nltk.inference.nonmonotonic import * >>> from nltk.sem import logic >>> logic._counter._value = 0 >>> read_expr = logic.Expression.fromstring ------------------------ Closed Domain Assumption ------------------------ The only entities in the domain are those found in the assumptions or goal. If the domain only contains "A" and "B", then the expression "exists x.P(x)" can be replaced with "P(A) | P(B)" and an expression "all x.P(x)" can be replaced with "P(A) & P(B)". >>> p1 = read_expr(r'all x.(man(x) -> mortal(x))') >>> p2 = read_expr(r'man(Socrates)') >>> c = read_expr(r'mortal(Socrates)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() True >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP (man(Socrates) -> mortal(Socrates)) man(Socrates) >>> cdp.prove() True >>> p1 = read_expr(r'exists x.walk(x)') >>> p2 = read_expr(r'man(Socrates)') >>> c = read_expr(r'walk(Socrates)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() False >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP walk(Socrates) man(Socrates) >>> cdp.prove() True >>> p1 = read_expr(r'exists x.walk(x)') >>> p2 = read_expr(r'man(Socrates)') >>> p3 = read_expr(r'-walk(Bill)') >>> c = read_expr(r'walk(Socrates)') >>> prover = Prover9Command(c, [p1,p2,p3]) >>> prover.prove() False >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP (walk(Socrates) | walk(Bill)) man(Socrates) -walk(Bill) >>> cdp.prove() True >>> p1 = read_expr(r'walk(Socrates)') >>> p2 = read_expr(r'walk(Bill)') >>> c = read_expr(r'all x.walk(x)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() False >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP walk(Socrates) walk(Bill) >>> print(cdp.goal()) # doctest: +SKIP (walk(Socrates) & walk(Bill)) >>> cdp.prove() True >>> p1 = read_expr(r'girl(mary)') >>> p2 = read_expr(r'dog(rover)') >>> p3 = read_expr(r'all x.(girl(x) -> -dog(x))') >>> p4 = read_expr(r'all x.(dog(x) -> -girl(x))') >>> p5 = read_expr(r'chase(mary, rover)') >>> c = read_expr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))') >>> prover = Prover9Command(c, [p1,p2,p3,p4,p5]) >>> print(prover.prove()) False >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP girl(mary) dog(rover) ((girl(rover) -> -dog(rover)) & (girl(mary) -> -dog(mary))) ((dog(rover) -> -girl(rover)) & (dog(mary) -> -girl(mary))) chase(mary,rover) >>> print(cdp.goal()) # doctest: +SKIP ((dog(rover) & (girl(rover) -> chase(rover,rover)) & (girl(mary) -> chase(mary,rover))) | (dog(mary) & (girl(rover) -> chase(rover,mary)) & (girl(mary) -> chase(mary,mary)))) >>> print(cdp.prove()) True ----------------------- Unique Names Assumption ----------------------- No two entities in the domain represent the same entity unless it can be explicitly proven that they do. Therefore, if the domain contains "A" and "B", then add the assumption "-(A = B)" if it is not the case that " \|- (A = B)". >>> p1 = read_expr(r'man(Socrates)') >>> p2 = read_expr(r'man(Bill)') >>> c = read_expr(r'exists x.exists y.-(x = y)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() False >>> unp = UniqueNamesProver(prover) >>> for a in unp.assumptions(): print(a) # doctest: +SKIP man(Socrates) man(Bill) -(Socrates = Bill) >>> unp.prove() True >>> p1 = read_expr(r'all x.(walk(x) -> (x = Socrates))') >>> p2 = read_expr(r'Bill = William') >>> p3 = read_expr(r'Bill = Billy') >>> c = read_expr(r'-walk(William)') >>> prover = Prover9Command(c, [p1,p2,p3]) >>> prover.prove() False >>> unp = UniqueNamesProver(prover) >>> for a in unp.assumptions(): print(a) # doctest: +SKIP all x.(walk(x) -> (x = Socrates)) (Bill = William) (Bill = Billy) -(William = Socrates) -(Billy = Socrates) -(Socrates = Bill) >>> unp.prove() True ----------------------- Closed World Assumption ----------------------- The only entities that have certain properties are those that is it stated have the properties. We accomplish this assumption by "completing" predicates. If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion of "P". If the assumptions contain "all x.(ostrich(x) -> bird(x))", then "all x.(bird(x) -> ostrich(x))" is the completion of "bird". If the assumptions don't contain anything that are "P", then "all x.-P(x)" is the completion of "P". >>> p1 = read_expr(r'walk(Socrates)') >>> p2 = read_expr(r'-(Socrates = Bill)') >>> c = read_expr(r'-walk(Bill)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() False >>> cwp = ClosedWorldProver(prover) >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP walk(Socrates) -(Socrates = Bill) all z1.(walk(z1) -> (z1 = Socrates)) >>> cwp.prove() True >>> p1 = read_expr(r'see(Socrates, John)') >>> p2 = read_expr(r'see(John, Mary)') >>> p3 = read_expr(r'-(Socrates = John)') >>> p4 = read_expr(r'-(John = Mary)') >>> c = read_expr(r'-see(Socrates, Mary)') >>> prover = Prover9Command(c, [p1,p2,p3,p4]) >>> prover.prove() False >>> cwp = ClosedWorldProver(prover) >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP see(Socrates,John) see(John,Mary) -(Socrates = John) -(John = Mary) all z3 z4.(see(z3,z4) -> (((z3 = Socrates) & (z4 = John)) | ((z3 = John) & (z4 = Mary)))) >>> cwp.prove() True >>> p1 = read_expr(r'all x.(ostrich(x) -> bird(x))') >>> p2 = read_expr(r'bird(Tweety)') >>> p3 = read_expr(r'-ostrich(Sam)') >>> p4 = read_expr(r'Sam != Tweety') >>> c = read_expr(r'-bird(Sam)') >>> prover = Prover9Command(c, [p1,p2,p3,p4]) >>> prover.prove() False >>> cwp = ClosedWorldProver(prover) >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP all x.(ostrich(x) -> bird(x)) bird(Tweety) -ostrich(Sam) -(Sam = Tweety) all z7.-ostrich(z7) all z8.(bird(z8) -> ((z8 = Tweety) | ostrich(z8))) >>> print(cwp.prove()) True ----------------------- Multi-Decorator Example ----------------------- Decorators can be nested to utilize multiple assumptions. >>> p1 = read_expr(r'see(Socrates, John)') >>> p2 = read_expr(r'see(John, Mary)') >>> c = read_expr(r'-see(Socrates, Mary)') >>> prover = Prover9Command(c, [p1,p2]) >>> print(prover.prove()) False >>> cmd = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover))) >>> print(cmd.prove()) True ----------------- Default Reasoning ----------------- >>> logic._counter._value = 0 >>> premises = [] define the taxonomy >>> premises.append(read_expr(r'all x.(elephant(x) -> animal(x))')) >>> premises.append(read_expr(r'all x.(bird(x) -> animal(x))')) >>> premises.append(read_expr(r'all x.(dove(x) -> bird(x))')) >>> premises.append(read_expr(r'all x.(ostrich(x) -> bird(x))')) >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> ostrich(x))')) default the properties using abnormalities >>> premises.append(read_expr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')) #normal animals don't fly >>> premises.append(read_expr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')) #normal birds fly >>> premises.append(read_expr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly specify abnormal entities >>> premises.append(read_expr(r'all x.(bird(x) -> Ab1(x))')) #flight >>> premises.append(read_expr(r'all x.(ostrich(x) -> Ab2(x))')) #non-flying bird >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich define entities >>> premises.append(read_expr(r'elephant(el)')) >>> premises.append(read_expr(r'dove(do)')) >>> premises.append(read_expr(r'ostrich(os)')) print the augmented assumptions list >>> prover = Prover9Command(None, premises) >>> command = UniqueNamesProver(ClosedWorldProver(prover)) >>> for a in command.assumptions(): print(a) # doctest: +SKIP all x.(elephant(x) -> animal(x)) all x.(bird(x) -> animal(x)) all x.(dove(x) -> bird(x)) all x.(ostrich(x) -> bird(x)) all x.(flying_ostrich(x) -> ostrich(x)) all x.((animal(x) & -Ab1(x)) -> -fly(x)) all x.((bird(x) & -Ab2(x)) -> fly(x)) all x.((ostrich(x) & -Ab3(x)) -> -fly(x)) all x.(bird(x) -> Ab1(x)) all x.(ostrich(x) -> Ab2(x)) all x.(flying_ostrich(x) -> Ab3(x)) elephant(el) dove(do) ostrich(os) all z1.(animal(z1) -> (elephant(z1) | bird(z1))) all z2.(Ab1(z2) -> bird(z2)) all z3.(bird(z3) -> (dove(z3) | ostrich(z3))) all z4.(dove(z4) -> (z4 = do)) all z5.(Ab2(z5) -> ostrich(z5)) all z6.(Ab3(z6) -> flying_ostrich(z6)) all z7.(ostrich(z7) -> ((z7 = os) | flying_ostrich(z7))) all z8.-flying_ostrich(z8) all z9.(elephant(z9) -> (z9 = el)) -(el = os) -(el = do) -(os = do) >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(el)'), premises))).prove() True >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('fly(do)'), premises))).prove() True >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(os)'), premises))).prove() True nltk-3.7/nltk/test/nonmonotonic_fixt.py000066400000000000000000000004321420073152400204030ustar00rootroot00000000000000def setup_module(): import pytest from nltk.inference.mace import Mace try: m = Mace() m._find_binary("mace4") except LookupError as e: pytest.skip( "Mace4/Prover9 is not available so nonmonotonic.doctest was skipped" ) nltk-3.7/nltk/test/onto1.fol000066400000000000000000000003561420073152400160260ustar00rootroot00000000000000all x. ((boxer2 x) implies (dog x)) all x. ((boxer1 x) implies (person x)) all x. (not ((dog x) and (person x))) all x. (not ((kitchen x) and (garden x))) all x. ((kitchen x) implies (location x)) all x. ((garden x) implies (location x)) nltk-3.7/nltk/test/paice.doctest000066400000000000000000000023261420073152400167330ustar00rootroot00000000000000 ===================================================== PAICE's evaluation statistics for stemming algorithms ===================================================== Given a list of words with their real lemmas and stems according to stemming algorithm under evaluation, counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) and Error-rate relative to truncation (ERRT). >>> from nltk.metrics import Paice ------------------------------------- Understemming and Overstemming values ------------------------------------- >>> lemmas = {'kneel': ['kneel', 'knelt'], ... 'range': ['range', 'ranged'], ... 'ring': ['ring', 'rang', 'rung']} >>> stems = {'kneel': ['kneel'], ... 'knelt': ['knelt'], ... 'rang': ['rang', 'range', 'ranged'], ... 'ring': ['ring'], ... 'rung': ['rung']} >>> p = Paice(lemmas, stems) >>> p.gumt, p.gdmt, p.gwmt, p.gdnt (4.0, 5.0, 2.0, 16.0) >>> p.ui, p.oi, p.sw (0.8..., 0.125..., 0.15625...) >>> p.errt 1.0 >>> [('{0:.3f}'.format(a), '{0:.3f}'.format(b)) for a, b in p.coords] [('0.000', '1.000'), ('0.000', '0.375'), ('0.600', '0.125'), ('0.800', '0.125')] nltk-3.7/nltk/test/parse.doctest000066400000000000000000001023231420073152400167620ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========= Parsing ========= Unit tests for the Context Free Grammar class --------------------------------------------- >>> import pickle >>> import subprocess >>> import sys >>> from nltk import Nonterminal, nonterminals, Production, CFG >>> nt1 = Nonterminal('NP') >>> nt2 = Nonterminal('VP') >>> nt1.symbol() 'NP' >>> nt1 == Nonterminal('NP') True >>> nt1 == nt2 False >>> S, NP, VP, PP = nonterminals('S, NP, VP, PP') >>> N, V, P, DT = nonterminals('N, V, P, DT') >>> prod1 = Production(S, [NP, VP]) >>> prod2 = Production(NP, [DT, NP]) >>> prod1.lhs() S >>> prod1.rhs() (NP, VP) >>> prod1 == Production(S, [NP, VP]) True >>> prod1 == prod2 False >>> grammar = CFG.fromstring(""" ... S -> NP VP ... PP -> P NP ... NP -> 'the' N | N PP | 'the' N PP ... VP -> V NP | V PP | V NP PP ... N -> 'cat' ... N -> 'dog' ... N -> 'rug' ... V -> 'chased' ... V -> 'sat' ... P -> 'in' ... P -> 'on' ... """) >>> cmd = """import pickle ... from nltk import Production ... p = Production('S', ['NP', 'VP']) ... print(pickle.dumps(p)) ... """ >>> # Start a subprocess to simulate pickling in another process >>> proc = subprocess.run([sys.executable, '-c', cmd], stdout=subprocess.PIPE) >>> p1 = pickle.loads(eval(proc.stdout)) >>> p2 = Production('S', ['NP', 'VP']) >>> print(hash(p1) == hash(p2)) True Unit tests for the rd (Recursive Descent Parser) class ------------------------------------------------------ Create and run a recursive descent parser over both a syntactically ambiguous and unambiguous sentence. >>> from nltk.parse import RecursiveDescentParser >>> rd = RecursiveDescentParser(grammar) >>> sentence1 = 'the cat chased the dog'.split() >>> sentence2 = 'the cat chased the dog on the rug'.split() >>> for t in rd.parse(sentence1): ... print(t) (S (NP the (N cat)) (VP (V chased) (NP the (N dog)))) >>> for t in rd.parse(sentence2): ... print(t) (S (NP the (N cat)) (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) (S (NP the (N cat)) (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) (dolist (expr doctest-font-lock-keywords) (add-to-list 'font-lock-keywords expr)) font-lock-keywords (add-to-list 'font-lock-keywords (car doctest-font-lock-keywords)) Unit tests for the sr (Shift Reduce Parser) class ------------------------------------------------- Create and run a shift reduce parser over both a syntactically ambiguous and unambiguous sentence. Note that unlike the recursive descent parser, one and only one parse is ever returned. >>> from nltk.parse import ShiftReduceParser >>> sr = ShiftReduceParser(grammar) >>> sentence1 = 'the cat chased the dog'.split() >>> sentence2 = 'the cat chased the dog on the rug'.split() >>> for t in sr.parse(sentence1): ... print(t) (S (NP the (N cat)) (VP (V chased) (NP the (N dog)))) The shift reduce parser uses heuristics to decide what to do when there are multiple possible shift or reduce operations available - for the supplied grammar clearly the wrong operation is selected. >>> for t in sr.parse(sentence2): ... print(t) Unit tests for the Chart Parser class ------------------------------------- We use the demo() function for testing. We must turn off showing of times. >>> import nltk First we test tracing with a short sentence >>> nltk.parse.chart.demo(2, print_times=False, trace=1, ... sent='I saw a dog', numparses=1) * Sentence: I saw a dog ['I', 'saw', 'a', 'dog'] * Strategy: Bottom-up |. I . saw . a . dog .| |[---------] . . .| [0:1] 'I' |. [---------] . .| [1:2] 'saw' |. . [---------] .| [2:3] 'a' |. . . [---------]| [3:4] 'dog' |> . . . .| [0:0] NP -> * 'I' |[---------] . . .| [0:1] NP -> 'I' * |> . . . .| [0:0] S -> * NP VP |> . . . .| [0:0] NP -> * NP PP |[---------> . . .| [0:1] S -> NP * VP |[---------> . . .| [0:1] NP -> NP * PP |. > . . .| [1:1] Verb -> * 'saw' |. [---------] . .| [1:2] Verb -> 'saw' * |. > . . .| [1:1] VP -> * Verb NP |. > . . .| [1:1] VP -> * Verb |. [---------> . .| [1:2] VP -> Verb * NP |. [---------] . .| [1:2] VP -> Verb * |. > . . .| [1:1] VP -> * VP PP |[-------------------] . .| [0:2] S -> NP VP * |. [---------> . .| [1:2] VP -> VP * PP |. . > . .| [2:2] Det -> * 'a' |. . [---------] .| [2:3] Det -> 'a' * |. . > . .| [2:2] NP -> * Det Noun |. . [---------> .| [2:3] NP -> Det * Noun |. . . > .| [3:3] Noun -> * 'dog' |. . . [---------]| [3:4] Noun -> 'dog' * |. . [-------------------]| [2:4] NP -> Det Noun * |. . > . .| [2:2] S -> * NP VP |. . > . .| [2:2] NP -> * NP PP |. [-----------------------------]| [1:4] VP -> Verb NP * |. . [------------------->| [2:4] S -> NP * VP |. . [------------------->| [2:4] NP -> NP * PP |[=======================================]| [0:4] S -> NP VP * |. [----------------------------->| [1:4] VP -> VP * PP Nr edges in chart: 33 (S (NP I) (VP (Verb saw) (NP (Det a) (Noun dog)))) Then we test the different parsing Strategies. Note that the number of edges differ between the strategies. Top-down >>> nltk.parse.chart.demo(1, print_times=False, trace=0, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Top-down Nr edges in chart: 48 (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) Bottom-up >>> nltk.parse.chart.demo(2, print_times=False, trace=0, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Bottom-up Nr edges in chart: 53 (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) Bottom-up Left-Corner >>> nltk.parse.chart.demo(3, print_times=False, trace=0, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Bottom-up left-corner Nr edges in chart: 36 (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) Left-Corner with Bottom-Up Filter >>> nltk.parse.chart.demo(4, print_times=False, trace=0, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Filtered left-corner Nr edges in chart: 28 (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) The stepping chart parser >>> nltk.parse.chart.demo(5, print_times=False, trace=1, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Stepping (top-down vs bottom-up) *** SWITCH TO TOP DOWN |[------] . . . . .| [0:1] 'I' |. [------] . . . .| [1:2] 'saw' |. . [------] . . .| [2:3] 'John' |. . . [------] . .| [3:4] 'with' |. . . . [------] .| [4:5] 'a' |. . . . . [------]| [5:6] 'dog' |> . . . . . .| [0:0] S -> * NP VP |> . . . . . .| [0:0] NP -> * NP PP |> . . . . . .| [0:0] NP -> * Det Noun |> . . . . . .| [0:0] NP -> * 'I' |[------] . . . . .| [0:1] NP -> 'I' * |[------> . . . . .| [0:1] S -> NP * VP |[------> . . . . .| [0:1] NP -> NP * PP |. > . . . . .| [1:1] VP -> * VP PP |. > . . . . .| [1:1] VP -> * Verb NP |. > . . . . .| [1:1] VP -> * Verb |. > . . . . .| [1:1] Verb -> * 'saw' |. [------] . . . .| [1:2] Verb -> 'saw' * |. [------> . . . .| [1:2] VP -> Verb * NP |. [------] . . . .| [1:2] VP -> Verb * |[-------------] . . . .| [0:2] S -> NP VP * |. [------> . . . .| [1:2] VP -> VP * PP *** SWITCH TO BOTTOM UP |. . > . . . .| [2:2] NP -> * 'John' |. . . > . . .| [3:3] PP -> * 'with' NP |. . . > . . .| [3:3] Prep -> * 'with' |. . . . > . .| [4:4] Det -> * 'a' |. . . . . > .| [5:5] Noun -> * 'dog' |. . [------] . . .| [2:3] NP -> 'John' * |. . . [------> . .| [3:4] PP -> 'with' * NP |. . . [------] . .| [3:4] Prep -> 'with' * |. . . . [------] .| [4:5] Det -> 'a' * |. . . . . [------]| [5:6] Noun -> 'dog' * |. [-------------] . . .| [1:3] VP -> Verb NP * |[--------------------] . . .| [0:3] S -> NP VP * |. [-------------> . . .| [1:3] VP -> VP * PP |. . > . . . .| [2:2] S -> * NP VP |. . > . . . .| [2:2] NP -> * NP PP |. . . . > . .| [4:4] NP -> * Det Noun |. . [------> . . .| [2:3] S -> NP * VP |. . [------> . . .| [2:3] NP -> NP * PP |. . . . [------> .| [4:5] NP -> Det * Noun |. . . . [-------------]| [4:6] NP -> Det Noun * |. . . [--------------------]| [3:6] PP -> 'with' NP * |. [----------------------------------]| [1:6] VP -> VP PP * *** SWITCH TO TOP DOWN |. . > . . . .| [2:2] NP -> * Det Noun |. . . . > . .| [4:4] NP -> * NP PP |. . . > . . .| [3:3] VP -> * VP PP |. . . > . . .| [3:3] VP -> * Verb NP |. . . > . . .| [3:3] VP -> * Verb |[=========================================]| [0:6] S -> NP VP * |. [---------------------------------->| [1:6] VP -> VP * PP |. . [---------------------------]| [2:6] NP -> NP PP * |. . . . [------------->| [4:6] NP -> NP * PP |. [----------------------------------]| [1:6] VP -> Verb NP * |. . [--------------------------->| [2:6] S -> NP * VP |. . [--------------------------->| [2:6] NP -> NP * PP |[=========================================]| [0:6] S -> NP VP * |. [---------------------------------->| [1:6] VP -> VP * PP |. . . . . . >| [6:6] VP -> * VP PP |. . . . . . >| [6:6] VP -> * Verb NP |. . . . . . >| [6:6] VP -> * Verb *** SWITCH TO BOTTOM UP |. . . . > . .| [4:4] S -> * NP VP |. . . . [------------->| [4:6] S -> NP * VP *** SWITCH TO TOP DOWN *** SWITCH TO BOTTOM UP *** SWITCH TO TOP DOWN *** SWITCH TO BOTTOM UP *** SWITCH TO TOP DOWN *** SWITCH TO BOTTOM UP Nr edges in chart: 61 (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) Unit tests for the Incremental Chart Parser class ------------------------------------------------- The incremental chart parsers are defined in earleychart.py. We use the demo() function for testing. We must turn off showing of times. >>> import nltk Earley Chart Parser >>> nltk.parse.earleychart.demo(print_times=False, trace=1, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] |. I . saw . John . with . a . dog .| |[------] . . . . .| [0:1] 'I' |. [------] . . . .| [1:2] 'saw' |. . [------] . . .| [2:3] 'John' |. . . [------] . .| [3:4] 'with' |. . . . [------] .| [4:5] 'a' |. . . . . [------]| [5:6] 'dog' |> . . . . . .| [0:0] S -> * NP VP |> . . . . . .| [0:0] NP -> * NP PP |> . . . . . .| [0:0] NP -> * Det Noun |> . . . . . .| [0:0] NP -> * 'I' |[------] . . . . .| [0:1] NP -> 'I' * |[------> . . . . .| [0:1] S -> NP * VP |[------> . . . . .| [0:1] NP -> NP * PP |. > . . . . .| [1:1] VP -> * VP PP |. > . . . . .| [1:1] VP -> * Verb NP |. > . . . . .| [1:1] VP -> * Verb |. > . . . . .| [1:1] Verb -> * 'saw' |. [------] . . . .| [1:2] Verb -> 'saw' * |. [------> . . . .| [1:2] VP -> Verb * NP |. [------] . . . .| [1:2] VP -> Verb * |[-------------] . . . .| [0:2] S -> NP VP * |. [------> . . . .| [1:2] VP -> VP * PP |. . > . . . .| [2:2] NP -> * NP PP |. . > . . . .| [2:2] NP -> * Det Noun |. . > . . . .| [2:2] NP -> * 'John' |. . [------] . . .| [2:3] NP -> 'John' * |. [-------------] . . .| [1:3] VP -> Verb NP * |. . [------> . . .| [2:3] NP -> NP * PP |. . . > . . .| [3:3] PP -> * 'with' NP |[--------------------] . . .| [0:3] S -> NP VP * |. [-------------> . . .| [1:3] VP -> VP * PP |. . . [------> . .| [3:4] PP -> 'with' * NP |. . . . > . .| [4:4] NP -> * NP PP |. . . . > . .| [4:4] NP -> * Det Noun |. . . . > . .| [4:4] Det -> * 'a' |. . . . [------] .| [4:5] Det -> 'a' * |. . . . [------> .| [4:5] NP -> Det * Noun |. . . . . > .| [5:5] Noun -> * 'dog' |. . . . . [------]| [5:6] Noun -> 'dog' * |. . . . [-------------]| [4:6] NP -> Det Noun * |. . . [--------------------]| [3:6] PP -> 'with' NP * |. . . . [------------->| [4:6] NP -> NP * PP |. . [---------------------------]| [2:6] NP -> NP PP * |. [----------------------------------]| [1:6] VP -> VP PP * |[=========================================]| [0:6] S -> NP VP * |. [---------------------------------->| [1:6] VP -> VP * PP |. [----------------------------------]| [1:6] VP -> Verb NP * |. . [--------------------------->| [2:6] NP -> NP * PP |[=========================================]| [0:6] S -> NP VP * |. [---------------------------------->| [1:6] VP -> VP * PP (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) Unit tests for LARGE context-free grammars ------------------------------------------ Reading the ATIS grammar. >>> grammar = nltk.data.load('grammars/large_grammars/atis.cfg') >>> grammar Reading the test sentences. >>> sentences = nltk.data.load('grammars/large_grammars/atis_sentences.txt') >>> sentences = nltk.parse.util.extract_test_sentences(sentences) >>> len(sentences) 98 >>> testsentence = sentences[22] >>> testsentence[0] ['show', 'me', 'northwest', 'flights', 'to', 'detroit', '.'] >>> testsentence[1] 17 >>> sentence = testsentence[0] Now we test all different parsing strategies. Note that the number of edges differ between the strategies. Bottom-up parsing. >>> parser = nltk.parse.BottomUpChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 7661 >>> print((len(list(chart.parses(grammar.start()))))) 17 Bottom-up Left-corner parsing. >>> parser = nltk.parse.BottomUpLeftCornerChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 4986 >>> print((len(list(chart.parses(grammar.start()))))) 17 Left-corner parsing with bottom-up filter. >>> parser = nltk.parse.LeftCornerChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 1342 >>> print((len(list(chart.parses(grammar.start()))))) 17 Top-down parsing. >>> parser = nltk.parse.TopDownChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 28352 >>> print((len(list(chart.parses(grammar.start()))))) 17 Incremental Bottom-up parsing. >>> parser = nltk.parse.IncrementalBottomUpChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 7661 >>> print((len(list(chart.parses(grammar.start()))))) 17 Incremental Bottom-up Left-corner parsing. >>> parser = nltk.parse.IncrementalBottomUpLeftCornerChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 4986 >>> print((len(list(chart.parses(grammar.start()))))) 17 Incremental Left-corner parsing with bottom-up filter. >>> parser = nltk.parse.IncrementalLeftCornerChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 1342 >>> print((len(list(chart.parses(grammar.start()))))) 17 Incremental Top-down parsing. >>> parser = nltk.parse.IncrementalTopDownChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 28352 >>> print((len(list(chart.parses(grammar.start()))))) 17 Earley parsing. This is similar to the incremental top-down algorithm. >>> parser = nltk.parse.EarleyChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 28352 >>> print((len(list(chart.parses(grammar.start()))))) 17 Unit tests for the Probabilistic CFG class ------------------------------------------ >>> from nltk.corpus import treebank >>> from itertools import islice >>> from nltk.grammar import PCFG, induce_pcfg >>> toy_pcfg1 = PCFG.fromstring(""" ... S -> NP VP [1.0] ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] ... Det -> 'the' [0.8] | 'my' [0.2] ... N -> 'man' [0.5] | 'telescope' [0.5] ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2] ... V -> 'ate' [0.35] | 'saw' [0.65] ... PP -> P NP [1.0] ... P -> 'with' [0.61] | 'under' [0.39] ... """) >>> toy_pcfg2 = PCFG.fromstring(""" ... S -> NP VP [1.0] ... VP -> V NP [.59] ... VP -> V [.40] ... VP -> VP PP [.01] ... NP -> Det N [.41] ... NP -> Name [.28] ... NP -> NP PP [.31] ... PP -> P NP [1.0] ... V -> 'saw' [.21] ... V -> 'ate' [.51] ... V -> 'ran' [.28] ... N -> 'boy' [.11] ... N -> 'cookie' [.12] ... N -> 'table' [.13] ... N -> 'telescope' [.14] ... N -> 'hill' [.5] ... Name -> 'Jack' [.52] ... Name -> 'Bob' [.48] ... P -> 'with' [.61] ... P -> 'under' [.39] ... Det -> 'the' [.41] ... Det -> 'a' [.31] ... Det -> 'my' [.28] ... """) Create a set of PCFG productions. >>> grammar = PCFG.fromstring(""" ... A -> B B [.3] | C B C [.7] ... B -> B D [.5] | C [.5] ... C -> 'a' [.1] | 'b' [0.9] ... D -> 'b' [1.0] ... """) >>> prod = grammar.productions()[0] >>> prod A -> B B [0.3] >>> prod.lhs() A >>> prod.rhs() (B, B) >>> print((prod.prob())) 0.3 >>> grammar.start() A >>> grammar.productions() [A -> B B [0.3], A -> C B C [0.7], B -> B D [0.5], B -> C [0.5], C -> 'a' [0.1], C -> 'b' [0.9], D -> 'b' [1.0]] Induce some productions using parsed Treebank data. >>> productions = [] >>> for fileid in treebank.fileids()[:2]: ... for t in treebank.parsed_sents(fileid): ... productions += t.productions() >>> grammar = induce_pcfg(S, productions) >>> grammar >>> sorted(grammar.productions(lhs=Nonterminal('PP')))[:2] [PP -> IN NP [1.0]] >>> sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2] [NNP -> 'Agnew' [0.0714286], NNP -> 'Consolidated' [0.0714286]] >>> sorted(grammar.productions(lhs=Nonterminal('JJ')))[:2] [JJ -> 'British' [0.142857], JJ -> 'former' [0.142857]] >>> sorted(grammar.productions(lhs=Nonterminal('NP')))[:2] [NP -> CD NNS [0.133333], NP -> DT JJ JJ NN [0.0666667]] Unit tests for the Probabilistic Chart Parse classes ---------------------------------------------------- >>> tokens = "Jack saw Bob with my cookie".split() >>> grammar = toy_pcfg2 >>> print(grammar) Grammar with 23 productions (start state = S) S -> NP VP [1.0] VP -> V NP [0.59] VP -> V [0.4] VP -> VP PP [0.01] NP -> Det N [0.41] NP -> Name [0.28] NP -> NP PP [0.31] PP -> P NP [1.0] V -> 'saw' [0.21] V -> 'ate' [0.51] V -> 'ran' [0.28] N -> 'boy' [0.11] N -> 'cookie' [0.12] N -> 'table' [0.13] N -> 'telescope' [0.14] N -> 'hill' [0.5] Name -> 'Jack' [0.52] Name -> 'Bob' [0.48] P -> 'with' [0.61] P -> 'under' [0.39] Det -> 'the' [0.41] Det -> 'a' [0.31] Det -> 'my' [0.28] Create several parsers using different queuing strategies and show the resulting parses. >>> from nltk.parse import pchart >>> parser = pchart.InsideChartParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) (S (NP (Name Jack)) (VP (VP (V saw) (NP (Name Bob))) (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) >>> parser = pchart.RandomChartParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) (S (NP (Name Jack)) (VP (VP (V saw) (NP (Name Bob))) (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) >>> parser = pchart.UnsortedChartParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) (S (NP (Name Jack)) (VP (VP (V saw) (NP (Name Bob))) (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) >>> parser = pchart.LongestChartParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) (S (NP (Name Jack)) (VP (VP (V saw) (NP (Name Bob))) (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) >>> parser = pchart.InsideChartParser(grammar, beam_size = len(tokens)+1) >>> for t in parser.parse(tokens): ... print(t) Unit tests for the Viterbi Parse classes ---------------------------------------- >>> from nltk.parse import ViterbiParser >>> tokens = "Jack saw Bob with my cookie".split() >>> grammar = toy_pcfg2 Parse the tokenized sentence. >>> parser = ViterbiParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) Unit tests for the FeatStructNonterminal class ---------------------------------------------- >>> from nltk.grammar import FeatStructNonterminal >>> FeatStructNonterminal( ... pos='n', agr=FeatStructNonterminal(number='pl', gender='f')) [agr=[gender='f', number='pl'], pos='n'] >>> FeatStructNonterminal('VP[+fin]/NP[+pl]') VP[+fin]/NP[+pl] Tracing the Feature Chart Parser -------------------------------- We use the featurechart.demo() function for tracing the Feature Chart Parser. >>> nltk.parse.featurechart.demo(print_times=False, ... print_grammar=True, ... parser=nltk.parse.featurechart.FeatureChartParser, ... sent='I saw John with a dog') Grammar with 18 productions (start state = S[]) S[] -> NP[] VP[] PP[] -> Prep[] NP[] NP[] -> NP[] PP[] VP[] -> VP[] PP[] VP[] -> Verb[] NP[] VP[] -> Verb[] NP[] -> Det[pl=?x] Noun[pl=?x] NP[] -> 'John' NP[] -> 'I' Det[] -> 'the' Det[] -> 'my' Det[-pl] -> 'a' Noun[-pl] -> 'dog' Noun[-pl] -> 'cookie' Verb[] -> 'ate' Verb[] -> 'saw' Prep[] -> 'with' Prep[] -> 'under' * FeatureChartParser Sentence: I saw John with a dog |.I.s.J.w.a.d.| |[-] . . . . .| [0:1] 'I' |. [-] . . . .| [1:2] 'saw' |. . [-] . . .| [2:3] 'John' |. . . [-] . .| [3:4] 'with' |. . . . [-] .| [4:5] 'a' |. . . . . [-]| [5:6] 'dog' |[-] . . . . .| [0:1] NP[] -> 'I' * |[-> . . . . .| [0:1] S[] -> NP[] * VP[] {} |[-> . . . . .| [0:1] NP[] -> NP[] * PP[] {} |. [-] . . . .| [1:2] Verb[] -> 'saw' * |. [-> . . . .| [1:2] VP[] -> Verb[] * NP[] {} |. [-] . . . .| [1:2] VP[] -> Verb[] * |. [-> . . . .| [1:2] VP[] -> VP[] * PP[] {} |[---] . . . .| [0:2] S[] -> NP[] VP[] * |. . [-] . . .| [2:3] NP[] -> 'John' * |. . [-> . . .| [2:3] S[] -> NP[] * VP[] {} |. . [-> . . .| [2:3] NP[] -> NP[] * PP[] {} |. [---] . . .| [1:3] VP[] -> Verb[] NP[] * |. [---> . . .| [1:3] VP[] -> VP[] * PP[] {} |[-----] . . .| [0:3] S[] -> NP[] VP[] * |. . . [-] . .| [3:4] Prep[] -> 'with' * |. . . [-> . .| [3:4] PP[] -> Prep[] * NP[] {} |. . . . [-] .| [4:5] Det[-pl] -> 'a' * |. . . . [-> .| [4:5] NP[] -> Det[pl=?x] * Noun[pl=?x] {?x: False} |. . . . . [-]| [5:6] Noun[-pl] -> 'dog' * |. . . . [---]| [4:6] NP[] -> Det[-pl] Noun[-pl] * |. . . . [--->| [4:6] S[] -> NP[] * VP[] {} |. . . . [--->| [4:6] NP[] -> NP[] * PP[] {} |. . . [-----]| [3:6] PP[] -> Prep[] NP[] * |. . [-------]| [2:6] NP[] -> NP[] PP[] * |. [---------]| [1:6] VP[] -> VP[] PP[] * |. [--------->| [1:6] VP[] -> VP[] * PP[] {} |[===========]| [0:6] S[] -> NP[] VP[] * |. . [------->| [2:6] S[] -> NP[] * VP[] {} |. . [------->| [2:6] NP[] -> NP[] * PP[] {} |. [---------]| [1:6] VP[] -> Verb[] NP[] * |. [--------->| [1:6] VP[] -> VP[] * PP[] {} |[===========]| [0:6] S[] -> NP[] VP[] * (S[] (NP[] I) (VP[] (VP[] (Verb[] saw) (NP[] John)) (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog))))) (S[] (NP[] I) (VP[] (Verb[] saw) (NP[] (NP[] John) (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog)))))) Unit tests for the Feature Chart Parser classes ----------------------------------------------- The list of parsers we want to test. >>> parsers = [nltk.parse.featurechart.FeatureChartParser, ... nltk.parse.featurechart.FeatureTopDownChartParser, ... nltk.parse.featurechart.FeatureBottomUpChartParser, ... nltk.parse.featurechart.FeatureBottomUpLeftCornerChartParser, ... nltk.parse.earleychart.FeatureIncrementalChartParser, ... nltk.parse.earleychart.FeatureEarleyChartParser, ... nltk.parse.earleychart.FeatureIncrementalTopDownChartParser, ... nltk.parse.earleychart.FeatureIncrementalBottomUpChartParser, ... nltk.parse.earleychart.FeatureIncrementalBottomUpLeftCornerChartParser, ... ] A helper function that tests each parser on the given grammar and sentence. We check that the number of trees are correct, and that all parsers return the same trees. Otherwise an error is printed. >>> def unittest(grammar, sentence, nr_trees): ... sentence = sentence.split() ... trees = None ... for P in parsers: ... result = P(grammar).parse(sentence) ... result = set(tree.freeze() for tree in result) ... if len(result) != nr_trees: ... print("Wrong nr of trees:", len(result)) ... elif trees is None: ... trees = result ... elif result != trees: ... print("Trees differ for parser:", P.__name__) The demo grammar from before, with an ambiguous sentence. >>> isawjohn = nltk.parse.featurechart.demo_grammar() >>> unittest(isawjohn, "I saw John with a dog with my cookie", 5) This grammar tests that variables in different grammar rules are renamed before unification. (The problematic variable is in this case ?X). >>> whatwasthat = nltk.grammar.FeatureGrammar.fromstring(''' ... S[] -> NP[num=?N] VP[num=?N, slash=?X] ... NP[num=?X] -> "what" ... NP[num=?X] -> "that" ... VP[num=?P, slash=none] -> V[num=?P] NP[] ... V[num=sg] -> "was" ... ''') >>> unittest(whatwasthat, "what was that", 1) This grammar tests that the same rule can be used in different places in another rule, and that the variables are properly renamed. >>> thislovesthat = nltk.grammar.FeatureGrammar.fromstring(''' ... S[] -> NP[case=nom] V[] NP[case=acc] ... NP[case=?X] -> Pron[case=?X] ... Pron[] -> "this" ... Pron[] -> "that" ... V[] -> "loves" ... ''') >>> unittest(thislovesthat, "this loves that", 1) Tests for loading feature grammar files --------------------------------------- Alternative 1: first load the grammar, then create the parser. >>> fcfg = nltk.data.load('grammars/book_grammars/feat0.fcfg') >>> fcp1 = nltk.parse.FeatureChartParser(fcfg) >>> print((type(fcp1))) Alternative 2: directly load the parser. >>> fcp2 = nltk.parse.load_parser('grammars/book_grammars/feat0.fcfg') >>> print((type(fcp2))) nltk-3.7/nltk/test/portuguese.doctest_latin1000066400000000000000000000273411420073152400213300ustar00rootroot00000000000000========================================== Examplos para o processamento do português ========================================== >>> import nltk (NB. Este material parte do pressuposto de que o leitor esteja familiarizado com o livro do NLTK, disponível em ``http://nltk.org/index.php/Book``). Utilizando o Corpus MacMorpho Tagged ------------------------------------ O NLTK inclui o corpus de notícias para o português brasileiro com tags de partes do discurso MAC-MORPHO, que conta com mais de um milhão de palavras de textos jornalísticos extraídos de dez seções do jornal diário *Folha de São Paulo*, do ano de 1994. Podemos utilizar este corpus como uma seqüência de palavras ou de palavras com tags da seguinte maneira: >>> nltk.corpus.mac_morpho.words() ['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', ...] >>> nltk.corpus.mac_morpho.sents() [['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o', 'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'], ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional', 'do', 'Zebu', ',', 'que', 'come\xe7a', 'dia', '25'], ...] >>> nltk.corpus.mac_morpho.tagged_words() [('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...] Também é possível utilizá-lo em chunks de frases. >>> nltk.corpus.mac_morpho.tagged_sents() [[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ('de', 'PREP'), ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milh\xe3o', 'N'), ('em', 'PREP|+'), ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'), ('Pinhal', 'NPROP'), ('em', 'PREP'), ('S\xe3o', 'NPROP'), ('Paulo', 'NPROP')], [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'), ('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'), ('do', 'NPROP'), ('Zebu', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'), ('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...] Estes dados podem ser utilizados para efetuar o treinamento de taggers (como nos exemplos abaixo para o Floresta treebank). Utilizando o Floresta Portuguese Treebank ----------------------------------------- A distribuição de dados do NLTK inclui o "Floresta Sinta(c)tica Corpus" na versão 7.4, disponível em ``http://www.linguateca.pt/Floresta/``. Como para a amostra do Penn Treebank, é possível utilizar o conteúdo deste corpus como uma seqüência de palavras com informações de tags, da seguinte maneira: >>> from nltk.corpus import floresta >>> floresta.words() ['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...] >>> floresta.tagged_words() [('Um', '>N+art'), ('revivalismo', 'H+n'), ...] As tags são constituídas por certas informações sintáticas, seguidas por um sinal de mais, seguido por tag costumeira de parte do discurso (part-of-speech). Vamos remover o conteúdo que antecede o sinal de mais: >>> def simplify_tag(t): ... if "+" in t: ... return t[t.index("+")+1:] ... else: ... return t >>> twords = nltk.corpus.floresta.tagged_words() >>> twords = [(w.lower(),simplify_tag(t)) for (w,t) in twords] >>> twords[:10] [('um', 'art'), ('revivalismo', 'n'), ('refrescante', 'adj'), ('o', 'art'), ('7_e_meio', 'prop'), ('\xe9', 'v-fin'), ('um', 'art'), ('ex-libris', 'n'), ('de', 'prp'), ('a', 'art')] E exibir de maneira mais apropriada as palavras com informações de tags: >>> print ' '.join(word + '/' + tag for (word, tag) in twords[:10]) um/art revivalismo/n refrescante/adj o/art 7_e_meio/prop ?/v-fin um/art ex-libris/n de/prp a/art Em seguida, vamos contar o número de tokens de palavras e tipos, além de determinar qual a palavra mais comum: >>> words = floresta.words() >>> len(words) 211870 >>> fd = nltk.FreqDist(words) >>> len(fd) 29425 >>> fd.max() 'de' Podemos também listar as 20 tags mais freqüentes, em ordem decrescente de freqüência: >>> tags = [simplify_tag(tag) for (word,tag) in floresta.tagged_words()] >>> fd = nltk.FreqDist(tags) >>> fd.sorted()[:20] ['n', 'prp', 'art', 'v-fin', ',', 'prop', 'adj', 'adv', '.', 'conj-c', 'v-inf', 'pron-det', 'v-pcp', 'num', 'pron-indp', 'pron-pers', '\xab', '\xbb', 'conj-s', '}'] Também podemos ler o corpus agrupado por enunciados: >>> floresta.sents() [['Um', 'revivalismo', 'refrescante'], ['O', '7_e_Meio', '\xe9', 'um', 'ex-libris', 'de', 'a', 'noite', 'algarvia', '.'], ...] >>> floresta.tagged_sents() [[('Um', '>N+art'), ('revivalismo', 'H+n'), ('refrescante', 'N<+adj')], [('O', '>N+art'), ('7_e_Meio', 'H+prop'), ('\xe9', 'P+v-fin'), ('um', '>N+art'), ('ex-libris', 'H+n'), ('de', 'H+prp'), ('a', '>N+art'), ('noite', 'H+n'), ('algarvia', 'N<+adj'), ('.', '.')], ...] >>> floresta.parsed_sents() [Tree('UTT+np', [Tree('>N+art', ['Um']), Tree('H+n', ['revivalismo']), Tree('N<+adj', ['refrescante'])]), Tree('STA+fcl', [Tree('SUBJ+np', [Tree('>N+art', ['O']), Tree('H+prop', ['7_e_Meio'])]), Tree('P+v-fin', ['\xe9']), Tree('SC+np', [Tree('>N+art', ['um']), Tree('H+n', ['ex-libris']), Tree('N<+pp', [Tree('H+prp', ['de']), Tree('P<+np', [Tree('>N+art', ['a']), Tree('H+n', ['noite']), Tree('N<+adj', ['algarvia'])])])]), Tree('.', ['.'])]), ...] Para ver uma árvore de análise sintática, podemos utilizar o método ``draw()``, como no exemplo: >>> psents = floresta.parsed_sents() >>> psents[5].draw() # doctest: +SKIP Concordância simples -------------------- A seguir, apresentamos uma função que recebe uma palavra e uma quantidade determinada de contexto (medido em caracteres) e gera uma concordância para a mesma. >>> def concordance(word, context=30): ... for sent in floresta.sents(): ... if word in sent: ... pos = sent.index(word) ... left = ' '.join(sent[:pos]) ... right = ' '.join(sent[pos+1:]) ... print '%*s %s %-*s' %\ ... (context, left[-context:], word, context, right[:context]) >>> concordance("dar") # doctest: +SKIP anduru , foi o suficiente para dar a volta a o resultado . 1. O P?BLICO veio dar a a imprensa di?ria portuguesa A fartura de pensamento pode dar maus resultados e n?s n?o quer Come?a a dar resultados a pol?tica de a Uni ial come?ar a incorporar- lo e dar forma a um ' site ' que tem se r com Constantino para ele lhe dar tamb?m os pap?is assinados . va a brincar , pois n?o lhe ia dar procura??o nenhuma enquanto n? ?rica como o ant?doto capaz de dar sentido a o seu enorme poder . . . . >>> concordance("vender") # doctest: +SKIP er recebido uma encomenda para vender 4000 blindados a o Iraque . m?rico_Amorim caso conseguisse vender o lote de ac??es de o empres?r mpre ter jovens simp?ticos a ? vender ? chega ! } Disse que o governo vai vender ? desde autom?vel at? particip ndiciou ontem duas pessoas por vender carro com ?gio . A inten??o de Fleury ? vender as a??es para equilibrar as fi Tagging de partes do discurso ----------------------------- Vamos começar obtendo os dados dos enunciados marcados com tags e simplificando estas últimas como descrito anteriormente. >>> from nltk.corpus import floresta >>> tsents = floresta.tagged_sents() >>> tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] >>> train = tsents[100:] >>> test = tsents[:100] Já sabemos que ``n`` é a tag mais comum; desta forma, podemos criar um tagger por default que marque toda palavra como substantivo e, em seguida, avaliar seu desempenho: >>> tagger0 = nltk.DefaultTagger('n') >>> nltk.tag.accuracy(tagger0, test) 0.17690941385435169 Como pode-se deduzir facilmente, uma em cada seis palavras é um substantivo. Vamos aperfeiçoar estes resultados treinando um tagger unigrama: >>> tagger1 = nltk.UnigramTagger(train, backoff=tagger0) >>> nltk.tag.accuracy(tagger1, test) 0.85115452930728241 E, em seguida, um tagger bigrama: >>> tagger2 = nltk.BigramTagger(train, backoff=tagger1) >>> nltk.tag.accuracy(tagger2, test) 0.86856127886323264 Segmentação de frases --------------------- O Punkt é uma ferramenta para segmentação de frases lingüisticamente independente, o qual requer um treinamento em texto puro. O texto de origem (obtido do Floresta Portuguese Treebank) contém uma frase por linha. Podemos ler o texto, dividi-lo em função de suas linhas e então agrupar estas linhas utilizando espaços. Desta forma as informações sobre quebras de frases terão sido descartadas; podemos então dividir este material em dados para treinamento e para verificação: >>> text = open('floresta.txt').read() >>> lines = text.split('\n') >>> train = ' '.join(lines[10:]) >>> test = ' '.join(lines[:10]) É agora possível treinar o segmentador de frases (ou tokenizador de frases) e utilizá-lo em nossas frases de verificação. (Para exibir o texto em uma forma legível, pode ser necessário converter o texto para o UTF-8, utilizando ``print sent.decode('latin-1').encode('utf-8')``.) >>> stok = nltk.PunktSentenceTokenizer(train) >>> for sent in stok.tokenize(test): ... print sent As versões do NLTK a partir da 0.9b1 incluem um modelo treinado para a segmentação de frases em português, o qual pode ser carregado pela maneira a seguir. É mais rápido carregar um modelo já treinado do que repetir o treinamento do mesmo. >>> stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') Stemming -------- O NLTK inclui o stemmer para o português RSLP. Vamos demonstrar sua utilização para algumas palavras em português: >>> stemmer = nltk.stem.RSLPStemmer() >>> stemmer.stem("copiar") u'copi' >>> stemmer.stem("paisagem") u'pais' Stopwords --------- O NLTK inclui stopword ("palavras limite") para o português: >>> stopwords = nltk.corpus.stopwords.words('portuguese') >>> stopwords[:10] ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'at\xe9'] A esta altura, é possível utilizá-las para filtrar textos. Vamos encontrar as palavras mais comuns (à exceção das stopwords) e listá-las em ordem decrescente de freqüência: >>> fd = nltk.FreqDist(w.lower() for w in floresta.words() if w not in stopwords) >>> for word in fd.sorted()[:20]: ... print word, fd[word] , 13444 . 7725 ? 2369 ? 2310 ? 1137 o 1086 } 1047 { 1044 a 897 ; 633 em 516 ser 466 sobre 349 os 313 anos 301 ontem 292 ainda 279 segundo 256 ter 249 dois 231 Codificações de caracteres -------------------------- O Python é capaz de lidar com todas a codificações de caracteres mais utilizada para o português, a ISO 8859-1 (ISO Latin 1). >>> text = open('floresta.txt').read() >>> text[:60] 'O 7 e Meio \xe9 um ex-libris da noite algarvia.\n\xc9 uma das mais ' >>> print text[:60] O 7 e Meio ? um ex-libris da noite algarvia. ? uma das mais >>> text[:60].decode('latin-1') u'O 7 e Meio \xe9 um ex-libris da noite algarvia.\n\xc9 uma das mais ' >>> text[:60].decode('latin-1').encode('utf-8') 'O 7 e Meio \xc3\xa9 um ex-libris da noite algarvia.\n\xc3\x89 uma das mais ' >>> text[:60].decode('latin-1').encode('utf-8') 'O 7 e Meio \xc3\xa9 um ex-libris da noite algarvia.\n\xc3\x89 uma das mais ' >>> text[:60].decode('latin-1').encode('utf-16') '\xff\xfeO\x00 \x007\x00 \x00e\x00 \x00M\x00e\x00i\x00o\x00 \x00\xe9\x00 \x00u\x00m\x00 \x00e\x00x\x00-\x00l\x00i\x00b\x00r\x00i\x00s\x00 \x00d\x00a\x00 \x00n\x00o\x00i\x00t\x00e\x00 \x00a\x00l\x00g\x00a\x00r\x00v\x00i\x00a\x00.\x00\n\x00\xc9\x00 \x00u\x00m\x00a\x00 \x00d\x00a\x00s\x00 \x00m\x00a\x00i\x00s\x00 \x00' nltk-3.7/nltk/test/portuguese_en.doctest000066400000000000000000000540311420073152400205360ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ================================== Examples for Portuguese Processing ================================== This HOWTO contains a variety of examples relating to the Portuguese language. It is intended to be read in conjunction with the NLTK book (``https://www.nltk.org/book/``). For instructions on running the Python interpreter, please see the section *Getting Started with Python*, in Chapter 1. -------------------------------------------- Python Programming, with Portuguese Examples -------------------------------------------- Chapter 1 of the NLTK book contains many elementary programming examples, all with English texts. In this section, we'll see some corresponding examples using Portuguese. Please refer to the chapter for full discussion. *Vamos!* >>> from nltk.test.portuguese_en_fixt import setup_module >>> setup_module() >>> from nltk.examples.pt import * *** Introductory Examples for the NLTK Book *** Loading ptext1, ... and psent1, ... Type the name of the text or sentence to view it. Type: 'texts()' or 'sents()' to list the materials. ptext1: Memórias Póstumas de Brás Cubas (1881) ptext2: Dom Casmurro (1899) ptext3: Gênesis ptext4: Folha de Sao Paulo (1994) Any time we want to find out about these texts, we just have to enter their names at the Python prompt: >>> ptext2 Searching Text -------------- A concordance permits us to see words in context. >>> ptext1.concordance('olhos') Building index... Displaying 25 of 138 matches: De pé , à cabeceira da cama , com os olhos estúpidos , a boca entreaberta , a t orelhas . Pela minha parte fechei os olhos e deixei - me ir à ventura . Já agor xões de cérebro enfermo . Como ia de olhos fechados , não via o caminho ; lembr gelos eternos . Com efeito , abri os olhos e vi que o meu animal galopava numa me apareceu então , fitando - me uns olhos rutilantes como o sol . Tudo nessa f mim mesmo . Então , encarei - a com olhos súplices , e pedi mais alguns anos . ... For a given word, we can find words with a similar text distribution: >>> ptext1.similar('chegar') Building word-context index... acabada acudir aludir avistar bramanismo casamento cheguei com contar contrário corpo dali deixei desferirem dizer fazer filhos já leitor lhe >>> ptext3.similar('chegar') Building word-context index... achar alumiar arrombar destruir governar guardar ir lavrar passar que toda tomar ver vir We can search for the statistically significant collocations in a text: >>> ptext1.collocations() Building collocations list Quincas Borba; Lobo Neves; alguma coisa; Brás Cubas; meu pai; dia seguinte; não sei; Meu pai; alguns instantes; outra vez; outra coisa; por exemplo; mim mesmo; coisa nenhuma; mesma coisa; não era; dias depois; Passeio Público; olhar para; das coisas We can search for words in context, with the help of *regular expressions*, e.g.: >>> ptext1.findall(" (<.*>)") estúpidos; e; fechados; rutilantes; súplices; a; do; babavam; na; moles; se; da; umas; espraiavam; chamejantes; espetados; ... We can automatically generate random text based on a given text, e.g.: >>> ptext3.generate() # doctest: +SKIP No princípio , criou Deus os abençoou , dizendo : Onde { estão } e até à ave dos céus , { que } será . Disse mais Abrão : Dá - me a mulher que tomaste ; porque daquele poço Eseque , { tinha .} E disse : Não poderemos descer ; mas , do campo ainda não estava na casa do teu pescoço . E viveu Serugue , depois Simeão e Levi { são } estes ? E o varão , porque habitava na terra de Node , da mão de Esaú : Jeús , Jalão e Corá Texts as List of Words ---------------------- A few sentences have been defined for you. >>> psent1 ['o', 'amor', 'da', 'gl\xf3ria', 'era', 'a', 'coisa', 'mais', 'verdadeiramente', 'humana', 'que', 'h\xe1', 'no', 'homem', ',', 'e', ',', 'conseq\xfcentemente', ',', 'a', 'sua', 'mais', 'genu\xedna', 'fei\xe7\xe3o', '.'] >>> Notice that the sentence has been *tokenized*. Each token is represented as a string, represented using quotes, e.g. ``'coisa'``. Some strings contain special characters, e.g. ``\xf3``, the internal representation for ó. The tokens are combined in the form of a *list*. How long is this list? >>> len(psent1) 25 >>> What is the vocabulary of this sentence? >>> sorted(set(psent1)) [',', '.', 'a', 'amor', 'coisa', 'conseqüentemente', 'da', 'e', 'era', 'feição', 'genuína', 'glória', 'homem', 'humana', 'há', 'mais', 'no', 'o', 'que', 'sua', 'verdadeiramente'] >>> Let's iterate over each item in ``psent2``, and print information for each: >>> for w in psent2: ... print(w, len(w), w[-1]) ... Não 3 o consultes 9 s dicionários 11 s . 1 . Observe how we make a human-readable version of a string, using ``decode()``. Also notice that we accessed the last character of a string ``w`` using ``w[-1]``. We just saw a ``for`` loop above. Another useful control structure is a *list comprehension*. >>> [w.upper() for w in psent2] ['N\xc3O', 'CONSULTES', 'DICION\xc1RIOS', '.'] >>> [w for w in psent1 if w.endswith('a')] ['da', 'gl\xf3ria', 'era', 'a', 'coisa', 'humana', 'a', 'sua', 'genu\xedna'] >>> [w for w in ptext4 if len(w) > 15] ['norte-irlandeses', 'pan-nacionalismo', 'predominatemente', 'primeiro-ministro', 'primeiro-ministro', 'irlandesa-americana', 'responsabilidades', 'significativamente'] We can examine the relative frequency of words in a text, using ``FreqDist``: >>> fd1 = FreqDist(ptext1) >>> fd1 >>> fd1['olhos'] 137 >>> fd1.max() ',' >>> fd1.samples()[:100] [',', '.', 'a', 'que', 'de', 'e', '-', 'o', ';', 'me', 'um', 'n\xe3o', '\x97', 'se', 'do', 'da', 'uma', 'com', 'os', '\xe9', 'era', 'as', 'eu', 'lhe', 'ao', 'em', 'para', 'mas', '...', '!', '\xe0', 'na', 'mais', '?', 'no', 'como', 'por', 'N\xe3o', 'dos', 'o', 'ele', ':', 'Virg\xedlia', 'me', 'disse', 'minha', 'das', 'O', '/', 'A', 'CAP\xcdTULO', 'muito', 'depois', 'coisa', 'foi', 'sem', 'olhos', 'ela', 'nos', 'tinha', 'nem', 'E', 'outro', 'vida', 'nada', 'tempo', 'menos', 'outra', 'casa', 'homem', 'porque', 'quando', 'mim', 'mesmo', 'ser', 'pouco', 'estava', 'dia', 't\xe3o', 'tudo', 'Mas', 'at\xe9', 'D', 'ainda', 's\xf3', 'alguma', 'la', 'vez', 'anos', 'h\xe1', 'Era', 'pai', 'esse', 'lo', 'dizer', 'assim', 'ent\xe3o', 'dizia', 'aos', 'Borba'] --------------- Reading Corpora --------------- Accessing the Machado Text Corpus --------------------------------- NLTK includes the complete works of Machado de Assis. >>> from nltk.corpus import machado >>> machado.fileids() ['contos/macn001.txt', 'contos/macn002.txt', 'contos/macn003.txt', ...] Each file corresponds to one of the works of Machado de Assis. To see a complete list of works, you can look at the corpus README file: ``print machado.readme()``. Let's access the text of the *Posthumous Memories of Brás Cubas*. We can access the text as a list of characters, and access 200 characters starting from position 10,000. >>> raw_text = machado.raw('romance/marm05.txt') >>> raw_text[10000:10200] u', primou no\nEstado, e foi um dos amigos particulares do vice-rei Conde da Cunha.\n\nComo este apelido de Cubas lhe\ncheirasse excessivamente a tanoaria, alegava meu pai, bisneto de Dami\xe3o, que o\ndito ape' However, this is not a very useful way to work with a text. We generally think of a text as a sequence of words and punctuation, not characters: >>> text1 = machado.words('romance/marm05.txt') >>> text1 ['Romance', ',', 'Mem\xf3rias', 'P\xf3stumas', 'de', ...] >>> len(text1) 77098 >>> len(set(text1)) 10848 Here's a program that finds the most common ngrams that contain a particular target word. >>> from nltk import ngrams, FreqDist >>> target_word = 'olhos' >>> fd = FreqDist(ng ... for ng in ngrams(text1, 5) ... if target_word in ng) >>> for hit in fd.samples(): ... print(' '.join(hit)) ... , com os olhos no com os olhos no ar com os olhos no chão e todos com os olhos me estar com os olhos os olhos estúpidos , a os olhos na costura , os olhos no ar , , com os olhos espetados , com os olhos estúpidos , com os olhos fitos , com os olhos naquele , com os olhos para Accessing the MacMorpho Tagged Corpus ------------------------------------- NLTK includes the MAC-MORPHO Brazilian Portuguese POS-tagged news text, with over a million words of journalistic texts extracted from ten sections of the daily newspaper *Folha de Sao Paulo*, 1994. We can access this corpus as a sequence of words or tagged words as follows: >>> import nltk.corpus >>> nltk.corpus.mac_morpho.words() ['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', ...] >>> nltk.corpus.mac_morpho.sents() [['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o', 'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'], ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional', 'do', 'Zeb', ',', 'que', 'come\xe7a', 'dia', '25'], ...] >>> nltk.corpus.mac_morpho.tagged_words() [('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...] We can also access it in sentence chunks. >>> nltk.corpus.mac_morpho.tagged_sents() [[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ('de', 'PREP'), ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milh\xe3o', 'N'), ('em', 'PREP|+'), ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'), ('Pinhal', 'NPROP'), ('em', 'PREP'), ('S\xe3o', 'NPROP'), ('Paulo', 'NPROP')], [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'), ('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'), ('do', 'NPROP'), ('Zeb', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'), ('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...] This data can be used to train taggers (examples below for the Floresta treebank). Accessing the Floresta Portuguese Treebank ------------------------------------------ The NLTK data distribution includes the "Floresta Sinta(c)tica Corpus" version 7.4, available from ``https://www.linguateca.pt/Floresta/``. We can access this corpus as a sequence of words or tagged words as follows: >>> from nltk.corpus import floresta >>> floresta.words() ['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...] >>> floresta.tagged_words() [('Um', '>N+art'), ('revivalismo', 'H+n'), ...] The tags consist of some syntactic information, followed by a plus sign, followed by a conventional part-of-speech tag. Let's strip off the material before the plus sign: >>> def simplify_tag(t): ... if "+" in t: ... return t[t.index("+")+1:] ... else: ... return t >>> twords = floresta.tagged_words() >>> twords = [(w.lower(), simplify_tag(t)) for (w,t) in twords] >>> twords[:10] [('um', 'art'), ('revivalismo', 'n'), ('refrescante', 'adj'), ('o', 'art'), ('7_e_meio', 'prop'), ('\xe9', 'v-fin'), ('um', 'art'), ('ex-libris', 'n'), ('de', 'prp'), ('a', 'art')] Pretty printing the tagged words: >>> print(' '.join(word + '/' + tag for (word, tag) in twords[:10])) um/art revivalismo/n refrescante/adj o/art 7_e_meio/prop é/v-fin um/art ex-libris/n de/prp a/art Count the word tokens and types, and determine the most common word: >>> words = floresta.words() >>> len(words) 211852 >>> fd = nltk.FreqDist(words) >>> len(fd) 29421 >>> fd.max() 'de' List the 20 most frequent tags, in order of decreasing frequency: >>> tags = [simplify_tag(tag) for (word,tag) in floresta.tagged_words()] >>> fd = nltk.FreqDist(tags) >>> fd.keys()[:20] ['n', 'prp', 'art', 'v-fin', ',', 'prop', 'adj', 'adv', '.', 'conj-c', 'v-inf', 'pron-det', 'v-pcp', 'num', 'pron-indp', 'pron-pers', '\xab', '\xbb', 'conj-s', '}'] We can also access the corpus grouped by sentence: >>> floresta.sents() [['Um', 'revivalismo', 'refrescante'], ['O', '7_e_Meio', '\xe9', 'um', 'ex-libris', 'de', 'a', 'noite', 'algarvia', '.'], ...] >>> floresta.tagged_sents() [[('Um', '>N+art'), ('revivalismo', 'H+n'), ('refrescante', 'N<+adj')], [('O', '>N+art'), ('7_e_Meio', 'H+prop'), ('\xe9', 'P+v-fin'), ('um', '>N+art'), ('ex-libris', 'H+n'), ('de', 'H+prp'), ('a', '>N+art'), ('noite', 'H+n'), ('algarvia', 'N<+adj'), ('.', '.')], ...] >>> floresta.parsed_sents() [Tree('UTT+np', [Tree('>N+art', ['Um']), Tree('H+n', ['revivalismo']), Tree('N<+adj', ['refrescante'])]), Tree('STA+fcl', [Tree('SUBJ+np', [Tree('>N+art', ['O']), Tree('H+prop', ['7_e_Meio'])]), Tree('P+v-fin', ['\xe9']), Tree('SC+np', [Tree('>N+art', ['um']), Tree('H+n', ['ex-libris']), Tree('N<+pp', [Tree('H+prp', ['de']), Tree('P<+np', [Tree('>N+art', ['a']), Tree('H+n', ['noite']), Tree('N<+adj', ['algarvia'])])])]), Tree('.', ['.'])]), ...] To view a parse tree, use the ``draw()`` method, e.g.: >>> psents = floresta.parsed_sents() >>> psents[5].draw() # doctest: +SKIP Character Encodings ------------------- Python understands the common character encoding used for Portuguese, ISO 8859-1 (ISO Latin 1). >>> import os, nltk.test >>> testdir = os.path.split(nltk.test.__file__)[0] >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO 8859-1') >>> text[:60] 'O 7 e Meio \xe9 um ex-libris da noite algarvia.\n\xc9 uma das mais ' >>> print(text[:60]) O 7 e Meio é um ex-libris da noite algarvia. É uma das mais For more information about character encodings and Python, please see section 3.3 of the book. ---------------- Processing Tasks ---------------- Simple Concordancing -------------------- Here's a function that takes a word and a specified amount of context (measured in characters), and generates a concordance for that word. >>> def concordance(word, context=30): ... for sent in floresta.sents(): ... if word in sent: ... pos = sent.index(word) ... left = ' '.join(sent[:pos]) ... right = ' '.join(sent[pos+1:]) ... print('%*s %s %-*s' % ... (context, left[-context:], word, context, right[:context])) >>> concordance("dar") # doctest: +SKIP anduru , foi o suficiente para dar a volta a o resultado . 1. O P?BLICO veio dar a a imprensa di?ria portuguesa A fartura de pensamento pode dar maus resultados e n?s n?o quer Come?a a dar resultados a pol?tica de a Uni ial come?ar a incorporar- lo e dar forma a um ' site ' que tem se r com Constantino para ele lhe dar tamb?m os pap?is assinados . va a brincar , pois n?o lhe ia dar procura??o nenhuma enquanto n? ?rica como o ant?doto capaz de dar sentido a o seu enorme poder . . . . >>> concordance("vender") # doctest: +SKIP er recebido uma encomenda para vender 4000 blindados a o Iraque . m?rico_Amorim caso conseguisse vender o lote de ac??es de o empres?r mpre ter jovens simp?ticos a ? vender ? chega ! } Disse que o governo vai vender ? desde autom?vel at? particip ndiciou ontem duas pessoas por vender carro com ?gio . A inten??o de Fleury ? vender as a??es para equilibrar as fi Part-of-Speech Tagging ---------------------- Let's begin by getting the tagged sentence data, and simplifying the tags as described earlier. >>> from nltk.corpus import floresta >>> tsents = floresta.tagged_sents() >>> tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] >>> train = tsents[100:] >>> test = tsents[:100] We already know that ``n`` is the most common tag, so we can set up a default tagger that tags every word as a noun, and see how well it does: >>> tagger0 = nltk.DefaultTagger('n') >>> nltk.tag.accuracy(tagger0, test) 0.17697228144989338 Evidently, about one in every six words is a noun. Let's improve on this by training a unigram tagger: >>> tagger1 = nltk.UnigramTagger(train, backoff=tagger0) >>> nltk.tag.accuracy(tagger1, test) 0.87029140014214645 Next a bigram tagger: >>> tagger2 = nltk.BigramTagger(train, backoff=tagger1) >>> nltk.tag.accuracy(tagger2, test) 0.89019189765458417 Sentence Segmentation --------------------- Punkt is a language-neutral sentence segmentation tool. We >>> sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle') >>> raw_text = machado.raw('romance/marm05.txt') >>> sentences = sent_tokenizer.tokenize(raw_text) >>> for sent in sentences[1000:1005]: ... print("<<", sent, ">>") ... << Em verdade, parecia ainda mais mulher do que era; seria criança nos seus folgares de moça; mas assim quieta, impassível, tinha a compostura da mulher casada. >> << Talvez essa circunstância lhe diminuía um pouco da graça virginal. >> << Depressa nos familiarizamos; a mãe fazia-lhe grandes elogios, eu escutava-os de boa sombra, e ela sorria com os olhos fúlgidos, como se lá dentro do cérebro lhe estivesse a voar uma borboletinha de asas de ouro e olhos de diamante... >> << Digo lá dentro, porque cá fora o que esvoaçou foi uma borboleta preta, que subitamente penetrou na varanda, e começou a bater as asas em derredor de D. Eusébia. >> << D. Eusébia deu um grito, levantou-se, praguejou umas palavras soltas: - T'esconjuro!... >> The sentence tokenizer can be trained and evaluated on other text. The source text (from the Floresta Portuguese Treebank) contains one sentence per line. We read the text, split it into its lines, and then join these lines together using spaces. Now the information about sentence breaks has been discarded. We split this material into training and testing data: >>> import os, nltk.test >>> testdir = os.path.split(nltk.test.__file__)[0] >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO-8859-1') >>> lines = text.split('\n') >>> train = ' '.join(lines[10:]) >>> test = ' '.join(lines[:10]) Now we train the sentence segmenter (or sentence tokenizer) and use it on our test sentences: >>> stok = nltk.PunktSentenceTokenizer(train) >>> print(stok.tokenize(test)) ['O 7 e Meio \xe9 um ex-libris da noite algarvia.', '\xc9 uma das mais antigas discotecas do Algarve, situada em Albufeira, que continua a manter os tra\xe7os decorativos e as clientelas de sempre.', '\xc9 um pouco a vers\xe3o de uma esp\xe9cie de \xaboutro lado\xbb da noite, a meio caminho entre os devaneios de uma fauna perif\xe9rica, seja de Lisboa, Londres, Dublin ou Faro e Portim\xe3o, e a postura circunspecta dos fi\xe9is da casa, que dela esperam a m\xfasica \xabgeracionista\xbb dos 60 ou dos 70.', 'N\xe3o deixa de ser, nos tempos que correm, um certo \xabvery typical\xbb algarvio, cabe\xe7a de cartaz para os que querem fugir a algumas movimenta\xe7\xf5es nocturnas j\xe1 a caminho da ritualiza\xe7\xe3o de massas, do g\xe9nero \xabvamos todos ao Calypso e encontramo-nos na Locomia\xbb.', 'E assim, aos 2,5 milh\xf5es que o Minist\xe9rio do Planeamento e Administra\xe7\xe3o do Territ\xf3rio j\xe1 gasta no pagamento do pessoal afecto a estes organismos, v\xeam juntar-se os montantes das obras propriamente ditas, que os munic\xedpios, j\xe1 com projectos na m\xe3o, v\xeam reivindicar junto do Executivo, como salienta aquele membro do Governo.', 'E o dinheiro \xabn\xe3o falta s\xf3 \xe0s c\xe2maras\xbb, lembra o secret\xe1rio de Estado, que considera que a solu\xe7\xe3o para as autarquias \xe9 \xabespecializarem-se em fundos comunit\xe1rios\xbb.', 'Mas como, se muitas n\xe3o disp\xf5em, nos seus quadros, dos t\xe9cnicos necess\xe1rios?', '\xabEncomendem-nos a projectistas de fora\xbb porque, se as obras vierem a ser financiadas, eles at\xe9 saem de gra\xe7a, j\xe1 que, nesse caso, \xabos fundos comunit\xe1rios pagam os projectos, o mesmo n\xe3o acontecendo quando eles s\xe3o feitos pelos GAT\xbb, dado serem organismos do Estado.', 'Essa poder\xe1 vir a ser uma hip\xf3tese, at\xe9 porque, no terreno, a capacidade dos GAT est\xe1 cada vez mais enfraquecida.', 'Alguns at\xe9 j\xe1 desapareceram, como o de Castro Verde, e outros t\xeam vindo a perder quadros.'] NLTK's data collection includes a trained model for Portuguese sentence segmentation, which can be loaded as follows. It is faster to load a trained model than to retrain it. >>> stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') Stemming -------- NLTK includes the RSLP Portuguese stemmer. Here we use it to stem some Portuguese text: >>> stemmer = nltk.stem.RSLPStemmer() >>> stemmer.stem("copiar") 'copi' >>> stemmer.stem("paisagem") 'pais' Stopwords --------- NLTK includes Portuguese stopwords: >>> stopwords = nltk.corpus.stopwords.words('portuguese') >>> stopwords[:10] ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'at\xe9'] Now we can use these to filter text. Let's find the most frequent words (other than stopwords) and print them in descending order of frequency: >>> fd = nltk.FreqDist(w.lower() for w in floresta.words() if w not in stopwords) >>> for word in list(fd.keys())[:20]: ... print(word, fd[word]) , 13444 . 7725 « 2369 » 2310 é 1305 o 1086 } 1047 { 1044 a 897 ; 633 em 516 ser 466 sobre 349 os 313 anos 301 ontem 292 ainda 279 segundo 256 ter 249 dois 231 nltk-3.7/nltk/test/portuguese_en_fixt.py000066400000000000000000000001761420073152400205540ustar00rootroot00000000000000def setup_module(): import pytest pytest.skip("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!") nltk-3.7/nltk/test/probability.doctest000066400000000000000000000213521420073152400201720ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =========== Probability =========== >>> from nltk.test.probability_fixt import setup_module >>> setup_module() >>> import nltk >>> from nltk.probability import * FreqDist -------- >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!'] >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.'] >>> fd1 = nltk.FreqDist(text1) >>> fd1 == nltk.FreqDist(text1) True Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order. >>> import itertools >>> both = nltk.FreqDist(text1 + text2) >>> both_most_common = both.most_common() >>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1])))) [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)] >>> both == fd1 + nltk.FreqDist(text2) True >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged True >>> fd2 = nltk.FreqDist(text2) >>> fd1.update(fd2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd1.update(text2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd2 = nltk.FreqDist(fd1) >>> fd2 == fd1 True ``nltk.FreqDist`` can be pickled: >>> import pickle >>> fd1 = nltk.FreqDist(text1) >>> pickled = pickle.dumps(fd1) >>> fd1 == pickle.loads(pickled) True Mathematical operations: >>> FreqDist('abbb') + FreqDist('bcc') FreqDist({'b': 4, 'c': 2, 'a': 1}) >>> FreqDist('abbbc') - FreqDist('bccd') FreqDist({'b': 2, 'a': 1}) >>> FreqDist('abbb') | FreqDist('bcc') FreqDist({'b': 3, 'c': 2, 'a': 1}) >>> FreqDist('abbb') & FreqDist('bcc') FreqDist({'b': 1}) ConditionalFreqDist ------------------- >>> cfd1 = ConditionalFreqDist() >>> cfd1[1] = FreqDist('abbbb') >>> cfd1[2] = FreqDist('xxxxyy') >>> cfd1 >>> cfd2 = ConditionalFreqDist() >>> cfd2[1] = FreqDist('bbccc') >>> cfd2[2] = FreqDist('xxxyyyzz') >>> cfd2[3] = FreqDist('m') >>> cfd2 >>> r = cfd1 + cfd2 >>> [(i,r[i]) for i in r.conditions()] [(1, FreqDist({'b': 6, 'c': 3, 'a': 1})), (2, FreqDist({'x': 7, 'y': 5, 'z': 2})), (3, FreqDist({'m': 1}))] >>> r = cfd1 - cfd2 >>> [(i,r[i]) for i in r.conditions()] [(1, FreqDist({'b': 2, 'a': 1})), (2, FreqDist({'x': 1}))] >>> r = cfd1 | cfd2 >>> [(i,r[i]) for i in r.conditions()] [(1, FreqDist({'b': 4, 'c': 3, 'a': 1})), (2, FreqDist({'x': 4, 'y': 3, 'z': 2})), (3, FreqDist({'m': 1}))] >>> r = cfd1 & cfd2 >>> [(i,r[i]) for i in r.conditions()] [(1, FreqDist({'b': 2})), (2, FreqDist({'x': 3, 'y': 2}))] Testing some HMM estimators --------------------------- We extract a small part (500 sentences) of the Brown corpus >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] >>> print(len(corpus)) 500 We create a HMM trainer - note that we need the tags and symbols from the whole corpus, not just the training corpus >>> from nltk.util import unique_list >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) >>> print(len(tag_set)) 92 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> print(len(symbols)) 1464 >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) We divide the corpus into 90% training and 10% testing >>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> print(len(train_corpus)) 450 >>> print(len(test_corpus)) 50 And now we can test the estimators >>> def train_and_test(est): ... hmm = trainer.train_supervised(train_corpus, estimator=est) ... print('%.2f%%' % (100 * hmm.accuracy(test_corpus))) Maximum Likelihood Estimation ----------------------------- - this resulted in an initialization error before r7209 >>> mle = lambda fd, bins: MLEProbDist(fd) >>> train_and_test(mle) 22.75% Laplace (= Lidstone with gamma==1) >>> train_and_test(LaplaceProbDist) 66.04% Expected Likelihood Estimation (= Lidstone with gamma==0.5) >>> train_and_test(ELEProbDist) 73.01% Lidstone Estimation, for gamma==0.1, 0.5 and 1 (the later two should be exactly equal to MLE and ELE above) >>> def lidstone(gamma): ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) >>> train_and_test(lidstone(0.1)) 82.51% >>> train_and_test(lidstone(0.5)) 73.01% >>> train_and_test(lidstone(1.0)) 66.04% Witten Bell Estimation ---------------------- - This resulted in ZeroDivisionError before r7209 >>> train_and_test(WittenBellProbDist) 88.12% Good Turing Estimation >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5) >>> train_and_test(gt) 86.93% Kneser Ney Estimation --------------------- Since the Kneser-Ney distribution is best suited for trigrams, we must adjust our testing accordingly. >>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1])) ... for x, y, z in nltk.trigrams(sent)] ... for sent in corpus[:100]] We will then need to redefine the rest of the training/testing variables >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) >>> len(tag_set) 906 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> len(symbols) 1341 >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) >>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> len(train_corpus) 90 >>> len(test_corpus) 10 >>> kn = lambda fd, bins: KneserNeyProbDist(fd) >>> train_and_test(kn) 0.86% Remains to be added: - Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist Squashed bugs ------------- Issue 511: override pop and popitem to invalidate the cache >>> fd = nltk.FreqDist('a') >>> list(fd.keys()) ['a'] >>> fd.pop('a') 1 >>> list(fd.keys()) [] Issue 533: access cumulative frequencies with no arguments >>> fd = nltk.FreqDist('aab') >>> list(fd._cumulative_frequencies(['a'])) [2.0] >>> list(fd._cumulative_frequencies(['a', 'b'])) [2.0, 3.0] Issue 579: override clear to reset some variables >>> fd = FreqDist('aab') >>> fd.clear() >>> fd.N() 0 Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently add errant categories >>> from nltk.corpus import brown >>> brown.fileids('blah') Traceback (most recent call last): ... ValueError: Category blah not found >>> brown.categories() ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default otherwise any unseen events get a probability of zero, i.e., they don't get smoothed >>> from nltk import SimpleGoodTuringProbDist, FreqDist >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10}) >>> p = SimpleGoodTuringProbDist(fd) >>> p.prob('a') 0.017649766667026317... >>> p.prob('o') 0.08433050215340411... >>> p.prob('z') 0.022727272727272728... >>> p.prob('foobar') 0.022727272727272728... ``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and ``ConditionalFreqDist`` can be pickled: >>> import pickle >>> pd = MLEProbDist(fd) >>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples()) True >>> dpd = DictionaryConditionalProbDist({'x': pd}) >>> unpickled = pickle.loads(pickle.dumps(dpd)) >>> dpd['x'].prob('a') 0.011363636... >>> dpd['x'].prob('a') == unpickled['x'].prob('a') True >>> cfd = nltk.probability.ConditionalFreqDist() >>> cfd['foo']['hello'] += 1 >>> cfd['foo']['hello'] += 1 >>> cfd['bar']['hello'] += 1 >>> cfd2 = pickle.loads(pickle.dumps(cfd)) >>> cfd2 == cfd True >>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist) >>> cpd2 = pickle.loads(pickle.dumps(cpd)) >>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello') True nltk-3.7/nltk/test/probability_fixt.py000066400000000000000000000002641420073152400202060ustar00rootroot00000000000000# probability.doctest uses HMM which requires numpy; # skip probability.doctest if numpy is not available def setup_module(): import pytest pytest.importorskip("numpy") nltk-3.7/nltk/test/propbank.doctest000066400000000000000000000145661420073152400174770ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ======== PropBank ======== The PropBank Corpus provides predicate-argument annotation for the entire Penn Treebank. Each verb in the treebank is annotated by a single instance in PropBank, containing information about the location of the verb, and the location and identity of its arguments: >>> from nltk.corpus import propbank >>> pb_instances = propbank.instances() >>> print(pb_instances) [, , ...] Each propbank instance defines the following member variables: - Location information: `fileid`, `sentnum`, `wordnum` - Annotator information: `tagger` - Inflection information: `inflection` - Roleset identifier: `roleset` - Verb (aka predicate) location: `predicate` - Argument locations and types: `arguments` The following examples show the types of these arguments: >>> inst = pb_instances[103] >>> (inst.fileid, inst.sentnum, inst.wordnum) ('wsj_0004.mrg', 8, 16) >>> inst.tagger 'gold' >>> inst.inflection >>> infl = inst.inflection >>> infl.form, infl.tense, infl.aspect, infl.person, infl.voice ('v', 'p', '-', '-', 'a') >>> inst.roleset 'rise.01' >>> inst.predicate PropbankTreePointer(16, 0) >>> inst.arguments ((PropbankTreePointer(0, 2), 'ARG1'), (PropbankTreePointer(13, 1), 'ARGM-DIS'), (PropbankTreePointer(17, 1), 'ARG4-to'), (PropbankTreePointer(20, 1), 'ARG3-from')) The location of the predicate and of the arguments are encoded using `PropbankTreePointer` objects, as well as `PropbankChainTreePointer` objects and `PropbankSplitTreePointer` objects. A `PropbankTreePointer` consists of a `wordnum` and a `height`: >>> print(inst.predicate.wordnum, inst.predicate.height) 16 0 This identifies the tree constituent that is headed by the word that is the `wordnum`\ 'th token in the sentence, and whose span is found by going `height` nodes up in the tree. This type of pointer is only useful if we also have the corresponding tree structure, since it includes empty elements such as traces in the word number count. The trees for 10% of the standard PropBank Corpus are contained in the `treebank` corpus: >>> tree = inst.tree >>> from nltk.corpus import treebank >>> assert tree == treebank.parsed_sents(inst.fileid)[inst.sentnum] >>> inst.predicate.select(tree) Tree('VBD', ['rose']) >>> for (argloc, argid) in inst.arguments: ... print('%-10s %s' % (argid, argloc.select(tree).pformat(500)[:50])) ARG1 (NP-SBJ (NP (DT The) (NN yield)) (PP (IN on) (NP ( ARGM-DIS (PP (IN for) (NP (NN example))) ARG4-to (PP-DIR (TO to) (NP (CD 8.04) (NN %))) ARG3-from (PP-DIR (IN from) (NP (CD 7.90) (NN %))) Propbank tree pointers can be converted to standard tree locations, which are usually easier to work with, using the `treepos()` method: >>> treepos = inst.predicate.treepos(tree) >>> print (treepos, tree[treepos]) (4, 0) (VBD rose) In some cases, argument locations will be encoded using `PropbankChainTreePointer`\ s (for trace chains) or `PropbankSplitTreePointer`\ s (for discontinuous constituents). Both of these objects contain a single member variable, `pieces`, containing a list of the constituent pieces. They also define the method `select()`, which will return a tree containing all the elements of the argument. (A new head node is created, labeled "*CHAIN*" or "*SPLIT*", since the argument is not a single constituent in the original tree). Sentence #6 contains an example of an argument that is both discontinuous and contains a chain: >>> inst = pb_instances[6] >>> inst.roleset 'expose.01' >>> argloc, argid = inst.arguments[2] >>> argloc >>> argloc.pieces [, PropbankTreePointer(27, 0)] >>> argloc.pieces[0].pieces ... [PropbankTreePointer(22, 1), PropbankTreePointer(24, 0), PropbankTreePointer(25, 1)] >>> print(argloc.select(inst.tree)) (*CHAIN* (*SPLIT* (NP (DT a) (NN group)) (IN of) (NP (NNS workers))) (-NONE- *)) The PropBank Corpus also provides access to the frameset files, which define the argument labels used by the annotations, on a per-verb basis. Each frameset file contains one or more predicates, such as 'turn' or 'turn_on', each of which is divided into coarse-grained word senses called rolesets. For each roleset, the frameset file provides descriptions of the argument roles, along with examples. >>> expose_01 = propbank.roleset('expose.01') >>> turn_01 = propbank.roleset('turn.01') >>> print(turn_01) >>> for role in turn_01.findall("roles/role"): ... print(role.attrib['n'], role.attrib['descr']) 0 turner 1 thing turning m direction, location >>> from xml.etree import ElementTree >>> print(ElementTree.tostring(turn_01.find('example')).decode('utf8').strip()) John turned the key in the lock. John turned the key in the lock Note that the standard corpus distribution only contains 10% of the treebank, so the parse trees are not available for instances starting at 9353: >>> inst = pb_instances[9352] >>> inst.fileid 'wsj_0199.mrg' >>> print(inst.tree) (S (NP-SBJ (NNP Trinity)) (VP (VBD said) (SBAR (-NONE- 0) ...)) >>> print(inst.predicate.select(inst.tree)) (VB begin) >>> inst = pb_instances[9353] >>> inst.fileid 'wsj_0200.mrg' >>> print(inst.tree) None >>> print(inst.predicate.select(inst.tree)) Traceback (most recent call last): . . . ValueError: Parse tree not available However, if you supply your own version of the treebank corpus (by putting it before the nltk-provided version on `nltk.data.path`, or by creating a `ptb` directory as described above and using the `propbank_ptb` module), then you can access the trees for all instances. A list of the verb lemmas contained in PropBank is returned by the `propbank.verbs()` method: >>> propbank.verbs() ['abandon', 'abate', 'abdicate', 'abet', 'abide', ...] nltk-3.7/nltk/test/pytest.ini000066400000000000000000000011601420073152400163070ustar00rootroot00000000000000[pytest] doctest_optionflags=ELLIPSIS NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL # --doctest-continue-on-failure allows the test to always # get to the teardown, if it exists addopts = --doctest-glob *.doctest --doctest-continue-on-failure --color=yes --code-highlight=yes # Other options for creating valid teardowns for doctests: # 1. Turn doctests into unittest tests # - https://docs.python.org/3/library/doctest.html#unittest-api # 2. Use sphinx doctests # - https://www.sphinx-doc.org/en/master/usage/extensions/doctest.html # 3. Convert the tests that require setup/teardown into pytest tests with fixtures nltk-3.7/nltk/test/relextract.doctest000066400000000000000000000220571420073152400200320ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ====================== Information Extraction ====================== Information Extraction standardly consists of three subtasks: #. Named Entity Recognition #. Relation Extraction #. Template Filling Named Entities ~~~~~~~~~~~~~~ The IEER corpus is marked up for a variety of Named Entities. A `Named Entity`:dt: (more strictly, a Named Entity mention) is a name of an entity belonging to a specified class. For example, the Named Entity classes in IEER include PERSON, LOCATION, ORGANIZATION, DATE and so on. Within NLTK, Named Entities are represented as subtrees within a chunk structure: the class name is treated as node label, while the entity mention itself appears as the leaves of the subtree. This is illustrated below, where we have show an extract of the chunk representation of document NYT_19980315.064: >>> from nltk.corpus import ieer >>> docs = ieer.parsed_docs('NYT_19980315') >>> tree = docs[1].text >>> print(tree) (DOCUMENT ... ``It's a chance to think about first-level questions,'' said Ms. (PERSON Cohn) , a partner in the (ORGANIZATION McGlashan & Sarrail) firm in (LOCATION San Mateo) , (LOCATION Calif.) ...) Thus, the Named Entity mentions in this example are *Cohn*, *McGlashan & Sarrail*, *San Mateo* and *Calif.*. The CoNLL2002 Dutch and Spanish data is treated similarly, although in this case, the strings are also POS tagged. >>> from nltk.corpus import conll2002 >>> for doc in conll2002.chunked_sents('ned.train')[27]: ... print(doc) ('Het', 'Art') (ORG Hof/N van/Prep Cassatie/N) ('verbrak', 'V') ('het', 'Art') ('arrest', 'N') ('zodat', 'Conj') ('het', 'Pron') ('moest', 'V') ('worden', 'V') ('overgedaan', 'V') ('door', 'Prep') ('het', 'Art') ('hof', 'N') ('van', 'Prep') ('beroep', 'N') ('van', 'Prep') (LOC Antwerpen/N) ('.', 'Punc') Relation Extraction ~~~~~~~~~~~~~~~~~~~ Relation Extraction standardly consists of identifying specified relations between Named Entities. For example, assuming that we can recognize ORGANIZATIONs and LOCATIONs in text, we might want to also recognize pairs *(o, l)* of these kinds of entities such that *o* is located in *l*. The `sem.relextract` module provides some tools to help carry out a simple version of this task. The `tree2semi_rel()` function splits a chunk document into a list of two-member lists, each of which consists of a (possibly empty) string followed by a `Tree` (i.e., a Named Entity): >>> from nltk.sem import relextract >>> pairs = relextract.tree2semi_rel(tree) >>> for s, tree in pairs[18:22]: ... print('("...%s", %s)' % (" ".join(s[-5:]),tree)) ("...about first-level questions,'' said Ms.", (PERSON Cohn)) ("..., a partner in the", (ORGANIZATION McGlashan & Sarrail)) ("...firm in", (LOCATION San Mateo)) ("...,", (LOCATION Calif.)) The function `semi_rel2reldict()` processes triples of these pairs, i.e., pairs of the form ``((string1, Tree1), (string2, Tree2), (string3, Tree3))`` and outputs a dictionary (a `reldict`) in which ``Tree1`` is the subject of the relation, ``string2`` is the filler and ``Tree3`` is the object of the relation. ``string1`` and ``string3`` are stored as left and right context respectively. >>> reldicts = relextract.semi_rel2reldict(pairs) >>> for k, v in sorted(reldicts[0].items()): ... print(k, '=>', v) filler => of messages to their own ``Cyberia'' ... lcon => transactions.'' Each week, they post objclass => ORGANIZATION objsym => white_house objtext => White House rcon => for access to its planned subjclass => CARDINAL subjsym => hundreds subjtext => hundreds untagged_filler => of messages to their own ``Cyberia'' ... The next example shows some of the values for two `reldict`\ s corresponding to the ``'NYT_19980315'`` text extract shown earlier. >>> for r in reldicts[18:20]: ... print('=' * 20) ... print(r['subjtext']) ... print(r['filler']) ... print(r['objtext']) ==================== Cohn , a partner in the McGlashan & Sarrail ==================== McGlashan & Sarrail firm in San Mateo The function `relextract()` allows us to filter the `reldict`\ s according to the classes of the subject and object named entities. In addition, we can specify that the filler text has to match a given regular expression, as illustrated in the next example. Here, we are looking for pairs of entities in the IN relation, where IN has signature . >>> import re >>> IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') >>> for fileid in ieer.fileids(): ... for doc in ieer.parsed_docs(fileid): ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN): ... print(relextract.rtuple(rel)) [ORG: 'Christian Democrats'] ', the leading political forces in' [LOC: 'Italy'] [ORG: 'AP'] ') _ Lebanese guerrillas attacked Israeli forces in southern' [LOC: 'Lebanon'] [ORG: 'Security Council'] 'adopted Resolution 425. Huge yellow banners hung across intersections in' [LOC: 'Beirut'] [ORG: 'U.N.'] 'failures in' [LOC: 'Africa'] [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia'] [ORG: 'U.N.'] 'partners on a more effective role in' [LOC: 'Africa'] [ORG: 'AP'] ') _ A bomb exploded in a mosque in central' [LOC: 'San`a'] [ORG: 'Krasnoye Sormovo'] 'shipyard in the Soviet city of' [LOC: 'Gorky'] [ORG: 'Kelab Golf Darul Ridzuan'] 'in' [LOC: 'Perak'] [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia'] [ORG: 'WHYY'] 'in' [LOC: 'Philadelphia'] [ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo'] [ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington'] [ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington'] [ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles'] [ORG: 'Open Text'] ', based in' [LOC: 'Waterloo'] ... The next example illustrates a case where the pattern is a disjunction of roles that a PERSON can occupy in an ORGANIZATION. >>> roles = r""" ... (.*( ... analyst| ... chair(wo)?man| ... commissioner| ... counsel| ... director| ... economist| ... editor| ... executive| ... foreman| ... governor| ... head| ... lawyer| ... leader| ... librarian).*)| ... manager| ... partner| ... president| ... producer| ... professor| ... researcher| ... spokes(wo)?man| ... writer| ... ,\sof\sthe?\s* # "X, of (the) Y" ... """ >>> ROLES = re.compile(roles, re.VERBOSE) >>> for fileid in ieer.fileids(): ... for doc in ieer.parsed_docs(fileid): ... for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): ... print(relextract.rtuple(rel)) [PER: 'Kivutha Kibwana'] ', of the' [ORG: 'National Convention Assembly'] [PER: 'Boban Boskovic'] ', chief executive of the' [ORG: 'Plastika'] [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations'] [PER: 'Kiriyenko'] 'became a foreman at the' [ORG: 'Krasnoye Sormovo'] [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations'] [PER: 'Mike Godwin'] ', chief counsel for the' [ORG: 'Electronic Frontier Foundation'] ... In the case of the CoNLL2002 data, we can include POS tags in the query pattern. This example also illustrates how the output can be presented as something that looks more like a clause in a logical language. >>> de = """ ... .* ... ( ... de/SP| ... del/SP ... ) ... """ >>> DE = re.compile(de, re.VERBOSE) >>> rels = [rel for doc in conll2002.chunked_sents('esp.train') ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] >>> for r in rels[:10]: ... print(relextract.clause(r, relsym='DE')) DE('tribunal_supremo', 'victoria') DE('museo_de_arte', 'alcorc\xf3n') DE('museo_de_bellas_artes', 'a_coru\xf1a') DE('siria', 'l\xedbano') DE('uni\xf3n_europea', 'pek\xedn') DE('ej\xe9rcito', 'rogberi') DE('juzgado_de_instrucci\xf3n_n\xfamero_1', 'san_sebasti\xe1n') DE('psoe', 'villanueva_de_la_serena') DE('ej\xe9rcito', 'l\xedbano') DE('juzgado_de_lo_penal_n\xfamero_2', 'ceuta') >>> vnv = """ ... ( ... is/V| ... was/V| ... werd/V| ... wordt/V ... ) ... .* ... van/Prep ... """ >>> VAN = re.compile(vnv, re.VERBOSE) >>> for doc in conll2002.chunked_sents('ned.train'): ... for r in relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): ... print(relextract.clause(r, relsym="VAN")) VAN("cornet_d'elzius", 'buitenlandse_handel') VAN('johan_rottiers', 'kardinaal_van_roey_instituut') VAN('annie_lennox', 'eurythmics') nltk-3.7/nltk/test/resolution.doctest000066400000000000000000000171541420073152400200620ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========================= Resolution Theorem Prover ========================= >>> from nltk.inference.resolution import * >>> from nltk.sem import logic >>> from nltk.sem.logic import * >>> logic._counter._value = 0 >>> read_expr = logic.Expression.fromstring >>> P = read_expr('P') >>> Q = read_expr('Q') >>> R = read_expr('R') >>> A = read_expr('A') >>> B = read_expr('B') >>> x = read_expr('x') >>> y = read_expr('y') >>> z = read_expr('z') ------------------------------- Test most_general_unification() ------------------------------- >>> print(most_general_unification(x, x)) {} >>> print(most_general_unification(A, A)) {} >>> print(most_general_unification(A, x)) {x: A} >>> print(most_general_unification(x, A)) {x: A} >>> print(most_general_unification(x, y)) {x: y} >>> print(most_general_unification(P(x), P(A))) {x: A} >>> print(most_general_unification(P(x,B), P(A,y))) {x: A, y: B} >>> print(most_general_unification(P(x,B), P(B,x))) {x: B} >>> print(most_general_unification(P(x,y), P(A,x))) {x: A, y: x} >>> print(most_general_unification(P(Q(x)), P(y))) {y: Q(x)} ------------ Test unify() ------------ >>> print(Clause([]).unify(Clause([]))) [] >>> print(Clause([P(x)]).unify(Clause([-P(A)]))) [{}] >>> print(Clause([P(A), Q(x)]).unify(Clause([-P(x), R(x)]))) [{R(A), Q(A)}] >>> print(Clause([P(A), Q(x), R(x,y)]).unify(Clause([-P(x), Q(y)]))) [{Q(y), Q(A), R(A,y)}] >>> print(Clause([P(A), -Q(y)]).unify(Clause([-P(x), Q(B)]))) [{}] >>> print(Clause([P(x), Q(x)]).unify(Clause([-P(A), -Q(B)]))) [{-Q(B), Q(A)}, {-P(A), P(B)}] >>> print(Clause([P(x,x), Q(x), R(x)]).unify(Clause([-P(A,z), -Q(B)]))) [{-Q(B), Q(A), R(A)}, {-P(A,z), R(B), P(B,B)}] >>> a = clausify(read_expr('P(A)')) >>> b = clausify(read_expr('A=B')) >>> print(a[0].unify(b[0])) [{P(B)}] ------------------------- Test is_tautology() ------------------------- >>> print(Clause([P(A), -P(A)]).is_tautology()) True >>> print(Clause([-P(A), P(A)]).is_tautology()) True >>> print(Clause([P(x), -P(A)]).is_tautology()) False >>> print(Clause([Q(B), -P(A), P(A)]).is_tautology()) True >>> print(Clause([-Q(A), P(R(A)), -P(R(A)), Q(x), -R(y)]).is_tautology()) True >>> print(Clause([P(x), -Q(A)]).is_tautology()) False ------------------------- Test subsumes() ------------------------- >>> print(Clause([P(A), Q(B)]).subsumes(Clause([P(A), Q(B)]))) True >>> print(Clause([-P(A)]).subsumes(Clause([P(A)]))) False >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), P(A)]))) True >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), R(A), P(A)]))) True >>> print(Clause([P(A), R(A), Q(B)]).subsumes(Clause([Q(B), P(A)]))) False >>> print(Clause([P(x)]).subsumes(Clause([P(A)]))) True >>> print(Clause([P(A)]).subsumes(Clause([P(x)]))) True ------------ Test prove() ------------ >>> print(ResolutionProverCommand(read_expr('man(x)')).prove()) False >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) -> --man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) <-> man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('-(man(x) <-> -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('all x.man(x)')).prove()) False >>> print(ResolutionProverCommand(read_expr('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')).prove()) False >>> print(ResolutionProverCommand(read_expr('some x.all y.sees(x,y)')).prove()) False >>> p1 = read_expr('all x.(man(x) -> mortal(x))') >>> p2 = read_expr('man(Socrates)') >>> c = read_expr('mortal(Socrates)') >>> ResolutionProverCommand(c, [p1,p2]).prove() True >>> p1 = read_expr('all x.(man(x) -> walks(x))') >>> p2 = read_expr('man(John)') >>> c = read_expr('some y.walks(y)') >>> ResolutionProverCommand(c, [p1,p2]).prove() True >>> p = read_expr('some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))') >>> c = read_expr('some e0.walk(e0,mary)') >>> ResolutionProverCommand(c, [p]).prove() True ------------ Test proof() ------------ >>> p1 = read_expr('all x.(man(x) -> mortal(x))') >>> p2 = read_expr('man(Socrates)') >>> c = read_expr('mortal(Socrates)') >>> logic._counter._value = 0 >>> tp = ResolutionProverCommand(c, [p1,p2]) >>> tp.prove() True >>> print(tp.proof()) [1] {-mortal(Socrates)} A [2] {-man(z2), mortal(z2)} A [3] {man(Socrates)} A [4] {-man(Socrates)} (1, 2) [5] {mortal(Socrates)} (2, 3) [6] {} (1, 5) ------------------ Question Answering ------------------ One answer >>> p1 = read_expr('father_of(art,john)') >>> p2 = read_expr('father_of(bob,kim)') >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))') >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))') >>> logic._counter._value = 0 >>> tp = ResolutionProverCommand(None, [p1,p2,p3,c]) >>> sorted(tp.find_answers()) [] >>> print(tp.proof()) # doctest: +SKIP [1] {father_of(art,john)} A [2] {father_of(bob,kim)} A [3] {-father_of(z3,z4), parent_of(z3,z4)} A [4] {-parent_of(z6,john), ANSWER(z6)} A [5] {parent_of(art,john)} (1, 3) [6] {parent_of(bob,kim)} (2, 3) [7] {ANSWER(z6), -father_of(z6,john)} (3, 4) [8] {ANSWER(art)} (1, 7) [9] {ANSWER(art)} (4, 5) Multiple answers >>> p1 = read_expr('father_of(art,john)') >>> p2 = read_expr('mother_of(ann,john)') >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))') >>> p4 = read_expr('all x.all y.(mother_of(x,y) -> parent_of(x,y))') >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))') >>> logic._counter._value = 0 >>> tp = ResolutionProverCommand(None, [p1,p2,p3,p4,c]) >>> sorted(tp.find_answers()) [, ] >>> print(tp.proof()) # doctest: +SKIP [ 1] {father_of(art,john)} A [ 2] {mother_of(ann,john)} A [ 3] {-father_of(z3,z4), parent_of(z3,z4)} A [ 4] {-mother_of(z7,z8), parent_of(z7,z8)} A [ 5] {-parent_of(z10,john), ANSWER(z10)} A [ 6] {parent_of(art,john)} (1, 3) [ 7] {parent_of(ann,john)} (2, 4) [ 8] {ANSWER(z10), -father_of(z10,john)} (3, 5) [ 9] {ANSWER(art)} (1, 8) [10] {ANSWER(z10), -mother_of(z10,john)} (4, 5) [11] {ANSWER(ann)} (2, 10) [12] {ANSWER(art)} (5, 6) [13] {ANSWER(ann)} (5, 7) nltk-3.7/nltk/test/sem3.cfg000066400000000000000000000006561420073152400156170ustar00rootroot00000000000000####################################### # sem1.cfg ####################################### # Minimal feature-based grammar with determiner semantics. % start S S[sem=?vp] -> NP[sem=?np] VP[subj=?np, sem=?vp] VP[sem=?v, subj=?np] -> IV[sem=?v, subj=?np] NP[sem=[index='k',name='kim']] -> 'Kim' IV[sem=[rel='bark', arg=?i], subj=[sem=[index=?i]]] -> 'barks' #IV[fsem=[rel='bark', arg=(1)[]], subj=[fsem=[index->(1)]]] -> 'barks' nltk-3.7/nltk/test/semantics.doctest000066400000000000000000000577131420073152400176520ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========= Semantics ========= >>> # Setup tests by setting the counter to 0 >>> from nltk.sem import logic >>> logic._counter._value = 0 >>> import nltk >>> from nltk.sem import Valuation, Model >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ... ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] >>> val = Valuation(v) >>> dom = val.domain >>> m = Model(dom, val) Evaluation ---------- The top-level method of a ``Model`` instance is ``evaluate()``, which assigns a semantic value to expressions of the ``logic`` module, under an assignment ``g``: >>> dom = val.domain >>> g = nltk.sem.Assignment(dom) >>> m.evaluate('all x.(boy(x) -> - girl(x))', g) True ``evaluate()`` calls a recursive function ``satisfy()``, which in turn calls a function ``i()`` to interpret non-logical constants and individual variables. ``i()`` delegates the interpretation of these to the the model's ``Valuation`` and the variable assignment ``g`` respectively. Any atomic expression which cannot be assigned a value by ``i`` raises an ``Undefined`` exception; this is caught by ``evaluate``, which returns the string ``'Undefined'``. >>> m.evaluate('walk(adam)', g, trace=2) 'walk(adam)' is undefined under M, g 'Undefined' Batch Processing ---------------- The utility functions ``interpret_sents()`` and ``evaluate_sents()`` are intended to help with processing multiple sentences. Here's an example of the first of these: >>> sents = ['Mary walks'] >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg') >>> for result in results: ... for (synrep, semrep) in result: ... print(synrep) (S[SEM=] (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>] (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary)) (VP[NUM='sg', SEM=<\x.walk(x)>] (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks))) In order to provide backwards compatibility with 'legacy' grammars where the semantics value is specified with a lowercase ``sem`` feature, the relevant feature name can be passed to the function using the ``semkey`` parameter, as shown here: >>> sents = ['raining'] >>> g = nltk.grammar.FeatureGrammar.fromstring(""" ... % start S ... S[sem=] -> 'raining' ... """) >>> results = nltk.sem.util.interpret_sents(sents, g, semkey='sem') >>> for result in results: ... for (synrep, semrep) in result: ... print(semrep) raining The function ``evaluate_sents()`` works in a similar manner, but also needs to be passed a ``Model`` against which the semantic representations are evaluated. Unit Tests ========== Unit tests for relations and valuations --------------------------------------- >>> from nltk.sem import * Relations are sets of tuples, all of the same length. >>> s1 = set([('d1', 'd2'), ('d1', 'd1'), ('d2', 'd1')]) >>> is_rel(s1) True >>> s2 = set([('d1', 'd2'), ('d1', 'd2'), ('d1',)]) >>> is_rel(s2) Traceback (most recent call last): . . . ValueError: Set set([('d1', 'd2'), ('d1',)]) contains sequences of different lengths >>> s3 = set(['d1', 'd2']) >>> is_rel(s3) Traceback (most recent call last): . . . ValueError: Set set(['d2', 'd1']) contains sequences of different lengths >>> s4 = set2rel(s3) >>> is_rel(s4) True >>> is_rel(set()) True >>> null_binary_rel = set([(None, None)]) >>> is_rel(null_binary_rel) True Sets of entities are converted into sets of singleton tuples (containing strings). >>> sorted(set2rel(s3)) [('d1',), ('d2',)] >>> sorted(set2rel(set([1,3,5,]))) ['1', '3', '5'] >>> set2rel(set()) == set() True >>> set2rel(set2rel(s3)) == set2rel(s3) True Predication is evaluated by set membership. >>> ('d1', 'd2') in s1 True >>> ('d2', 'd2') in s1 False >>> ('d1',) in s1 False >>> 'd2' in s1 False >>> ('d1',) in s4 True >>> ('d1',) in set() False >>> 'd1' in null_binary_rel False >>> val = Valuation([('Fido', 'd1'), ('dog', set(['d1', 'd2'])), ('walk', set())]) >>> sorted(val['dog']) [('d1',), ('d2',)] >>> val.domain == set(['d1', 'd2']) True >>> print(val.symbols) ['Fido', 'dog', 'walk'] Parse a valuation from a string. >>> v = """ ... john => b1 ... mary => g1 ... suzie => g2 ... fido => d1 ... tess => d2 ... noosa => n ... girl => {g1, g2} ... boy => {b1, b2} ... dog => {d1, d2} ... bark => {d1, d2} ... walk => {b1, g2, d1} ... chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} ... see => {(b1, g1), (b2, d2), (g1, b1),(d2, b1), (g2, n)} ... in => {(b1, n), (b2, n), (d2, n)} ... with => {(b1, g1), (g1, b1), (d1, b1), (b1, d1)} ... """ >>> val = Valuation.fromstring(v) >>> print(val) # doctest: +SKIP {'bark': set([('d1',), ('d2',)]), 'boy': set([('b1',), ('b2',)]), 'chase': set([('b1', 'g1'), ('g2', 'd2'), ('g1', 'd1'), ('b2', 'g1')]), 'dog': set([('d1',), ('d2',)]), 'fido': 'd1', 'girl': set([('g2',), ('g1',)]), 'in': set([('d2', 'n'), ('b1', 'n'), ('b2', 'n')]), 'john': 'b1', 'mary': 'g1', 'noosa': 'n', 'see': set([('b1', 'g1'), ('b2', 'd2'), ('d2', 'b1'), ('g2', 'n'), ('g1', 'b1')]), 'suzie': 'g2', 'tess': 'd2', 'walk': set([('d1',), ('b1',), ('g2',)]), 'with': set([('b1', 'g1'), ('d1', 'b1'), ('b1', 'd1'), ('g1', 'b1')])} Unit tests for function argument application in a Model ------------------------------------------------------- >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')])), ... ('kiss', null_binary_rel)] >>> val = Valuation(v) >>> dom = val.domain >>> m = Model(dom, val) >>> g = Assignment(dom) >>> sorted(val['boy']) [('b1',), ('b2',)] >>> ('b1',) in val['boy'] True >>> ('g1',) in val['boy'] False >>> ('foo',) in val['boy'] False >>> ('b1', 'g1') in val['love'] True >>> ('b1', 'b1') in val['kiss'] False >>> sorted(val.domain) ['b1', 'b2', 'd1', 'g1', 'g2'] Model Tests =========== Extension of Lambda expressions >>> v0 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ... ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] >>> val0 = Valuation(v0) >>> dom0 = val0.domain >>> m0 = Model(dom0, val0) >>> g0 = Assignment(dom0) >>> print(m0.evaluate(r'\x. \y. love(x, y)', g0) == {'g2': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'b2': {'g2': True, 'b2': False, 'b1': False, 'g1': False, 'd1': False}, 'b1': {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}, 'g1': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'd1': {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}}) True >>> print(m0.evaluate(r'\x. dog(x) (adam)', g0)) False >>> print(m0.evaluate(r'\x. (dog(x) | boy(x)) (adam)', g0)) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(fido)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(adam)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)', g0) == {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)(adam)', g0)) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty, adam)', g0)) True >>> print(m0.evaluate(r'\y. \x. love(x, y)(fido)(adam)', g0)) False >>> print(m0.evaluate(r'\y. \x. love(x, y)(betty, adam)', g0)) True >>> print(m0.evaluate(r'\x. exists y. love(x, y)', g0) == {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}) True >>> print(m0.evaluate(r'\z. adam', g0) == {'g2': 'b1', 'b2': 'b1', 'b1': 'b1', 'g1': 'b1', 'd1': 'b1'}) True >>> print(m0.evaluate(r'\z. love(x, y)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}) True Propositional Model Test ------------------------ >>> tests = [ ... ('P & Q', True), ... ('P & R', False), ... ('- P', False), ... ('- R', True), ... ('- - P', True), ... ('- (P & R)', True), ... ('P | R', True), ... ('R | P', True), ... ('R | R', False), ... ('- P | R', False), ... ('P | - P', True), ... ('P -> Q', True), ... ('P -> R', False), ... ('R -> P', True), ... ('P <-> P', True), ... ('R <-> R', True), ... ('P <-> R', False), ... ] >>> val1 = Valuation([('P', True), ('Q', True), ('R', False)]) >>> dom = set([]) >>> m = Model(dom, val1) >>> g = Assignment(dom) >>> for (sent, testvalue) in tests: ... semvalue = m.evaluate(sent, g) ... if semvalue == testvalue: ... print('*', end=' ') * * * * * * * * * * * * * * * * * Test of i Function ------------------ >>> from nltk.sem import Expression >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] >>> val = Valuation(v) >>> dom = val.domain >>> m = Model(dom, val) >>> g = Assignment(dom, [('x', 'b1'), ('y', 'g2')]) >>> exprs = ['adam', 'girl', 'love', 'walks', 'x', 'y', 'z'] >>> parsed_exprs = [Expression.fromstring(e) for e in exprs] >>> sorted_set = lambda x: sorted(x) if isinstance(x, set) else x >>> for parsed in parsed_exprs: ... try: ... print("'%s' gets value %s" % (parsed, sorted_set(m.i(parsed, g)))) ... except Undefined: ... print("'%s' is Undefined" % parsed) 'adam' gets value b1 'girl' gets value [('g1',), ('g2',)] 'love' gets value [('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')] 'walks' is Undefined 'x' gets value b1 'y' gets value g2 'z' is Undefined Test for formulas in Model -------------------------- >>> tests = [ ... ('love(adam, betty)', True), ... ('love(adam, sue)', 'Undefined'), ... ('dog(fido)', True), ... ('- dog(fido)', False), ... ('- - dog(fido)', True), ... ('- dog(sue)', 'Undefined'), ... ('dog(fido) & boy(adam)', True), ... ('- (dog(fido) & boy(adam))', False), ... ('- dog(fido) & boy(adam)', False), ... ('dog(fido) | boy(adam)', True), ... ('- (dog(fido) | boy(adam))', False), ... ('- dog(fido) | boy(adam)', True), ... ('- dog(fido) | - boy(adam)', False), ... ('dog(fido) -> boy(adam)', True), ... ('- (dog(fido) -> boy(adam))', False), ... ('- dog(fido) -> boy(adam)', True), ... ('exists x . love(adam, x)', True), ... ('all x . love(adam, x)', False), ... ('fido = fido', True), ... ('exists x . all y. love(x, y)', False), ... ('exists x . (x = fido)', True), ... ('all x . (dog(x) | - dog(x))', True), ... ('adam = mia', 'Undefined'), ... ('\\x. (boy(x) | girl(x))', {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}), ... ('\\x. exists y. (boy(x) & love(x, y))', {'g2': False, 'b2': True, 'b1': True, 'g1': False, 'd1': False}), ... ('exists z1. boy(z1)', True), ... ('exists x. (boy(x) & - (x = adam))', True), ... ('exists x. (boy(x) & all y. love(y, x))', False), ... ('all x. (boy(x) | girl(x))', False), ... ('all x. (girl(x) -> exists y. boy(y) & love(x, y))', False), ... ('exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', True), ... ('exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', False), ... ('all x. (dog(x) -> - girl(x))', True), ... ('exists x. exists y. (love(x, y) & love(x, y))', True), ... ] >>> for (sent, testvalue) in tests: ... semvalue = m.evaluate(sent, g) ... if semvalue == testvalue: ... print('*', end=' ') ... else: ... print(sent, semvalue) * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Satisfier Tests --------------- >>> formulas = [ ... 'boy(x)', ... '(x = x)', ... '(boy(x) | girl(x))', ... '(boy(x) & girl(x))', ... 'love(adam, x)', ... 'love(x, adam)', ... '- (x = adam)', ... 'exists z22. love(x, z22)', ... 'exists y. love(y, x)', ... 'all y. (girl(y) -> love(x, y))', ... 'all y. (girl(y) -> love(y, x))', ... 'all y. (girl(y) -> (boy(x) & love(y, x)))', ... 'boy(x) & all y. (girl(y) -> love(x, y))', ... 'boy(x) & all y. (girl(y) -> love(y, x))', ... 'boy(x) & exists y. (girl(y) & love(y, x))', ... 'girl(x) -> dog(x)', ... 'all y. (dog(y) -> (x = y))', ... '- exists y. love(y, x)', ... 'exists y. (love(adam, y) & love(y, x))' ... ] >>> g.purge() >>> g.add('x', 'b1') {'x': 'b1'} >>> for f in formulas: ... try: ... print("'%s' gets value: %s" % (f, m.evaluate(f, g))) ... except Undefined: ... print("'%s' is Undefined" % f) 'boy(x)' gets value: True '(x = x)' gets value: True '(boy(x) | girl(x))' gets value: True '(boy(x) & girl(x))' gets value: False 'love(adam, x)' gets value: False 'love(x, adam)' gets value: False '- (x = adam)' gets value: False 'exists z22. love(x, z22)' gets value: True 'exists y. love(y, x)' gets value: True 'all y. (girl(y) -> love(x, y))' gets value: False 'all y. (girl(y) -> love(y, x))' gets value: True 'all y. (girl(y) -> (boy(x) & love(y, x)))' gets value: True 'boy(x) & all y. (girl(y) -> love(x, y))' gets value: False 'boy(x) & all y. (girl(y) -> love(y, x))' gets value: True 'boy(x) & exists y. (girl(y) & love(y, x))' gets value: True 'girl(x) -> dog(x)' gets value: True 'all y. (dog(y) -> (x = y))' gets value: False '- exists y. love(y, x)' gets value: False 'exists y. (love(adam, y) & love(y, x))' gets value: True >>> from nltk.sem import Expression >>> for fmla in formulas: ... p = Expression.fromstring(fmla) ... g.purge() ... print("Satisfiers of '%s':\n\t%s" % (p, sorted(m.satisfiers(p, 'x', g)))) Satisfiers of 'boy(x)': ['b1', 'b2'] Satisfiers of '(x = x)': ['b1', 'b2', 'd1', 'g1', 'g2'] Satisfiers of '(boy(x) | girl(x))': ['b1', 'b2', 'g1', 'g2'] Satisfiers of '(boy(x) & girl(x))': [] Satisfiers of 'love(adam,x)': ['g1'] Satisfiers of 'love(x,adam)': ['g1', 'g2'] Satisfiers of '-(x = adam)': ['b2', 'd1', 'g1', 'g2'] Satisfiers of 'exists z22.love(x,z22)': ['b1', 'b2', 'g1', 'g2'] Satisfiers of 'exists y.love(y,x)': ['b1', 'g1', 'g2'] Satisfiers of 'all y.(girl(y) -> love(x,y))': [] Satisfiers of 'all y.(girl(y) -> love(y,x))': ['b1'] Satisfiers of 'all y.(girl(y) -> (boy(x) & love(y,x)))': ['b1'] Satisfiers of '(boy(x) & all y.(girl(y) -> love(x,y)))': [] Satisfiers of '(boy(x) & all y.(girl(y) -> love(y,x)))': ['b1'] Satisfiers of '(boy(x) & exists y.(girl(y) & love(y,x)))': ['b1'] Satisfiers of '(girl(x) -> dog(x))': ['b1', 'b2', 'd1'] Satisfiers of 'all y.(dog(y) -> (x = y))': ['d1'] Satisfiers of '-exists y.love(y,x)': ['b2', 'd1'] Satisfiers of 'exists y.(love(adam,y) & love(y,x))': ['b1'] Tests based on the Blackburn & Bos testsuite -------------------------------------------- >>> v1 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'), ... ('honey_bunny', 'd4'), ('yolanda', 'd5'), ... ('customer', set(['d1', 'd2'])), ... ('robber', set(['d3', 'd4'])), ... ('love', set([('d3', 'd4')]))] >>> val1 = Valuation(v1) >>> dom1 = val1.domain >>> m1 = Model(dom1, val1) >>> g1 = Assignment(dom1) >>> v2 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'), ... ('honey_bunny', 'd4'), ('yolanda', 'd4'), ... ('customer', set(['d1', 'd2', 'd5', 'd6'])), ... ('robber', set(['d3', 'd4'])), ... ('love', set([(None, None)]))] >>> val2 = Valuation(v2) >>> dom2 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6']) >>> m2 = Model(dom2, val2) >>> g2 = Assignment(dom2) >>> g21 = Assignment(dom2) >>> g21.add('y', 'd3') {'y': 'd3'} >>> v3 = [('mia', 'd1'), ('jody', 'd2'), ('jules', 'd3'), ... ('vincent', 'd4'), ... ('woman', set(['d1', 'd2'])), ('man', set(['d3', 'd4'])), ... ('joke', set(['d5', 'd6'])), ('episode', set(['d7', 'd8'])), ... ('in', set([('d5', 'd7'), ('d5', 'd8')])), ... ('tell', set([('d1', 'd5'), ('d2', 'd6')]))] >>> val3 = Valuation(v3) >>> dom3 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8']) >>> m3 = Model(dom3, val3) >>> g3 = Assignment(dom3) >>> tests = [ ... ('exists x. robber(x)', m1, g1, True), ... ('exists x. exists y. love(y, x)', m1, g1, True), ... ('exists x0. exists x1. love(x1, x0)', m2, g2, False), ... ('all x. all y. love(y, x)', m2, g2, False), ... ('- (all x. all y. love(y, x))', m2, g2, True), ... ('all x. all y. - love(y, x)', m2, g2, True), ... ('yolanda = honey_bunny', m2, g2, True), ... ('mia = honey_bunny', m2, g2, 'Undefined'), ... ('- (yolanda = honey_bunny)', m2, g2, False), ... ('- (mia = honey_bunny)', m2, g2, 'Undefined'), ... ('all x. (robber(x) | customer(x))', m2, g2, True), ... ('- (all x. (robber(x) | customer(x)))', m2, g2, False), ... ('(robber(x) | customer(x))', m2, g2, 'Undefined'), ... ('(robber(y) | customer(y))', m2, g21, True), ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True), ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True), ... ('- exists x. woman(x)', m3, g3, False), ... ('exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'), ... ('- exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'), ... ('exists x. (man(x) & - exists y. woman(y))', m3, g3, False), ... ('exists x. (man(x) & - exists x. woman(x))', m3, g3, False), ... ('exists x. (woman(x) & - exists x. customer(x))', m2, g2, 'Undefined'), ... ] >>> for item in tests: ... sentence, model, g, testvalue = item ... semvalue = model.evaluate(sentence, g) ... if semvalue == testvalue: ... print('*', end=' ') ... g.purge() * * * * * * * * * * * * * * * * * * * * * * Tests for mapping from syntax to semantics ------------------------------------------ Load a valuation from a file. >>> import nltk.data >>> from nltk.sem.util import parse_sents >>> val = nltk.data.load('grammars/sample_grammars/valuation1.val') >>> dom = val.domain >>> m = Model(dom, val) >>> g = Assignment(dom) >>> gramfile = 'grammars/sample_grammars/sem2.fcfg' >>> inputs = ['John sees a girl', 'every dog barks'] >>> parses = parse_sents(inputs, gramfile) >>> for sent, trees in zip(inputs, parses): ... print() ... print("Sentence: %s" % sent) ... for tree in trees: ... print("Parse:\n %s" %tree) ... print("Semantics: %s" % root_semrep(tree)) Sentence: John sees a girl Parse: (S[SEM=] (NP[-LOC, NUM='sg', SEM=<\P.P(john)>] (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John)) (VP[NUM='sg', SEM=<\y.exists x.(girl(x) & see(y,x))>] (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees) (NP[NUM='sg', SEM=<\Q.exists x.(girl(x) & Q(x))>] (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a) (Nom[NUM='sg', SEM=<\x.girl(x)>] (N[NUM='sg', SEM=<\x.girl(x)>] girl))))) Semantics: exists x.(girl(x) & see(john,x)) Sentence: every dog barks Parse: (S[SEM= bark(x))>] (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) (Nom[NUM='sg', SEM=<\x.dog(x)>] (N[NUM='sg', SEM=<\x.dog(x)>] dog))) (VP[NUM='sg', SEM=<\x.bark(x)>] (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) Semantics: all x.(dog(x) -> bark(x)) >>> sent = "every dog barks" >>> result = nltk.sem.util.interpret_sents([sent], gramfile)[0] >>> for (syntree, semrep) in result: ... print(syntree) ... print() ... print(semrep) (S[SEM= bark(x))>] (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) (Nom[NUM='sg', SEM=<\x.dog(x)>] (N[NUM='sg', SEM=<\x.dog(x)>] dog))) (VP[NUM='sg', SEM=<\x.bark(x)>] (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) all x.(dog(x) -> bark(x)) >>> result = nltk.sem.util.evaluate_sents([sent], gramfile, m, g)[0] >>> for (syntree, semrel, value) in result: ... print(syntree) ... print() ... print(semrep) ... print() ... print(value) (S[SEM= bark(x))>] (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) (Nom[NUM='sg', SEM=<\x.dog(x)>] (N[NUM='sg', SEM=<\x.dog(x)>] dog))) (VP[NUM='sg', SEM=<\x.bark(x)>] (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) all x.(dog(x) -> bark(x)) True >>> sents = ['Mary walks', 'John sees a dog'] >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg') >>> for result in results: ... for (synrep, semrep) in result: ... print(synrep) (S[SEM=] (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>] (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary)) (VP[NUM='sg', SEM=<\x.walk(x)>] (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks))) (S[SEM=] (NP[-LOC, NUM='sg', SEM=<\P.P(john)>] (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John)) (VP[NUM='sg', SEM=<\y.exists x.(dog(x) & see(y,x))>] (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees) (NP[NUM='sg', SEM=<\Q.exists x.(dog(x) & Q(x))>] (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a) (Nom[NUM='sg', SEM=<\x.dog(x)>] (N[NUM='sg', SEM=<\x.dog(x)>] dog))))) Cooper Storage -------------- >>> from nltk.sem import cooper_storage as cs >>> sentence = 'every girl chases a dog' >>> trees = cs.parse_with_bindops(sentence, grammar='grammars/book_grammars/storage.fcfg') >>> semrep = trees[0].label()['SEM'] >>> cs_semrep = cs.CooperStore(semrep) >>> print(cs_semrep.core) chase(z2,z4) >>> for bo in cs_semrep.store: ... print(bo) bo(\P.all x.(girl(x) -> P(x)),z2) bo(\P.exists x.(dog(x) & P(x)),z4) >>> cs_semrep.s_retrieve(trace=True) Permutation 1 (\P.all x.(girl(x) -> P(x)))(\z2.chase(z2,z4)) (\P.exists x.(dog(x) & P(x)))(\z4.all x.(girl(x) -> chase(x,z4))) Permutation 2 (\P.exists x.(dog(x) & P(x)))(\z4.chase(z2,z4)) (\P.all x.(girl(x) -> P(x)))(\z2.exists x.(dog(x) & chase(z2,x))) >>> for reading in cs_semrep.readings: ... print(reading) exists x.(dog(x) & all z3.(girl(z3) -> chase(z3,x))) all x.(girl(x) -> exists z4.(dog(z4) & chase(x,z4))) nltk-3.7/nltk/test/sentiment.doctest000066400000000000000000000273311420073152400176630ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =================== Sentiment Analysis =================== >>> from nltk.classify import NaiveBayesClassifier >>> from nltk.corpus import subjectivity >>> from nltk.sentiment import SentimentAnalyzer >>> from nltk.sentiment.util import * >>> n_instances = 100 >>> subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] >>> obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] >>> len(subj_docs), len(obj_docs) (100, 100) Each document is represented by a tuple (sentence, label). The sentence is tokenized, so it is represented by a list of strings: >>> subj_docs[0] (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj') We separately split subjective and objective instances to keep a balanced uniform class distribution in both train and test sets. >>> train_subj_docs = subj_docs[:80] >>> test_subj_docs = subj_docs[80:100] >>> train_obj_docs = obj_docs[:80] >>> test_obj_docs = obj_docs[80:100] >>> training_docs = train_subj_docs+train_obj_docs >>> testing_docs = test_subj_docs+test_obj_docs >>> sentim_analyzer = SentimentAnalyzer() >>> all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) We use simple unigram word features, handling negation: >>> unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) >>> len(unigram_feats) 83 >>> sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) We apply features to obtain a feature-value representation of our datasets: >>> training_set = sentim_analyzer.apply_features(training_docs) >>> test_set = sentim_analyzer.apply_features(testing_docs) We can now train our classifier on the training set, and subsequently output the evaluation results: >>> trainer = NaiveBayesClassifier.train >>> classifier = sentim_analyzer.train(trainer, training_set) Training classifier >>> for key,value in sorted(sentim_analyzer.evaluate(test_set).items()): ... print('{0}: {1}'.format(key, value)) Evaluating NaiveBayesClassifier results... Accuracy: 0.8 F-measure [obj]: 0.8 F-measure [subj]: 0.8 Precision [obj]: 0.8 Precision [subj]: 0.8 Recall [obj]: 0.8 Recall [subj]: 0.8 Vader ------ >>> from nltk.sentiment.vader import SentimentIntensityAnalyzer >>> sentences = ["VADER is smart, handsome, and funny.", # positive sentence example ... "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted) ... "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted) ... "VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled ... "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity ... "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score ... "The book was good.", # positive sentence ... "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted) ... "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence ... "A really bad, horrible book.", # negative sentence with booster words ... "At least it isn't a horrible book.", # negated negative sentence with contraction ... ":) and :D", # emoticons handled ... "", # an empty string is correctly handled ... "Today sux", # negative slang handled ... "Today sux!", # negative slang with punctuation emphasis handled ... "Today SUX!", # negative slang with capitalization emphasis ... "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but" ... ] >>> paragraph = "It was one of the worst movies I've seen, despite good reviews. \ ... Unbelievably bad acting!! Poor direction. VERY poor production. \ ... The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!" >>> from nltk import tokenize >>> lines_list = tokenize.sent_tokenize(paragraph) >>> sentences.extend(lines_list) >>> tricky_sentences = [ ... "Most automated sentiment analysis tools are shit.", ... "VADER sentiment analysis is the shit.", ... "Sentiment analysis has never been good.", ... "Sentiment analysis with VADER has never been this good.", ... "Warren Beatty has never been so entertaining.", ... "I won't say that the movie is astounding and I wouldn't claim that \ ... the movie is too banal either.", ... "I like to hate Michael Bay films, but I couldn't fault this one", ... "I like to hate Michael Bay films, BUT I couldn't help but fault this one", ... "It's one thing to watch an Uwe Boll film, but another thing entirely \ ... to pay for it", ... "The movie was too good", ... "This movie was actually neither that funny, nor super witty.", ... "This movie doesn't care about cleverness, wit or any other kind of \ ... intelligent humor.", ... "Those who find ugly meanings in beautiful things are corrupt without \ ... being charming.", ... "There are slow and repetitive parts, BUT it has just enough spice to \ ... keep it interesting.", ... "The script is not fantastic, but the acting is decent and the cinematography \ ... is EXCELLENT!", ... "Roger Dodger is one of the most compelling variations on this theme.", ... "Roger Dodger is one of the least compelling variations on this theme.", ... "Roger Dodger is at least compelling as a variation on the theme.", ... "they fall in love with the product", ... "but then it breaks", ... "usually around the time the 90 day warranty expires", ... "the twin towers collapsed today", ... "However, Mr. Carter solemnly argues, his client carried out the kidnapping \ ... under orders and in the ''least offensive way possible.''" ... ] >>> sentences.extend(tricky_sentences) >>> for sentence in sentences: ... sid = SentimentIntensityAnalyzer() ... print(sentence) ... ss = sid.polarity_scores(sentence) ... for k in sorted(ss): ... print('{0}: {1}, '.format(k, ss[k]), end='') ... print() VADER is smart, handsome, and funny. compound: 0.8316, neg: 0.0, neu: 0.254, pos: 0.746, VADER is smart, handsome, and funny! compound: 0.8439, neg: 0.0, neu: 0.248, pos: 0.752, VADER is very smart, handsome, and funny. compound: 0.8545, neg: 0.0, neu: 0.299, pos: 0.701, VADER is VERY SMART, handsome, and FUNNY. compound: 0.9227, neg: 0.0, neu: 0.246, pos: 0.754, VADER is VERY SMART, handsome, and FUNNY!!! compound: 0.9342, neg: 0.0, neu: 0.233, pos: 0.767, VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!! compound: 0.9469, neg: 0.0, neu: 0.294, pos: 0.706, The book was good. compound: 0.4404, neg: 0.0, neu: 0.508, pos: 0.492, The book was kind of good. compound: 0.3832, neg: 0.0, neu: 0.657, pos: 0.343, The plot was good, but the characters are uncompelling and the dialog is not great. compound: -0.7042, neg: 0.327, neu: 0.579, pos: 0.094, A really bad, horrible book. compound: -0.8211, neg: 0.791, neu: 0.209, pos: 0.0, At least it isn't a horrible book. compound: 0.431, neg: 0.0, neu: 0.637, pos: 0.363, :) and :D compound: 0.7925, neg: 0.0, neu: 0.124, pos: 0.876, compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0, Today sux compound: -0.3612, neg: 0.714, neu: 0.286, pos: 0.0, Today sux! compound: -0.4199, neg: 0.736, neu: 0.264, pos: 0.0, Today SUX! compound: -0.5461, neg: 0.779, neu: 0.221, pos: 0.0, Today kinda sux! But I'll get by, lol compound: 0.5249, neg: 0.138, neu: 0.517, pos: 0.344, It was one of the worst movies I've seen, despite good reviews. compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0, Unbelievably bad acting!! compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0, Poor direction. compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0, VERY poor production. compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0, The movie was bad. compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0, Very bad movie. compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0, VERY bad movie. compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0, VERY BAD movie. compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0, VERY BAD movie! compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0, Most automated sentiment analysis tools are shit. compound: -0.5574, neg: 0.375, neu: 0.625, pos: 0.0, VADER sentiment analysis is the shit. compound: 0.6124, neg: 0.0, neu: 0.556, pos: 0.444, Sentiment analysis has never been good. compound: -0.3412, neg: 0.325, neu: 0.675, pos: 0.0, Sentiment analysis with VADER has never been this good. compound: 0.5228, neg: 0.0, neu: 0.703, pos: 0.297, Warren Beatty has never been so entertaining. compound: 0.5777, neg: 0.0, neu: 0.616, pos: 0.384, I won't say that the movie is astounding and I wouldn't claim that the movie is too banal either. compound: 0.4215, neg: 0.0, neu: 0.851, pos: 0.149, I like to hate Michael Bay films, but I couldn't fault this one compound: 0.3153, neg: 0.157, neu: 0.534, pos: 0.309, I like to hate Michael Bay films, BUT I couldn't help but fault this one compound: -0.1531, neg: 0.277, neu: 0.477, pos: 0.246, It's one thing to watch an Uwe Boll film, but another thing entirely to pay for it compound: -0.2541, neg: 0.112, neu: 0.888, pos: 0.0, The movie was too good compound: 0.4404, neg: 0.0, neu: 0.58, pos: 0.42, This movie was actually neither that funny, nor super witty. compound: -0.6759, neg: 0.41, neu: 0.59, pos: 0.0, This movie doesn't care about cleverness, wit or any other kind of intelligent humor. compound: -0.1338, neg: 0.265, neu: 0.497, pos: 0.239, Those who find ugly meanings in beautiful things are corrupt without being charming. compound: -0.3553, neg: 0.314, neu: 0.493, pos: 0.192, There are slow and repetitive parts, BUT it has just enough spice to keep it interesting. compound: 0.4678, neg: 0.079, neu: 0.735, pos: 0.186, The script is not fantastic, but the acting is decent and the cinematography is EXCELLENT! compound: 0.7565, neg: 0.092, neu: 0.607, pos: 0.301, Roger Dodger is one of the most compelling variations on this theme. compound: 0.2944, neg: 0.0, neu: 0.834, pos: 0.166, Roger Dodger is one of the least compelling variations on this theme. compound: -0.1695, neg: 0.132, neu: 0.868, pos: 0.0, Roger Dodger is at least compelling as a variation on the theme. compound: 0.2263, neg: 0.0, neu: 0.84, pos: 0.16, they fall in love with the product compound: 0.6369, neg: 0.0, neu: 0.588, pos: 0.412, but then it breaks compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, usually around the time the 90 day warranty expires compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, the twin towers collapsed today compound: -0.2732, neg: 0.344, neu: 0.656, pos: 0.0, However, Mr. Carter solemnly argues, his client carried out the kidnapping under orders and in the ''least offensive way possible.'' compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074, nltk-3.7/nltk/test/sentiwordnet.doctest000066400000000000000000000017621420073152400204020ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ====================== SentiWordNet Interface ====================== SentiWordNet can be imported like this: >>> from nltk.corpus import sentiwordnet as swn ------------ SentiSynsets ------------ >>> breakdown = swn.senti_synset('breakdown.n.03') >>> print(breakdown) >>> breakdown.pos_score() 0.0 >>> breakdown.neg_score() 0.25 >>> breakdown.obj_score() 0.75 ------ Lookup ------ >>> list(swn.senti_synsets('slow')) [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')] >>> happy = swn.senti_synsets('happy', 'a') >>> all = swn.all_senti_synsets() nltk-3.7/nltk/test/simple.doctest000066400000000000000000000044241420073152400171440ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ================= EasyInstall Tests ================= This file contains some simple tests that will be run by EasyInstall in order to test the installation when NLTK-Data is absent. ------------ Tokenization ------------ >>> from nltk.tokenize import wordpunct_tokenize >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" ... "two of them.\n\nThanks.") >>> wordpunct_tokenize(s) ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] ------- Metrics ------- >>> from nltk.metrics import precision, recall, f_measure >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() >>> reference_set = set(reference) >>> test_set = set(test) >>> precision(reference_set, test_set) 1.0 >>> print(recall(reference_set, test_set)) 0.8 >>> print(f_measure(reference_set, test_set)) 0.88888888888... ------------------ Feature Structures ------------------ >>> from nltk import FeatStruct >>> fs1 = FeatStruct(PER=3, NUM='pl', GND='fem') >>> fs2 = FeatStruct(POS='N', AGR=fs1) >>> print(fs2) [ [ GND = 'fem' ] ] [ AGR = [ NUM = 'pl' ] ] [ [ PER = 3 ] ] [ ] [ POS = 'N' ] >>> print(fs2['AGR']) [ GND = 'fem' ] [ NUM = 'pl' ] [ PER = 3 ] >>> print(fs2['AGR']['PER']) 3 ------- Parsing ------- >>> from nltk.parse.recursivedescent import RecursiveDescentParser >>> from nltk.grammar import CFG >>> grammar = CFG.fromstring(""" ... S -> NP VP ... PP -> P NP ... NP -> 'the' N | N PP | 'the' N PP ... VP -> V NP | V PP | V NP PP ... N -> 'cat' | 'dog' | 'rug' ... V -> 'chased' ... P -> 'on' ... """) >>> rd = RecursiveDescentParser(grammar) >>> sent = 'the cat chased the dog on the rug'.split() >>> for t in rd.parse(sent): ... print(t) (S (NP the (N cat)) (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) (S (NP the (N cat)) (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) nltk-3.7/nltk/test/stem.doctest000066400000000000000000000046171420073152400166270ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========== Stemmers ========== Overview ~~~~~~~~ Stemmers remove morphological affixes from words, leaving only the word stem. >>> from nltk.stem import * Unit tests for the Porter stemmer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> from nltk.stem.porter import * Create a new Porter stemmer. >>> stemmer = PorterStemmer() Test the stemmer on various pluralised words. >>> plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', ... 'died', 'agreed', 'owned', 'humbled', 'sized', ... 'meeting', 'stating', 'siezing', 'itemization', ... 'sensational', 'traditional', 'reference', 'colonizer', ... 'plotted'] >>> singles = [stemmer.stem(plural) for plural in plurals] >>> print(' '.join(singles)) caress fli die mule deni die agre own humbl size meet state siez item sensat tradit refer colon plot Unit tests for Snowball stemmer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> from nltk.stem.snowball import SnowballStemmer See which languages are supported. >>> print(" ".join(SnowballStemmer.languages)) arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish Create a new instance of a language specific subclass. >>> stemmer = SnowballStemmer("english") Stem a word. >>> print(stemmer.stem("running")) run Decide not to stem stopwords. >>> stemmer2 = SnowballStemmer("english", ignore_stopwords=True) >>> print(stemmer.stem("having")) have >>> print(stemmer2.stem("having")) having The 'english' stemmer is better than the original 'porter' stemmer. >>> print(SnowballStemmer("english").stem("generously")) generous >>> print(SnowballStemmer("porter").stem("generously")) gener .. note:: Extra stemmer tests can be found in `nltk.test.unit.test_stem`. Unit tests for ARLSTem Stemmer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> from nltk.stem.arlstem import ARLSTem Create a Stemmer instance. >>> stemmer = ARLSTem() Stem a word. >>> stemmer.stem('يعمل') 'عمل' Unit tests for ARLSTem2 Stemmer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> from nltk.stem.arlstem2 import ARLSTem2 Create a Stemmer instance. >>> stemmer = ARLSTem2() Stem a word. >>> stemmer.stem('يعمل') 'عمل' nltk-3.7/nltk/test/tag.doctest000066400000000000000000001015321420073152400164240ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT Evaluation of Taggers ===================== Evaluating the standard NLTK PerceptronTagger using Accuracy, Precision, Recall and F-measure for each of the tags. >>> from nltk.tag import PerceptronTagger >>> from nltk.corpus import treebank >>> tagger = PerceptronTagger() >>> gold_data = treebank.tagged_sents()[10:20] >>> print(tagger.accuracy(gold_data)) # doctest: +ELLIPSIS 0.885931... >>> print(tagger.evaluate_per_tag(gold_data)) Tag | Prec. | Recall | F-measure -------+--------+--------+----------- '' | 1.0000 | 1.0000 | 1.0000 , | 1.0000 | 1.0000 | 1.0000 -NONE- | 0.0000 | 0.0000 | 0.0000 . | 1.0000 | 1.0000 | 1.0000 : | 1.0000 | 1.0000 | 1.0000 CC | 1.0000 | 1.0000 | 1.0000 CD | 0.7647 | 1.0000 | 0.8667 DT | 1.0000 | 1.0000 | 1.0000 IN | 1.0000 | 1.0000 | 1.0000 JJ | 0.5882 | 0.8333 | 0.6897 JJR | 1.0000 | 1.0000 | 1.0000 JJS | 1.0000 | 1.0000 | 1.0000 NN | 0.7647 | 0.9630 | 0.8525 NNP | 0.8929 | 1.0000 | 0.9434 NNS | 1.0000 | 1.0000 | 1.0000 POS | 1.0000 | 1.0000 | 1.0000 PRP | 1.0000 | 1.0000 | 1.0000 RB | 0.8000 | 1.0000 | 0.8889 RBR | 0.0000 | 0.0000 | 0.0000 TO | 1.0000 | 1.0000 | 1.0000 VB | 1.0000 | 1.0000 | 1.0000 VBD | 0.8571 | 0.9231 | 0.8889 VBG | 1.0000 | 1.0000 | 1.0000 VBN | 0.8333 | 0.5556 | 0.6667 VBP | 0.5714 | 0.8000 | 0.6667 VBZ | 1.0000 | 1.0000 | 1.0000 WP | 1.0000 | 1.0000 | 1.0000 `` | 1.0000 | 1.0000 | 1.0000 List only the 10 most common tags: >>> print(tagger.evaluate_per_tag(gold_data, truncate=10, sort_by_count=True)) Tag | Prec. | Recall | F-measure -------+--------+--------+----------- IN | 1.0000 | 1.0000 | 1.0000 DT | 1.0000 | 1.0000 | 1.0000 NN | 0.7647 | 0.9630 | 0.8525 NNP | 0.8929 | 1.0000 | 0.9434 NNS | 1.0000 | 1.0000 | 1.0000 -NONE- | 0.0000 | 0.0000 | 0.0000 CD | 0.7647 | 1.0000 | 0.8667 VBD | 0.8571 | 0.9231 | 0.8889 JJ | 0.5882 | 0.8333 | 0.6897 , | 1.0000 | 1.0000 | 1.0000 Similarly, we can display the confusion matrix for this tagger. >>> print(tagger.confusion(gold_data)) | - | | N | | O | | N J J N N P P R V V V V V | | ' E C C D I J J J N N N O R R B T V B B B B B W ` | | ' , - . : C D T N J R S N P S S P B R O B D G N P Z P ` | -------+-------------------------------------------------------------------------------------+ '' | <3> . . . . . . . . . . . . . . . . . . . . . . . . . . . | , | .<11> . . . . . . . . . . . . . . . . . . . . . . . . . . | -NONE- | . . <.> . . . 4 . . 4 . . 7 2 . . . 1 . . . . . . 3 . . . | . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . | : | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | CC | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . | CD | . . . . . .<13> . . . . . . . . . . . . . . . . . . . . . | DT | . . . . . . .<28> . . . . . . . . . . . . . . . . . . . . | IN | . . . . . . . .<34> . . . . . . . . . . . . . . . . . . . | JJ | . . . . . . . . .<10> . . . 1 . . . . 1 . . . . . . . . . | JJR | . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . | JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . | NN | . . . . . . . . . 1 . .<26> . . . . . . . . . . . . . . . | NNP | . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . | NNS | . . . . . . . . . . . . . .<22> . . . . . . . . . . . . . | POS | . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . | PRP | . . . . . . . . . . . . . . . . <3> . . . . . . . . . . . | RB | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . | RBR | . . . . . . . . . . . . . . . . . . <.> . . . . . . . . . | TO | . . . . . . . . . . . . . . . . . . . <2> . . . . . . . . | VB | . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . | VBD | . . . . . . . . . . . . . . . . . . . . .<12> . 1 . . . . | VBG | . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . | VBN | . . . . . . . . . 2 . . . . . . . . . . . 2 . <5> . . . . | VBP | . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . | VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . <2> . . | WP | . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . | `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3>| -------+-------------------------------------------------------------------------------------+ (row = reference; col = test) Brill Trainer with evaluation ============================= >>> # Perform the relevant imports. >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer, UnigramTagger >>> # Load some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] >>> gold_data = treebank.tagged_sents()[200:300] >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) We've now created a simple ``RegexpTagger``, which tags according to the regular expression rules it has been supplied. This tagger in and of itself does not have a great accuracy. >>> backoff.accuracy(gold_data) #doctest: +ELLIPSIS 0.245014... Neither does a simple ``UnigramTagger``. This tagger is trained on some data, and will then first try to match unigrams (i.e. tokens) of the sentence it has to tag to the learned data. >>> unigram_tagger = UnigramTagger(baseline_data) >>> unigram_tagger.accuracy(gold_data) #doctest: +ELLIPSIS 0.581196... The lackluster accuracy here can be explained with the following example: >>> unigram_tagger.tag(["I", "would", "like", "this", "sentence", "to", "be", "tagged"]) [('I', 'NNP'), ('would', 'MD'), ('like', None), ('this', 'DT'), ('sentence', None), ('to', 'TO'), ('be', 'VB'), ('tagged', None)] As you can see, many tokens are tagged as ``None``, as these tokens are OOV (out of vocabulary). The ``UnigramTagger`` has never seen them, and as a result they are not in its database of known terms. In practice, a ``UnigramTagger`` is exclusively used in conjunction with a *backoff*. Our real baseline which will use such a backoff. We'll create a ``UnigramTagger`` like before, but now the ``RegexpTagger`` will be used as a backoff for the situations where the ``UnigramTagger`` encounters an OOV token. >>> baseline = UnigramTagger(baseline_data, backoff=backoff) >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS 0.7537647... That is already much better. We can investigate the performance further by running ``evaluate_per_tag``. This method will output the *Precision*, *Recall* and *F-measure* of each tag. >>> print(baseline.evaluate_per_tag(gold_data, sort_by_count=True)) Tag | Prec. | Recall | F-measure -------+--------+--------+----------- NNP | 0.9674 | 0.2738 | 0.4269 NN | 0.4111 | 0.9136 | 0.5670 IN | 0.9383 | 0.9580 | 0.9480 DT | 0.9819 | 0.8859 | 0.9314 JJ | 0.8167 | 0.2970 | 0.4356 NNS | 0.7393 | 0.9630 | 0.8365 -NONE- | 1.0000 | 0.8345 | 0.9098 , | 1.0000 | 1.0000 | 1.0000 . | 1.0000 | 1.0000 | 1.0000 VBD | 0.6429 | 0.8804 | 0.7431 CD | 1.0000 | 0.9872 | 0.9935 CC | 1.0000 | 0.9355 | 0.9667 VB | 0.7778 | 0.3684 | 0.5000 VBN | 0.9375 | 0.3000 | 0.4545 RB | 0.7778 | 0.7447 | 0.7609 TO | 1.0000 | 1.0000 | 1.0000 VBZ | 0.9643 | 0.6429 | 0.7714 VBG | 0.6415 | 0.9444 | 0.7640 PRP$ | 1.0000 | 1.0000 | 1.0000 PRP | 1.0000 | 0.5556 | 0.7143 MD | 1.0000 | 1.0000 | 1.0000 VBP | 0.6471 | 0.5789 | 0.6111 POS | 1.0000 | 1.0000 | 1.0000 $ | 1.0000 | 0.8182 | 0.9000 '' | 1.0000 | 1.0000 | 1.0000 : | 1.0000 | 1.0000 | 1.0000 WDT | 0.4000 | 0.2000 | 0.2667 `` | 1.0000 | 1.0000 | 1.0000 JJR | 1.0000 | 0.5000 | 0.6667 NNPS | 0.0000 | 0.0000 | 0.0000 RBR | 1.0000 | 1.0000 | 1.0000 -LRB- | 0.0000 | 0.0000 | 0.0000 -RRB- | 0.0000 | 0.0000 | 0.0000 RP | 0.6667 | 0.6667 | 0.6667 EX | 0.5000 | 0.5000 | 0.5000 JJS | 0.0000 | 0.0000 | 0.0000 WP | 1.0000 | 1.0000 | 1.0000 PDT | 0.0000 | 0.0000 | 0.0000 AT | 0.0000 | 0.0000 | 0.0000 It's clear that although the precision of tagging `"NNP"` is high, the recall is very low. With other words, we're missing a lot of cases where the true label is `"NNP"`. We can see a similar effect with `"JJ"`. We can also see a very expected result: The precision of `"NN"` is low, while the recall is high. If a term is OOV (i.e. ``UnigramTagger`` defers it to ``RegexpTagger``) and ``RegexpTagger`` doesn't have a good rule for it, then it will be tagged as `"NN"`. So, we catch almost all tokens that are truly labeled as `"NN"`, but we also tag as `"NN"` for many tokens that shouldn't be `"NN"`. This method gives us some insight in what parts of the tagger needs more attention, and why. However, it doesn't tell us what the terms with true label `"NNP"` or `"JJ"` are actually tagged as. To help that, we can create a confusion matrix. >>> print(baseline.confusion(gold_data)) | - | | - N - | | L O R N P | | R N R J J N N N P P P R R V V V V V W | | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ $ | <9> . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . | '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | -LRB- | . . . <.> . . . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | -NONE- | . . . .<121> . . . . . . . . . . . . . 24 . . . . . . . . . . . . . . . . . . . . | -RRB- | . . . . . <.> . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | CC | . . . . . . . . . <58> . . . . . . . . 4 . . . . . . . . . . . . . . . . . . . . | CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | DT | . . . . . . . . 1 . .<163> . 4 . . . . 13 . . . . . . . . . . . . . . . . . 3 . . | EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | JJ | . . . . . . . . . . . . . . <49> . . . 86 2 . 4 . . . . 6 . . . . 12 3 . 3 . . . . | JJR | . . . . . . . . . . . . . . . <3> . . 3 . . . . . . . . . . . . . . . . . . . . | JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | NN | . . . . . . . . . . . . . . 9 . . .<296> . . 5 . . . . . . . . 5 . 9 . . . . . . | NNP | . . . . . . . . . . . 2 . . . . . . 199 <89> . 26 . . . . 2 . . . . 2 5 . . . . . . | NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | VB | . . . . . . . . . . . . . . 2 . . . 30 . . . . . . . 1 . . . <21> . . . 3 . . . . | VBD | . . . . . . . . . . . . . . . . . . 10 . . . . . . . . . . . . <81> . 1 . . . . . | VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 31 . <15> . . . . . | VBP | . . . . . . . . . . . . . . . . . . 7 . . . . . . . . . . . 1 . . . <11> . . . . | VBZ | . . . . . . . . . . . . . . . . . . . . . 15 . . . . . . . . . . . . . <27> . . . | WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ (row = reference; col = test) Once again we can see that `"NN"` is the default if the tagger isn't sure. Beyond that, we can see why the recall for `"NNP"` is so low: these tokens are often tagged as `"NN"`. This effect can also be seen for `"JJ"`, where the majority of tokens that ought to be tagged as `"JJ"` are actually tagged as `"NN"` by our tagger. This tagger will only serve as a baseline for the ``BrillTaggerTrainer``, which uses templates to attempt to improve the performance of the tagger. >>> # Set up templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] >>> # Construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... Found 618 useful rules. B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 13 14 1 4 | NN->VB if Pos:TO@[-1] 8 8 0 0 | NN->VB if Pos:MD@[-1] 7 10 3 22 | NN->IN if Pos:NNS@[-1] 5 5 0 0 | NN->VBP if Pos:PRP@[-1] 5 5 0 0 | VBD->VBN if Pos:VBZ@[-1] 5 5 0 0 | NNS->NN if Pos:IN@[-1] & Word:asbestos@[0] 4 4 0 0 | NN->-NONE- if Pos:WP@[-1] 4 4 0 3 | NN->NNP if Pos:-NONE-@[-1] 4 6 2 2 | NN->NNP if Pos:NNP@[-1] 4 4 0 0 | NNS->VBZ if Pos:PRP@[-1] >>> tagger1.rules()[1:3] (Rule('000', 'NN', 'VB', [(Pos([-1]),'MD')]), Rule('000', 'NN', 'IN', [(Pos([-1]),'NNS')])) >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) TRAIN ( 2417 tokens) initial 555 0.7704 final: 496 0.7948 #ID | Score (train) | #Rules | Template -------------------------------------------- 000 | 54 0.915 | 9 0.900 | Template(Pos([-1])) 001 | 5 0.085 | 1 0.100 | Template(Pos([-1]),Word([0])) >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS 0.769230... >>> print(tagger1.evaluate_per_tag(gold_data, sort_by_count=True)) Tag | Prec. | Recall | F-measure -------+--------+--------+----------- NNP | 0.8298 | 0.3600 | 0.5021 NN | 0.4435 | 0.8364 | 0.5797 IN | 0.8476 | 0.9580 | 0.8994 DT | 0.9819 | 0.8859 | 0.9314 JJ | 0.8167 | 0.2970 | 0.4356 NNS | 0.7464 | 0.9630 | 0.8410 -NONE- | 1.0000 | 0.8414 | 0.9139 , | 1.0000 | 1.0000 | 1.0000 . | 1.0000 | 1.0000 | 1.0000 VBD | 0.6723 | 0.8696 | 0.7583 CD | 1.0000 | 0.9872 | 0.9935 CC | 1.0000 | 0.9355 | 0.9667 VB | 0.8103 | 0.8246 | 0.8174 VBN | 0.9130 | 0.4200 | 0.5753 RB | 0.7778 | 0.7447 | 0.7609 TO | 1.0000 | 1.0000 | 1.0000 VBZ | 0.9667 | 0.6905 | 0.8056 VBG | 0.6415 | 0.9444 | 0.7640 PRP$ | 1.0000 | 1.0000 | 1.0000 PRP | 1.0000 | 0.5556 | 0.7143 MD | 1.0000 | 1.0000 | 1.0000 VBP | 0.6316 | 0.6316 | 0.6316 POS | 1.0000 | 1.0000 | 1.0000 $ | 1.0000 | 0.8182 | 0.9000 '' | 1.0000 | 1.0000 | 1.0000 : | 1.0000 | 1.0000 | 1.0000 WDT | 0.4000 | 0.2000 | 0.2667 `` | 1.0000 | 1.0000 | 1.0000 JJR | 1.0000 | 0.5000 | 0.6667 NNPS | 0.0000 | 0.0000 | 0.0000 RBR | 1.0000 | 1.0000 | 1.0000 -LRB- | 0.0000 | 0.0000 | 0.0000 -RRB- | 0.0000 | 0.0000 | 0.0000 RP | 0.6667 | 0.6667 | 0.6667 EX | 0.5000 | 0.5000 | 0.5000 JJS | 0.0000 | 0.0000 | 0.0000 WP | 1.0000 | 1.0000 | 1.0000 PDT | 0.0000 | 0.0000 | 0.0000 AT | 0.0000 | 0.0000 | 0.0000 >>> print(tagger1.confusion(gold_data)) | - | | - N - | | L O R N P | | R N R J J N N N P P P R R V V V V V W | | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ $ | <9> . . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . 1 . . . . . . . . | '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | -LRB- | . . . <.> . . . . . . . . . 1 . . . . 2 . . . . . . . . . . . . . . . . . . . . | -NONE- | . . . .<122> . . . . . . . . 1 . . . . 22 . . . . . . . . . . . . . . . . . . . . | -RRB- | . . . . . <.> . . . . . . . . . . . . 2 1 . . . . . . . . . . . . . . . . . . . | . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | CC | . . . . . . . . . <58> . . . . . . . . 2 1 . . . . . . . . . . . . . . 1 . . . . | CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | DT | . . . . . . . . 1 . .<163> . 5 . . . . 12 . . . . . . . . . . . . . . . . . 3 . . | EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | JJ | . . . . . . . . . . . . . 4 <49> . . . 79 4 . 4 . . . . 6 . . . 1 12 3 . 3 . . . . | JJR | . . . . . . . . . . . . . 2 . <3> . . 1 . . . . . . . . . . . . . . . . . . . . | JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | NN | . . . . . . . . . . . . . 7 9 . . .<271> 16 . 5 . . . . . . . . 7 . 9 . . . . . . | NNP | . . . . . . . . . . . 2 . 7 . . . . 163<117> . 26 . . . . 2 . . . 1 2 5 . . . . . . | NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | VB | . . . . . . . . . . . . . . 2 . . . 4 . . . . . . . 1 . . . <47> . . . 3 . . . . | VBD | . . . . . . . . . . . . . 1 . . . . 8 1 . . . . . . . . . . . <80> . 2 . . . . . | VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 25 . <21> . . . . . | VBP | . . . . . . . . . . . . . 2 . . . . 4 . . . . . . . . . . . 1 . . . <12> . . . . | VBZ | . . . . . . . . . . . . . . . . . . . . . 13 . . . . . . . . . . . . . <29> . . . | WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ (row = reference; col = test) >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) >>> tagged[33][12:] [('foreign', 'NN'), ('debt', 'NN'), ('of', 'IN'), ('$', '$'), ('64', 'CD'), ('billion', 'CD'), ('*U*', '-NONE-'), ('--', ':'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'IN'), ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] Regression Tests ~~~~~~~~~~~~~~~~ Sequential Taggers ------------------ Add tests for: - make sure backoff is being done correctly. - make sure ngram taggers don't use previous sentences for context. - make sure ngram taggers see 'beginning of the sentence' as a unique context - make sure regexp tagger's regexps are tried in order - train on some simple examples, & make sure that the size & the generated models are correct. - make sure cutoff works as intended - make sure that ngram models only exclude contexts covered by the backoff tagger if the backoff tagger gets that context correct at *all* locations. Regression Testing for issue #1025 ================================== We want to ensure that a RegexpTagger can be created with more than 100 patterns and does not fail with: "AssertionError: sorry, but this version only supports 100 named groups" >>> from nltk.tag import RegexpTagger >>> patterns = [(str(i), 'NNP',) for i in range(200)] >>> tagger = RegexpTagger(patterns) Regression Testing for issue #2483 ================================== Ensure that tagging with pos_tag (PerceptronTagger) does not throw an IndexError when attempting tagging an empty string. What it must return instead is not strictly defined. >>> from nltk.tag import pos_tag >>> pos_tag(['', 'is', 'a', 'beautiful', 'day']) [...] nltk-3.7/nltk/test/tokenize.doctest000066400000000000000000000464071420073152400175120ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT >>> from nltk.tokenize import * Regression Tests: NLTKWordTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tokenizing some test strings. >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." >>> word_tokenize(s1) ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'] >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said." >>> word_tokenize(s2) ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." >>> word_tokenize(s3) ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.'] >>> s4 = "I cannot cannot work under these conditions!" >>> word_tokenize(s4) ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'] >>> s5 = "The company spent $30,000,000 last year." >>> word_tokenize(s5) ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'] >>> s6 = "The company spent 40.75% of its income last year." >>> word_tokenize(s6) ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'] >>> s7 = "He arrived at 3:00 pm." >>> word_tokenize(s7) ['He', 'arrived', 'at', '3:00', 'pm', '.'] >>> s8 = "I bought these items: books, pencils, and pens." >>> word_tokenize(s8) ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'] >>> s9 = "Though there were 150, 100 of them were old." >>> word_tokenize(s9) ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'] >>> s10 = "There were 300,000, but that wasn't enough." >>> word_tokenize(s10) ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'] >>> s11 = "It's more'n enough." >>> word_tokenize(s11) ['It', "'s", 'more', "'n", 'enough', '.'] Gathering the spans of the tokenized strings. >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected True >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."''' >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12), ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36), ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62), ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82), ... (82, 83), (83, 84)] >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to', ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost', ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"'] >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected True Testing improvement made to the TreebankWordTokenizer >>> sx1 = '\xabNow that I can do.\xbb' >>> expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb'] >>> word_tokenize(sx1) == expected True >>> sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.' >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.'] >>> word_tokenize(sx2) == expected True Testing treebank's detokenizer >>> from nltk.tokenize.treebank import TreebankWordDetokenizer >>> detokenizer = TreebankWordDetokenizer() >>> s = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." >>> detokenizer.detokenize(word_tokenize(s)) 'On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.' >>> s = "\"We beat some pretty good teams to get here,\" Slocum said." >>> detokenizer.detokenize(word_tokenize(s)) '"We beat some pretty good teams to get here," Slocum said.' >>> s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." >>> detokenizer.detokenize(word_tokenize(s)) 'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.' >>> s = "I cannot cannot work under these conditions!" >>> detokenizer.detokenize(word_tokenize(s)) 'I cannot cannot work under these conditions!' >>> s = "The company spent $30,000,000 last year." >>> detokenizer.detokenize(word_tokenize(s)) 'The company spent $30,000,000 last year.' >>> s = "The company spent 40.75% of its income last year." >>> detokenizer.detokenize(word_tokenize(s)) 'The company spent 40.75% of its income last year.' >>> s = "He arrived at 3:00 pm." >>> detokenizer.detokenize(word_tokenize(s)) 'He arrived at 3:00 pm.' >>> s = "I bought these items: books, pencils, and pens." >>> detokenizer.detokenize(word_tokenize(s)) 'I bought these items: books, pencils, and pens.' >>> s = "Though there were 150, 100 of them were old." >>> detokenizer.detokenize(word_tokenize(s)) 'Though there were 150, 100 of them were old.' >>> s = "There were 300,000, but that wasn't enough." >>> detokenizer.detokenize(word_tokenize(s)) "There were 300,000, but that wasn't enough." >>> s = 'How "are" you?' >>> detokenizer.detokenize(word_tokenize(s)) 'How "are" you?' >>> s = "Hello (world)" >>> detokenizer.detokenize(word_tokenize(s)) 'Hello (world)' >>> s = ' with (many) [kinds] of {parentheses}. "Sometimes it\'s inside (quotes)". ("Sometimes the otherway around").' >>> detokenizer.detokenize(word_tokenize(s)) ' with (many) [kinds] of {parentheses}. "Sometimes it\'s inside (quotes)". ("Sometimes the otherway around").' >>> s = "Sentence ending with (parentheses)" >>> detokenizer.detokenize(word_tokenize(s)) 'Sentence ending with (parentheses)' >>> s = "(Sentence) starting with parentheses." >>> detokenizer.detokenize(word_tokenize(s)) '(Sentence) starting with parentheses.' Sentence tokenization in word_tokenize: >>> s11 = "I called Dr. Jones. I called Dr. Jones." >>> word_tokenize(s11) ['I', 'called', 'Dr.', 'Jones', '.', 'I', 'called', 'Dr.', 'Jones', '.'] >>> s12 = ("Ich muss unbedingt daran denken, Mehl, usw. fur einen " ... "Kuchen einzukaufen. Ich muss.") >>> word_tokenize(s12) ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw', '.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] >>> word_tokenize(s12, 'german') ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] Regression Tests: Regexp Tokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Some additional test strings. >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" ... "two of them.\n\nThanks.") >>> s2 = ("Alas, it has not rained today. When, do you think, " ... "will it rain again?") >>> s3 = ("

    Although this is not the case here, we must " ... "not relax our vigilance!

    ") >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False) [', ', '. ', ', ', ', ', '?'] >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True) ['Alas', 'it has not rained today', 'When', 'do you think', 'will it rain again'] Take care to avoid using capturing groups: >>> regexp_tokenize(s3, r'', gaps=False) ['

    ', '', '', '

    '] >>> regexp_tokenize(s3, r'', gaps=False) ['

    ', '', '', '

    '] >>> regexp_tokenize(s3, r'', gaps=True) ['Although this is ', 'not', ' the case here, we must not relax our vigilance!'] Named groups are capturing groups, and confuse the tokenizer: >>> regexp_tokenize(s3, r'b|p)>', gaps=False) ['p', 'b', 'b', 'p'] >>> regexp_tokenize(s3, r'b|p)>', gaps=True) ['p', 'Although this is ', 'b', 'not', 'b', ' the case here, we must not relax our vigilance!', 'p'] Make sure that nested groups don't confuse the tokenizer: >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False) ['las', 'has', 'rai', 'rai'] >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True) ['A', ', it ', ' not ', 'ned today. When, do you think, will it ', 'n again?'] Back-references require capturing groups, and these are not supported: >>> regexp_tokenize("aabbbcccc", r'(.)\1') ['a', 'b', 'c', 'c'] A simple sentence tokenizer '\.(\s+|$)' >>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True) ['Good muffins cost $3.88\nin New York', 'Please buy me\ntwo of them', 'Thanks'] Regression Tests: TweetTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TweetTokenizer is a tokenizer specifically designed for micro-blogging tokenization tasks. >>> from nltk.tokenize import TweetTokenizer >>> tknzr = TweetTokenizer() >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" >>> tknzr.tokenize(s0) ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)" >>> tknzr.tokenize(s1) ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)'] >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn" >>> tknzr.tokenize(s2) ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn'] >>> s3 = "@Insanomania They do... Their mentality doesn't :(" >>> tknzr.tokenize(s3) ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':('] >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!" >>> tknzr.tokenize(s4) ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!'] >>> tknzr = TweetTokenizer(reduce_len=True) >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" >>> tknzr.tokenize(s5) ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':('] It is possible to specify `strip_handles` and `reduce_len` parameters for a TweetTokenizer instance. Setting `strip_handles` to True, the tokenizer will remove Twitter handles (e.g. usernames). Setting `reduce_len` to True, repeated character sequences of length 3 or greater will be replaced with sequences of length 3. >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!' >>> tknzr.tokenize(s6) [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.' >>> tknzr.tokenize(s7) [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.'] >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com.' >>> tknzr.tokenize(s8) ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin@email.com', '.'] The `preserve_case` parameter (default: True) allows to convert uppercase tokens to lowercase tokens. Emoticons are not affected: >>> tknzr = TweetTokenizer(preserve_case=False) >>> s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P" >>> tknzr.tokenize(s9) ['@jrmy', ':', "i'm", 'really', 'happyyy', 'about', 'that', '!', 'niceeee', ':D', ':P'] It should not hang on long sequences of the same punctuation character. >>> tknzr = TweetTokenizer() >>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L" >>> tknzr.tokenize(s10) ['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L'] Tokenizing multiple sentences at once: >>> tknzr = TweetTokenizer() >>> sentences = [ ... "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--", ... "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P", ... "@_willy65: No place for @chuck tonight. Sorry." ... ] >>> tknzr.tokenize_sents(sentences) # doctest: +NORMALIZE_WHITESPACE [['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'], ['@jrmy', ':', "I'm", 'REALLY', 'HAPPYYY', 'about', 'that', '!', 'NICEEEE', ':D', ':P'], ['@_willy65', ':', 'No', 'place', 'for', '@chuck', 'tonight', '.', 'Sorry', '.']] Regression Tests: PunktSentenceTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The sentence splitter should remove whitespace following the sentence boundary. >>> pst = PunktSentenceTokenizer() >>> pst.tokenize('See Section 3). Or Section 2). ') ['See Section 3).', 'Or Section 2).'] >>> pst.tokenize('See Section 3.) Or Section 2.) ') ['See Section 3.)', 'Or Section 2.)'] >>> pst.tokenize('See Section 3.) Or Section 2.) ', realign_boundaries=False) ['See Section 3.', ') Or Section 2.', ')'] Two instances of PunktSentenceTokenizer should not share PunktParameters. >>> pst = PunktSentenceTokenizer() >>> pst2 = PunktSentenceTokenizer() >>> pst._params is pst2._params False Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067 >>> from nltk.tokenize.punkt import PunktBaseClass, PunktTrainer, PunktSentenceTokenizer >>> from nltk.tokenize.punkt import PunktLanguageVars, PunktParameters >>> pbc = PunktBaseClass(lang_vars=None, params=None) >>> type(pbc._params) >>> type(pbc._lang_vars) >>> pt = PunktTrainer(lang_vars=None) >>> type(pt._lang_vars) >>> pst = PunktSentenceTokenizer(lang_vars=None) >>> type(pst._lang_vars) Testing that inputs can start with dots. >>> pst = PunktSentenceTokenizer(lang_vars=None) >>> pst.tokenize(". This input starts with a dot. This used to cause issues.") ['.', 'This input starts with a dot.', 'This used to cause issues.'] Regression Tests: align_tokens ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Post-hoc alignment of tokens with a source string >>> from nltk.tokenize.util import align_tokens >>> list(align_tokens([''], "")) [(0, 0)] >>> list(align_tokens([''], " ")) [(0, 0)] >>> list(align_tokens([], "")) [] >>> list(align_tokens([], " ")) [] >>> list(align_tokens(['a'], "a")) [(0, 1)] >>> list(align_tokens(['abc', 'def'], "abcdef")) [(0, 3), (3, 6)] >>> list(align_tokens(['abc', 'def'], "abc def")) [(0, 3), (4, 7)] >>> list(align_tokens(['ab', 'cd'], "ab cd ef")) [(0, 2), (3, 5)] >>> list(align_tokens(['ab', 'cd', 'ef'], "ab cd ef")) [(0, 2), (3, 5), (6, 8)] >>> list(align_tokens(['ab', 'cd', 'efg'], "ab cd ef")) Traceback (most recent call last): .... ValueError: substring "efg" not found in "ab cd ef" >>> list(align_tokens(['ab', 'cd', 'ef', 'gh'], "ab cd ef")) Traceback (most recent call last): .... ValueError: substring "gh" not found in "ab cd ef" >>> list(align_tokens(['The', 'plane', ',', 'bound', 'for', 'St', 'Petersburg', ',', 'crashed', 'in', 'Egypt', "'s", 'Sinai', 'desert', 'just', '23', 'minutes', 'after', 'take-off', 'from', 'Sharm', 'el-Sheikh', 'on', 'Saturday', '.'], "The plane, bound for St Petersburg, crashed in Egypt's Sinai desert just 23 minutes after take-off from Sharm el-Sheikh on Saturday.")) [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), (123, 131), (131, 132)] Regression Tests: MWETokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Pickle an MWETokenizer >>> from nltk.tokenize import MWETokenizer >>> import pickle >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+') >>> p = pickle.dumps(tokenizer) >>> unpickeled = pickle.loads(p) >>> unpickeled.tokenize("An hors d'oeuvre tonight, sir?".split()) ['An', "hors+d'oeuvre", 'tonight,', 'sir?'] Regression Tests: TextTilingTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TextTilingTokenizer tokenizes text into coherent subtopic chunks based upon Hearst's TextTiling algorithm. >>> from nltk.tokenize import TextTilingTokenizer >>> from nltk.corpus import brown >>> tt = TextTilingTokenizer() >>> tt.tokenize(brown.raw()[0:1000]) ["\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np Allen/np Jr./"] Test that `ValueError` exceptions are raised when illegal arguments are used. >>> TextTilingTokenizer(similarity_method='foo').tokenize(brown.raw()[0:1000]) Traceback (most recent call last): ... ValueError: Similarity method foo not recognized >>> TextTilingTokenizer(smoothing_method='bar').tokenize(brown.raw()[0:1000]) Traceback (most recent call last): ... ValueError: Smoothing method bar not recognized nltk-3.7/nltk/test/toolbox.doctest000066400000000000000000000234411420073152400173410ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =============================== Unit test cases for ``toolbox`` =============================== >>> from nltk import toolbox -------------------------- ``toolbox.StandardFormat`` -------------------------- >>> f = toolbox.StandardFormat() ``toolbox.StandardFormat.open()`` --------------------------------- >>> import os, tempfile >>> (fd, fname) = tempfile.mkstemp() >>> tf = os.fdopen(fd, "w") >>> _ = tf.write('\\lx a value\n\\lx another value\n') >>> tf.close() >>> f = toolbox.StandardFormat() >>> f.open(fname) >>> list(f.fields()) [('lx', 'a value'), ('lx', 'another value')] >>> f.close() >>> os.unlink(fname) ``toolbox.StandardFormat.open_string()`` ---------------------------------------- >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value\n') >>> list(f.fields()) [('lx', 'a value'), ('lx', 'another value')] >>> f.close() ``toolbox.StandardFormat.close()`` ---------------------------------- >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value\n') >>> list(f.fields()) [('lx', 'a value'), ('lx', 'another value')] >>> f.close() ``toolbox.StandardFormat.line_num`` --------------------------------------- ``StandardFormat.line_num`` contains the line number of the last line returned: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n') >>> line_nums = [] >>> for l in f.raw_fields(): ... line_nums.append(f.line_num) >>> line_nums [1, 2, 3] ``StandardFormat.line_num`` contains the line number of the last line returned: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n') >>> line_nums = [] >>> for l in f.raw_fields(): ... line_nums.append(f.line_num) >>> line_nums [2, 5, 7] ``StandardFormat.line_num`` doesn't exist before opening or after closing a file or string: >>> f = toolbox.StandardFormat() >>> f.line_num Traceback (most recent call last): ... AttributeError: 'StandardFormat' object has no attribute 'line_num' >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n') >>> line_nums = [] >>> for l in f.raw_fields(): ... line_nums.append(f.line_num) >>> line_nums [2, 5, 7] >>> f.close() >>> f.line_num Traceback (most recent call last): ... AttributeError: 'StandardFormat' object has no attribute 'line_num' ``toolbox.StandardFormat.raw_fields()`` --------------------------------------- ``raw_fields()`` returns an iterator over tuples of two strings representing the marker and its value. The marker is given without the backslash and the value without its trailing newline: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value\n') >>> list(f.raw_fields()) [('lx', 'a value'), ('lx', 'another value')] an empty file returns nothing: >>> f = toolbox.StandardFormat() >>> f.open_string('') >>> list(f.raw_fields()) [] file with only a newline returns WHAT SHOULD IT RETURN???: >>> f = toolbox.StandardFormat() >>> f.open_string('\n') >>> list(f.raw_fields()) [(None, '')] file with only one field should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx one value\n') >>> list(f.raw_fields()) [('lx', 'one value')] file without a trailing newline should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value') >>> list(f.raw_fields()) [('lx', 'a value'), ('lx', 'another value')] trailing white space is preserved except for the final newline: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n') >>> list(f.raw_fields()) [('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')] line wrapping is preserved: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') >>> list(f.raw_fields()) [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')] file beginning with a multiline record should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') >>> list(f.raw_fields()) [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')] file ending with a multiline record should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n') >>> list(f.raw_fields()) [('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')] file beginning with a BOM should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n') >>> list(f.raw_fields()) [('lx', 'a value'), ('lx', 'another value')] file beginning with two BOMs should ignore only the first one: >>> f = toolbox.StandardFormat() >>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n') >>> list(f.raw_fields()) [(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')] should not ignore a BOM not at the beginning of the file: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n') >>> list(f.raw_fields()) [('lx', 'a value\n\xef\xbb\xbf\\lx another value')] ``toolbox.StandardFormat.fields()`` ----------------------------------- trailing white space is not preserved: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n') >>> list(f.fields()) [('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')] multiline fields are unwrapped: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') >>> list(f.fields()) [('lx', 'a value more of the value and still more'), ('lc', 'another val')] markers ------- A backslash in the first position on a new line indicates the start of a marker. The backslash is not part of the marker: >>> f = toolbox.StandardFormat() >>> f.open_string('\\mk a value\n') >>> list(f.fields()) [('mk', 'a value')] If the backslash occurs later in the line it does not indicate the start of a marker: >>> f = toolbox.StandardFormat() >>> f.open_string('\\mk a value\n \\mk another one\n') >>> list(f.raw_fields()) [('mk', 'a value\n \\mk another one')] There is no specific limit to the length of a marker: >>> f = toolbox.StandardFormat() >>> f.open_string('\\this_is_an_extremely_long_marker value\n') >>> list(f.fields()) [('this_is_an_extremely_long_marker', 'value')] A marker can contain any non white space character: >>> f = toolbox.StandardFormat() >>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789 value\n') >>> list(f.fields()) [('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')] A marker is terminated by any white space character: >>> f = toolbox.StandardFormat() >>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one') >>> list(f.fields()) [('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')] Consecutive whitespace characters (except newline) are treated the same as one: >>> f = toolbox.StandardFormat() >>> f.open_string('\\mk \t\r\fa value\n') >>> list(f.fields()) [('mk', 'a value')] ----------------------- ``toolbox.ToolboxData`` ----------------------- >>> db = toolbox.ToolboxData() ``toolbox.ToolboxData.parse()`` ------------------------------- check that normal parsing works: >>> from xml.etree import ElementTree >>> td = toolbox.ToolboxData() >>> s = """\\_sh v3.0 400 Rotokas Dictionary ... \\_DateStampHasFourDigitYear ... ... \\lx kaa ... \\ps V.A ... \\ge gag ... \\gp nek i pas ... ... \\lx kaa ... \\ps V.B ... \\ge strangle ... \\gp pasim nek ... """ >>> td.open_string(s) >>> tree = td.parse(key='lx') >>> tree.tag 'toolbox_data' >>> ElementTree.tostring(list(tree)[0]).decode('utf8') '
    <_sh>v3.0 400 Rotokas Dictionary<_DateStampHasFourDigitYear />
    ' >>> ElementTree.tostring(list(tree)[1]).decode('utf8') 'kaaV.Agagnek i pas' >>> ElementTree.tostring(list(tree)[2]).decode('utf8') 'kaaV.Bstranglepasim nek' check that guessing the key marker works: >>> from xml.etree import ElementTree >>> td = toolbox.ToolboxData() >>> s = """\\_sh v3.0 400 Rotokas Dictionary ... \\_DateStampHasFourDigitYear ... ... \\lx kaa ... \\ps V.A ... \\ge gag ... \\gp nek i pas ... ... \\lx kaa ... \\ps V.B ... \\ge strangle ... \\gp pasim nek ... """ >>> td.open_string(s) >>> tree = td.parse() >>> ElementTree.tostring(list(tree)[0]).decode('utf8') '
    <_sh>v3.0 400 Rotokas Dictionary<_DateStampHasFourDigitYear />
    ' >>> ElementTree.tostring(list(tree)[1]).decode('utf8') 'kaaV.Agagnek i pas' >>> ElementTree.tostring(list(tree)[2]).decode('utf8') 'kaaV.Bstranglepasim nek' ----------------------- ``toolbox`` functions ----------------------- ``toolbox.to_sfm_string()`` ------------------------------- nltk-3.7/nltk/test/toy.cfg000066400000000000000000000002121420073152400155470ustar00rootroot00000000000000S -> NP VP PP -> P NP NP -> Det N | NP PP VP -> V NP | VP PP Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' nltk-3.7/nltk/test/translate.doctest000066400000000000000000000177341420073152400176600ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT .. -*- coding: utf-8 -*- ========= Alignment ========= Corpus Reader ------------- >>> from nltk.corpus import comtrans >>> words = comtrans.words('alignment-en-fr.txt') >>> for word in words[:6]: ... print(word) Resumption of the session I declare >>> als = comtrans.aligned_sents('alignment-en-fr.txt')[0] >>> als AlignedSent(['Resumption', 'of', 'the', 'session'], ['Reprise', 'de', 'la', 'session'], Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) Alignment Objects ----------------- Aligned sentences are simply a mapping between words in a sentence: >>> print(" ".join(als.words)) Resumption of the session >>> print(" ".join(als.mots)) Reprise de la session >>> als.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]) Usually we look at them from the perspective of a source to a target language, but they are easily inverted: >>> als.invert() AlignedSent(['Reprise', 'de', 'la', 'session'], ['Resumption', 'of', 'the', 'session'], Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) We can create new alignments, but these need to be in the correct range of the corresponding sentences: >>> from nltk.translate import Alignment, AlignedSent >>> als = AlignedSent(['Reprise', 'de', 'la', 'session'], ... ['Resumption', 'of', 'the', 'session'], ... Alignment([(0, 0), (1, 4), (2, 1), (3, 3)])) Traceback (most recent call last): ... IndexError: Alignment is outside boundary of mots You can set alignments with any sequence of tuples, so long as the first two indexes of the tuple are the alignment indices: >>> als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))]) >>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))]) Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))]) Alignment Algorithms -------------------- EM for IBM Model 1 ~~~~~~~~~~~~~~~~~~ Here is an example from Koehn, 2010: >>> from nltk.translate import IBMModel1 >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']), ... AlignedSent(['the', 'book'], ['das', 'Buch']), ... AlignedSent(['a', 'book'], ['ein', 'Buch'])] >>> em_ibm1 = IBMModel1(corpus, 20) >>> print(round(em_ibm1.translation_table['the']['das'], 1)) 1.0 >>> print(round(em_ibm1.translation_table['book']['das'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['house']['das'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['the']['Buch'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['book']['Buch'], 1)) 1.0 >>> print(round(em_ibm1.translation_table['a']['Buch'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['book']['ein'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['a']['ein'], 1)) 1.0 >>> print(round(em_ibm1.translation_table['the']['Haus'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['house']['Haus'], 1)) 1.0 >>> print(round(em_ibm1.translation_table['book'][None], 1)) 0.5 And using an NLTK corpus. We train on only 10 sentences, since it is so slow: >>> from nltk.corpus import comtrans >>> com_ibm1 = IBMModel1(comtrans.aligned_sents()[:10], 20) >>> print(round(com_ibm1.translation_table['bitte']['Please'], 1)) 0.2 >>> print(round(com_ibm1.translation_table['Sitzungsperiode']['session'], 1)) 1.0 Evaluation ---------- The evaluation metrics for alignments are usually not interested in the contents of alignments but more often the comparison to a "gold standard" alignment that has been been constructed by human experts. For this reason we often want to work just with raw set operations against the alignment points. This then gives us a very clean form for defining our evaluation metrics. .. Note:: The AlignedSent class has no distinction of "possible" or "sure" alignments. Thus all alignments are treated as "sure". Consider the following aligned sentence for evaluation: >>> my_als = AlignedSent(['Resumption', 'of', 'the', 'session'], ... ['Reprise', 'de', 'la', 'session'], ... Alignment([(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)])) Precision ~~~~~~~~~ ``precision = |A∩P| / |A|`` **Precision** is probably the most well known evaluation metric and it is implemented in `nltk.metrics.scores.precision`_. Since precision is simply interested in the proportion of correct alignments, we calculate the ratio of the number of our test alignments (*A*) that match a possible alignment (*P*), over the number of test alignments provided. There is no penalty for missing a possible alignment in our test alignments. An easy way to game this metric is to provide just one test alignment that is in *P* [OCH2000]_. Here are some examples: >>> from nltk.metrics import precision >>> als.alignment = Alignment([(0,0), (1,1), (2,2), (3,3)]) >>> precision(Alignment([]), als.alignment) 0.0 >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) 1.0 >>> precision(Alignment([(0,0), (3,3)]), als.alignment) 0.5 >>> precision(Alignment.fromstring('0-0 3-3'), als.alignment) 0.5 >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment) 1.0 >>> precision(als.alignment, my_als.alignment) 0.6 .. _nltk.metrics.scores.precision: https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.precision Recall ~~~~~~ ``recall = |A∩S| / |S|`` **Recall** is another well known evaluation metric that has a set based implementation in NLTK as `nltk.metrics.scores.recall`_. Since recall is simply interested in the proportion of found alignments, we calculate the ratio of the number of our test alignments (*A*) that match a sure alignment (*S*) over the number of sure alignments. There is no penalty for producing a lot of test alignments. An easy way to game this metric is to include every possible alignment in our test alignments, regardless if they are correct or not [OCH2000]_. Here are some examples: >>> from nltk.metrics import recall >>> print(recall(Alignment([]), als.alignment)) None >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) 1.0 >>> recall(Alignment.fromstring('0-0 3-3'), als.alignment) 1.0 >>> recall(Alignment([(0,0), (3,3)]), als.alignment) 1.0 >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment) 0.66666... >>> recall(als.alignment, my_als.alignment) 0.75 .. _nltk.metrics.scores.recall: https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.recall Alignment Error Rate (AER) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ``AER = 1 - (|A∩S| + |A∩P|) / (|A| + |S|)`` **Alignment Error Rate** is commonly used metric for assessing sentence alignments. It combines precision and recall metrics together such that a perfect alignment must have all of the sure alignments and may have some possible alignments [MIHALCEA2003]_ [KOEHN2010]_. .. Note:: [KOEHN2010]_ defines the AER as ``AER = (|A∩S| + |A∩P|) / (|A| + |S|)`` in his book, but corrects it to the above in his online errata. This is in line with [MIHALCEA2003]_. Here are some examples: >>> from nltk.translate import alignment_error_rate >>> alignment_error_rate(Alignment([]), als.alignment) 1.0 >>> alignment_error_rate(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) 0.0 >>> alignment_error_rate(als.alignment, my_als.alignment) 0.333333... >>> alignment_error_rate(als.alignment, my_als.alignment, ... als.alignment | Alignment([(1,2), (2,1)])) 0.222222... .. [OCH2000] Och, F. and Ney, H. (2000) *Statistical Machine Translation*, EAMT Workshop .. [MIHALCEA2003] Mihalcea, R. and Pedersen, T. (2003) *An evaluation exercise for word alignment*, HLT-NAACL 2003 .. [KOEHN2010] Koehn, P. (2010) *Statistical Machine Translation*, Cambridge University Press nltk-3.7/nltk/test/tree.doctest000066400000000000000000001317421420073152400166160ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =============================== Unit tests for nltk.tree.Tree =============================== >>> from nltk.tree import * Some trees to run tests on: >>> dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])]) >>> dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])]) >>> vp = Tree('vp', [Tree('v', ['chased']), dp2]) >>> tree = Tree('s', [dp1, vp]) >>> print(tree) (s (dp (d the) (np dog)) (vp (v chased) (dp (d the) (np cat)))) The node label is accessed using the `label()` method: >>> dp1.label(), dp2.label(), vp.label(), tree.label() ('dp', 'dp', 'vp', 's') >>> print(tree[1,1,1,0]) cat The `treepositions` method returns a list of the tree positions of subtrees and leaves in a tree. By default, it gives the position of every tree, subtree, and leaf, in prefix order: >>> print(tree.treepositions()) [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0), (1, 1, 0, 0), (1, 1, 1), (1, 1, 1, 0)] In addition to `str` and `repr`, several methods exist to convert a tree object to one of several standard tree encodings: >>> print(tree.pformat_latex_qtree()) \Tree [.s [.dp [.d the ] [.np dog ] ] [.vp [.v chased ] [.dp [.d the ] [.np cat ] ] ] ] There is also a fancy ASCII art representation: >>> tree.pretty_print() s ________|_____ | vp | _____|___ dp | dp ___|___ | ___|___ d np v d np | | | | | the dog chased the cat >>> tree.pretty_print(unicodelines=True, nodedist=4) s ┌──────────────┴────────┐ │ vp │ ┌────────┴──────┐ dp │ dp ┌──────┴──────┐ │ ┌──────┴──────┐ d np v d np │ │ │ │ │ the dog chased the cat Trees can be initialized from treebank strings: >>> tree2 = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))') >>> print(tree2) (S (NP I) (VP (V enjoyed) (NP my cookie))) Trees can be compared for equality: >>> tree == Tree.fromstring(str(tree)) True >>> tree2 == Tree.fromstring(str(tree2)) True >>> tree == tree2 False >>> tree == Tree.fromstring(str(tree2)) False >>> tree2 == Tree.fromstring(str(tree)) False >>> tree != Tree.fromstring(str(tree)) False >>> tree2 != Tree.fromstring(str(tree2)) False >>> tree != tree2 True >>> tree != Tree.fromstring(str(tree2)) True >>> tree2 != Tree.fromstring(str(tree)) True >>> tree < tree2 or tree > tree2 True Tree Parsing ============ The class method `Tree.fromstring()` can be used to parse trees, and it provides some additional options. >>> tree = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))') >>> print(tree) (S (NP I) (VP (V enjoyed) (NP my cookie))) When called on a subclass of `Tree`, it will create trees of that type: >>> tree = ImmutableTree.fromstring('(VP (V enjoyed) (NP my cookie))') >>> print(tree) (VP (V enjoyed) (NP my cookie)) >>> print(type(tree)) >>> tree[1] = 'x' Traceback (most recent call last): . . . ValueError: ImmutableTree may not be modified >>> del tree[0] Traceback (most recent call last): . . . ValueError: ImmutableTree may not be modified The ``brackets`` parameter can be used to specify two characters that should be used as brackets: >>> print(Tree.fromstring('[S [NP I] [VP [V enjoyed] [NP my cookie]]]', ... brackets='[]')) (S (NP I) (VP (V enjoyed) (NP my cookie))) >>> print(Tree.fromstring(' >>', ... brackets='<>')) (S (NP I) (VP (V enjoyed) (NP my cookie))) If ``brackets`` is not a string, or is not exactly two characters, then `Tree.fromstring` raises an exception: >>> Tree.fromstring(' >', brackets='') Traceback (most recent call last): . . . TypeError: brackets must be a length-2 string >>> Tree.fromstring(' >', brackets='<<>>') Traceback (most recent call last): . . . TypeError: brackets must be a length-2 string >>> Tree.fromstring(' >', brackets=12) Traceback (most recent call last): . . . TypeError: brackets must be a length-2 string >>> Tree.fromstring('<>', brackets=('<<','>>')) Traceback (most recent call last): . . . TypeError: brackets must be a length-2 string (We may add support for multi-character brackets in the future, in which case the ``brackets=('<<','>>')`` example would start working.) Whitespace brackets are not permitted: >>> Tree.fromstring('(NP my cookie\n', brackets='(\n') Traceback (most recent call last): . . . TypeError: whitespace brackets not allowed If an invalid tree is given to Tree.fromstring, then it raises a ValueError, with a description of the problem: >>> Tree.fromstring('(NP my cookie) (NP my milk)') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected 'end-of-string' but got '(NP' at index 15. "...y cookie) (NP my mil..." ^ >>> Tree.fromstring(')NP my cookie(') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected '(' but got ')' at index 0. ")NP my coo..." ^ >>> Tree.fromstring('(NP my cookie))') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected 'end-of-string' but got ')' at index 14. "...my cookie))" ^ >>> Tree.fromstring('my cookie)') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected '(' but got 'my' at index 0. "my cookie)" ^ >>> Tree.fromstring('(NP my cookie') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected ')' but got 'end-of-string' at index 13. "... my cookie" ^ >>> Tree.fromstring('') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected '(' but got 'end-of-string' at index 0. "" ^ Trees with no children are supported: >>> print(Tree.fromstring('(S)')) (S ) >>> print(Tree.fromstring('(X (Y) (Z))')) (X (Y ) (Z )) Trees with an empty node label and no children are supported: >>> print(Tree.fromstring('()')) ( ) >>> print(Tree.fromstring('(X () ())')) (X ( ) ( )) Trees with an empty node label and children are supported, but only if the first child is not a leaf (otherwise, it will be treated as the node label). >>> print(Tree.fromstring('((A) (B) (C))')) ( (A ) (B ) (C )) >>> print(Tree.fromstring('((A) leaf)')) ( (A ) leaf) >>> print(Tree.fromstring('(((())))')) ( ( ( ( )))) The optional arguments `read_node` and `read_leaf` may be used to transform the string values of nodes or leaves. >>> print(Tree.fromstring('(A b (C d e) (F (G h i)))', ... read_node=lambda s: '<%s>' % s, ... read_leaf=lambda s: '"%s"' % s)) (
    "b" ( "d" "e") ( ( "h" "i"))) These transformation functions are typically used when the node or leaf labels should be parsed to a non-string value (such as a feature structure). If node and leaf labels need to be able to include whitespace, then you must also use the optional `node_pattern` and `leaf_pattern` arguments. >>> from nltk.featstruct import FeatStruct >>> tree = Tree.fromstring('([cat=NP] [lex=the] [lex=dog])', ... read_node=FeatStruct, read_leaf=FeatStruct) >>> tree.set_label(tree.label().unify(FeatStruct('[num=singular]'))) >>> print(tree) ([cat='NP', num='singular'] [lex='the'] [lex='dog']) The optional argument ``remove_empty_top_bracketing`` can be used to remove any top-level empty bracketing that occurs. >>> print(Tree.fromstring('((S (NP I) (VP (V enjoyed) (NP my cookie))))', ... remove_empty_top_bracketing=True)) (S (NP I) (VP (V enjoyed) (NP my cookie))) It will not remove a top-level empty bracketing with multiple children: >>> print(Tree.fromstring('((A a) (B b))')) ( (A a) (B b)) Tree.fromlist() --------------- The class method `Tree.fromlist()` can be used to parse trees that are expressed as nested lists, such as those produced by the tree() function from the wordnet module. >>> from nltk.corpus import wordnet as wn >>> t=Tree.fromlist(wn.synset('dog.n.01').tree(lambda s:s.hypernyms())) >>> print(t.height()) 14 >>> print(t.leaves()) ["Synset('entity.n.01')", "Synset('entity.n.01')"] >>> t.pretty_print() Synset('dog.n.01') _________________|__________________ Synset('canine.n. | 02') | | | Synset('carnivor | e.n.01') | | | Synset('placenta | l.n.01') | | | Synset('mammal.n. | 01') | | | Synset('vertebra | te.n.01') | | | Synset('chordate. Synset('domestic n.01') _animal.n.01') | | Synset('animal.n. Synset('animal.n. 01') 01') | | Synset('organism. Synset('organism. n.01') n.01') | | Synset('living_t Synset('living_t hing.n.01') hing.n.01') | | Synset('whole.n. Synset('whole.n. 02') 02') | | Synset('object.n. Synset('object.n. 01') 01') | | Synset('physical Synset('physical _entity.n.01') _entity.n.01') | | Synset('entity.n. Synset('entity.n. 01') 01') Parented Trees ============== `ParentedTree` is a subclass of `Tree` that automatically maintains parent pointers for single-parented trees. Parented trees can be created directly from a node label and a list of children: >>> ptree = ( ... ParentedTree('VP', [ ... ParentedTree('VERB', ['saw']), ... ParentedTree('NP', [ ... ParentedTree('DET', ['the']), ... ParentedTree('NOUN', ['dog'])])])) >>> print(ptree) (VP (VERB saw) (NP (DET the) (NOUN dog))) Parented trees can be created from strings using the classmethod `ParentedTree.fromstring`: >>> ptree = ParentedTree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))') >>> print(ptree) (VP (VERB saw) (NP (DET the) (NOUN dog))) >>> print(type(ptree)) Parented trees can also be created by using the classmethod `ParentedTree.convert` to convert another type of tree to a parented tree: >>> tree = Tree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))') >>> ptree = ParentedTree.convert(tree) >>> print(ptree) (VP (VERB saw) (NP (DET the) (NOUN dog))) >>> print(type(ptree)) .. clean-up: >>> del tree `ParentedTree`\ s should never be used in the same tree as `Tree`\ s or `MultiParentedTree`\ s. Mixing tree implementations may result in incorrect parent pointers and in `TypeError` exceptions: >>> # Inserting a Tree in a ParentedTree gives an exception: >>> ParentedTree('NP', [ ... Tree('DET', ['the']), Tree('NOUN', ['dog'])]) Traceback (most recent call last): . . . TypeError: Can not insert a non-ParentedTree into a ParentedTree >>> # inserting a ParentedTree in a Tree gives incorrect parent pointers: >>> broken_tree = Tree('NP', [ ... ParentedTree('DET', ['the']), ParentedTree('NOUN', ['dog'])]) >>> print(broken_tree[0].parent()) None Parented Tree Methods ------------------------ In addition to all the methods defined by the `Tree` class, the `ParentedTree` class adds six new methods whose values are automatically updated whenever a parented tree is modified: `parent()`, `parent_index()`, `left_sibling()`, `right_sibling()`, `root()`, and `treeposition()`. The `parent()` method contains a `ParentedTree`\ 's parent, if it has one; and ``None`` otherwise. `ParentedTree`\ s that do not have parents are known as "root trees." >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Parent = %s' % subtree.parent()) (VP (VERB saw) (NP (DET the) (NOUN dog))) Parent = None (VERB saw) Parent = (VP (VERB saw) (NP (DET the) (NOUN dog))) (NP (DET the) (NOUN dog)) Parent = (VP (VERB saw) (NP (DET the) (NOUN dog))) (DET the) Parent = (NP (DET the) (NOUN dog)) (NOUN dog) Parent = (NP (DET the) (NOUN dog)) The `parent_index()` method stores the index of a tree in its parent's child list. If a tree does not have a parent, then its `parent_index` is ``None``. >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Parent Index = %s' % subtree.parent_index()) ... assert (subtree.parent() is None or ... subtree.parent()[subtree.parent_index()] is subtree) (VP (VERB saw) (NP (DET the) (NOUN dog))) Parent Index = None (VERB saw) Parent Index = 0 (NP (DET the) (NOUN dog)) Parent Index = 1 (DET the) Parent Index = 0 (NOUN dog) Parent Index = 1 Note that ``ptree.parent().index(ptree)`` is *not* equivalent to ``ptree.parent_index()``. In particular, ``ptree.parent().index(ptree)`` will return the index of the first child of ``ptree.parent()`` that is equal to ``ptree`` (using ``==``); and that child may not be ``ptree``: >>> on_and_on = ParentedTree('CONJP', [ ... ParentedTree('PREP', ['on']), ... ParentedTree('COJN', ['and']), ... ParentedTree('PREP', ['on'])]) >>> second_on = on_and_on[2] >>> print(second_on.parent_index()) 2 >>> print(second_on.parent().index(second_on)) 0 The methods `left_sibling()` and `right_sibling()` can be used to get a parented tree's siblings. If a tree does not have a left or right sibling, then the corresponding method's value is ``None``: >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Left Sibling = %s' % subtree.left_sibling()) ... print(' Right Sibling = %s' % subtree.right_sibling()) (VP (VERB saw) (NP (DET the) (NOUN dog))) Left Sibling = None Right Sibling = None (VERB saw) Left Sibling = None Right Sibling = (NP (DET the) (NOUN dog)) (NP (DET the) (NOUN dog)) Left Sibling = (VERB saw) Right Sibling = None (DET the) Left Sibling = None Right Sibling = (NOUN dog) (NOUN dog) Left Sibling = (DET the) Right Sibling = None A parented tree's root tree can be accessed using the `root()` method. This method follows the tree's parent pointers until it finds a tree without a parent. If a tree does not have a parent, then it is its own root: >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Root = %s' % subtree.root()) (VP (VERB saw) (NP (DET the) (NOUN dog))) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) (VERB saw) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) (NP (DET the) (NOUN dog)) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) (DET the) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) (NOUN dog) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) The `treeposition()` method can be used to find a tree's treeposition relative to its root: >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Tree Position = %s' % (subtree.treeposition(),)) ... assert subtree.root()[subtree.treeposition()] is subtree (VP (VERB saw) (NP (DET the) (NOUN dog))) Tree Position = () (VERB saw) Tree Position = (0,) (NP (DET the) (NOUN dog)) Tree Position = (1,) (DET the) Tree Position = (1, 0) (NOUN dog) Tree Position = (1, 1) Whenever a parented tree is modified, all of the methods described above (`parent()`, `parent_index()`, `left_sibling()`, `right_sibling()`, `root()`, and `treeposition()`) are automatically updated. For example, if we replace ``ptree``\ 's subtree for the word "dog" with a new subtree for "cat," the method values for both the "dog" subtree and the "cat" subtree get automatically updated: >>> # Replace the dog with a cat >>> dog = ptree[1,1] >>> cat = ParentedTree('NOUN', ['cat']) >>> ptree[1,1] = cat >>> # the noun phrase is no longer the dog's parent: >>> print(dog.parent(), dog.parent_index(), dog.left_sibling()) None None None >>> # dog is now its own root. >>> print(dog.root()) (NOUN dog) >>> print(dog.treeposition()) () >>> # the cat's parent is now the noun phrase: >>> print(cat.parent()) (NP (DET the) (NOUN cat)) >>> print(cat.parent_index()) 1 >>> print(cat.left_sibling()) (DET the) >>> print(cat.root()) (VP (VERB saw) (NP (DET the) (NOUN cat))) >>> print(cat.treeposition()) (1, 1) ParentedTree Regression Tests ----------------------------- Keep track of all trees that we create (including subtrees) using this variable: >>> all_ptrees = [] Define a helper function to create new parented trees: >>> def make_ptree(s): ... ptree = ParentedTree.convert(Tree.fromstring(s)) ... all_ptrees.extend(t for t in ptree.subtrees() ... if isinstance(t, Tree)) ... return ptree Define a test function that examines every subtree in all_ptrees; and checks that all six of its methods are defined correctly. If any ptrees are passed as arguments, then they are printed. >>> def pcheck(*print_ptrees): ... for ptree in all_ptrees: ... # Check ptree's methods. ... if ptree.parent() is not None: ... i = ptree.parent_index() ... assert ptree.parent()[i] is ptree ... if i > 0: ... assert ptree.left_sibling() is ptree.parent()[i-1] ... if i < (len(ptree.parent())-1): ... assert ptree.right_sibling() is ptree.parent()[i+1] ... assert len(ptree.treeposition()) > 0 ... assert (ptree.treeposition() == ... ptree.parent().treeposition() + (ptree.parent_index(),)) ... assert ptree.root() is not ptree ... assert ptree.root() is not None ... assert ptree.root() is ptree.parent().root() ... assert ptree.root()[ptree.treeposition()] is ptree ... else: ... assert ptree.parent_index() is None ... assert ptree.left_sibling() is None ... assert ptree.right_sibling() is None ... assert ptree.root() is ptree ... assert ptree.treeposition() == () ... # Check ptree's children's methods: ... for i, child in enumerate(ptree): ... if isinstance(child, Tree): ... # pcheck parent() & parent_index() methods ... assert child.parent() is ptree ... assert child.parent_index() == i ... # pcheck sibling methods ... if i == 0: ... assert child.left_sibling() is None ... else: ... assert child.left_sibling() is ptree[i-1] ... if i == len(ptree)-1: ... assert child.right_sibling() is None ... else: ... assert child.right_sibling() is ptree[i+1] ... if print_ptrees: ... print('ok!', end=' ') ... for ptree in print_ptrees: print(ptree) ... else: ... print('ok!') Run our test function on a variety of newly-created trees: >>> pcheck(make_ptree('(A)')) ok! (A ) >>> pcheck(make_ptree('(A (B (C (D) (E f)) g) h)')) ok! (A (B (C (D ) (E f)) g) h) >>> pcheck(make_ptree('(A (B) (C c) (D d d) (E e e e))')) ok! (A (B ) (C c) (D d d) (E e e e)) >>> pcheck(make_ptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))')) ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e ))) Run our test function after performing various tree-modification operations: **__delitem__()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> e = ptree[0,0,1] >>> del ptree[0,0,1]; pcheck(ptree); pcheck(e) ok! (A (B (C (D ) (Q p)) g) h) ok! (E f) >>> del ptree[0,0,0]; pcheck(ptree) ok! (A (B (C (Q p)) g) h) >>> del ptree[0,1]; pcheck(ptree) ok! (A (B (C (Q p))) h) >>> del ptree[-1]; pcheck(ptree) ok! (A (B (C (Q p)))) >>> del ptree[-100] Traceback (most recent call last): . . . IndexError: index out of range >>> del ptree[()] Traceback (most recent call last): . . . IndexError: The tree position () may not be deleted. >>> # With slices: >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') >>> b = ptree[0] >>> del ptree[0:0]; pcheck(ptree) ok! (A (B c) (D e) f g (H i) j (K l)) >>> del ptree[:1]; pcheck(ptree); pcheck(b) ok! (A (D e) f g (H i) j (K l)) ok! (B c) >>> del ptree[-2:]; pcheck(ptree) ok! (A (D e) f g (H i)) >>> del ptree[1:3]; pcheck(ptree) ok! (A (D e) (H i)) >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') >>> del ptree[5:1000]; pcheck(ptree) ok! (A (B c) (D e) f g (H i)) >>> del ptree[-2:1000]; pcheck(ptree) ok! (A (B c) (D e) f) >>> del ptree[-100:1]; pcheck(ptree) ok! (A (D e) f) >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') >>> del ptree[1:-2:2]; pcheck(ptree) ok! (A (B c) f (H i) j (K l)) **__setitem__()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> d, e, q = ptree[0,0] >>> ptree[0,0,0] = 'x'; pcheck(ptree); pcheck(d) ok! (A (B (C x (E f) (Q p)) g) h) ok! (D ) >>> ptree[0,0,1] = make_ptree('(X (Y z))'); pcheck(ptree); pcheck(e) ok! (A (B (C x (X (Y z)) (Q p)) g) h) ok! (E f) >>> ptree[1] = d; pcheck(ptree) ok! (A (B (C x (X (Y z)) (Q p)) g) (D )) >>> ptree[-1] = 'x'; pcheck(ptree) ok! (A (B (C x (X (Y z)) (Q p)) g) x) >>> ptree[-100] = 'y' Traceback (most recent call last): . . . IndexError: index out of range >>> ptree[()] = make_ptree('(X y)') Traceback (most recent call last): . . . IndexError: The tree position () may not be assigned to. >>> # With slices: >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') >>> b = ptree[0] >>> ptree[0:0] = ('x', make_ptree('(Y)')); pcheck(ptree) ok! (A x (Y ) (B c) (D e) f g (H i) j (K l)) >>> ptree[2:6] = (); pcheck(ptree); pcheck(b) ok! (A x (Y ) (H i) j (K l)) ok! (B c) >>> ptree[-2:] = ('z', 'p'); pcheck(ptree) ok! (A x (Y ) (H i) z p) >>> ptree[1:3] = [make_ptree('(X)') for x in range(10)]; pcheck(ptree) ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p) >>> ptree[5:1000] = []; pcheck(ptree) ok! (A x (X ) (X ) (X ) (X )) >>> ptree[-2:1000] = ['n']; pcheck(ptree) ok! (A x (X ) (X ) n) >>> ptree[-100:1] = [make_ptree('(U v)')]; pcheck(ptree) ok! (A (U v) (X ) (X ) n) >>> ptree[-1:] = (make_ptree('(X)') for x in range(3)); pcheck(ptree) ok! (A (U v) (X ) (X ) (X ) (X ) (X )) >>> ptree[1:-2:2] = ['x', 'y']; pcheck(ptree) ok! (A (U v) x (X ) y (X ) (X )) **append()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> ptree.append('x'); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x) >>> ptree.append(make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z))) **extend()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> ptree.extend(['x', 'y', make_ptree('(X (Y z))')]); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) >>> ptree.extend([]); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) >>> ptree.extend(make_ptree('(X)') for x in range(3)); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X )) **insert()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> ptree.insert(0, make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h) >>> ptree.insert(-1, make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) >>> ptree.insert(-4, make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) >>> # Note: as with ``list``, inserting at a negative index that >>> # gives a position before the start of the list does *not* >>> # raise an IndexError exception; it just inserts at 0. >>> ptree.insert(-400, make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (X (Y z)) (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) **pop()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> ptree[0,0].pop(1); pcheck(ptree) ParentedTree('E', ['f']) ok! (A (B (C (D ) (Q p)) g) h) >>> ptree[0].pop(-1); pcheck(ptree) 'g' ok! (A (B (C (D ) (Q p))) h) >>> ptree.pop(); pcheck(ptree) 'h' ok! (A (B (C (D ) (Q p)))) >>> ptree.pop(-100) Traceback (most recent call last): . . . IndexError: index out of range **remove()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> e = ptree[0,0,1] >>> ptree[0,0].remove(ptree[0,0,1]); pcheck(ptree); pcheck(e) ok! (A (B (C (D ) (Q p)) g) h) ok! (E f) >>> ptree[0,0].remove(make_ptree('(Q p)')); pcheck(ptree) ok! (A (B (C (D )) g) h) >>> ptree[0,0].remove(make_ptree('(Q p)')) Traceback (most recent call last): . . . ValueError: ParentedTree('Q', ['p']) is not in list >>> ptree.remove('h'); pcheck(ptree) ok! (A (B (C (D )) g)) >>> ptree.remove('h'); Traceback (most recent call last): . . . ValueError: 'h' is not in list >>> # remove() removes the first subtree that is equal (==) to the >>> # given tree, which may not be the identical tree we give it: >>> ptree = make_ptree('(A (X x) (Y y) (X x))') >>> x1, y, x2 = ptree >>> ptree.remove(ptree[-1]); pcheck(ptree) ok! (A (Y y) (X x)) >>> print(x1.parent()); pcheck(x1) None ok! (X x) >>> print(x2.parent()) (A (Y y) (X x)) Test that a tree can not be given multiple parents: >>> ptree = make_ptree('(A (X x) (Y y) (Z z))') >>> ptree[0] = ptree[1] Traceback (most recent call last): . . . ValueError: Can not insert a subtree that already has a parent. >>> pcheck() ok! [more to be written] Shallow copying can be tricky for Tree and several of its subclasses. For shallow copies of Tree, only the root node is reconstructed, while all the children are shared between the two trees. Modify the children of one tree - and the shallowly copied tree will also update. >>> from nltk.tree import Tree, ParentedTree, MultiParentedTree >>> tree = Tree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") >>> copy_tree = tree.copy(deep=False) >>> tree == copy_tree # Ensure identical labels and nodes True >>> id(copy_tree[0]) == id(tree[0]) # Ensure shallow copy - the children are the same objects in memory True For ParentedTree objects, this behaviour is not possible. With a shallow copy, the children of the root node would be reused for both the original and the shallow copy. For this to be possible, some children would need to have multiple parents. As this is forbidden for ParentedTree objects, attempting to make a shallow copy will cause a warning, and a deep copy is made instead. >>> ptree = ParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") >>> copy_ptree = ptree.copy(deep=False) >>> copy_ptree == ptree # Ensure identical labels and nodes True >>> id(copy_ptree[0]) != id(ptree[0]) # Shallow copying isn't supported - it defaults to deep copy. True For MultiParentedTree objects, the issue of only allowing one parent that can be seen for ParentedTree objects is no more. Shallow copying a MultiParentedTree gives the children of the root node two parents: the original and the newly copied root. >>> mptree = MultiParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") >>> len(mptree[0].parents()) 1 >>> copy_mptree = mptree.copy(deep=False) >>> copy_mptree == mptree # Ensure identical labels and nodes True >>> len(mptree[0].parents()) 2 >>> len(copy_mptree[0].parents()) 2 Shallow copying a MultiParentedTree is similar to creating a second root which is identically labeled as the root on which the copy method was called. ImmutableParentedTree Regression Tests -------------------------------------- >>> iptree = ImmutableParentedTree.convert(ptree) >>> type(iptree) >>> del iptree[0] Traceback (most recent call last): . . . ValueError: ImmutableParentedTree may not be modified >>> iptree.set_label('newnode') Traceback (most recent call last): . . . ValueError: ImmutableParentedTree may not be modified MultiParentedTree Regression Tests ---------------------------------- Keep track of all trees that we create (including subtrees) using this variable: >>> all_mptrees = [] Define a helper function to create new parented trees: >>> def make_mptree(s): ... mptree = MultiParentedTree.convert(Tree.fromstring(s)) ... all_mptrees.extend(t for t in mptree.subtrees() ... if isinstance(t, Tree)) ... return mptree Define a test function that examines every subtree in all_mptrees; and checks that all six of its methods are defined correctly. If any mptrees are passed as arguments, then they are printed. >>> def mpcheck(*print_mptrees): ... def has(seq, val): # uses identity comparison ... for item in seq: ... if item is val: return True ... return False ... for mptree in all_mptrees: ... # Check mptree's methods. ... if len(mptree.parents()) == 0: ... assert len(mptree.left_siblings()) == 0 ... assert len(mptree.right_siblings()) == 0 ... assert len(mptree.roots()) == 1 ... assert mptree.roots()[0] is mptree ... assert mptree.treepositions(mptree) == [()] ... left_siblings = right_siblings = () ... roots = {id(mptree): 1} ... else: ... roots = dict((id(r), 0) for r in mptree.roots()) ... left_siblings = mptree.left_siblings() ... right_siblings = mptree.right_siblings() ... for parent in mptree.parents(): ... for i in mptree.parent_indices(parent): ... assert parent[i] is mptree ... # check left siblings ... if i > 0: ... for j in range(len(left_siblings)): ... if left_siblings[j] is parent[i-1]: ... del left_siblings[j] ... break ... else: ... assert 0, 'sibling not found!' ... # check ight siblings ... if i < (len(parent)-1): ... for j in range(len(right_siblings)): ... if right_siblings[j] is parent[i+1]: ... del right_siblings[j] ... break ... else: ... assert 0, 'sibling not found!' ... # check roots ... for root in parent.roots(): ... assert id(root) in roots, 'missing root' ... roots[id(root)] += 1 ... # check that we don't have any unexplained values ... assert len(left_siblings)==0, 'unexpected sibling' ... assert len(right_siblings)==0, 'unexpected sibling' ... for v in roots.values(): assert v>0, roots #'unexpected root' ... # check treepositions ... for root in mptree.roots(): ... for treepos in mptree.treepositions(root): ... assert root[treepos] is mptree ... # Check mptree's children's methods: ... for i, child in enumerate(mptree): ... if isinstance(child, Tree): ... # mpcheck parent() & parent_index() methods ... assert has(child.parents(), mptree) ... assert i in child.parent_indices(mptree) ... # mpcheck sibling methods ... if i > 0: ... assert has(child.left_siblings(), mptree[i-1]) ... if i < len(mptree)-1: ... assert has(child.right_siblings(), mptree[i+1]) ... if print_mptrees: ... print('ok!', end=' ') ... for mptree in print_mptrees: print(mptree) ... else: ... print('ok!') Run our test function on a variety of newly-created trees: >>> mpcheck(make_mptree('(A)')) ok! (A ) >>> mpcheck(make_mptree('(A (B (C (D) (E f)) g) h)')) ok! (A (B (C (D ) (E f)) g) h) >>> mpcheck(make_mptree('(A (B) (C c) (D d d) (E e e e))')) ok! (A (B ) (C c) (D d d) (E e e e)) >>> mpcheck(make_mptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))')) ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e ))) >>> subtree = make_mptree('(A (B (C (D) (E f)) g) h)') Including some trees that contain multiple parents: >>> mpcheck(MultiParentedTree('Z', [subtree, subtree])) ok! (Z (A (B (C (D ) (E f)) g) h) (A (B (C (D ) (E f)) g) h)) Run our test function after performing various tree-modification operations (n.b., these are the same tests that we ran for `ParentedTree`, above; thus, none of these trees actually *uses* multiple parents.) **__delitem__()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> e = mptree[0,0,1] >>> del mptree[0,0,1]; mpcheck(mptree); mpcheck(e) ok! (A (B (C (D ) (Q p)) g) h) ok! (E f) >>> del mptree[0,0,0]; mpcheck(mptree) ok! (A (B (C (Q p)) g) h) >>> del mptree[0,1]; mpcheck(mptree) ok! (A (B (C (Q p))) h) >>> del mptree[-1]; mpcheck(mptree) ok! (A (B (C (Q p)))) >>> del mptree[-100] Traceback (most recent call last): . . . IndexError: index out of range >>> del mptree[()] Traceback (most recent call last): . . . IndexError: The tree position () may not be deleted. >>> # With slices: >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') >>> b = mptree[0] >>> del mptree[0:0]; mpcheck(mptree) ok! (A (B c) (D e) f g (H i) j (K l)) >>> del mptree[:1]; mpcheck(mptree); mpcheck(b) ok! (A (D e) f g (H i) j (K l)) ok! (B c) >>> del mptree[-2:]; mpcheck(mptree) ok! (A (D e) f g (H i)) >>> del mptree[1:3]; mpcheck(mptree) ok! (A (D e) (H i)) >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') >>> del mptree[5:1000]; mpcheck(mptree) ok! (A (B c) (D e) f g (H i)) >>> del mptree[-2:1000]; mpcheck(mptree) ok! (A (B c) (D e) f) >>> del mptree[-100:1]; mpcheck(mptree) ok! (A (D e) f) >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') >>> del mptree[1:-2:2]; mpcheck(mptree) ok! (A (B c) f (H i) j (K l)) **__setitem__()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> d, e, q = mptree[0,0] >>> mptree[0,0,0] = 'x'; mpcheck(mptree); mpcheck(d) ok! (A (B (C x (E f) (Q p)) g) h) ok! (D ) >>> mptree[0,0,1] = make_mptree('(X (Y z))'); mpcheck(mptree); mpcheck(e) ok! (A (B (C x (X (Y z)) (Q p)) g) h) ok! (E f) >>> mptree[1] = d; mpcheck(mptree) ok! (A (B (C x (X (Y z)) (Q p)) g) (D )) >>> mptree[-1] = 'x'; mpcheck(mptree) ok! (A (B (C x (X (Y z)) (Q p)) g) x) >>> mptree[-100] = 'y' Traceback (most recent call last): . . . IndexError: index out of range >>> mptree[()] = make_mptree('(X y)') Traceback (most recent call last): . . . IndexError: The tree position () may not be assigned to. >>> # With slices: >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') >>> b = mptree[0] >>> mptree[0:0] = ('x', make_mptree('(Y)')); mpcheck(mptree) ok! (A x (Y ) (B c) (D e) f g (H i) j (K l)) >>> mptree[2:6] = (); mpcheck(mptree); mpcheck(b) ok! (A x (Y ) (H i) j (K l)) ok! (B c) >>> mptree[-2:] = ('z', 'p'); mpcheck(mptree) ok! (A x (Y ) (H i) z p) >>> mptree[1:3] = [make_mptree('(X)') for x in range(10)]; mpcheck(mptree) ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p) >>> mptree[5:1000] = []; mpcheck(mptree) ok! (A x (X ) (X ) (X ) (X )) >>> mptree[-2:1000] = ['n']; mpcheck(mptree) ok! (A x (X ) (X ) n) >>> mptree[-100:1] = [make_mptree('(U v)')]; mpcheck(mptree) ok! (A (U v) (X ) (X ) n) >>> mptree[-1:] = (make_mptree('(X)') for x in range(3)); mpcheck(mptree) ok! (A (U v) (X ) (X ) (X ) (X ) (X )) >>> mptree[1:-2:2] = ['x', 'y']; mpcheck(mptree) ok! (A (U v) x (X ) y (X ) (X )) **append()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> mptree.append('x'); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x) >>> mptree.append(make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z))) **extend()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> mptree.extend(['x', 'y', make_mptree('(X (Y z))')]); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) >>> mptree.extend([]); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) >>> mptree.extend(make_mptree('(X)') for x in range(3)); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X )) **insert()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> mptree.insert(0, make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h) >>> mptree.insert(-1, make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) >>> mptree.insert(-4, make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) >>> # Note: as with ``list``, inserting at a negative index that >>> # gives a position before the start of the list does *not* >>> # raise an IndexError exception; it just inserts at 0. >>> mptree.insert(-400, make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (X (Y z)) (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) **pop()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> mptree[0,0].pop(1); mpcheck(mptree) MultiParentedTree('E', ['f']) ok! (A (B (C (D ) (Q p)) g) h) >>> mptree[0].pop(-1); mpcheck(mptree) 'g' ok! (A (B (C (D ) (Q p))) h) >>> mptree.pop(); mpcheck(mptree) 'h' ok! (A (B (C (D ) (Q p)))) >>> mptree.pop(-100) Traceback (most recent call last): . . . IndexError: index out of range **remove()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> e = mptree[0,0,1] >>> mptree[0,0].remove(mptree[0,0,1]); mpcheck(mptree); mpcheck(e) ok! (A (B (C (D ) (Q p)) g) h) ok! (E f) >>> mptree[0,0].remove(make_mptree('(Q p)')); mpcheck(mptree) ok! (A (B (C (D )) g) h) >>> mptree[0,0].remove(make_mptree('(Q p)')) Traceback (most recent call last): . . . ValueError: MultiParentedTree('Q', ['p']) is not in list >>> mptree.remove('h'); mpcheck(mptree) ok! (A (B (C (D )) g)) >>> mptree.remove('h'); Traceback (most recent call last): . . . ValueError: 'h' is not in list >>> # remove() removes the first subtree that is equal (==) to the >>> # given tree, which may not be the identical tree we give it: >>> mptree = make_mptree('(A (X x) (Y y) (X x))') >>> x1, y, x2 = mptree >>> mptree.remove(mptree[-1]); mpcheck(mptree) ok! (A (Y y) (X x)) >>> print([str(p) for p in x1.parents()]) [] >>> print([str(p) for p in x2.parents()]) ['(A (Y y) (X x))'] ImmutableMultiParentedTree Regression Tests ------------------------------------------- >>> imptree = ImmutableMultiParentedTree.convert(mptree) >>> type(imptree) >>> del imptree[0] Traceback (most recent call last): . . . ValueError: ImmutableMultiParentedTree may not be modified >>> imptree.set_label('newnode') Traceback (most recent call last): . . . ValueError: ImmutableMultiParentedTree may not be modified ProbabilisticTree Regression Tests ---------------------------------- >>> prtree = ProbabilisticTree("S", [ProbabilisticTree("NP", ["N"], prob=0.3)], prob=0.6) >>> print(prtree) (S (NP N)) (p=0.6) >>> import copy >>> prtree == copy.deepcopy(prtree) == prtree.copy(deep=True) == prtree.copy() True >>> prtree[0] is prtree.copy()[0] True >>> prtree[0] is prtree.copy(deep=True)[0] False >>> imprtree = ImmutableProbabilisticTree.convert(prtree) >>> type(imprtree) >>> del imprtree[0] Traceback (most recent call last): . . . ValueError: ImmutableProbabilisticTree may not be modified >>> imprtree.set_label('newnode') Traceback (most recent call last): . . . ValueError: ImmutableProbabilisticTree may not be modified Squashed Bugs ============= This used to discard the ``(B b)`` subtree (fixed in svn 6270): >>> print(Tree.fromstring('((A a) (B b))')) ( (A a) (B b)) Pickling ParentedTree instances didn't work for Python 3.7 onwards (See #2478) >>> import pickle >>> tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))') >>> print(tree) (S (NN x) (NP x) (NN x)) >>> pickled = pickle.dumps(tree) >>> tree_loaded = pickle.loads(pickled) >>> print(tree_loaded) (S (NN x) (NP x) (NN x)) ParentedTree used to be impossible to (deep)copy. (See #1324) >>> from nltk.tree import ParentedTree >>> import copy >>> tree = ParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") >>> tree == copy.deepcopy(tree) == copy.copy(tree) == tree.copy(deep=True) == tree.copy() True nltk-3.7/nltk/test/treeprettyprinter.doctest000066400000000000000000000217571420073152400214760ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ========================================================= Unit tests for nltk.tree.prettyprinter.TreePrettyPrinter ========================================================= >>> from nltk.tree import Tree, TreePrettyPrinter Tree nr 2170 from nltk.corpus.treebank: >>> tree = Tree.fromstring( ... '(S (NP-SBJ (PRP I)) (VP (VBP feel) (ADJP-PRD (RB pretty) ' ... '(JJ good)) (PP-CLR (IN about) (NP (PRP it)))) (. .))') >>> tpp = TreePrettyPrinter(tree) >>> print(tpp.text()) S __________________________|_____________________ | VP | | ____________________|___________ | | | | PP-CLR | | | | _____|_____ | NP-SBJ | ADJP-PRD | NP | | | _______|______ | | | PRP VBP RB JJ IN PRP . | | | | | | | I feel pretty good about it . >>> print(tpp.text(unicodelines=True)) S ┌──────────────────────────┼─────────────────────┐ │ VP │ │ ┌─────────────┬──────┴───────────┐ │ │ │ │ PP-CLR │ │ │ │ ┌─────┴─────┐ │ NP-SBJ │ ADJP-PRD │ NP │ │ │ ┌───────┴──────┐ │ │ │ PRP VBP RB JJ IN PRP . │ │ │ │ │ │ │ I feel pretty good about it . A tree with long labels: >>> tree = Tree.fromstring( ... '(sentence (plural-noun-phrase (plural-noun Superconductors)) ' ... '(verb-phrase (plural-verb conduct) ' ... '(noun-phrase (singular-noun electricity))))') >>> tpp = TreePrettyPrinter(tree) >>> print(tpp.text(abbreviate=8, nodedist=2)) sentence __________|__________ | verb-phr. | __________|__________ plural-n. | noun-phr. | | | plural-n. plural-v. singular. | | | Supercon. conduct electric. >>> print(tpp.text(maxwidth=8, nodedist=2)) sentence _________|________ | verb- | phrase | ________|_________ plural- | noun- noun- | phrase phrase | | | | | plural- plural- singular- noun verb noun | | | Supercon conduct electric ductors ity A discontinuous tree: >>> tree = Tree.fromstring( ... '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) ' ... '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) ' ... '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int) >>> sentence = ('Ze had met haar moeder kunnen gaan winkelen ,' ... ' zwemmen of terrassen .'.split()) >>> tpp = TreePrettyPrinter(tree, sentence) >>> print(tpp.text()) top _____|______________________________________________ smain | | _______________________________|_____ | | | | inf | | | | _____|____ | | | | | inf | | | | | ____|_____ | | | | | | conj | | | | _____ | ___ | _________|______ | __________________ | | | inf | | | | | | | | | _________|_____ | ___ | _________ | | | | | | | pp | | | | | | | | | | ____|____ | | | | | | | | | | | np | | | | inf | inf | | | | ____|____ | | | | | | | | noun verb prep det noun verb verb verb punct verb vg verb punct | | | | | | | | | | | | | Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen . >>> print(tpp.text(unicodelines=True)) top ┌─────┴──────────────────┬───────────────────────────┐ smain │ │ ┌────┬──────────────────────────┴─────┐ │ │ │ │ inf │ │ │ │ ┌─────┴────┐ │ │ │ │ │ inf │ │ │ │ │ ┌────┴─────┐ │ │ │ │ │ │ conj │ │ │ │ ┌───── │ ─── │ ─────────┴────── │ ─────┬─────┬──────┐ │ │ │ inf │ │ │ │ │ │ │ │ │ ┌─────────┴───── │ ─── │ ─────────┐ │ │ │ │ │ │ │ pp │ │ │ │ │ │ │ │ │ │ ┌────┴────┐ │ │ │ │ │ │ │ │ │ │ │ np │ │ │ │ inf │ inf │ │ │ │ ┌────┴────┐ │ │ │ │ │ │ │ │ noun verb prep det noun verb verb verb punct verb vg verb punct │ │ │ │ │ │ │ │ │ │ │ │ │ Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen . Importing TreePrettyPrinter --------------------------- First of all, a simple tree will be constructed:: >>> from nltk.tree import Tree >>> tree = Tree.fromstring('(S (NP Mary) (VP walks))') We'll use this sample tree to show that the method of importing `TreePrettyPrinter` work correctly: - Recommended:: >>> from nltk.tree import TreePrettyPrinter >>> print(TreePrettyPrinter(tree).text()) S ____|____ NP VP | | Mary walks - Alternative but valid options:: >>> from nltk import TreePrettyPrinter >>> print(TreePrettyPrinter(tree).text()) S ____|____ NP VP | | Mary walks >>> from nltk.tree.prettyprinter import TreePrettyPrinter >>> print(TreePrettyPrinter(tree).text()) S ____|____ NP VP | | Mary walks - Deprecated, do not use:: >>> from nltk.treeprettyprinter import TreePrettyPrinter >>> print(TreePrettyPrinter(tree).text()) S ____|____ NP VP | | Mary walks This method will throw a DeprecationWarning:: Import `TreePrettyPrinter` using `from nltk.tree import TreePrettyPrinter` instead. nltk-3.7/nltk/test/treetransforms.doctest000066400000000000000000000113641420073152400207320ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ------------------------------------------- Unit tests for the TreeTransformation class ------------------------------------------- >>> from copy import deepcopy >>> from nltk.tree import Tree, collapse_unary, chomsky_normal_form, un_chomsky_normal_form >>> tree_string = "(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))" >>> tree = Tree.fromstring(tree_string) >>> print(tree) (TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .))) Make a copy of the original tree and collapse the subtrees with only one child >>> collapsedTree = deepcopy(tree) >>> collapse_unary(collapsedTree) >>> print(collapsedTree) (TOP (S (S+VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room)))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .))) >>> collapsedTree2 = deepcopy(tree) >>> collapse_unary(collapsedTree2, collapsePOS=True, collapseRoot=True) >>> print(collapsedTree2) (TOP+S (S+VP (VBN Turned) (ADVP+RB loose) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room)))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP+RB little) (ADJP+RB right))) (. .)) Convert the tree to Chomsky Normal Form i.e. each subtree has either two subtree children or a single leaf value. This conversion can be performed using either left- or right-factoring. >>> cnfTree = deepcopy(collapsedTree) >>> chomsky_normal_form(cnfTree, factor='left') >>> print(cnfTree) (TOP (S (S| (S| (S| (S+VP (S+VP| (VBN Turned) (ADVP (RB loose))) (PP (IN in) (NP (NP| (NP (NP| (NNP Shane) (NNP Longman)) (POS 's)) (NN trading)) (NN room)))) (, ,)) (NP (NP| (DT the) (NN yuppie)) (NNS dealers))) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))) (. .))) >>> cnfTree = deepcopy(collapsedTree) >>> chomsky_normal_form(cnfTree, factor='right') >>> print(cnfTree) (TOP (S (S+VP (VBN Turned) (S+VP| (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NP| (NNP Longman) (POS 's))) (NP| (NN trading) (NN room)))))) (S|<,-NP-VP-.> (, ,) (S| (NP (DT the) (NP| (NN yuppie) (NNS dealers))) (S| (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))))) Employ some Markov smoothing to make the artificial node labels a bit more readable. See the treetransforms.py documentation for more details. >>> markovTree = deepcopy(collapsedTree) >>> chomsky_normal_form(markovTree, horzMarkov=2, vertMarkov=1) >>> print(markovTree) (TOP (S^ (S+VP^ (VBN Turned) (S+VP|^ (ADVP^ (RB loose)) (PP^ (IN in) (NP^ (NP^ (NNP Shane) (NP|^ (NNP Longman) (POS 's))) (NP|^ (NN trading) (NN room)))))) (S|<,-NP>^ (, ,) (S|^ (NP^ (DT the) (NP|^ (NN yuppie) (NNS dealers))) (S|^ (VP^ (AUX do) (NP^ (NP^ (RB little)) (ADJP^ (RB right)))) (. .)))))) Convert the transformed tree back to its original form >>> un_chomsky_normal_form(markovTree) >>> tree == markovTree True nltk-3.7/nltk/test/twitter.ipynb000066400000000000000000001544701420073152400170400ustar00rootroot00000000000000{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Twitter HOWTO" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Overview\n", "\n", "This document is an overview of how to use NLTK to collect and process Twitter data. It was written as an IPython notebook, and if you have IPython installed, you can download [the source of the notebook](https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/twitter.ipynb) from the NLTK GitHub repository and run the notebook in interactive mode.\n", "\n", "Most of the tasks that you might want to carry out with 'live' Twitter data require you to authenticate your request by registering for API keys. This is usually a once-only step. When you have registered your API keys, you can store them in a file on your computer, and then use them whenever you want. We explain what's involved in the section [First Steps](#first_steps).\n", "\n", "If you have already obtained Twitter API keys as part of some earlier project, [storing your keys](#store_keys) explains how to save them to a file that NLTK will be able to find. Alternatively, if you just want to play around with the Twitter data that is distributed as part of NLTK, head over to the section on using the [`twitter-samples` corpus reader](#corpus_reader).\n", "\n", "Once you have got authentication sorted out, we'll show you [how to use NLTK's `Twitter` class](#simple). This is made as simple as possible, but deliberately limits what you can do. \n", "\n", "## First Steps\n", "\n", "As mentioned above, in order to collect data from Twitter, you first need to register a new *application* — this is Twitter's way of referring to any computer program that interacts with the Twitter API. As long as you save your registration information correctly, you should only need to do this once, since the information should work for any NLTK code that you write. You will need to have a Twitter account before you can register. Twitter also insists that [you add a mobile phone number to your Twitter profile](https://support.twitter.com/articles/110250-adding-your-mobile-number-to-your-account-via-web) before you will be allowed to register an application.\n", "\n", "These are the steps you need to carry out.\n", "\n", "### Getting your API keys from Twitter\n", "\n", "1. Sign in to your Twitter account at https://apps.twitter.com. You should then get sent to a screen that looks something like this:\n", "\n", "Clicking on the **Create New App** button should take you to the following screen:\n", "\n", "The information that you provide for **Name**, **Description** and **Website** can be anything you like.\n", "\n", "2. Make sure that you select **Read and Write** access for your application (as specified on the *Permissions* tab of Twitter's Application Management screen):\n", "\n", "\n", "3. Go to the tab labeled **Keys and Access Tokens**. It should look something like this, but with actual keys rather than a string of Xs:\n", "\n", "As you can see, this will give you four distinct keys: consumer key, consumer key secret, access token and access token secret.\n", "\n", "### Storing your keys\n", "\n", "1. Create a folder named `twitter-files` in your home directory. Within this folder, use a text editor to create a new file called `credentials.txt`. Make sure that this file is just a plain text file. In it, you should create which you should store in a text file with the following structure:\n", "```\n", "app_key=YOUR CONSUMER KEY \n", "app_secret=YOUR CONSUMER SECRET \n", "oauth_token=YOUR ACCESS TOKEN \n", "oauth_token_secret=YOUR ACCESS TOKEN SECRET\n", "```\n", "Type the part up to and includinge the '=' symbol exactly as shown. The values on the right-hand side of the '=' — that is, everything in caps — should be cut-and-pasted from the relevant API key information shown on the Twitter **Keys and Access Tokens**. Save the file and that's it.\n", "\n", "2. It's going to be important for NLTK programs to know where you have stored your\n", " credentials. We'll assume that this folder is called `twitter-files`, but you can call it anything you like. We will also assume that this folder is where you save any files containing tweets that you collect. Once you have decided on the name and location of this \n", " folder, you will need to set the `TWITTER` environment variable to this value. \n", "\n", " On a Unix-like system (including MacOS), you will set the variable something like this:\n", " ```bash\n", " export TWITTER=\"/path/to/your/twitter-files\"\n", " ```\n", " Rather than having to give this command each time you start a new session, it's advisable to add it to your shell's configuration file, e.g. to `.bashrc`.\n", "\n", " On a Windows machine, right click on “My Computer” then select `Properties > Advanced > Environment Variables > User Variables > New...` \n", "\n", " One important thing to remember is that you need to keep your `credentials.txt` file private. So do **not** share your `twitter-files` folder with anyone else, and do **not** upload it to a public repository such as GitHub.\n", "\n", "3. Finally, read through Twitter's [Developer Rules of the Road](https://dev.twitter.com/overview/terms/policy). As far as these rules are concerned, you count as both the application developer and the user." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Install Twython\n", "\n", "The NLTK Twitter package relies on a third party library called [Twython](https://twython.readthedocs.org/). Install Twython via [pip](https://pip.pypa.io):\n", "```bash\n", "$ pip install twython\n", "```\n", "\n", "or with [easy_install](https://pythonhosted.org/setuptools/easy_install.html):\n", "\n", "```bash\n", "$ easy_install twython\n", "```\n", "We're now ready to get started. The next section will describe how to use the `Twitter` class to talk to the Twitter API." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*More detail*:\n", "Twitter offers are two main authentication options. OAuth 1 is for user-authenticated API calls, and allows sending status updates, direct messages, etc, whereas OAuth 2 is for application-authenticated calls, where read-only access is sufficient. Although OAuth 2 sounds more appropriate for the kind of tasks envisaged within NLTK, it turns out that access to Twitter's Streaming API requires OAuth 1, which is why it's necessary to obtain *Read and Write* access for your application." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using the simple `Twitter` class\n", "\n", "### Dipping into the Public Stream\n", "\n", "The `Twitter` class is intended as a simple means of interacting with the Twitter data stream. Later on, we'll look at other methods which give more fine-grained control. \n", "\n", "The Twitter live public stream is a sample (approximately 1%) of all Tweets that are currently being published by users. They can be on any topic and in any language. In your request, you can give keywords which will narrow down the Tweets that get delivered to you. Our first example looks for Tweets which include either the word *love* or *hate*. We limit the call to finding 10 tweets. When you run this code, it will definitely produce different results from those shown below!" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sana magkakaisa na ang mga Kapamilya at Kapuso. Spread love, not hate\n", " #ShowtimeKapamiIyaDay #ALDubEBforLOVE\n", "@Real_Liam_Payne Please follow me , you mean the world to me and words can't describe how much i love you x3186\n", "Love my ugly wife\n", "RT @ansaberano: We Found Love\n", "#PushAwardsLizQuen\n", "RT @yungunmei: people want to fall in love but don't understand the concept\n", "I don't care, I love It #EMABiggestFans1D\n", "RT @bryan_white: I'm not in the Philippines Yet but we are making a very BIG announcement in 2 days! Get ready! Love you! #GGMY #ALDubEBfor…\n", "I whole heartedly HATE @lakiamichelle like really HATE her 😩 who wants to be her friend because I DONT\n", "RT @lahrose23: I love yu to https://t.co/dfsRwSp1IC\n", "RT @alone_in_woods: ahoj, já jsem tvůj pes a tohle je náš love song /// Zrní - Já jsem tvůj pes https://t.co/7L0XPHeA2d via @YouTube\n", "Written 10 Tweets\n" ] } ], "source": [ "from nltk.twitter import Twitter\n", "tw = Twitter()\n", "tw.tweets(keywords='love, hate', limit=10) #sample from the public stream" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The next example filters the live public stream by looking for specific user accounts. In this case, we 'follow' two news organisations, namely `@CNN` and `@BBCNews`. [As advised by Twitter](https://dev.twitter.com/streaming/reference/post/statuses/filter), we use *numeric userIDs* for these accounts. If you run this code yourself, you'll see that Tweets are arriving much more slowly than in the previous example. This is because even big new organisations don't publish Tweets that often.\n", "\n", "A bit later we will show you how to use Python to convert usernames such as `@CNN` to userIDs such as `759251`, but for now you might find it simpler to use a web service like [TweeterID](https://tweeterid.com) if you want to experiment with following different accounts than the ones shown below." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Judge grants petition allowing @Caitlyn_Jenner to officially change her name and gender. http://t.co/HpCbAQ64Mk http://t.co/BPaKy2…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "Written 10 Tweets\n" ] } ], "source": [ "tw = Twitter()\n", "tw.tweets(follow=['759251', '612473'], limit=10) # see what CNN and BBC are talking about" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Saving Tweets to a File\n", "\n", "By default, the `Twitter` class will just print out Tweets to your computer terminal. Although it's fun to view the Twitter stream zipping by on your screen, you'll probably want to save some tweets in a file. We can tell the `tweets()` method to save to a file by setting the flag `to_screen` to `False`. \n", "\n", "The `Twitter` class will look at the value of your environmental variable `TWITTER` to determine which folder to use to save the tweets, and it will put them in a date-stamped file with the prefix `tweets`. " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing to /Users/ewan/twitter-files/tweets.20150926-154251.json\n", "Written 25 Tweets\n" ] } ], "source": [ "tw = Twitter()\n", "tw.tweets(to_screen=False, limit=25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So far, we've been taking data from the live public stream. However, it's also possible to retrieve past tweets, for example by searching for specific keywords, and setting `stream=False`:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "“Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video] http://t.co/eY4GgKS3ak\n", "“Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video] http://t.co/Pflf7A6Tr6\n", "“Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video] http://t.co/mibYfNISBT http://t.co/9ElX70F4St\n", "Photo: “Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video]: Hillary... http://t.co/qIiWGk1jbM\n", "lena dunham and hilary clinton talking about feminism... l o l theyre the two most hypocritical and clueless about what feminism actually is\n", "“Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video]: \n", "Hillary Clinton An... http://t.co/31shf6VeEu\n", "“Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video]: \n", "Hillary Clinton An... http://t.co/uvft4LDS0t\n", "“Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video]: \n", "Hillary Clinton An... http://t.co/uEbc25V3E3\n", "“Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video]: \n", "Hillary Cl... http://t.co/RNgziN9eWA #bossip\n", "“Girls” Creator Lena Dunham Interviews Hilary Clinton About… Lenny Kravitz’s Junk [Video]: \n", "Hillary Clinton An... http://t.co/gkB5aLEJJP\n", "Written 10 Tweets\n" ] } ], "source": [ "tw.tweets(keywords='hilary clinton', stream=False, limit=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Onwards and Upwards\n", "\n", "In this section, we'll look at how to get more fine-grained control over processing Tweets. To start off, we will import a bunch of stuff from the `twitter` package." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following example, you'll see the line\n", "``` python\n", "oauth = credsfromfile()\n", "```\n", "This gets hold of your stored API key information. The function `credsfromfile()` by default looks for a file called `credentials.txt` in the directory set by the environment variable `TWITTER`, reads the contents and returns the result as a dictionary. We then pass this dictionary as an argument when initializing our client code. We'll be using two classes to wrap the clients: `Streamer` and `Query`; the first of these calls [the Streaming API](https://dev.twitter.com/streaming/overview) and the second calls Twitter's [Search API](https://dev.twitter.com/rest/public) (also called the REST API). " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*More detail*: For more detail, see this blog post on [The difference between the Twitter Firehose API, the Twitter Search API, and the Twitter Streaming API](http://www.brightplanet.com/2013/06/twitter-firehose-vs-twitter-api-whats-the-difference-and-why-should-you-care/)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After initializing a client, we call the `register()` method to specify whether we want to view the data on a terminal or write it to a file. Finally, we call a method which determines the API endpoint to address; in this case, we use `sample()` to get a random sample from the the Streaming API." ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RT @EPVLatino: ¿Y entonces? El gobierno sigue importando carros mientras las plantas Chery los tiene acumulados http://t.co/bBhrawqHe7\n", "RT @AbrahamMateoMus: . @menudanochetv aquí se suman nuestros Abrahamers MEXICAN@S!! 👏 \n", "#MenudaNocheConAM 😉 http://t.co/8DMw31wZ5i\n", "RT @Joeyclipstar: ** FRESH ** Bow Wow Signs to Bad Boy Records - The Breakfast Club http://t.co/3w58p6Sbx2 RT http://t.co/LbQU2brfpf\n", "#شاركونا\n", "اشي مستحيل تكمل يومكك بدونه ... ؟ 🌚\n", "#Manal\n", "RT @techjunkiejh: MEAN Stack Tutorial #mongodb #ExpressJS #angularjs #nodejs #javascript http://t.co/4gTFsj2dtP http://t.co/a86hmb4mRx\n", "Only @MariamDiamond would reply to a spider on twitter 😂😂\n", "RT @CJLeBlanc: @SeanCarrigan greets the new day..full spirit, verve and no small amount of vodka! GO TEAM #YR! … http://t.co/bQIglZVDxR\n", "んぐぅおぉ、はらみーライブ楽しかったようで何より。行きたかったンゴ〜\n", "RT @NicoleRaine8: @maine_richards @MaLuisaMiranda1 count me in ngkakape nyahaha #ALDubEBforLOVE\n", "RT @RadioDelPlata: [AHORA] \"Me amputaron los 4 miembros\" Perla Pascarelli sobre #Malapraxis a #MónicayCésar http://t.co/StUhpxDeM3\n", "Written 10 Tweets\n" ] } ], "source": [ "oauth = credsfromfile()\n", "client = Streamer(**oauth)\n", "client.register(TweetViewer(limit=10))\n", "client.sample()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The next example is similar, except that we call the `filter()` method with the `track` parameter followed by a string literal. The string is interpreted as a list of search terms where [comma indicates a logical OR](https://dev.twitter.com/streaming/overview/request-parameters#track). The terms are treated as case-insensitive." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "European countries at heart of refugee crisis seek to ease tensions: Hungary announces removal of razor wire f... http://t.co/PavCKddtY2\n", "RT @onlinewweman: Germany told students to wear \"modest clothing\" bc they don't want the refugees to have \"misunderstandings.\" That's a wei…\n", "RT @El_consciente: El cuento ha cambiado. A pinocho le crecía la nariz si mentía. A los políticos europeos sus fortunas. Made in Germany ht…\n", "VIDEO=> Finns Attack “Refugee” Bus with Rocks and Fireworks – Refugees Turn Back to Sweden https://t.co/94KqhyCNjJ http://t.co/e3kmeGjRFn\n", "RT @El_consciente: Merkel al volante de Europa. Fabricación en cadena de productos fraudulentos. Made in Germany http://t.co/SJ5BYQ7lIu htt…\n", "European countries at heart of refugee crisis seek to ease tensions: Hungary announces rem... http://t.co/5BmOYNK3Kj (via @EricBarbosa11\n", "@SirCorgis @matty_is @RT_com but will Poland blame the ppl actually causing the refugee crisis? Cause and effect is a bitch innit?\n", "RT @El_consciente: Merkel al volante de Europa. Fabricación en cadena de productos fraudulentos. Made in Germany http://t.co/SJ5BYQ7lIu htt…\n", "♥ https://t.co/CyoWdON0li\n", "RT @mjesusgz: Castle Germany http://t.co/scs5dJE1Gk\n", "Written 10 Tweets\n" ] } ], "source": [ "client = Streamer(**oauth)\n", "client.register(TweetViewer(limit=10))\n", "client.filter(track='refugee, germany')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Whereas the Streaming API lets us access near real-time Twitter data, the Search API lets us query for past Tweets. In the following example, the value `tweets` returned by `search_tweets()` is a generator; the expression `next(tweets)` gives us the first Tweet from the generator. \n", "\n", "Although Twitter delivers Tweets as [JSON](http://www.json.org) objects, the Python client encodes them as dictionaries, and the example pretty-prints a portion of the dictionary corresponding the Tweet in question." ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'contributors': None,\n", " 'coordinates': None,\n", " 'created_at': 'Sat Sep 26 14:25:12 +0000 2015',\n", " 'entities': {...},\n", " 'favorite_count': 0,\n", " 'favorited': False,\n", " 'geo': None,\n", " 'id': 647778955005665280,\n", " 'id_str': '647778955005665280',\n", " 'in_reply_to_screen_name': None,\n", " 'in_reply_to_status_id': None,\n", " 'in_reply_to_status_id_str': None,\n", " 'in_reply_to_user_id': None,\n", " 'in_reply_to_user_id_str': None,\n", " 'is_quote_status': False,\n", " 'lang': 'en',\n", " 'metadata': {...},\n", " 'place': None,\n", " 'possibly_sensitive': False,\n", " 'retweet_count': 0,\n", " 'retweeted': False,\n", " 'source': 'TechWars',\n", " 'text': 'We compared #gate vs #nltk - see results: http://t.co/jvQ4Ph85L1',\n", " 'truncated': False,\n", " 'user': {...}}\n" ] } ], "source": [ "client = Query(**oauth)\n", "tweets = client.search_tweets(keywords='nltk', limit=10)\n", "tweet = next(tweets)\n", "from pprint import pprint\n", "pprint(tweet, depth=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Twitter's own documentation [provides a useful overview of all the fields in the JSON object](https://dev.twitter.com/overview/api/tweets) and it may be helpful to look at this [visual map of a Tweet object](http://www.scribd.com/doc/30146338/map-of-a-tweet).\n", "\n", "Since each Tweet is converted into a Python dictionary, it's straightforward to just show a selected field, such as the value of the `'text'` key." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Slammer an immigration lawyer seattle wa protection if purusha this morning polaric deportation?: Nltk\n", "Python Text Processing with NLTK 2.0 Cookbook / Jacob Perkins\n", "http://t.co/0gUjlTWA7G\n", "\n", "49\n", "RT @tjowens: DHbox http://t.co/skIzU3Nm6C \"Ready-to-go configurations of Omeka, NLTK, IPython, R Studio, and Mallet\" #odh2015 http://t.co/6…\n", "RT @tjowens: DHbox http://t.co/skIzU3Nm6C \"Ready-to-go configurations of Omeka, NLTK, IPython, R Studio, and Mallet\" #odh2015 http://t.co/6…\n", "RT @tjowens: DHbox http://t.co/skIzU3Nm6C \"Ready-to-go configurations of Omeka, NLTK, IPython, R Studio, and Mallet\" #odh2015 http://t.co/6…\n", "RT @tjowens: DHbox http://t.co/skIzU3Nm6C \"Ready-to-go configurations of Omeka, NLTK, IPython, R Studio, and Mallet\" #odh2015 http://t.co/6…\n", "RT @ideaofhappiness: Interesting! @DH_Box is a Docker container for digital humanities computational work, pre-equipped with IPython, RStud…\n", "RT @ideaofhappiness: Interesting! @DH_Box is a Docker container for digital humanities computational work, pre-equipped with IPython, RStud…\n", "RT @dimazest: Stanford dependency parser support is merged into @NLTK_org https://t.co/aN6b1lFGPf\n" ] } ], "source": [ "for tweet in tweets:\n", " print(tweet['text'])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing to /Users/ewan/twitter-files/tweets.20150926-154337.json\n" ] } ], "source": [ "client = Query(**oauth)\n", "client.register(TweetWriter())\n", "client.user_tweets('timoreilly', 10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Given a list of user IDs, the following example shows how to retrieve the screen name and other information about the users." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CNN, followers: 19806095, following: 1102\n", "BBCNews, followers: 4935491, following: 105\n", "ReutersLive, followers: 307337, following: 55\n", "BreakingNews, followers: 7949242, following: 541\n", "AJELive, followers: 1117, following: 19\n" ] } ], "source": [ "userids = ['759251', '612473', '15108702', '6017542', '2673523800']\n", "client = Query(**oauth)\n", "user_info = client.user_info_from_id(userids)\n", "for info in user_info:\n", " name = info['screen_name']\n", " followers = info['followers_count']\n", " following = info['friends_count']\n", " print(\"{}, followers: {}, following: {}\".format(name, followers, following))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A list of user IDs can also be used as input to the Streaming API client." ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @bbcweather: Cameras at the ready for #supermoon #eclipse on 27/28th Sept, next one won't be until 2033! http://t.co/SPucnmBqaD http://t…\n", "RT @BreakingNews: Alleged Libya-Europe people smuggler killed in shootout, Libya officials say Italy behind asssassination - @guardian http…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "@CNN white water, Monica, emails, Benghazi. A family/foundation of lies and crime. Indict Hillary for breaking laws\n", "RT @CNN: Bill Clinton on email scrutiny: 'I've never seen so much expended on so little.'\n", "http://t.co/XkLP0IHeOG\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @CNN: Sunday's #supermoon #eclipse has some excited, but for others, it's an ominous \"blood moon.\" http://t.co/2B1wdQru0q http://t.co/Aw…\n", "RT @BreakingNews: Alleged Libya-Europe people smuggler killed in shootout, Libya officials say Italy behind asssassination - @guardian http…\n", "Written 10 Tweets\n" ] } ], "source": [ "client = Streamer(**oauth)\n", "client.register(TweetViewer(limit=10))\n", "client.statuses.filter(follow=userids)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To store data that Twitter sents by the Streaming API, we register a `TweetWriter` instance." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing to /Users/ewan/twitter-files/tweets.20150926-154408.json\n", "Written 10 Tweets\n" ] } ], "source": [ "client = Streamer(**oauth)\n", "client.register(TweetWriter(limit=10))\n", "client.statuses.sample()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's the full signature of the `Tweetwriter`'s `__init__()` method:\n", "```python\n", "def __init__(self, limit=2000, upper_date_limit=None, lower_date_limit=None,\n", " fprefix='tweets', subdir='twitter-files', repeat=False,\n", " gzip_compress=False): \n", "```\n", "If the `repeat` parameter is set to `True`, then the writer will write up to the value of `limit` in file `file1`, then open a new file `file2` and write to it until the limit is reached, and so on indefinitely. The parameter `gzip_compress` can be used to compress the files once they have been written." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using a Tweet Corpus\n", "\n", "NLTK's Twitter corpus currently contains a sample of 20k Tweets (named '`twitter_samples`')\n", "retrieved from the Twitter Streaming API, together with another 10k which are divided according to sentiment into negative and positive." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from nltk.corpus import twitter_samples\n", "twitter_samples.fileids()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We follow standard practice in storing full Tweets as line-separated\n", "JSON. These data structures can be accessed via `tweets.docs()`. However, in general it\n", "is more practical to focus just on the text field of the Tweets, which\n", "are accessed via the `strings()` method." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP\n", "VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY\n", "RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…\n", "RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1\n", "RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…\n", "RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2\n", "RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives\n", "RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…\n", "RT @NivenJ1: “@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating do…\n", "LOLZ to Trickle Down Wealth. It's never trickling past their own wallets. Greed always wins $$$ for the greedy. https://t.co/X7deoPbS97\n", "SNP leader faces audience questions http://t.co/TYClKltSpW\n", "RT @cononeilluk: Cameron \"Ed Milliband hanging out with Russell Brand. He is a joke. This is an election. This is about real people' http:/…\n", "RT @politicshome: Ed Miliband: Last Labour government did not overspend http://t.co/W9RJ2aSH6o http://t.co/4myFekg5ex\n", "If Miliband is refusing to do any deal with the SNP, how does he plan on forming a government?\n", "RT @scotnotbritt: Well thats it. LABOUR would rather have a TORY government rather than work with the SNP. http://t.co/SNMkRDCe9f\n" ] } ], "source": [ "strings = twitter_samples.strings('tweets.20150430-223406.json')\n", "for string in strings[:15]:\n", " print(string)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The default tokenizer for Tweets (`casual.py`) is specialised for 'casual' text, and\n", "the `tokenized()` method returns a list of lists of tokens." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', 'the', 'EU', 'is', 'estimated', 'to', 'be', 'costing', 'Britain', '£', '170', 'billion', 'per', 'year', '!', '#BetterOffOut', '#UKIP']\n", "['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY']\n", "['RT', '@LabourEoin', ':', 'The', 'economy', 'was', 'growing', '3', 'times', 'faster', 'on', 'the', 'day', 'David', 'Cameron', 'became', 'Prime', 'Minister', 'than', 'it', 'is', 'today', '..', '#BBCqt', 'http://t.co…']\n", "['RT', '@GregLauder', ':', 'the', 'UKIP', 'east', 'lothian', 'candidate', 'looks', 'about', '16', 'and', 'still', 'has', 'an', 'msn', 'addy', 'http://t.co/7eIU0c5Fm1']\n", "['RT', '@thesundaypeople', ':', \"UKIP's\", 'housing', 'spokesman', 'rakes', 'in', '£', '800k', 'in', 'housing', 'benefit', 'from', 'migrants', '.', 'http://t.co/GVwb9Rcb4w', 'http://t.co/c1AZxcLh…']\n" ] } ], "source": [ "tokenized = twitter_samples.tokenized('tweets.20150430-223406.json')\n", "for toks in tokenized[:5]:\n", " print(toks)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "### Extracting Parts of a Tweet\n", "\n", "If we want to carry out other kinds of analysis on Tweets, we have to work directly with the file rather than via the corpus reader. For demonstration purposes, we will use the same file as the one in the preceding section, namely `tweets.20150430-223406.json`. The `abspath()` method of the corpus gives us the full pathname of the relevant file. If your NLTK data is installed in the default location on a Unix-like system, this pathname will be `'/usr/share/nltk_data/corpora/twitter_samples/tweets.20150430-223406.json'`." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from nltk.corpus import twitter_samples\n", "input_file = twitter_samples.abspath(\"tweets.20150430-223406.json\")" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "The function `json2csv()` takes as input a file-like object consisting of Tweets as line-delimited JSON objects and returns a file in CSV format. The third parameter of the function lists the fields that we want to extract from the JSON. One of the simplest examples is to extract just the text of the Tweets (though of course it would have been even simpler to use the `strings()` method of the corpus reader)." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from nltk.twitter.common import json2csv\n", "with open(input_file) as fp:\n", " json2csv(fp, 'tweets_text.csv', ['text'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We've passed the filename `'tweets_text.csv'` as the second argument of `json2csv()`. Unless you provide a complete pathname, the file will be created in the directory where you are currently executing Python.\n", "\n", "If you open the file `'tweets_text.csv'`, the first 5 lines should look as follows:\n", "\n", "```\n", "RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP\n", "VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY\n", "RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…\n", "RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1\n", "RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "However, in some applications you may want to work with Tweet metadata, e.g., the creation date and the user. As mentioned earlier, all the fields of a Tweet object are described in [the official Twitter API](https://dev.twitter.com/overview/api/tweets). \n", "\n", "The third argument of `json2csv()` can specified so that the function selects relevant parts of the metadata. For example, the following will generate a CSV file including most of the metadata together with the id of the user who has published it." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "with open(input_file) as fp:\n", " json2csv(fp, 'tweets.20150430-223406.tweet.csv',\n", " ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id', \n", " 'in_reply_to_user_id', 'retweet_count', 'retweeted', \n", " 'text', 'truncated', 'user.id'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "created_at,favorite_count,id,in_reply_to_status_id,in_reply_to_user_id,retweet_count,retweeted,text,truncated,user.id\n", "\n", "Thu Apr 30 21:34:06 +0000 2015,0,593891099434983425,,,0,False,RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,False,107794703\n", "\n", "Thu Apr 30 21:34:06 +0000 2015,0,593891099548094465,,,0,False,VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY,False,557422508\n", "\n", "Thu Apr 30 21:34:06 +0000 2015,0,593891099388846080,,,0,False,RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…,False,3006692193\n", "\n", "Thu Apr 30 21:34:06 +0000 2015,0,593891100429045760,,,0,False,RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1,False,455154030\n", "\n" ] } ], "source": [ "for line in open('tweets.20150430-223406.tweet.csv').readlines()[:5]:\n", " print(line)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The first nine elements of the list are attributes of the Tweet, while the last one, `user.id`, takes the user object associated with the Tweet, and retrieves the attributes in the list (in this case only the id). The object for the Twitter user is described in the [Twitter API for users](https://dev.twitter.com/overview/api/users)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The rest of the metadata of the Tweet are the so-called [entities](https://dev.twitter.com/overview/api/entities) and [places](https://dev.twitter.com/overview/api/places). The following examples show how to get each of those entities. They all include the id of the Tweet as the first argument, and some of them include also the text for clarity." ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from nltk.twitter.common import json2csv_entities\n", "with open(input_file) as fp:\n", " json2csv_entities(fp, 'tweets.20150430-223406.hashtags.csv',\n", " ['id', 'text'], 'hashtags', ['text'])\n", " \n", "with open(input_file) as fp:\n", " json2csv_entities(fp, 'tweets.20150430-223406.user_mentions.csv',\n", " ['id', 'text'], 'user_mentions', ['id', 'screen_name'])\n", " \n", "with open(input_file) as fp:\n", " json2csv_entities(fp, 'tweets.20150430-223406.media.csv',\n", " ['id'], 'media', ['media_url', 'url'])\n", " \n", "with open(input_file) as fp:\n", " json2csv_entities(fp, 'tweets.20150430-223406.urls.csv',\n", " ['id'], 'urls', ['url', 'expanded_url'])\n", " \n", "with open(input_file) as fp:\n", " json2csv_entities(fp, 'tweets.20150430-223406.place.csv',\n", " ['id', 'text'], 'place', ['name', 'country'])\n", "\n", "with open(input_file) as fp:\n", " json2csv_entities(fp, 'tweets.20150430-223406.place_bounding_box.csv',\n", " ['id', 'name'], 'place.bounding_box', ['coordinates'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Additionally, when a Tweet is actually a retweet, the original tweet can be also fetched from the same file, as follows:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [], "source": [ "with open(input_file) as fp:\n", " json2csv_entities(fp, 'tweets.20150430-223406.original_tweets.csv',\n", " ['id'], 'retweeted_status', ['created_at', 'favorite_count', \n", " 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count',\n", " 'text', 'truncated', 'user.id'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here the first id corresponds to the retweeted Tweet, and the second id to the original Tweet.\n", "\n", "### Using Dataframes\n", "\n", "Sometimes it's convenient to manipulate CSV files as tabular data, and this is made easy with the [Pandas](http://pandas.pydata.org/) data analysis library. `pandas` is not currently one of the dependencies of NLTK, and you will probably have to install it specially.\n", "\n", "Here is an example of how to read a CSV file into a `pandas` dataframe. We use the `head()` method of a dataframe to just show the first 5 rows." ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
    \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
    created_atfavorite_countin_reply_to_status_idin_reply_to_user_idretweet_countretweetedtexttruncateduser.id
    id
    593891099434983425Thu Apr 30 21:34:06 +0000 20150NaNNaN0FalseRT @KirkKus: Indirect cost of the UK being in ...False107794703
    593891099548094465Thu Apr 30 21:34:06 +0000 20150NaNNaN0FalseVIDEO: Sturgeon on post-election deals http://...False557422508
    593891099388846080Thu Apr 30 21:34:06 +0000 20150NaNNaN0FalseRT @LabourEoin: The economy was growing 3 time...False3006692193
    593891100429045760Thu Apr 30 21:34:06 +0000 20150NaNNaN0FalseRT @GregLauder: the UKIP east lothian candidat...False455154030
    593891100768784384Thu Apr 30 21:34:07 +0000 20150NaNNaN0FalseRT @thesundaypeople: UKIP's housing spokesman ...False187547338
    \n", "
    " ], "text/plain": [ " created_at favorite_count \\\n", "id \n", "593891099434983425 Thu Apr 30 21:34:06 +0000 2015 0 \n", "593891099548094465 Thu Apr 30 21:34:06 +0000 2015 0 \n", "593891099388846080 Thu Apr 30 21:34:06 +0000 2015 0 \n", "593891100429045760 Thu Apr 30 21:34:06 +0000 2015 0 \n", "593891100768784384 Thu Apr 30 21:34:07 +0000 2015 0 \n", "\n", " in_reply_to_status_id in_reply_to_user_id retweet_count \\\n", "id \n", "593891099434983425 NaN NaN 0 \n", "593891099548094465 NaN NaN 0 \n", "593891099388846080 NaN NaN 0 \n", "593891100429045760 NaN NaN 0 \n", "593891100768784384 NaN NaN 0 \n", "\n", " retweeted \\\n", "id \n", "593891099434983425 False \n", "593891099548094465 False \n", "593891099388846080 False \n", "593891100429045760 False \n", "593891100768784384 False \n", "\n", " text \\\n", "id \n", "593891099434983425 RT @KirkKus: Indirect cost of the UK being in ... \n", "593891099548094465 VIDEO: Sturgeon on post-election deals http://... \n", "593891099388846080 RT @LabourEoin: The economy was growing 3 time... \n", "593891100429045760 RT @GregLauder: the UKIP east lothian candidat... \n", "593891100768784384 RT @thesundaypeople: UKIP's housing spokesman ... \n", "\n", " truncated user.id \n", "id \n", "593891099434983425 False 107794703 \n", "593891099548094465 False 557422508 \n", "593891099388846080 False 3006692193 \n", "593891100429045760 False 455154030 \n", "593891100768784384 False 187547338 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "tweets = pd.read_csv('tweets.20150430-223406.tweet.csv', index_col=2, header=0, encoding=\"utf8\")\n", "tweets.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using the dataframe it is easy, for example, to first select Tweets with a specific user ID and then retrieve their `'text'` value." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "id\n", "593891099548094465 VIDEO: Sturgeon on post-election deals http://...\n", "593891101766918144 SNP leader faces audience questions http://t.c...\n", "Name: text, dtype: object" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets.loc[tweets['user.id'] == 557422508]['text']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Expanding a list of Tweet IDs\n", "\n", "Because the Twitter Terms of Service place severe restrictions on the distribution of Tweets by third parties, a workaround is to instead distribute just the Tweet IDs, which are not subject to the same restrictions. The method `expand_tweetids()` sends a request to the Twitter API to return the full Tweet (in Twitter's terminology, a *hydrated* Tweet) that corresponds to a given Tweet ID. \n", "\n", "Since Tweets can be deleted by users, it's possible that certain IDs will only retrieve a null value. For this reason, it's safest to use a `try`/`except` block when retrieving values from the fetched Tweet. " ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Counted 10 Tweet IDs in <_io.StringIO object at 0x107234558>.\n", "id: 588665495508766721\n", "RT @30SecFlghts: Yep it was bad from the jump https://t.co/6vsFIulyRB\n", "\n", "id: 588665495487811584\n", "@8_s2_5 おかえりなさいまし\n", "\n", "id: 588665495492124672\n", "O link http://t.co/u8yh4xdIAF por @YouTube é o tweet mais popular hoje na minha feed.\n", "\n", "id: 588665495487844352\n", "RT @dam_anison: 【アニサマ2014 LIVEカラオケ⑤】\n", "μ'sのライブ映像がDAMに初登場!それは「それは僕たちの奇跡」!\n", "μ's結成から5年間の\"キセキ\"を噛み締めながら歌いたい!\n", "→http://t.co/ZCAB7jgE4L #anisama http:…\n", "\n", "id: 588665495513006080\n", "[Tweet not available]\n", "\n", "id: 588665495525588992\n", "坂道の時に限って裏の車がめっちゃ車間距離近づけて停めてくるから死ぬかと思った\n", "\n", "id: 588665495512948737\n", "Christina Grimmie #RisingStar\n", "17\n", "\n", "id: 588665495487909888\n", "Dolgun Dudaklı Kadınların Çok İyi Bildiği 14 Şey http://t.co/vvEzTlqWOv http://t.co/dsWke4uXQ3\n", "\n" ] } ], "source": [ "from io import StringIO\n", "ids_f =\\\n", " StringIO(\"\"\"\\\n", " 588665495492124672\n", " 588665495487909888\n", " 588665495508766721\n", " 588665495513006080\n", " 588665495517200384\n", " 588665495487811584\n", " 588665495525588992\n", " 588665495487844352\n", " 88665495492014081\n", " 588665495512948737\"\"\")\n", " \n", "oauth = credsfromfile()\n", "client = Query(**oauth)\n", "hydrated = client.expand_tweetids(ids_f)\n", "\n", " \n", "for tweet in hydrated: \n", " id_str = tweet['id_str']\n", " print('id: {}'.format(id_str))\n", " text = tweet['text']\n", " if text.startswith('@null'):\n", " text = \"[Tweet not available]\"\n", " print(text + '\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Although we provided the list of IDs as a string in the above example, the standard use case is to pass a file-like object as the argument to `expand_tweetids()`. " ] } ], "metadata": { "celltoolbar": "Raw Cell Format", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 } nltk-3.7/nltk/test/unit/000077500000000000000000000000001420073152400152375ustar00rootroot00000000000000nltk-3.7/nltk/test/unit/__init__.py000066400000000000000000000000001420073152400173360ustar00rootroot00000000000000nltk-3.7/nltk/test/unit/files/000077500000000000000000000000001420073152400163415ustar00rootroot00000000000000nltk-3.7/nltk/test/unit/files/bad_oauth1-1.txt000066400000000000000000000003241420073152400212460ustar00rootroot00000000000000# missing line for oauth_token_secret app_key=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa app_secret=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb oauth_token=cccccccccccccccccccccccccccccccccccccccccccccccc nltk-3.7/nltk/test/unit/files/bad_oauth1-2.txt000066400000000000000000000004501420073152400212470ustar00rootroot00000000000000# first line is malformed ('app-key' should 'be app_key') app-key=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa app_secret=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb oauth_token=cccccccccccccccccccccccccccccccccccccccccccccccc oauth_token_secret=ddddddddddddddddddddddddddddddddddddddddddd nltk-3.7/nltk/test/unit/files/bad_oauth1-3.txt000066400000000000000000000001311420073152400212440ustar00rootroot00000000000000# first two lines are collapsed app_key=aapp_secret=b oauth_token=c oauth_token_secret=d nltk-3.7/nltk/test/unit/files/credentials.txt000066400000000000000000000000721420073152400213760ustar00rootroot00000000000000app_key=a app_secret=b oauth_token=c oauth_token_secret=d nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.hashtag.csv.ref000066400000000000000000000162661420073152400242300ustar00rootroot00000000000000id,text,hashtags.text 593891099434983425,RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,BetterOffOut 593891099434983425,RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,UKIP 593891099388846080,RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…,BBCqt 593891100982546432,RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2,AskNigelFarage 593891100982546432,RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2,UKIP 593891101154619392,RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives,bbcqt 593891102895296512,"How dare @EdMiliband_MP force Socialists to chose between the English LP and the SNP! The #SNP are the last, true Socialist party in the UK",SNP 593891106787631104,RT @Markfergusonuk: The Sun’s Twitter worm appears to believe Ed Miliband is winning so far #bbcqt http://t.co/ZgZbSwnZxZ,bbcqt 593891107232165888,"RT @KatieKhaleesi: I'm #SNPbecause in my entire adult life I've only seen Labour & Tories cause misery, war, and further victimisation of t…",SNPbecause 593891107039215616,"“@suttonnick: Friday's Times front page: Miliband savaged for ‘lies’ over spending #tomorrowspaperstoday #bbcpapers http://t.co/ts9ZnULDwr”",tomorrowspaperstoday 593891107039215616,"“@suttonnick: Friday's Times front page: Miliband savaged for ‘lies’ over spending #tomorrowspaperstoday #bbcpapers http://t.co/ts9ZnULDwr”",bbcpapers 593891107857158145,RT @dhothersall: Scenes of celebration in Glasgow as #SNP applaud a #Conservative victory. Just a bit of fun. Or is it? :-) https://t.co/hh…,SNP 593891107857158145,RT @dhothersall: Scenes of celebration in Glasgow as #SNP applaud a #Conservative victory. Just a bit of fun. Or is it? :-) https://t.co/hh…,Conservative 593891108314308608,RT @LabourEoin: Another humongous lie from David Cameron. He has not clamped down on Tax Avoidance. Uncollected Tax has risen #BBCqt http:/…,BBCqt 593891110239543296,RT @jsteve372: .@mik61scot it's Nicola Sturgeon's new education policy... Improve literacy by getting deprive kids reading words on helicop…,SNPout 593891110763757568,RT @StephenHep21: @ScotlandTonight Ed is free to say he wont do a deal. It is #SNP who will never be forgiven if they vote down a Labour go…,SNP 593891111502016512,RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,bbcqt 593891111824941056,RT @HuffPostUK: The Tory spin message about David Cameron's performance is a bit obvious #bbcqt http://t.co/yihDlG5NVo http://t.co/KsqFCqwS…,bbcqt 593891111942365186,RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,bbcqt 593891113750106112,"RT @Riath84: Great, you reduced the deficit, by killing and shitting on the poor. Good job. Not really something to be bragging about Torie…",TheLastLeg 593891113498443776,Can you shut the NS down until May 8. #Prettyplease https://t.co/u5foOReQIh,Prettyplease 593891114215723008,Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015,BBC 593891114215723008,Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015,leadersdebate 593891114215723008,Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015,NigelFarage 593891114215723008,Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015,UKIP 593891114215723008,Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015,GE2015 593891114853216257,RT @LeeMartin4947: Miliband on Question Time: I won't have Labour government if it means SNP deal http://t.co/8DPMTRsqG1 #GE2015,GE2015 593891116363214848,"@ScotlandTonight If Miliband will step aside to let Tories in, we need 59 SNP MPs more than ever to stand up for Scotland @theSNP #VoteSNP",VoteSNP 593891116405100544,RT @ChristinaSNP: I just heard Milliband's pronouncements........ Oh dear.... Cutting off his nose to spite the Scots.... #VoteSNP,VoteSNP 593891117269155841,#Newsnight or Farage talking cr*p for nearly half an hour. A tough call; though Farage might well melt down yet again w/audience questions..,Newsnight 593891117525016576,RT @chunkymark: How would labour supporters/country feel if @Ed_Miliband let Tories have another 5 years by refusing to have coalition with…,bbcqt 593891121111179265,David Cameron hugs another husky in the storms http://t.co/WWkaKjKEdJ #david Cameron #hugahusky #davidcamerontweet #greatstorm 2013 #StJude,david 593891121111179265,David Cameron hugs another husky in the storms http://t.co/WWkaKjKEdJ #david Cameron #hugahusky #davidcamerontweet #greatstorm 2013 #StJude,hugahusky 593891121111179265,David Cameron hugs another husky in the storms http://t.co/WWkaKjKEdJ #david Cameron #hugahusky #davidcamerontweet #greatstorm 2013 #StJude,davidcamerontweet 593891121111179265,David Cameron hugs another husky in the storms http://t.co/WWkaKjKEdJ #david Cameron #hugahusky #davidcamerontweet #greatstorm 2013 #StJude,greatstorm 593891121111179265,David Cameron hugs another husky in the storms http://t.co/WWkaKjKEdJ #david Cameron #hugahusky #davidcamerontweet #greatstorm 2013 #StJude,StJude 593891120913977344,RT @gemini2359: #bbcqt Tories 1st introduced PFI In NHS not Labour. Here's Ken Clarke boasting of it. http://t.co/4lk4w72f1A,bbcqt 593891121312440320,RT @labourpress: We learnt the most from the question David Cameron wouldn't answer tonight: clear now the Tories will cut Child Benefit #b…,bbcqt 593891123149570049,RT @SunNation: #SUNNATION EXCLUSIVE: We reveal the man responsible for Ed slipping off the stage... http://t.co/WZ7bITsezb http://t.co/gd42…,SUNNATION 593891123304787968,"RT @HouseOfTraitors: #bbcbias Number of times Party/Leader mentioned tonight on @BBCNews LAB 14 CON 9 LIB 7 SNP 5 UKIP 0",bbcbias 593891124680482817,RT @BuntinRobert: Murphy in Scotland insults our intelligence. Milliband in London demeans all Scots. The Scots will not take it lying down…,SNPMay7th 593891124953161728,#UKIP TO WIN #FARAGEFOREVER,UKIP 593891124953161728,#UKIP TO WIN #FARAGEFOREVER,FARAGEFOREVER 593891125246689280,"RT @paulwaugh: ICM found just 6% #bbcqt viewers changed their mind. Small sub sample but: Clegg won 32% of switchers, Cam 25% Ed 20% http:/…",bbcqt 593891127520043008,"RT @WilsonWilson009: Leaked reports that #Tories feared ""inevitable ""election defeat so ran a campaign of false flags, muddying of the wate…",Tories 593891128262418433,"RT @benglaze: Great line from Lucy Powell on Miliband's minor trip: ""Ed slipped on David Cameron's sweat."" #GE2015 #bbcqt",GE2015 593891128262418433,"RT @benglaze: Great line from Lucy Powell on Miliband's minor trip: ""Ed slipped on David Cameron's sweat."" #GE2015 #bbcqt",bbcqt nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.media.csv.ref000066400000000000000000000037031420073152400236600ustar00rootroot00000000000000id,media.media_url,media.url 593891099388846080,http://pbs.twimg.com/media/CD3PUiCWoAEWBF6.png,http://t.co/yKornG2YDo 593891100429045760,http://pbs.twimg.com/media/CD2eDllWoAACbam.jpg,http://t.co/7eIU0c5Fm1 593891100768784384,http://pbs.twimg.com/media/CD1c3gmWEAMiNl1.jpg,http://t.co/c1AZxcLhbH 593891100982546432,http://pbs.twimg.com/media/CD26T4pW8AExTBG.png,http://t.co/ogHSc2Rsr2 593891101452476416,http://pbs.twimg.com/media/CD2Z53AWoAAviDQ.jpg,http://t.co/doz9RMilol 593891101959917568,http://pbs.twimg.com/media/CDsOoEFWEAExtzL.jpg,http://t.co/cwKvHu2LWl 593891102257713152,http://pbs.twimg.com/media/CD3qKQ3VEAA6pgM.jpg,http://t.co/4myFekg5ex 593891103100776448,http://pbs.twimg.com/media/CD3mxBMW0AEVICD.jpg,http://t.co/SNMkRDCe9f 593891104438759425,http://pbs.twimg.com/media/CD2Z53AWoAAviDQ.jpg,http://t.co/doz9RMilol 593891106590429185,http://pbs.twimg.com/media/CC9II0QUkAIeVDr.jpg,http://t.co/o3sG5B4Llj 593891107039215616,http://pbs.twimg.com/media/CD3qP7lWYAAOBXW.jpg,http://t.co/ts9ZnULDwr 593891108314308608,http://pbs.twimg.com/media/CD3NTZdW8AE3uoT.png,http://t.co/F48orkoAbc 593891110616989696,http://pbs.twimg.com/media/CD3rJUVUMAAuEth.jpg,http://t.co/gHMZXK3Tbc 593891111824941056,http://pbs.twimg.com/media/CD3oAiPWAAAdU2k.png,http://t.co/KsqFCqwSFB 593891116140888065,http://pbs.twimg.com/media/CD3iopTXIAA914t.jpg,http://t.co/XCfyirgaxE 593891117474668544,http://pbs.twimg.com/media/CD3rJUVUMAAuEth.jpg,http://t.co/gHMZXK3Tbc 593891118867185664,http://pbs.twimg.com/media/CD3ss9gW8AI3c8k.jpg,http://t.co/bGtWm4VM5D 593891118779097090,http://pbs.twimg.com/media/CD1c3gmWEAMiNl1.jpg,http://t.co/c1AZxcLhbH 593891119609556992,http://pbs.twimg.com/media/CD3ss9gW8AI3c8k.jpg,http://t.co/bGtWm4VM5D 593891120913977344,http://pbs.twimg.com/media/CAIbuq_WQAAvxRg.png,http://t.co/4lk4w72f1A 593891123149570049,http://pbs.twimg.com/media/CD3fNlnWIAA62Iw.jpg,http://t.co/gd42tw4Tgc 593891128472170497,http://pbs.twimg.com/media/CDvhVVgWoAAjmfz.jpg,http://t.co/q0B18I9OFA nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.place.csv.ref000066400000000000000000000013231420073152400236610ustar00rootroot00000000000000id,text,place.name,place.country 593891101313896449,LOLZ to Trickle Down Wealth. It's never trickling past their own wallets. Greed always wins $$$ for the greedy. https://t.co/X7deoPbS97,Mesa,United States 593891102895296512,"How dare @EdMiliband_MP force Socialists to chose between the English LP and the SNP! The #SNP are the last, true Socialist party in the UK",Edinburgh,United Kingdom 593891112407924736,Account for every penny? Like you did last time? Your nose should be 2 foot long! https://t.co/k4NMrenulf,Hove,United Kingdom 593891116363214848,"@ScotlandTonight If Miliband will step aside to let Tories in, we need 59 SNP MPs more than ever to stand up for Scotland @theSNP #VoteSNP",Scotland,United Kingdom nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.placeboundingbox.csv.ref000066400000000000000000000011501420073152400261160ustar00rootroot00000000000000place.id,place.name,bounding_box.coordinates 44d207663001f00b,Mesa,"[[-111.894548, 33.306275], [-111.894548, 33.505234], [-111.5805834, 33.505234], [-111.5805834, 33.306275]]" 7ae9e2f2ff7a87cd,Edinburgh,"[[-3.3285119, 55.894729], [-3.3285119, 55.991662], [-3.077505, 55.991662], [-3.077505, 55.894729]]" 42e88d76579c3d91,Hove,"[[-0.236008, 50.822226], [-0.236008, 50.8569235], [-0.1474382, 50.8569235], [-0.1474382, 50.822226]]" 0af014accd6f6e99,Scotland,"[[-7.66085699743185, 54.6336309570711], [-7.66085699743185, 60.8452769694519], [-0.740025000483186, 60.8452769694519], [-0.740025000483186, 54.6336309570711]]" nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.retweet.csv.ref000066400000000000000000000310701420073152400242560ustar00rootroot00000000000000id,retweeted_status.created_at,retweeted_status.favorite_count,retweeted_status.id,retweeted_status.in_reply_to_status_id,retweeted_status.in_reply_to_user_id,retweeted_status.retweet_count,retweeted_status.text,retweeted_status.truncated,retweeted_status.user.id 593891099434983425,Thu Apr 30 21:11:02 +0000 2015,3,593885295323521025,,,11,Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,False,929903647 593891099388846080,Thu Apr 30 19:25:17 +0000 2015,68,593858679256002560,,,225,The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co/yKornG2YDo,False,168090600 593891100429045760,Thu Apr 30 15:50:02 +0000 2015,6,593804511371776000,,,3,the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1,False,317875569 593891100768784384,Thu Apr 30 14:06:14 +0000 2015,114,593778387065384960,,,581,UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLhbH,False,262604305 593891100982546432,Thu Apr 30 17:54:20 +0000 2015,177,593835794000908288,,,208,Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2,False,19017675 593891101154619392,Thu Apr 30 20:40:06 +0000 2015,7,593877508363329536,,,10,Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives,False,232263561 593891101452476416,Thu Apr 30 15:31:55 +0000 2015,343,593799952637243393,,,1257,"The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.co/doz9RMilol",False,1525565280 593891101838340096,Thu Apr 30 20:06:13 +0000 2015,100,593868982739951616,593868133410185216,1225696320,103,“@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating dog rapist.,False,299530170 593891101959917568,Tue Apr 28 16:10:30 +0000 2015,1224,593084884467449856,,,2778,"Cameron ""Ed Milliband hanging out with Russell Brand. He is a joke. This is an election. This is about real people' http://t.co/cwKvHu2LWl",False,855263756 593891102257713152,Thu Apr 30 21:22:32 +0000 2015,1,593888189028392960,,,12,Ed Miliband: Last Labour government did not overspend http://t.co/W9RJ2aSH6o http://t.co/4myFekg5ex,False,16558943 593891103100776448,Thu Apr 30 21:07:42 +0000 2015,4,593884456655683584,,,16,Well thats it. LABOUR would rather have a TORY government rather than work with the SNP. http://t.co/SNMkRDCe9f,False,28584042 593891102836563969,Thu Apr 30 21:17:11 +0000 2015,0,593886843101433856,593886375281303552,320369769,1,"@dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you want Lab, vote Lab",False,23607137 593891104438759425,Thu Apr 30 15:31:55 +0000 2015,343,593799952637243393,,,1258,"The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.co/doz9RMilol",False,1525565280 593891106066198529,Thu Apr 30 20:18:10 +0000 2015,34,593871988231397376,,,50,Nick Clegg is just as responsible for this Govt's failing plan as David Cameron - he's backed the Tories all the way,False,162472533 593891106590429185,Sun Apr 19 12:35:55 +0000 2015,21,589769394143760384,,,21,Will the person who dropped a Vote SNP badge please call at Kilmarnock police station to collect it. http://t.co/o3sG5B4Llj,False,2787759816 593891106787631104,Thu Apr 30 20:08:21 +0000 2015,29,593869518973304832,,,62,The Sun’s Twitter worm appears to believe Ed Miliband is winning so far #bbcqt http://t.co/ZgZbSwnZxZ,False,28079313 593891107232165888,Thu Apr 30 10:41:28 +0000 2015,222,593726859847798785,,,195,"I'm #SNPbecause in my entire adult life I've only seen Labour & Tories cause misery, war, and further victimisation of the poor/vulnerable.",False,2171610117 593891107387375616,Thu Apr 30 18:51:47 +0000 2015,5,593850249891946496,,,11,"""UKIP is the bastard child of the big three parties. With apologies to all bastards."" - @PeterTatchell at the University of Manchester",False,40175634 593891107857158145,Thu Apr 30 13:53:30 +0000 2015,13,593775185884520448,,,20,Scenes of celebration in Glasgow as #SNP applaud a #Conservative victory. Just a bit of fun. Or is it? :-) https://t.co/hhAYgwbFMh,False,88059720 593891108314308608,Thu Apr 30 19:16:28 +0000 2015,56,593856459856809986,,,246,Another humongous lie from David Cameron. He has not clamped down on Tax Avoidance. Uncollected Tax has risen #BBCqt http://t.co/F48orkoAbc,False,168090600 593891110239543296,Thu Apr 30 17:28:45 +0000 2015,9,593829353575849984,593828150343249920,137690167,11,.@mik61scot it's Nicola Sturgeon's new education policy... Improve literacy by getting deprive kids reading words on helicopters... #SNPout,False,880165526 593891110616989696,Thu Apr 30 21:26:51 +0000 2015,17,593889272521957376,,,20,"Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc",False,87818409 593891110763757568,Thu Apr 30 20:30:13 +0000 2015,2,593875019765293056,,361756405,8,@ScotlandTonight Ed is free to say he wont do a deal. It is #SNP who will never be forgiven if they vote down a Labour govt + let in Tories.,False,2437160281 593891111502016512,Thu Apr 30 20:24:29 +0000 2015,782,593873576920424448,,,945,I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,False,61781260 593891111824941056,Thu Apr 30 21:33:55 +0000 2015,0,593891051942891520,,,1,The Tory spin message about David Cameron's performance is a bit obvious #bbcqt http://t.co/yihDlG5NVo http://t.co/KsqFCqwSFB,False,271413771 593891111942365186,Thu Apr 30 20:24:29 +0000 2015,783,593873576920424448,,,946,I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,False,61781260 593891111946551296,Thu Apr 30 21:01:43 +0000 2015,7,593882949646479362,,,2,"imagine fighting for us to stay in the union, promising that Scotland's voice will be heard, and then refusing to do a deal with the SNP.",False,330665067 593891113179750401,Thu Apr 30 21:07:52 +0000 2015,40,593884495289458689,593883648459448320,32522100,20,@JimForScotland Crazy statements like that just confirms to me I made the right decision in leaving Labour for the SNP.Time to get real Jim!,False,833702100 593891113750106112,Thu Apr 30 21:31:12 +0000 2015,0,593890368187424768,,,1,"Great, you reduced the deficit, by killing and shitting on the poor. Good job. Not really something to be bragging about Tories. #TheLastLeg",False,133211649 593891114496729088,Thu Apr 30 19:54:35 +0000 2015,6,593866053199011842,,,5,"If Miliband had replied then, ""ah, you work in recruitment? No wonder you're such a dick."" I'd have voted for him. But he didn't, so I won't",False,228706018 593891114744164354,Thu Apr 30 21:33:42 +0000 2015,0,593890999031746562,,,1,"Oh, Tories. Wiz zis message discipline you are really spoiling uz. http://t.co/wg5WhNyp6O",False,201003224 593891114853216257,Thu Apr 30 21:14:34 +0000 2015,0,593886183702339584,,,1,Miliband on Question Time: I won't have Labour government if it means SNP deal http://t.co/8DPMTRsqG1 #GE2015,False,574284134 593891116140888065,Thu Apr 30 20:49:42 +0000 2015,23,593879923095445504,,,23,"Scottish audience all address Sturgeon as ""Nicola"". Earlier on QT it was ""Mr Cameron"", ""Mr Miliband"" & ""Mr Clegg"". http://t.co/XCfyirgaxE",False,14072988 593891116405100544,Thu Apr 30 21:33:46 +0000 2015,1,593891016358375426,,,3,I just heard Milliband's pronouncements........ Oh dear.... Cutting off his nose to spite the Scots.... #VoteSNP,False,257023773 593891117525016576,Thu Apr 30 20:09:25 +0000 2015,72,593869789321363458,,,100,How would labour supporters/country feel if @Ed_Miliband let Tories have another 5 years by refusing to have coalition with SNP #bbcqt,False,212973087 593891117474668544,Thu Apr 30 21:26:51 +0000 2015,17,593889272521957376,,,21,"Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc",False,87818409 593891118384869376,Thu Apr 30 20:21:17 +0000 2015,69,593872773925208066,,,107,"Scots will never forgive Lab if Tories get in because Miliband refuses to work with SNP, says Stewart Hosie",False,1371900259 593891118867185664,Thu Apr 30 21:33:39 +0000 2015,2,593890984473296896,,,2,Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.co/bGtWm4VM5D,False,85794542 593891118779097090,Thu Apr 30 14:06:14 +0000 2015,114,593778387065384960,,,582,UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLhbH,False,262604305 593891119165001728,Thu Apr 30 21:32:46 +0000 2015,0,593890763194421248,,,2,"You Gov Poll Tonight LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)",False,265642404 593891119609556992,Thu Apr 30 21:33:39 +0000 2015,2,593890984473296896,,,3,Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.co/bGtWm4VM5D,False,85794542 593891119701880833,Thu Apr 30 21:04:30 +0000 2015,62,593883648459448320,,,123,Labour has called SNP bluff. The SNP must now be clear: are they willing to prevent or bring down a Labour government & let the Tories in?,False,32522100 593891120913977344,Thu Apr 30 19:41:07 +0000 2015,14,593862665296044032,,,39,#bbcqt Tories 1st introduced PFI In NHS not Labour. Here's Ken Clarke boasting of it. http://t.co/4lk4w72f1A,False,295283950 593891120989544448,Thu Apr 30 19:56:16 +0000 2015,16,593866476584599552,,,43,"It's far easier for Labour to work as a minority govt than the Tories. Miliband can call SNP's bluff, while Cameron has no such choice",False,22021978 593891121312440320,Thu Apr 30 20:54:29 +0000 2015,49,593881126898372609,,,95,We learnt the most from the question David Cameron wouldn't answer tonight: clear now the Tories will cut Child Benefit #bbcqt,False,79173926 593891123149570049,Thu Apr 30 20:34:42 +0000 2015,7,593876151300444161,,,30,#SUNNATION EXCLUSIVE: We reveal the man responsible for Ed slipping off the stage... http://t.co/WZ7bITsezb http://t.co/gd42tw4Tgc,False,2543147876 593891123304787968,Thu Apr 30 21:14:51 +0000 2015,2,593886255940775936,,,9,"#bbcbias Number of times Party/Leader mentioned tonight on @BBCNews LAB 14 CON 9 LIB 7 SNP 5 UKIP 0",False,1895366173 593891123292078081,Thu Apr 30 21:26:04 +0000 2015,0,593889078548004864,593871060010958848,187579804,1,@RogerV52 @UKIP 100% Nigel,False,270327100 593891123673878528,Thu Apr 30 21:30:43 +0000 2015,15,593890245172662272,,,52,"Latest YouGov poll (29 - 30 Apr): LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)",False,1541364193 593891123610963968,Thu Apr 30 21:18:57 +0000 2015,1,593887287412424705,,,1,Ed Miliband's in denial about his past squandering of our money. Don't let him do it again https://t.co/swqjv3xnHe,False,67323615 593891124680482817,Thu Apr 30 20:50:52 +0000 2015,7,593880217158057985,,,12,Murphy in Scotland insults our intelligence. Milliband in London demeans all Scots. The Scots will not take it lying down.#SNPMay7th,False,2506656725 593891125066346497,Thu Apr 30 17:53:05 +0000 2015,3,593835479088365568,,,6,Off to Citizens MK assembly to show my commitment to living wage. Another no show from my Tory opponent.,False,70803659 593891125246689280,Thu Apr 30 21:23:20 +0000 2015,5,593888390585700352,,,36,"ICM found just 6% #bbcqt viewers changed their mind. Small sub sample but: Clegg won 32% of switchers, Cam 25% Ed 20% http://t.co/auLQq4E2iV",False,26985345 593891126916046849,Wed Apr 29 21:25:25 +0000 2015,18,593526524323565568,,,33,"Russell Brand wants to hug people who thinks its ok to chop peoples heads off on our streets. Miliband is only reaching out to the insane.",False,1119758522 593891127520043008,Thu Apr 30 21:32:24 +0000 2015,0,593890669363634177,,,2,"Leaked reports that #Tories feared ""inevitable ""election defeat so ran a campaign of false flags, muddying of the waters & hiding the cuts!",False,1094730422 593891127918493698,Thu Apr 30 19:53:21 +0000 2015,67,593865743541886977,,,194,Labour will do a deal with SNP make no mistake! Scotland will rule the UK!,False,2656744153 593891128262418433,Thu Apr 30 21:15:16 +0000 2015,21,593886357690396672,,,45,"Great line from Lucy Powell on Miliband's minor trip: ""Ed slipped on David Cameron's sweat."" #GE2015 #bbcqt",False,114412574 593891128472170497,Wed Apr 29 07:27:02 +0000 2015,1133,593315538102788097,,,2253,"While Cameron slags off Miliband for meeting Russell Brand, shall we just bear this in mind? http://t.co/q0B18I9OFA",False,32876983 nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.text.csv.ref000066400000000000000000000276251420073152400235760ustar00rootroot00000000000000text RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co… RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1 RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh… RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2 RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives "RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…" RT @NivenJ1: “@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating do… LOLZ to Trickle Down Wealth. It's never trickling past their own wallets. Greed always wins $$$ for the greedy. https://t.co/X7deoPbS97 SNP leader faces audience questions http://t.co/TYClKltSpW "RT @cononeilluk: Cameron ""Ed Milliband hanging out with Russell Brand. He is a joke. This is an election. This is about real people' http:/…" RT @politicshome: Ed Miliband: Last Labour government did not overspend http://t.co/W9RJ2aSH6o http://t.co/4myFekg5ex "If Miliband is refusing to do any deal with the SNP, how does he plan on forming a government?" RT @scotnotbritt: Well thats it. LABOUR would rather have a TORY government rather than work with the SNP. http://t.co/SNMkRDCe9f Cameron wins last TV contest of election campaign - poll: LONDON (Reuters) - Prime Minister David Cameron won the… http://t.co/aUMOoYWOSk "RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…" "How dare @EdMiliband_MP force Socialists to chose between the English LP and the SNP! The #SNP are the last, true Socialist party in the UK" Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/X61IGbe07R "RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…" @B0MBSKARE the anti-Scottish feeling is largely a product of Tory press scaremongering. In practice most people won't give a toss! "Miliband stumbles, Cameron dodges http://t.co/wnv2zOhQvq" Miliband - I'd pass on PM job rather than do deal with Scots nationalists: LONDON (Reuters) - British Labour Party… http://t.co/2cFGVfWkqF RT @GloriaDePiero: Nick Clegg is just as responsible for this Govt's failing plan as David Cameron - he's backed the Tories all the way RT @mykilmarnock: Will the person who dropped a Vote SNP badge please call at Kilmarnock police station to collect it. http://t.co/o3sG5B4L… RT @Markfergusonuk: The Sun’s Twitter worm appears to believe Ed Miliband is winning so far #bbcqt http://t.co/ZgZbSwnZxZ "@ScottishPleb @AHairyBiker SNP toxic in England. Why would Labour want to do a deal with them?" "RT @KatieKhaleesi: I'm #SNPbecause in my entire adult life I've only seen Labour & Tories cause misery, war, and further victimisation of t…" Jeremy Vine doesn't think the SNP Scottish takeover merits a mention "RT @daniloxxv: ""UKIP is the bastard child of the big three parties. With apologies to all bastards."" - @PeterTatchell at the University of …" "“@suttonnick: Friday's Times front page: Miliband savaged for ‘lies’ over spending #tomorrowspaperstoday #bbcpapers http://t.co/ts9ZnULDwr”" RT @dhothersall: Scenes of celebration in Glasgow as #SNP applaud a #Conservative victory. Just a bit of fun. Or is it? :-) https://t.co/hh… "@KTHopkins SNP thickos look forward to having the balance of power, bringing fair, decent policies that fascists bigots like u detest!" RT @LabourEoin: Another humongous lie from David Cameron. He has not clamped down on Tax Avoidance. Uncollected Tax has risen #BBCqt http:/… "my tory dad: ""if UKIP had a chance here I'd definitely vote for them."" *5 mins later* ""you are so blinkered!!! so closed minded!!"" u sure?" "Tonight's polling average: Con 33.7%, Lab 33%, UKIP 13%, Lib Dem 8% and Green 5.7%." RT @jsteve372: .@mik61scot it's Nicola Sturgeon's new education policy... Improve literacy by getting deprive kids reading words on helicop… "RT @guardian: Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc" RT @StephenHep21: @ScotlandTonight Ed is free to say he wont do a deal. It is #SNP who will never be forgiven if they vote down a Labour go… RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt RT @HuffPostUK: The Tory spin message about David Cameron's performance is a bit obvious #bbcqt http://t.co/yihDlG5NVo http://t.co/KsqFCqwS… @Nigel_Farage Does Ed's rejection of Lab/SNP deal undermine Cameron's plea to Ukippers to 'come home' on the basis of that proposal? RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt "RT @chaos_lane: imagine fighting for us to stay in the union, promising that Scotland's voice will be heard, and then refusing to do a deal…" Account for every penny? Like you did last time? Your nose should be 2 foot long! https://t.co/k4NMrenulf RT @ronwindward: @JimForScotland Crazy statements like that just confirms to me I made the right decision in leaving Labour for the SNP.Tim… "RT @Riath84: Great, you reduced the deficit, by killing and shitting on the poor. Good job. Not really something to be bragging about Torie…" Clegg: Education spending is a coalition 'red line' - ITV News http://t.co/uLHiJeeJi6 Can you shut the NS down until May 8. #Prettyplease https://t.co/u5foOReQIh Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015 "'I'm not going to cave in to the SNP over Trident and the deficit'. A left-wing party in favour of nuclear weapons and maintaing austerity" "RT @nickgoff79: If Miliband had replied then, ""ah, you work in recruitment? No wonder you're such a dick."" I'd have voted for him. But he d…" "RT @EllieCumbo: Oh, Tories. Wiz zis message discipline you are really spoiling uz. http://t.co/wg5WhNyp6O" RT @LeeMartin4947: Miliband on Question Time: I won't have Labour government if it means SNP deal http://t.co/8DPMTRsqG1 #GE2015 VIDEO: Sturgeon on post-election deals http://t.co/sLvUpn2uD8 Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/PjP3yb5u6t "RT @joncraig: Scottish audience all address Sturgeon as ""Nicola"". Earlier on QT it was ""Mr Cameron"", ""Mr Miliband"" & ""Mr Clegg"". http://t.c…" "@ScotlandTonight If Miliband will step aside to let Tories in, we need 59 SNP MPs more than ever to stand up for Scotland @theSNP #VoteSNP" RT @ChristinaSNP: I just heard Milliband's pronouncements........ Oh dear.... Cutting off his nose to spite the Scots.... #VoteSNP #Newsnight or Farage talking cr*p for nearly half an hour. A tough call; though Farage might well melt down yet again w/audience questions.. Kind of like/support nick clegg hahahaha RT @chunkymark: How would labour supporters/country feel if @Ed_Miliband let Tories have another 5 years by refusing to have coalition with… "RT @guardian: Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc" "Couldn't watch the leaders Q&As tonight as I was at a scholarship dinner, but at said dinner I argued Labour doesn't need nor want the SNP" "RT @GardhamHT: Scots will never forgive Lab if Tories get in because Miliband refuses to work with SNP, says Stewart Hosie" RT @DanHannanMEP: Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.c… RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh… SNP leader faces audience questions http://t.co/gk6yJ9zXNx "RT @CountryStandard: You Gov Poll Tonight LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)" RT @DanHannanMEP: Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.c… RT @JimForScotland: Labour has called SNP bluff. The SNP must now be clear: are they willing to prevent or bring down a Labour government &… """Making love is not on the cards here, he's a Tory""" "Audience member: why are million's more people using foodbanks? David Cameron: here's a letter saying there's no money left" David Cameron hugs another husky in the storms http://t.co/WWkaKjKEdJ #david Cameron #hugahusky #davidcamerontweet #greatstorm 2013 #StJude RT @gemini2359: #bbcqt Tories 1st introduced PFI In NHS not Labour. Here's Ken Clarke boasting of it. http://t.co/4lk4w72f1A "RT @sunny_hundal: It's far easier for Labour to work as a minority govt than the Tories. Miliband can call SNP's bluff, while Cameron has n…" Fair point...... https://t.co/etdlMiew4T RT @labourpress: We learnt the most from the question David Cameron wouldn't answer tonight: clear now the Tories will cut Child Benefit #b… Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/ndbqvgEkNK RT @SunNation: #SUNNATION EXCLUSIVE: We reveal the man responsible for Ed slipping off the stage... http://t.co/WZ7bITsezb http://t.co/gd42… ED MILIBAND FELL OF STAGE... HOW IS HE GOING TO RUN THE COUNTRY?!?!?!???? "RT @HouseOfTraitors: #bbcbias Number of times Party/Leader mentioned tonight on @BBCNews LAB 14 CON 9 LIB 7 SNP 5 UKIP 0" RT @Lou_i5e: @RogerV52 @UKIP 100% Nigel @Daniel__Brookes @sarahduk121 @Nigel_Farage @UKIP that's a tad semantic? perhaps we should use recent general election results as best guide "RT @britainelects: Latest YouGov poll (29 - 30 Apr): LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)" The historic link between Labour and the trade unions is about to break - thanks to the SNP | via @Telegraph http://t.co/W5JB0FAFet RT @jogideon: Ed Miliband's in denial about his past squandering of our money. Don't let him do it again https://t.co/swqjv3xnHe RT @BuntinRobert: Murphy in Scotland insults our intelligence. Milliband in London demeans all Scots. The Scots will not take it lying down… #UKIP TO WIN #FARAGEFOREVER RT @andrew4mk: Off to Citizens MK assembly to show my commitment to living wage. Another no show from my Tory opponent. "RT @paulwaugh: ICM found just 6% #bbcqt viewers changed their mind. Small sub sample but: Clegg won 32% of switchers, Cam 25% Ed 20% http:/…" Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/Pm5UrTC2I5 "RT @LiarMPs: Russell Brand wants to hug people who thinks its ok to chop peoples heads off on our streets. Miliband is only reaching out to…" "So, on BBC Question Time's election special interview with the various party leaders, Miliband relented to the... http://t.co/JrCAHcLYrn" Bbc scotlands entire lead story is 'nicola sturgeon won't rule out what she's already ruled out' "RT @WilsonWilson009: Leaked reports that #Tories feared ""inevitable ""election defeat so ran a campaign of false flags, muddying of the wate…" RT @SolManOfficial: Labour will do a deal with SNP make no mistake! Scotland will rule the UK! "And like LibDem leader Mr C said, he had to make hard and brave decision 5yrs ago, to save Britain becm like Greece; huge impact stories" "RT @benglaze: Great line from Lucy Powell on Miliband's minor trip: ""Ed slipped on David Cameron's sweat."" #GE2015 #bbcqt" "RT @KatyFBrand: While Cameron slags off Miliband for meeting Russell Brand, shall we just bear this in mind? http://t.co/q0B18I9OFA" nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.tweet.csv.ref000066400000000000000000000474671420073152400237500ustar00rootroot00000000000000created_at,favorite_count,id,in_reply_to_status_id,in_reply_to_user_id,retweet_count,retweeted,text,truncated,user.id Thu Apr 30 21:34:06 +0000 2015,0,593891099434983425,,,0,False,RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,False,107794703 Thu Apr 30 21:34:06 +0000 2015,0,593891099548094465,,,0,False,VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY,False,557422508 Thu Apr 30 21:34:06 +0000 2015,0,593891099388846080,,,0,False,RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…,False,3006692193 Thu Apr 30 21:34:06 +0000 2015,0,593891100429045760,,,0,False,RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1,False,455154030 Thu Apr 30 21:34:07 +0000 2015,0,593891100768784384,,,0,False,RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…,False,187547338 Thu Apr 30 21:34:07 +0000 2015,0,593891100982546432,,,0,False,RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2,False,2164339015 Thu Apr 30 21:34:07 +0000 2015,0,593891101154619392,,,0,False,RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives,False,370363681 Thu Apr 30 21:34:07 +0000 2015,0,593891101452476416,,,0,False,"RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…",False,2264656669 Thu Apr 30 21:34:07 +0000 2015,0,593891101838340096,,,0,False,RT @NivenJ1: “@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating do…,False,304086842 Thu Apr 30 21:34:07 +0000 2015,0,593891101313896449,,,0,False,LOLZ to Trickle Down Wealth. It's never trickling past their own wallets. Greed always wins $$$ for the greedy. https://t.co/X7deoPbS97,False,2471552766 Thu Apr 30 21:34:07 +0000 2015,0,593891101766918144,,,0,False,SNP leader faces audience questions http://t.co/TYClKltSpW,False,557422508 Thu Apr 30 21:34:07 +0000 2015,0,593891101959917568,,,0,False,"RT @cononeilluk: Cameron ""Ed Milliband hanging out with Russell Brand. He is a joke. This is an election. This is about real people' http:/…",False,157763350 Thu Apr 30 21:34:07 +0000 2015,0,593891102257713152,,,0,False,RT @politicshome: Ed Miliband: Last Labour government did not overspend http://t.co/W9RJ2aSH6o http://t.co/4myFekg5ex,False,2888909722 Thu Apr 30 21:34:07 +0000 2015,0,593891102480007169,,,0,False,"If Miliband is refusing to do any deal with the SNP, how does he plan on forming a government?",False,390331913 Thu Apr 30 21:34:07 +0000 2015,0,593891103100776448,,,0,False,RT @scotnotbritt: Well thats it. LABOUR would rather have a TORY government rather than work with the SNP. http://t.co/SNMkRDCe9f,False,221176097 Thu Apr 30 21:34:07 +0000 2015,0,593891103184551936,,,0,False,Cameron wins last TV contest of election campaign - poll: LONDON (Reuters) - Prime Minister David Cameron won the… http://t.co/aUMOoYWOSk,False,2724103423 Thu Apr 30 21:34:07 +0000 2015,0,593891102836563969,,,0,False,"RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…",False,523427919 Thu Apr 30 21:34:07 +0000 2015,0,593891102895296512,,,0,False,"How dare @EdMiliband_MP force Socialists to chose between the English LP and the SNP! The #SNP are the last, true Socialist party in the UK",False,343918719 Thu Apr 30 21:34:07 +0000 2015,0,593891103272665088,,,0,False,Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/X61IGbe07R,False,160930875 Thu Apr 30 21:34:07 +0000 2015,0,593891104438759425,,,0,False,"RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…",False,60061001 Thu Apr 30 21:34:08 +0000 2015,0,593891104870813699,593890177019416576,278076899,0,False,@B0MBSKARE the anti-Scottish feeling is largely a product of Tory press scaremongering. In practice most people won't give a toss!,False,1584156685 Thu Apr 30 21:34:08 +0000 2015,0,593891105457897472,,,0,False,"Miliband stumbles, Cameron dodges http://t.co/wnv2zOhQvq",False,323685593 Thu Apr 30 21:34:08 +0000 2015,0,593891105935994880,,,0,False,Miliband - I'd pass on PM job rather than do deal with Scots nationalists: LONDON (Reuters) - British Labour Party… http://t.co/2cFGVfWkqF,False,2724103423 Thu Apr 30 21:34:08 +0000 2015,0,593891106066198529,,,0,False,RT @GloriaDePiero: Nick Clegg is just as responsible for this Govt's failing plan as David Cameron - he's backed the Tories all the way,False,115142697 Thu Apr 30 21:34:08 +0000 2015,0,593891106590429185,,,0,False,RT @mykilmarnock: Will the person who dropped a Vote SNP badge please call at Kilmarnock police station to collect it. http://t.co/o3sG5B4L…,False,352549747 Thu Apr 30 21:34:08 +0000 2015,0,593891106787631104,,,0,False,RT @Markfergusonuk: The Sun’s Twitter worm appears to believe Ed Miliband is winning so far #bbcqt http://t.co/ZgZbSwnZxZ,False,3129616979 Thu Apr 30 21:34:08 +0000 2015,0,593891106921799680,593890427595595776,336162378,0,False,"@ScottishPleb @AHairyBiker SNP toxic in England. Why would Labour want to do a deal with them?",False,336162378 Thu Apr 30 21:34:08 +0000 2015,0,593891107232165888,,,0,False,"RT @KatieKhaleesi: I'm #SNPbecause in my entire adult life I've only seen Labour & Tories cause misery, war, and further victimisation of t…",False,45970197 Thu Apr 30 21:34:08 +0000 2015,0,593891107437699072,,,0,False,Jeremy Vine doesn't think the SNP Scottish takeover merits a mention,False,3202460764 Thu Apr 30 21:34:08 +0000 2015,0,593891107387375616,,,0,False,"RT @daniloxxv: ""UKIP is the bastard child of the big three parties. With apologies to all bastards."" - @PeterTatchell at the University of …",False,2234850703 Thu Apr 30 21:34:08 +0000 2015,0,593891107039215616,593888286394945536,21910500,0,False,"“@suttonnick: Friday's Times front page: Miliband savaged for ‘lies’ over spending #tomorrowspaperstoday #bbcpapers http://t.co/ts9ZnULDwr”",False,159072956 Thu Apr 30 21:34:08 +0000 2015,0,593891107857158145,,,0,False,RT @dhothersall: Scenes of celebration in Glasgow as #SNP applaud a #Conservative victory. Just a bit of fun. Or is it? :-) https://t.co/hh…,False,514760597 Thu Apr 30 21:34:08 +0000 2015,0,593891108117155840,593855300832251904,21439144,0,False,"@KTHopkins SNP thickos look forward to having the balance of power, bringing fair, decent policies that fascists bigots like u detest!",False,2791535302 Thu Apr 30 21:34:08 +0000 2015,0,593891108314308608,,,0,False,RT @LabourEoin: Another humongous lie from David Cameron. He has not clamped down on Tax Avoidance. Uncollected Tax has risen #BBCqt http:/…,False,719947915 Thu Apr 30 21:34:08 +0000 2015,0,593891108716961792,,,0,False,"my tory dad: ""if UKIP had a chance here I'd definitely vote for them."" *5 mins later* ""you are so blinkered!!! so closed minded!!"" u sure?",False,30728366 Thu Apr 30 21:34:09 +0000 2015,0,593891109580967936,,,0,False,"Tonight's polling average: Con 33.7%, Lab 33%, UKIP 13%, Lib Dem 8% and Green 5.7%.",False,138076595 Thu Apr 30 21:34:09 +0000 2015,0,593891110239543296,,,0,False,RT @jsteve372: .@mik61scot it's Nicola Sturgeon's new education policy... Improve literacy by getting deprive kids reading words on helicop…,False,87032917 Thu Apr 30 21:34:09 +0000 2015,0,593891110616989696,,,0,False,"RT @guardian: Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc",False,1160879552 Thu Apr 30 21:34:09 +0000 2015,0,593891110763757568,,,0,False,RT @StephenHep21: @ScotlandTonight Ed is free to say he wont do a deal. It is #SNP who will never be forgiven if they vote down a Labour go…,False,80291715 Thu Apr 30 21:34:09 +0000 2015,0,593891111502016512,,,0,False,RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,False,2397876343 Thu Apr 30 21:34:09 +0000 2015,0,593891111824941056,,,0,False,RT @HuffPostUK: The Tory spin message about David Cameron's performance is a bit obvious #bbcqt http://t.co/yihDlG5NVo http://t.co/KsqFCqwS…,False,131257940 Thu Apr 30 21:34:09 +0000 2015,0,593891111799783424,593890934716248064,19017675,0,False,@Nigel_Farage Does Ed's rejection of Lab/SNP deal undermine Cameron's plea to Ukippers to 'come home' on the basis of that proposal?,False,572909495 Thu Apr 30 21:34:09 +0000 2015,0,593891111942365186,,,0,False,RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,False,211639555 Thu Apr 30 21:34:09 +0000 2015,0,593891111946551296,,,0,False,"RT @chaos_lane: imagine fighting for us to stay in the union, promising that Scotland's voice will be heard, and then refusing to do a deal…",False,431991168 Thu Apr 30 21:34:09 +0000 2015,0,593891112407924736,,,0,False,Account for every penny? Like you did last time? Your nose should be 2 foot long! https://t.co/k4NMrenulf,False,1298754739 Thu Apr 30 21:34:10 +0000 2015,0,593891113179750401,,,0,False,RT @ronwindward: @JimForScotland Crazy statements like that just confirms to me I made the right decision in leaving Labour for the SNP.Tim…,False,112839986 Thu Apr 30 21:34:10 +0000 2015,0,593891113750106112,,,0,False,"RT @Riath84: Great, you reduced the deficit, by killing and shitting on the poor. Good job. Not really something to be bragging about Torie…",False,25584011 Thu Apr 30 21:34:10 +0000 2015,0,593891113481572353,,,0,False,Clegg: Education spending is a coalition 'red line' - ITV News http://t.co/uLHiJeeJi6,False,773391421 Thu Apr 30 21:34:10 +0000 2015,0,593891113498443776,,,0,False,Can you shut the NS down until May 8. #Prettyplease https://t.co/u5foOReQIh,False,416610759 Thu Apr 30 21:34:10 +0000 2015,0,593891114215723008,,,0,False,Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015,False,2239487486 Thu Apr 30 21:34:10 +0000 2015,0,593891114475716608,,,0,False,"'I'm not going to cave in to the SNP over Trident and the deficit'. A left-wing party in favour of nuclear weapons and maintaing austerity",False,1303763558 Thu Apr 30 21:34:10 +0000 2015,0,593891114496729088,,,0,False,"RT @nickgoff79: If Miliband had replied then, ""ah, you work in recruitment? No wonder you're such a dick."" I'd have voted for him. But he d…",False,864943860 Thu Apr 30 21:34:10 +0000 2015,0,593891114744164354,,,0,False,"RT @EllieCumbo: Oh, Tories. Wiz zis message discipline you are really spoiling uz. http://t.co/wg5WhNyp6O",False,2695397184 Thu Apr 30 21:34:10 +0000 2015,0,593891114853216257,,,0,False,RT @LeeMartin4947: Miliband on Question Time: I won't have Labour government if it means SNP deal http://t.co/8DPMTRsqG1 #GE2015,False,223162337 Thu Apr 30 21:34:10 +0000 2015,0,593891115457064960,,,0,False,VIDEO: Sturgeon on post-election deals http://t.co/sLvUpn2uD8,False,323685593 Thu Apr 30 21:34:10 +0000 2015,0,593891115658383360,,,0,False,Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/PjP3yb5u6t,False,453176496 Thu Apr 30 21:34:10 +0000 2015,0,593891116140888065,,,0,False,"RT @joncraig: Scottish audience all address Sturgeon as ""Nicola"". Earlier on QT it was ""Mr Cameron"", ""Mr Miliband"" & ""Mr Clegg"". http://t.c…",False,962778541 Thu Apr 30 21:34:10 +0000 2015,0,593891116363214848,,361756405,0,False,"@ScotlandTonight If Miliband will step aside to let Tories in, we need 59 SNP MPs more than ever to stand up for Scotland @theSNP #VoteSNP",False,20048514 Thu Apr 30 21:34:10 +0000 2015,0,593891116405100544,,,0,False,RT @ChristinaSNP: I just heard Milliband's pronouncements........ Oh dear.... Cutting off his nose to spite the Scots.... #VoteSNP,False,327604786 Thu Apr 30 21:34:10 +0000 2015,0,593891117269155841,,,0,False,#Newsnight or Farage talking cr*p for nearly half an hour. A tough call; though Farage might well melt down yet again w/audience questions..,False,405194953 Thu Apr 30 21:34:11 +0000 2015,0,593891117638254592,,,0,False,Kind of like/support nick clegg hahahaha,False,2547024664 Thu Apr 30 21:34:11 +0000 2015,0,593891117525016576,,,0,False,RT @chunkymark: How would labour supporters/country feel if @Ed_Miliband let Tories have another 5 years by refusing to have coalition with…,False,20316261 Thu Apr 30 21:34:11 +0000 2015,0,593891117474668544,,,0,False,"RT @guardian: Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc",False,158533234 Thu Apr 30 21:34:11 +0000 2015,0,593891118045110272,,,0,False,"Couldn't watch the leaders Q&As tonight as I was at a scholarship dinner, but at said dinner I argued Labour doesn't need nor want the SNP",False,102098702 Thu Apr 30 21:34:11 +0000 2015,0,593891118384869376,,,0,False,"RT @GardhamHT: Scots will never forgive Lab if Tories get in because Miliband refuses to work with SNP, says Stewart Hosie",False,2938705671 Thu Apr 30 21:34:11 +0000 2015,0,593891118867185664,,,0,False,RT @DanHannanMEP: Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.c…,False,557752563 Thu Apr 30 21:34:11 +0000 2015,0,593891118779097090,,,0,False,RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…,False,323227501 Thu Apr 30 21:34:11 +0000 2015,0,593891118732849152,,,0,False,SNP leader faces audience questions http://t.co/gk6yJ9zXNx,False,323685593 Thu Apr 30 21:34:11 +0000 2015,0,593891119165001728,,,0,False,"RT @CountryStandard: You Gov Poll Tonight LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)",False,628502330 Thu Apr 30 21:34:11 +0000 2015,0,593891119609556992,,,0,False,RT @DanHannanMEP: Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.c…,False,281089096 Thu Apr 30 21:34:11 +0000 2015,0,593891119701880833,,,0,False,RT @JimForScotland: Labour has called SNP bluff. The SNP must now be clear: are they willing to prevent or bring down a Labour government &…,False,942479006 Thu Apr 30 21:34:11 +0000 2015,0,593891120767225856,,,0,False,"""Making love is not on the cards here, he's a Tory""",False,336530746 Thu Apr 30 21:34:11 +0000 2015,0,593891120825954304,,,0,False,"Audience member: why are million's more people using foodbanks? David Cameron: here's a letter saying there's no money left",False,1950008834 Thu Apr 30 21:34:11 +0000 2015,0,593891121111179265,,,0,False,David Cameron hugs another husky in the storms http://t.co/WWkaKjKEdJ #david Cameron #hugahusky #davidcamerontweet #greatstorm 2013 #StJude,False,1257694519 Thu Apr 30 21:34:11 +0000 2015,0,593891120913977344,,,0,False,RT @gemini2359: #bbcqt Tories 1st introduced PFI In NHS not Labour. Here's Ken Clarke boasting of it. http://t.co/4lk4w72f1A,False,1962796308 Thu Apr 30 21:34:11 +0000 2015,0,593891120989544448,,,0,False,"RT @sunny_hundal: It's far easier for Labour to work as a minority govt than the Tories. Miliband can call SNP's bluff, while Cameron has n…",False,36063034 Thu Apr 30 21:34:11 +0000 2015,0,593891121194995712,,,0,False,Fair point...... https://t.co/etdlMiew4T,False,19286879 Thu Apr 30 21:34:11 +0000 2015,0,593891121312440320,,,0,False,RT @labourpress: We learnt the most from the question David Cameron wouldn't answer tonight: clear now the Tories will cut Child Benefit #b…,False,2584199773 Thu Apr 30 21:34:12 +0000 2015,0,593891122541240320,,,0,False,Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/ndbqvgEkNK,False,2768062344 Thu Apr 30 21:34:12 +0000 2015,0,593891123149570049,,,0,False,RT @SunNation: #SUNNATION EXCLUSIVE: We reveal the man responsible for Ed slipping off the stage... http://t.co/WZ7bITsezb http://t.co/gd42…,False,2917805117 Thu Apr 30 21:34:12 +0000 2015,0,593891123183099906,,,0,False,ED MILIBAND FELL OF STAGE... HOW IS HE GOING TO RUN THE COUNTRY?!?!?!????,False,363838141 Thu Apr 30 21:34:12 +0000 2015,0,593891123304787968,,,0,False,"RT @HouseOfTraitors: #bbcbias Number of times Party/Leader mentioned tonight on @BBCNews LAB 14 CON 9 LIB 7 SNP 5 UKIP 0",False,1242229584 Thu Apr 30 21:34:12 +0000 2015,0,593891123292078081,,,0,False,RT @Lou_i5e: @RogerV52 @UKIP 100% Nigel,False,2951248157 Thu Apr 30 21:34:12 +0000 2015,0,593891123325771776,593890176709046272,948314394,0,False,@Daniel__Brookes @sarahduk121 @Nigel_Farage @UKIP that's a tad semantic? perhaps we should use recent general election results as best guide,False,22848976 Thu Apr 30 21:34:12 +0000 2015,0,593891123673878528,,,0,False,"RT @britainelects: Latest YouGov poll (29 - 30 Apr): LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)",False,2957078271 Thu Apr 30 21:34:12 +0000 2015,0,593891123766099970,,,0,False,The historic link between Labour and the trade unions is about to break - thanks to the SNP | via @Telegraph http://t.co/W5JB0FAFet,False,852919230 Thu Apr 30 21:34:12 +0000 2015,0,593891123610963968,,,0,False,RT @jogideon: Ed Miliband's in denial about his past squandering of our money. Don't let him do it again https://t.co/swqjv3xnHe,False,83628515 Thu Apr 30 21:34:12 +0000 2015,0,593891124680482817,,,0,False,RT @BuntinRobert: Murphy in Scotland insults our intelligence. Milliband in London demeans all Scots. The Scots will not take it lying down…,False,564954573 Thu Apr 30 21:34:12 +0000 2015,0,593891124953161728,,,0,False,#UKIP TO WIN #FARAGEFOREVER,False,382885872 Thu Apr 30 21:34:12 +0000 2015,0,593891125066346497,,,0,False,RT @andrew4mk: Off to Citizens MK assembly to show my commitment to living wage. Another no show from my Tory opponent.,False,15709635 Thu Apr 30 21:34:12 +0000 2015,0,593891125246689280,,,0,False,"RT @paulwaugh: ICM found just 6% #bbcqt viewers changed their mind. Small sub sample but: Clegg won 32% of switchers, Cam 25% Ed 20% http:/…",False,2578825735 Thu Apr 30 21:34:13 +0000 2015,0,593891126261592064,,,0,False,Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/Pm5UrTC2I5,False,2701597927 Thu Apr 30 21:34:13 +0000 2015,0,593891126916046849,,,0,False,"RT @LiarMPs: Russell Brand wants to hug people who thinks its ok to chop peoples heads off on our streets. Miliband is only reaching out to…",False,1454249192 Thu Apr 30 21:34:13 +0000 2015,0,593891127197073408,,,0,False,"So, on BBC Question Time's election special interview with the various party leaders, Miliband relented to the... http://t.co/JrCAHcLYrn",False,21105283 Thu Apr 30 21:34:13 +0000 2015,0,593891127545180162,,,0,False,Bbc scotlands entire lead story is 'nicola sturgeon won't rule out what she's already ruled out',False,23298323 Thu Apr 30 21:34:13 +0000 2015,0,593891127520043008,,,0,False,"RT @WilsonWilson009: Leaked reports that #Tories feared ""inevitable ""election defeat so ran a campaign of false flags, muddying of the wate…",False,586004207 Thu Apr 30 21:34:13 +0000 2015,0,593891127918493698,,,0,False,RT @SolManOfficial: Labour will do a deal with SNP make no mistake! Scotland will rule the UK!,False,76792634 Thu Apr 30 21:34:13 +0000 2015,0,593891127918485504,,,0,False,"And like LibDem leader Mr C said, he had to make hard and brave decision 5yrs ago, to save Britain becm like Greece; huge impact stories",False,633315980 Thu Apr 30 21:34:13 +0000 2015,0,593891128262418433,,,0,False,"RT @benglaze: Great line from Lucy Powell on Miliband's minor trip: ""Ed slipped on David Cameron's sweat."" #GE2015 #bbcqt",False,436708587 Thu Apr 30 21:34:13 +0000 2015,0,593891128472170497,,,0,False,"RT @KatyFBrand: While Cameron slags off Miliband for meeting Russell Brand, shall we just bear this in mind? http://t.co/q0B18I9OFA",False,1029438206 nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.url.csv.ref000066400000000000000000000052071420073152400234040ustar00rootroot00000000000000id,urls.url,urls.expanded_url 593891099548094465,http://t.co/BTJwrpbmOY,http://dlvr.it/9cgTZ3 593891100768784384,http://t.co/GVwb9Rcb4w,http://bit.ly/1JbmjYs 593891101313896449,https://t.co/X7deoPbS97,https://twitter.com/DanHannanMEP/status/593889079697211392 593891101766918144,http://t.co/TYClKltSpW,http://dlvr.it/9cgSR3 593891102257713152,http://t.co/W9RJ2aSH6o,http://polho.me/1I1FE1k 593891103184551936,http://t.co/aUMOoYWOSk,http://dlvr.it/9cgVwL 593891103272665088,http://t.co/X61IGbe07R,http://dlvr.it/9cgTLy 593891105457897472,http://t.co/wnv2zOhQvq,http://dlvr.it/9cgSWD 593891105935994880,http://t.co/2cFGVfWkqF,http://dlvr.it/9cgYD0 593891106787631104,http://t.co/ZgZbSwnZxZ,http://www.sunnation.co.uk/what-will-twitter-make-of-tonights-question-time-debate/ 593891107857158145,https://t.co/hhAYgwbFMh,https://www.facebook.com/dhothersall/videos/10155467952230487/?pnref=story 593891111824941056,http://t.co/yihDlG5NVo,http://huff.to/1GJYXJg 593891112407924736,https://t.co/k4NMrenulf,https://twitter.com/costofcameron/status/593885134643924992 593891113481572353,http://t.co/uLHiJeeJi6,http://dlvr.it/9cgWgm 593891113498443776,https://t.co/u5foOReQIh,https://twitter.com/georgeeaton/status/593864285014663168 593891114744164354,http://t.co/wg5WhNyp6O,http://www.buzzfeed.com/emilyashton/tory-mps-are-all-mysteriously-tweeting-the-same-description 593891114853216257,http://t.co/8DPMTRsqG1,http://d.gu.com/9cfz6T 593891115457064960,http://t.co/sLvUpn2uD8,http://dlvr.it/9cgZR4 593891115658383360,http://t.co/PjP3yb5u6t,http://dlvr.it/9cgYW8 593891118779097090,http://t.co/GVwb9Rcb4w,http://bit.ly/1JbmjYs 593891118732849152,http://t.co/gk6yJ9zXNx,http://dlvr.it/9cgS7s 593891121111179265,http://t.co/WWkaKjKEdJ,http://dld.bz/dgNZK 593891121194995712,https://t.co/etdlMiew4T,https://twitter.com/DanHannanMEP/status/593890984473296896 593891122541240320,http://t.co/ndbqvgEkNK,http://dlvr.it/9cgYRv 593891123149570049,http://t.co/WZ7bITsezb,http://www.sunnation.co.uk/ed-miliband-stacking-it-off-stage-is-the-funniest-thing-weve-seen-this-election/ 593891123766099970,http://t.co/W5JB0FAFet,http://www.telegraph.co.uk/news/general-election-2015/politics-blog/11570792/The-historic-link-between-Labour-and-the-trade-unions-is-about-to-break-thanks-to-the-SNP.html 593891123610963968,https://t.co/swqjv3xnHe,https://amp.twimg.com/v/0305a269-52cf-4a0c-84e5-b134a36d303e 593891125246689280,http://t.co/auLQq4E2iV,http://www.theguardian.com/politics/blog/live/2015/apr/30/question-time-cameron-miliband-and-clegg-interviews-live#block-55429ac4e4b07879681b8c5d 593891126261592064,http://t.co/Pm5UrTC2I5,http://dlvr.it/9cgSVh 593891127197073408,http://t.co/JrCAHcLYrn,http://fb.me/7bFwB3W9o nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.user.csv.ref000066400000000000000000000371251420073152400235640ustar00rootroot00000000000000id,text,user.id,user.followers_count,user.friends_count 593891099434983425,RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,107794703,804,515 593891099548094465,VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY,557422508,184,16 593891099388846080,RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…,3006692193,182,118 593891100429045760,RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1,455154030,1073,708 593891100768784384,RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…,187547338,87,267 593891100982546432,RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2,2164339015,112,83 593891101154619392,RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives,370363681,1183,896 593891101452476416,"RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…",2264656669,319,448 593891101838340096,RT @NivenJ1: “@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating do…,304086842,343,562 593891101313896449,LOLZ to Trickle Down Wealth. It's never trickling past their own wallets. Greed always wins $$$ for the greedy. https://t.co/X7deoPbS97,2471552766,966,2001 593891101766918144,SNP leader faces audience questions http://t.co/TYClKltSpW,557422508,184,16 593891101959917568,"RT @cononeilluk: Cameron ""Ed Milliband hanging out with Russell Brand. He is a joke. This is an election. This is about real people' http:/…",157763350,754,421 593891102257713152,RT @politicshome: Ed Miliband: Last Labour government did not overspend http://t.co/W9RJ2aSH6o http://t.co/4myFekg5ex,2888909722,375,1228 593891102480007169,"If Miliband is refusing to do any deal with the SNP, how does he plan on forming a government?",390331913,174,379 593891103100776448,RT @scotnotbritt: Well thats it. LABOUR would rather have a TORY government rather than work with the SNP. http://t.co/SNMkRDCe9f,221176097,9,32 593891103184551936,Cameron wins last TV contest of election campaign - poll: LONDON (Reuters) - Prime Minister David Cameron won the… http://t.co/aUMOoYWOSk,2724103423,102,3 593891102836563969,"RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…",523427919,1032,1294 593891102895296512,"How dare @EdMiliband_MP force Socialists to chose between the English LP and the SNP! The #SNP are the last, true Socialist party in the UK",343918719,937,953 593891103272665088,Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/X61IGbe07R,160930875,10171,10419 593891104438759425,"RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…",60061001,134,105 593891104870813699,@B0MBSKARE the anti-Scottish feeling is largely a product of Tory press scaremongering. In practice most people won't give a toss!,1584156685,377,1027 593891105457897472,"Miliband stumbles, Cameron dodges http://t.co/wnv2zOhQvq",323685593,294,0 593891105935994880,Miliband - I'd pass on PM job rather than do deal with Scots nationalists: LONDON (Reuters) - British Labour Party… http://t.co/2cFGVfWkqF,2724103423,102,3 593891106066198529,RT @GloriaDePiero: Nick Clegg is just as responsible for this Govt's failing plan as David Cameron - he's backed the Tories all the way,115142697,1234,1935 593891106590429185,RT @mykilmarnock: Will the person who dropped a Vote SNP badge please call at Kilmarnock police station to collect it. http://t.co/o3sG5B4L…,352549747,3458,3097 593891106787631104,RT @Markfergusonuk: The Sun’s Twitter worm appears to believe Ed Miliband is winning so far #bbcqt http://t.co/ZgZbSwnZxZ,3129616979,189,280 593891106921799680,"@ScottishPleb @AHairyBiker SNP toxic in England. Why would Labour want to do a deal with them?",336162378,2356,2118 593891107232165888,"RT @KatieKhaleesi: I'm #SNPbecause in my entire adult life I've only seen Labour & Tories cause misery, war, and further victimisation of t…",45970197,152,295 593891107437699072,Jeremy Vine doesn't think the SNP Scottish takeover merits a mention,3202460764,5,35 593891107387375616,"RT @daniloxxv: ""UKIP is the bastard child of the big three parties. With apologies to all bastards."" - @PeterTatchell at the University of …",2234850703,86,222 593891107039215616,"“@suttonnick: Friday's Times front page: Miliband savaged for ‘lies’ over spending #tomorrowspaperstoday #bbcpapers http://t.co/ts9ZnULDwr”",159072956,40337,28054 593891107857158145,RT @dhothersall: Scenes of celebration in Glasgow as #SNP applaud a #Conservative victory. Just a bit of fun. Or is it? :-) https://t.co/hh…,514760597,672,1015 593891108117155840,"@KTHopkins SNP thickos look forward to having the balance of power, bringing fair, decent policies that fascists bigots like u detest!",2791535302,4,57 593891108314308608,RT @LabourEoin: Another humongous lie from David Cameron. He has not clamped down on Tax Avoidance. Uncollected Tax has risen #BBCqt http:/…,719947915,33,229 593891108716961792,"my tory dad: ""if UKIP had a chance here I'd definitely vote for them."" *5 mins later* ""you are so blinkered!!! so closed minded!!"" u sure?",30728366,320,194 593891109580967936,"Tonight's polling average: Con 33.7%, Lab 33%, UKIP 13%, Lib Dem 8% and Green 5.7%.",138076595,289,1459 593891110239543296,RT @jsteve372: .@mik61scot it's Nicola Sturgeon's new education policy... Improve literacy by getting deprive kids reading words on helicop…,87032917,1260,1983 593891110616989696,"RT @guardian: Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc",1160879552,239,468 593891110763757568,RT @StephenHep21: @ScotlandTonight Ed is free to say he wont do a deal. It is #SNP who will never be forgiven if they vote down a Labour go…,80291715,107,129 593891111502016512,RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,2397876343,1067,871 593891111824941056,RT @HuffPostUK: The Tory spin message about David Cameron's performance is a bit obvious #bbcqt http://t.co/yihDlG5NVo http://t.co/KsqFCqwS…,131257940,1204,672 593891111799783424,@Nigel_Farage Does Ed's rejection of Lab/SNP deal undermine Cameron's plea to Ukippers to 'come home' on the basis of that proposal?,572909495,178,140 593891111942365186,RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,211639555,2415,2301 593891111946551296,"RT @chaos_lane: imagine fighting for us to stay in the union, promising that Scotland's voice will be heard, and then refusing to do a deal…",431991168,487,1372 593891112407924736,Account for every penny? Like you did last time? Your nose should be 2 foot long! https://t.co/k4NMrenulf,1298754739,119,192 593891113179750401,RT @ronwindward: @JimForScotland Crazy statements like that just confirms to me I made the right decision in leaving Labour for the SNP.Tim…,112839986,340,486 593891113750106112,"RT @Riath84: Great, you reduced the deficit, by killing and shitting on the poor. Good job. Not really something to be bragging about Torie…",25584011,155,376 593891113481572353,Clegg: Education spending is a coalition 'red line' - ITV News http://t.co/uLHiJeeJi6,773391421,164,0 593891113498443776,Can you shut the NS down until May 8. #Prettyplease https://t.co/u5foOReQIh,416610759,243,918 593891114215723008,Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015,2239487486,1473,927 593891114475716608,"'I'm not going to cave in to the SNP over Trident and the deficit'. A left-wing party in favour of nuclear weapons and maintaing austerity",1303763558,165,610 593891114496729088,"RT @nickgoff79: If Miliband had replied then, ""ah, you work in recruitment? No wonder you're such a dick."" I'd have voted for him. But he d…",864943860,67,376 593891114744164354,"RT @EllieCumbo: Oh, Tories. Wiz zis message discipline you are really spoiling uz. http://t.co/wg5WhNyp6O",2695397184,779,871 593891114853216257,RT @LeeMartin4947: Miliband on Question Time: I won't have Labour government if it means SNP deal http://t.co/8DPMTRsqG1 #GE2015,223162337,2871,3056 593891115457064960,VIDEO: Sturgeon on post-election deals http://t.co/sLvUpn2uD8,323685593,294,0 593891115658383360,Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/PjP3yb5u6t,453176496,1060,497 593891116140888065,"RT @joncraig: Scottish audience all address Sturgeon as ""Nicola"". Earlier on QT it was ""Mr Cameron"", ""Mr Miliband"" & ""Mr Clegg"". http://t.c…",962778541,613,774 593891116363214848,"@ScotlandTonight If Miliband will step aside to let Tories in, we need 59 SNP MPs more than ever to stand up for Scotland @theSNP #VoteSNP",20048514,94,159 593891116405100544,RT @ChristinaSNP: I just heard Milliband's pronouncements........ Oh dear.... Cutting off his nose to spite the Scots.... #VoteSNP,327604786,297,1233 593891117269155841,#Newsnight or Farage talking cr*p for nearly half an hour. A tough call; though Farage might well melt down yet again w/audience questions..,405194953,1861,179 593891117638254592,Kind of like/support nick clegg hahahaha,2547024664,118,187 593891117525016576,RT @chunkymark: How would labour supporters/country feel if @Ed_Miliband let Tories have another 5 years by refusing to have coalition with…,20316261,207,195 593891117474668544,"RT @guardian: Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc",158533234,1820,568 593891118045110272,"Couldn't watch the leaders Q&As tonight as I was at a scholarship dinner, but at said dinner I argued Labour doesn't need nor want the SNP",102098702,273,615 593891118384869376,"RT @GardhamHT: Scots will never forgive Lab if Tories get in because Miliband refuses to work with SNP, says Stewart Hosie",2938705671,119,241 593891118867185664,RT @DanHannanMEP: Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.c…,557752563,1016,1265 593891118779097090,RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…,323227501,758,697 593891118732849152,SNP leader faces audience questions http://t.co/gk6yJ9zXNx,323685593,294,0 593891119165001728,"RT @CountryStandard: You Gov Poll Tonight LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)",628502330,22,39 593891119609556992,RT @DanHannanMEP: Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.c…,281089096,878,353 593891119701880833,RT @JimForScotland: Labour has called SNP bluff. The SNP must now be clear: are they willing to prevent or bring down a Labour government &…,942479006,271,723 593891120767225856,"""Making love is not on the cards here, he's a Tory""",336530746,829,712 593891120825954304,"Audience member: why are million's more people using foodbanks? David Cameron: here's a letter saying there's no money left",1950008834,713,206 593891121111179265,David Cameron hugs another husky in the storms http://t.co/WWkaKjKEdJ #david Cameron #hugahusky #davidcamerontweet #greatstorm 2013 #StJude,1257694519,24,0 593891120913977344,RT @gemini2359: #bbcqt Tories 1st introduced PFI In NHS not Labour. Here's Ken Clarke boasting of it. http://t.co/4lk4w72f1A,1962796308,791,1085 593891120989544448,"RT @sunny_hundal: It's far easier for Labour to work as a minority govt than the Tories. Miliband can call SNP's bluff, while Cameron has n…",36063034,395,766 593891121194995712,Fair point...... https://t.co/etdlMiew4T,19286879,662,1999 593891121312440320,RT @labourpress: We learnt the most from the question David Cameron wouldn't answer tonight: clear now the Tories will cut Child Benefit #b…,2584199773,254,219 593891122541240320,Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/ndbqvgEkNK,2768062344,492,708 593891123149570049,RT @SunNation: #SUNNATION EXCLUSIVE: We reveal the man responsible for Ed slipping off the stage... http://t.co/WZ7bITsezb http://t.co/gd42…,2917805117,887,1691 593891123183099906,ED MILIBAND FELL OF STAGE... HOW IS HE GOING TO RUN THE COUNTRY?!?!?!????,363838141,199,207 593891123304787968,"RT @HouseOfTraitors: #bbcbias Number of times Party/Leader mentioned tonight on @BBCNews LAB 14 CON 9 LIB 7 SNP 5 UKIP 0",1242229584,1449,1144 593891123292078081,RT @Lou_i5e: @RogerV52 @UKIP 100% Nigel,2951248157,358,972 593891123325771776,@Daniel__Brookes @sarahduk121 @Nigel_Farage @UKIP that's a tad semantic? perhaps we should use recent general election results as best guide,22848976,553,576 593891123673878528,"RT @britainelects: Latest YouGov poll (29 - 30 Apr): LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)",2957078271,144,225 593891123766099970,The historic link between Labour and the trade unions is about to break - thanks to the SNP | via @Telegraph http://t.co/W5JB0FAFet,852919230,926,701 593891123610963968,RT @jogideon: Ed Miliband's in denial about his past squandering of our money. Don't let him do it again https://t.co/swqjv3xnHe,83628515,97,214 593891124680482817,RT @BuntinRobert: Murphy in Scotland insults our intelligence. Milliband in London demeans all Scots. The Scots will not take it lying down…,564954573,204,351 593891124953161728,#UKIP TO WIN #FARAGEFOREVER,382885872,236,397 593891125066346497,RT @andrew4mk: Off to Citizens MK assembly to show my commitment to living wage. Another no show from my Tory opponent.,15709635,2046,2171 593891125246689280,"RT @paulwaugh: ICM found just 6% #bbcqt viewers changed their mind. Small sub sample but: Clegg won 32% of switchers, Cam 25% Ed 20% http:/…",2578825735,63,136 593891126261592064,Watch: Ed Miliband trips off the stage following Question Time leaders special http://t.co/Pm5UrTC2I5,2701597927,153,11 593891126916046849,"RT @LiarMPs: Russell Brand wants to hug people who thinks its ok to chop peoples heads off on our streets. Miliband is only reaching out to…",1454249192,883,1403 593891127197073408,"So, on BBC Question Time's election special interview with the various party leaders, Miliband relented to the... http://t.co/JrCAHcLYrn",21105283,59,96 593891127545180162,Bbc scotlands entire lead story is 'nicola sturgeon won't rule out what she's already ruled out',23298323,222,886 593891127520043008,"RT @WilsonWilson009: Leaked reports that #Tories feared ""inevitable ""election defeat so ran a campaign of false flags, muddying of the wate…",586004207,656,800 593891127918493698,RT @SolManOfficial: Labour will do a deal with SNP make no mistake! Scotland will rule the UK!,76792634,3877,648 593891127918485504,"And like LibDem leader Mr C said, he had to make hard and brave decision 5yrs ago, to save Britain becm like Greece; huge impact stories",633315980,49,48 593891128262418433,"RT @benglaze: Great line from Lucy Powell on Miliband's minor trip: ""Ed slipped on David Cameron's sweat."" #GE2015 #bbcqt",436708587,979,1733 593891128472170497,"RT @KatyFBrand: While Cameron slags off Miliband for meeting Russell Brand, shall we just bear this in mind? http://t.co/q0B18I9OFA",1029438206,164,140 nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.usermention.csv.ref000066400000000000000000000356341420073152400251610ustar00rootroot00000000000000id,text,user_mentions.id,user_mentions.screen_name 593891099434983425,RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,929903647,KirkKus 593891099388846080,RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…,168090600,LabourEoin 593891100429045760,RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1,317875569,GregLauder 593891100768784384,RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…,262604305,thesundaypeople 593891100982546432,RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2,19017675,Nigel_Farage 593891101154619392,RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives,232263561,joannetallis 593891101154619392,RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives,14281853,Conservatives 593891101452476416,"RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…",1525565280,abstex 593891101838340096,RT @NivenJ1: “@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating do…,299530170,NivenJ1 593891101838340096,RT @NivenJ1: “@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating do…,1225696320,George_Osborne 593891101959917568,"RT @cononeilluk: Cameron ""Ed Milliband hanging out with Russell Brand. He is a joke. This is an election. This is about real people' http:/…",855263756,cononeilluk 593891102257713152,RT @politicshome: Ed Miliband: Last Labour government did not overspend http://t.co/W9RJ2aSH6o http://t.co/4myFekg5ex,16558943,politicshome 593891103100776448,RT @scotnotbritt: Well thats it. LABOUR would rather have a TORY government rather than work with the SNP. http://t.co/SNMkRDCe9f,28584042,scotnotbritt 593891102836563969,"RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…",23607137,stephen501 593891102836563969,"RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…",320369769,dunleavy138 593891102836563969,"RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…",523427919,CrillyBobc 593891102836563969,"RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…",77821953,theSNP 593891102836563969,"RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…",14291684,UKLabour 593891102836563969,"RT @stephen501: @dunleavy138 @CrillyBobc @theSNP @UKLabour I would be happy to do a deal with the SNP, but @Ed_Miliband was clear. If you w…",61781260,Ed_Miliband 593891102895296512,"How dare @EdMiliband_MP force Socialists to chose between the English LP and the SNP! The #SNP are the last, true Socialist party in the UK",2337137191,EdMiliband_MP 593891104438759425,"RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http://t.c…",1525565280,abstex 593891104870813699,@B0MBSKARE the anti-Scottish feeling is largely a product of Tory press scaremongering. In practice most people won't give a toss!,278076899,B0MBSKARE 593891106066198529,RT @GloriaDePiero: Nick Clegg is just as responsible for this Govt's failing plan as David Cameron - he's backed the Tories all the way,162472533,GloriaDePiero 593891106590429185,RT @mykilmarnock: Will the person who dropped a Vote SNP badge please call at Kilmarnock police station to collect it. http://t.co/o3sG5B4L…,2787759816,mykilmarnock 593891106787631104,RT @Markfergusonuk: The Sun’s Twitter worm appears to believe Ed Miliband is winning so far #bbcqt http://t.co/ZgZbSwnZxZ,28079313,Markfergusonuk 593891106921799680,"@ScottishPleb @AHairyBiker SNP toxic in England. Why would Labour want to do a deal with them?",1228300466,ScottishPleb 593891106921799680,"@ScottishPleb @AHairyBiker SNP toxic in England. Why would Labour want to do a deal with them?",396534158,AHairyBiker 593891107232165888,"RT @KatieKhaleesi: I'm #SNPbecause in my entire adult life I've only seen Labour & Tories cause misery, war, and further victimisation of t…",2171610117,KatieKhaleesi 593891107387375616,"RT @daniloxxv: ""UKIP is the bastard child of the big three parties. With apologies to all bastards."" - @PeterTatchell at the University of …",40175634,daniloxxv 593891107387375616,"RT @daniloxxv: ""UKIP is the bastard child of the big three parties. With apologies to all bastards."" - @PeterTatchell at the University of …",31135856,PeterTatchell 593891107039215616,"“@suttonnick: Friday's Times front page: Miliband savaged for ‘lies’ over spending #tomorrowspaperstoday #bbcpapers http://t.co/ts9ZnULDwr”",21910500,suttonnick 593891107857158145,RT @dhothersall: Scenes of celebration in Glasgow as #SNP applaud a #Conservative victory. Just a bit of fun. Or is it? :-) https://t.co/hh…,88059720,dhothersall 593891108117155840,"@KTHopkins SNP thickos look forward to having the balance of power, bringing fair, decent policies that fascists bigots like u detest!",21439144,KTHopkins 593891108314308608,RT @LabourEoin: Another humongous lie from David Cameron. He has not clamped down on Tax Avoidance. Uncollected Tax has risen #BBCqt http:/…,168090600,LabourEoin 593891110239543296,RT @jsteve372: .@mik61scot it's Nicola Sturgeon's new education policy... Improve literacy by getting deprive kids reading words on helicop…,880165526,jsteve372 593891110239543296,RT @jsteve372: .@mik61scot it's Nicola Sturgeon's new education policy... Improve literacy by getting deprive kids reading words on helicop…,137690167,mik61scot 593891110616989696,"RT @guardian: Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc",87818409,guardian 593891110763757568,RT @StephenHep21: @ScotlandTonight Ed is free to say he wont do a deal. It is #SNP who will never be forgiven if they vote down a Labour go…,2437160281,StephenHep21 593891110763757568,RT @StephenHep21: @ScotlandTonight Ed is free to say he wont do a deal. It is #SNP who will never be forgiven if they vote down a Labour go…,361756405,ScotlandTonight 593891111502016512,RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,61781260,Ed_Miliband 593891111824941056,RT @HuffPostUK: The Tory spin message about David Cameron's performance is a bit obvious #bbcqt http://t.co/yihDlG5NVo http://t.co/KsqFCqwS…,271413771,HuffPostUK 593891111799783424,@Nigel_Farage Does Ed's rejection of Lab/SNP deal undermine Cameron's plea to Ukippers to 'come home' on the basis of that proposal?,19017675,Nigel_Farage 593891111942365186,RT @Ed_Miliband: I want to be completely clear: there will no coalition and no deals with the SNP. #bbcqt,61781260,Ed_Miliband 593891111946551296,"RT @chaos_lane: imagine fighting for us to stay in the union, promising that Scotland's voice will be heard, and then refusing to do a deal…",330665067,chaos_lane 593891113179750401,RT @ronwindward: @JimForScotland Crazy statements like that just confirms to me I made the right decision in leaving Labour for the SNP.Tim…,833702100,ronwindward 593891113179750401,RT @ronwindward: @JimForScotland Crazy statements like that just confirms to me I made the right decision in leaving Labour for the SNP.Tim…,32522100,JimForScotland 593891113750106112,"RT @Riath84: Great, you reduced the deficit, by killing and shitting on the poor. Good job. Not really something to be bragging about Torie…",133211649,Riath84 593891114215723008,Fucking biased #BBC allowing @Nigel_Farage to have a #leadersdebate program all of his own... #NigelFarage #UKIP #GE2015,19017675,Nigel_Farage 593891114496729088,"RT @nickgoff79: If Miliband had replied then, ""ah, you work in recruitment? No wonder you're such a dick."" I'd have voted for him. But he d…",228706018,nickgoff79 593891114744164354,"RT @EllieCumbo: Oh, Tories. Wiz zis message discipline you are really spoiling uz. http://t.co/wg5WhNyp6O",201003224,EllieCumbo 593891114853216257,RT @LeeMartin4947: Miliband on Question Time: I won't have Labour government if it means SNP deal http://t.co/8DPMTRsqG1 #GE2015,574284134,LeeMartin4947 593891116140888065,"RT @joncraig: Scottish audience all address Sturgeon as ""Nicola"". Earlier on QT it was ""Mr Cameron"", ""Mr Miliband"" & ""Mr Clegg"". http://t.c…",14072988,joncraig 593891116363214848,"@ScotlandTonight If Miliband will step aside to let Tories in, we need 59 SNP MPs more than ever to stand up for Scotland @theSNP #VoteSNP",361756405,ScotlandTonight 593891116363214848,"@ScotlandTonight If Miliband will step aside to let Tories in, we need 59 SNP MPs more than ever to stand up for Scotland @theSNP #VoteSNP",77821953,theSNP 593891116405100544,RT @ChristinaSNP: I just heard Milliband's pronouncements........ Oh dear.... Cutting off his nose to spite the Scots.... #VoteSNP,257023773,ChristinaSNP 593891117525016576,RT @chunkymark: How would labour supporters/country feel if @Ed_Miliband let Tories have another 5 years by refusing to have coalition with…,212973087,chunkymark 593891117525016576,RT @chunkymark: How would labour supporters/country feel if @Ed_Miliband let Tories have another 5 years by refusing to have coalition with…,61781260,Ed_Miliband 593891117474668544,"RT @guardian: Guardian front page, Friday 1 May 2015: Miliband hardens his line: I will not do deal with SNP http://t.co/gHMZXK3Tbc",87818409,guardian 593891118384869376,"RT @GardhamHT: Scots will never forgive Lab if Tories get in because Miliband refuses to work with SNP, says Stewart Hosie",1371900259,GardhamHT 593891118867185664,RT @DanHannanMEP: Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.c…,85794542,DanHannanMEP 593891118779097090,RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants. http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…,262604305,thesundaypeople 593891119165001728,"RT @CountryStandard: You Gov Poll Tonight LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)",265642404,CountryStandard 593891119609556992,RT @DanHannanMEP: Ed Miliband bases his campaign around not trusting us on the EU. What would it say about us if we elected him? http://t.c…,85794542,DanHannanMEP 593891119701880833,RT @JimForScotland: Labour has called SNP bluff. The SNP must now be clear: are they willing to prevent or bring down a Labour government &…,32522100,JimForScotland 593891120913977344,RT @gemini2359: #bbcqt Tories 1st introduced PFI In NHS not Labour. Here's Ken Clarke boasting of it. http://t.co/4lk4w72f1A,295283950,gemini2359 593891120989544448,"RT @sunny_hundal: It's far easier for Labour to work as a minority govt than the Tories. Miliband can call SNP's bluff, while Cameron has n…",22021978,sunny_hundal 593891121312440320,RT @labourpress: We learnt the most from the question David Cameron wouldn't answer tonight: clear now the Tories will cut Child Benefit #b…,79173926,labourpress 593891123149570049,RT @SunNation: #SUNNATION EXCLUSIVE: We reveal the man responsible for Ed slipping off the stage... http://t.co/WZ7bITsezb http://t.co/gd42…,2543147876,SunNation 593891123304787968,"RT @HouseOfTraitors: #bbcbias Number of times Party/Leader mentioned tonight on @BBCNews LAB 14 CON 9 LIB 7 SNP 5 UKIP 0",1895366173,HouseOfTraitors 593891123304787968,"RT @HouseOfTraitors: #bbcbias Number of times Party/Leader mentioned tonight on @BBCNews LAB 14 CON 9 LIB 7 SNP 5 UKIP 0",612473,BBCNews 593891123292078081,RT @Lou_i5e: @RogerV52 @UKIP 100% Nigel,270327100,Lou_i5e 593891123292078081,RT @Lou_i5e: @RogerV52 @UKIP 100% Nigel,187579804,RogerV52 593891123292078081,RT @Lou_i5e: @RogerV52 @UKIP 100% Nigel,358204197,UKIP 593891123325771776,@Daniel__Brookes @sarahduk121 @Nigel_Farage @UKIP that's a tad semantic? perhaps we should use recent general election results as best guide,948314394,Daniel__Brookes 593891123325771776,@Daniel__Brookes @sarahduk121 @Nigel_Farage @UKIP that's a tad semantic? perhaps we should use recent general election results as best guide,3187232212,sarahduk121 593891123325771776,@Daniel__Brookes @sarahduk121 @Nigel_Farage @UKIP that's a tad semantic? perhaps we should use recent general election results as best guide,19017675,Nigel_Farage 593891123325771776,@Daniel__Brookes @sarahduk121 @Nigel_Farage @UKIP that's a tad semantic? perhaps we should use recent general election results as best guide,358204197,UKIP 593891123673878528,"RT @britainelects: Latest YouGov poll (29 - 30 Apr): LAB - 35% (+1) CON - 34% (-1) UKIP - 12% (-) LDEM - 8% (-1) GRN - 5% (+1)",1541364193,britainelects 593891123766099970,The historic link between Labour and the trade unions is about to break - thanks to the SNP | via @Telegraph http://t.co/W5JB0FAFet,16343974,Telegraph 593891123610963968,RT @jogideon: Ed Miliband's in denial about his past squandering of our money. Don't let him do it again https://t.co/swqjv3xnHe,67323615,jogideon 593891124680482817,RT @BuntinRobert: Murphy in Scotland insults our intelligence. Milliband in London demeans all Scots. The Scots will not take it lying down…,2506656725,BuntinRobert 593891125066346497,RT @andrew4mk: Off to Citizens MK assembly to show my commitment to living wage. Another no show from my Tory opponent.,70803659,andrew4mk 593891125246689280,"RT @paulwaugh: ICM found just 6% #bbcqt viewers changed their mind. Small sub sample but: Clegg won 32% of switchers, Cam 25% Ed 20% http:/…",26985345,paulwaugh 593891126916046849,"RT @LiarMPs: Russell Brand wants to hug people who thinks its ok to chop peoples heads off on our streets. Miliband is only reaching out to…",1119758522,LiarMPs 593891127520043008,"RT @WilsonWilson009: Leaked reports that #Tories feared ""inevitable ""election defeat so ran a campaign of false flags, muddying of the wate…",1094730422,WilsonWilson009 593891127918493698,RT @SolManOfficial: Labour will do a deal with SNP make no mistake! Scotland will rule the UK!,2656744153,SolManOfficial 593891128262418433,"RT @benglaze: Great line from Lucy Powell on Miliband's minor trip: ""Ed slipped on David Cameron's sweat."" #GE2015 #bbcqt",114412574,benglaze 593891128472170497,"RT @KatyFBrand: While Cameron slags off Miliband for meeting Russell Brand, shall we just bear this in mind? http://t.co/q0B18I9OFA",32876983,KatyFBrand nltk-3.7/nltk/test/unit/files/tweets.20150430-223406.userurl.csv.ref000066400000000000000000000000641420073152400242770ustar00rootroot00000000000000user.id,user.screen_name,urls.url,urls.expanded_url nltk-3.7/nltk/test/unit/lm/000077500000000000000000000000001420073152400156475ustar00rootroot00000000000000nltk-3.7/nltk/test/unit/lm/__init__.py000066400000000000000000000000001420073152400177460ustar00rootroot00000000000000nltk-3.7/nltk/test/unit/lm/test_counter.py000066400000000000000000000072771420073152400207540ustar00rootroot00000000000000# Natural Language Toolkit: Language Model Unit Tests # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT import unittest import pytest from nltk import FreqDist from nltk.lm import NgramCounter from nltk.util import everygrams class TestNgramCounter: """Tests for NgramCounter that only involve lookup, no modification.""" @classmethod def setup_class(self): text = [list("abcd"), list("egdbe")] self.trigram_counter = NgramCounter( everygrams(sent, max_len=3) for sent in text ) self.bigram_counter = NgramCounter(everygrams(sent, max_len=2) for sent in text) self.case = unittest.TestCase() def test_N(self): assert self.bigram_counter.N() == 16 assert self.trigram_counter.N() == 21 def test_counter_len_changes_with_lookup(self): assert len(self.bigram_counter) == 2 self.bigram_counter[50] assert len(self.bigram_counter) == 3 def test_ngram_order_access_unigrams(self): assert self.bigram_counter[1] == self.bigram_counter.unigrams def test_ngram_conditional_freqdist(self): case = unittest.TestCase() expected_trigram_contexts = [ ("a", "b"), ("b", "c"), ("e", "g"), ("g", "d"), ("d", "b"), ] expected_bigram_contexts = [("a",), ("b",), ("d",), ("e",), ("c",), ("g",)] bigrams = self.trigram_counter[2] trigrams = self.trigram_counter[3] self.case.assertCountEqual(expected_bigram_contexts, bigrams.conditions()) self.case.assertCountEqual(expected_trigram_contexts, trigrams.conditions()) def test_bigram_counts_seen_ngrams(self): assert self.bigram_counter[["a"]]["b"] == 1 assert self.bigram_counter[["b"]]["c"] == 1 def test_bigram_counts_unseen_ngrams(self): assert self.bigram_counter[["b"]]["z"] == 0 def test_unigram_counts_seen_words(self): assert self.bigram_counter["b"] == 2 def test_unigram_counts_completely_unseen_words(self): assert self.bigram_counter["z"] == 0 class TestNgramCounterTraining: @classmethod def setup_class(self): self.counter = NgramCounter() self.case = unittest.TestCase() @pytest.mark.parametrize("case", ["", [], None]) def test_empty_inputs(self, case): test = NgramCounter(case) assert 2 not in test assert test[1] == FreqDist() def test_train_on_unigrams(self): words = list("abcd") counter = NgramCounter([[(w,) for w in words]]) assert not counter[3] assert not counter[2] self.case.assertCountEqual(words, counter[1].keys()) def test_train_on_illegal_sentences(self): str_sent = ["Check", "this", "out", "!"] list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]] with pytest.raises(TypeError): NgramCounter([str_sent]) with pytest.raises(TypeError): NgramCounter([list_sent]) def test_train_on_bigrams(self): bigram_sent = [("a", "b"), ("c", "d")] counter = NgramCounter([bigram_sent]) assert not bool(counter[3]) def test_train_on_mix(self): mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h",)] counter = NgramCounter([mixed_sent]) unigrams = ["h"] bigram_contexts = [("a",), ("c",)] trigram_contexts = [("e", "f")] self.case.assertCountEqual(unigrams, counter[1].keys()) self.case.assertCountEqual(bigram_contexts, counter[2].keys()) self.case.assertCountEqual(trigram_contexts, counter[3].keys()) nltk-3.7/nltk/test/unit/lm/test_models.py000066400000000000000000000461361420073152400205550ustar00rootroot00000000000000# Natural Language Toolkit: Language Model Unit Tests # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT import math from operator import itemgetter import pytest from nltk.lm import ( MLE, AbsoluteDiscountingInterpolated, KneserNeyInterpolated, Laplace, Lidstone, StupidBackoff, Vocabulary, WittenBellInterpolated, ) from nltk.lm.preprocessing import padded_everygrams @pytest.fixture(scope="session") def vocabulary(): return Vocabulary(["a", "b", "c", "d", "z", "", ""], unk_cutoff=1) @pytest.fixture(scope="session") def training_data(): return [["a", "b", "c", "d"], ["e", "g", "a", "d", "b", "e"]] @pytest.fixture(scope="session") def bigram_training_data(training_data): return [list(padded_everygrams(2, sent)) for sent in training_data] @pytest.fixture(scope="session") def trigram_training_data(training_data): return [list(padded_everygrams(3, sent)) for sent in training_data] @pytest.fixture def mle_bigram_model(vocabulary, bigram_training_data): model = MLE(2, vocabulary=vocabulary) model.fit(bigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ ("d", ["c"], 1), # Unseen ngrams should yield 0 ("d", ["e"], 0), # Unigrams should also be 0 ("z", None, 0), # N unigrams = 14 # count('a') = 2 ("a", None, 2.0 / 14), # count('y') = 3 ("y", None, 3.0 / 14), ], ) def test_mle_bigram_scores(mle_bigram_model, word, context, expected_score): assert pytest.approx(mle_bigram_model.score(word, context), 1e-4) == expected_score def test_mle_bigram_logscore_for_zero_score(mle_bigram_model): assert math.isinf(mle_bigram_model.logscore("d", ["e"])) def test_mle_bigram_entropy_perplexity_seen(mle_bigram_model): # ngrams seen during training trained = [ ("", "a"), ("a", "b"), ("b", ""), ("", "a"), ("a", "d"), ("d", ""), ] # Ngram = Log score # , a = -1 # a, b = -1 # b, UNK = -1 # UNK, a = -1.585 # a, d = -1 # d, = -1 # TOTAL logscores = -6.585 # - AVG logscores = 1.0975 H = 1.0975 perplexity = 2.1398 assert pytest.approx(mle_bigram_model.entropy(trained), 1e-4) == H assert pytest.approx(mle_bigram_model.perplexity(trained), 1e-4) == perplexity def test_mle_bigram_entropy_perplexity_unseen(mle_bigram_model): # In MLE, even one unseen ngram should make entropy and perplexity infinite untrained = [("", "a"), ("a", "c"), ("c", "d"), ("d", "")] assert math.isinf(mle_bigram_model.entropy(untrained)) assert math.isinf(mle_bigram_model.perplexity(untrained)) def test_mle_bigram_entropy_perplexity_unigrams(mle_bigram_model): # word = score, log score # = 0.1429, -2.8074 # a = 0.1429, -2.8074 # c = 0.0714, -3.8073 # UNK = 0.2143, -2.2224 # d = 0.1429, -2.8074 # c = 0.0714, -3.8073 # = 0.1429, -2.8074 # TOTAL logscores = -21.6243 # - AVG logscores = 3.0095 H = 3.0095 perplexity = 8.0529 text = [("",), ("a",), ("c",), ("-",), ("d",), ("c",), ("",)] assert pytest.approx(mle_bigram_model.entropy(text), 1e-4) == H assert pytest.approx(mle_bigram_model.perplexity(text), 1e-4) == perplexity @pytest.fixture def mle_trigram_model(trigram_training_data, vocabulary): model = MLE(order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # count(d | b, c) = 1 # count(b, c) = 1 ("d", ("b", "c"), 1), # count(d | c) = 1 # count(c) = 1 ("d", ["c"], 1), # total number of tokens is 18, of which "a" occurred 2 times ("a", None, 2.0 / 18), # in vocabulary but unseen ("z", None, 0), # out of vocabulary should use "UNK" score ("y", None, 3.0 / 18), ], ) def test_mle_trigram_scores(mle_trigram_model, word, context, expected_score): assert pytest.approx(mle_trigram_model.score(word, context), 1e-4) == expected_score @pytest.fixture def lidstone_bigram_model(bigram_training_data, vocabulary): model = Lidstone(0.1, order=2, vocabulary=vocabulary) model.fit(bigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # count(d | c) = 1 # *count(d | c) = 1.1 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 1.8 ("d", ["c"], 1.1 / 1.8), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 0.8 = 14.8 # count("a") = 2 # *count("a") = 2.1 ("a", None, 2.1 / 14.8), # in vocabulary but unseen # count("z") = 0 # *count("z") = 0.1 ("z", None, 0.1 / 14.8), # out of vocabulary should use "UNK" score # count("") = 3 # *count("") = 3.1 ("y", None, 3.1 / 14.8), ], ) def test_lidstone_bigram_score(lidstone_bigram_model, word, context, expected_score): assert ( pytest.approx(lidstone_bigram_model.score(word, context), 1e-4) == expected_score ) def test_lidstone_entropy_perplexity(lidstone_bigram_model): text = [ ("", "a"), ("a", "c"), ("c", ""), ("", "d"), ("d", "c"), ("c", ""), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # , a = 0.3929, -1.3479 # a, c = 0.0357, -4.8074 # c, UNK = 0.0(5), -4.1699 # UNK, d = 0.0263, -5.2479 # d, c = 0.0357, -4.8074 # c, = 0.0(5), -4.1699 # TOTAL logscore: −24.5504 # - AVG logscore: 4.0917 H = 4.0917 perplexity = 17.0504 assert pytest.approx(lidstone_bigram_model.entropy(text), 1e-4) == H assert pytest.approx(lidstone_bigram_model.perplexity(text), 1e-4) == perplexity @pytest.fixture def lidstone_trigram_model(trigram_training_data, vocabulary): model = Lidstone(0.1, order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # Logic behind this is the same as for bigram model ("d", ["c"], 1.1 / 1.8), # if we choose a word that hasn't appeared after (b, c) ("e", ["c"], 0.1 / 1.8), # Trigram score now ("d", ["b", "c"], 1.1 / 1.8), ("e", ["b", "c"], 0.1 / 1.8), ], ) def test_lidstone_trigram_score(lidstone_trigram_model, word, context, expected_score): assert ( pytest.approx(lidstone_trigram_model.score(word, context), 1e-4) == expected_score ) @pytest.fixture def laplace_bigram_model(bigram_training_data, vocabulary): model = Laplace(2, vocabulary=vocabulary) model.fit(bigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # basic sanity-check: # count(d | c) = 1 # *count(d | c) = 2 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 9 ("d", ["c"], 2.0 / 9), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 8 = 22 # count("a") = 2 # *count("a") = 3 ("a", None, 3.0 / 22), # in vocabulary but unseen # count("z") = 0 # *count("z") = 1 ("z", None, 1.0 / 22), # out of vocabulary should use "UNK" score # count("") = 3 # *count("") = 4 ("y", None, 4.0 / 22), ], ) def test_laplace_bigram_score(laplace_bigram_model, word, context, expected_score): assert ( pytest.approx(laplace_bigram_model.score(word, context), 1e-4) == expected_score ) def test_laplace_bigram_entropy_perplexity(laplace_bigram_model): text = [ ("", "a"), ("a", "c"), ("c", ""), ("", "d"), ("d", "c"), ("c", ""), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # , a = 0.2, -2.3219 # a, c = 0.1, -3.3219 # c, UNK = 0.(1), -3.1699 # UNK, d = 0.(09), 3.4594 # d, c = 0.1 -3.3219 # c, = 0.(1), -3.1699 # Total logscores: −18.7651 # - AVG logscores: 3.1275 H = 3.1275 perplexity = 8.7393 assert pytest.approx(laplace_bigram_model.entropy(text), 1e-4) == H assert pytest.approx(laplace_bigram_model.perplexity(text), 1e-4) == perplexity def test_laplace_gamma(laplace_bigram_model): assert laplace_bigram_model.gamma == 1 @pytest.fixture def wittenbell_trigram_model(trigram_training_data, vocabulary): model = WittenBellInterpolated(3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # For unigram scores by default revert to regular MLE # Total unigrams: 18 # Vocab Size = 7 # count('c'): 1 ("c", None, 1.0 / 18), # in vocabulary but unseen # count("z") = 0 ("z", None, 0 / 18), # out of vocabulary should use "UNK" score # count("") = 3 ("y", None, 3.0 / 18), # 2 words follow b and b occurred a total of 2 times # gamma(['b']) = 2 / (2 + 2) = 0.5 # mle.score('c', ['b']) = 0.5 # mle('c') = 1 / 18 = 0.055 # (1 - gamma) * mle + gamma * mle('c') ~= 0.27 + 0.055 ("c", ["b"], (1 - 0.5) * 0.5 + 0.5 * 1 / 18), # building on that, let's try 'a b c' as the trigram # 1 word follows 'a b' and 'a b' occurred 1 time # gamma(['a', 'b']) = 1 / (1 + 1) = 0.5 # mle("c", ["a", "b"]) = 1 ("c", ["a", "b"], (1 - 0.5) + 0.5 * ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)), # P(c|zb) # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. ("c", ["z", "b"], ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)), ], ) def test_wittenbell_trigram_score( wittenbell_trigram_model, word, context, expected_score ): assert ( pytest.approx(wittenbell_trigram_model.score(word, context), 1e-4) == expected_score ) ############################################################################### # Notation Explained # ############################################################################### # For all subsequent calculations we use the following notation: # 1. '*': Placeholder for any word/character. E.g. '*b' stands for # all bigrams that end in 'b'. '*b*' stands for all trigrams that # contain 'b' in the middle. # 1. count(ngram): Count all instances (tokens) of an ngram. # 1. unique(ngram): Count unique instances (types) of an ngram. @pytest.fixture def kneserney_trigram_model(trigram_training_data, vocabulary): model = KneserNeyInterpolated(order=3, discount=0.75, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # P(c) = count('*c') / unique('**') # = 1 / 14 ("c", None, 1.0 / 14), # P(z) = count('*z') / unique('**') # = 0 / 14 # 'z' is in the vocabulary, but it was not seen during training. ("z", None, 0.0 / 14), # P(y) # Out of vocabulary should use "UNK" score. # P(y) = P(UNK) = count('*UNK') / unique('**') ("y", None, 3 / 14), # We start with P(c|b) # P(c|b) = alpha('bc') + gamma('b') * P(c) # alpha('bc') = max(unique('*bc') - discount, 0) / unique('*b*') # = max(1 - 0.75, 0) / 2 # = 0.125 # gamma('b') = discount * unique('b*') / unique('*b*') # = (0.75 * 2) / 2 # = 0.75 ("c", ["b"], (0.125 + 0.75 * (1 / 14))), # Building on that, let's try P(c|ab). # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') # = max(1 - 0.75, 0) / 1 # = 0.25 # gamma('ab') = (discount * unique('ab*')) / count('ab*') # = 0.75 * 1 / 1 ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))), # P(c|zb) # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))), ], ) def test_kneserney_trigram_score( kneserney_trigram_model, word, context, expected_score ): assert ( pytest.approx(kneserney_trigram_model.score(word, context), 1e-4) == expected_score ) @pytest.fixture def absolute_discounting_trigram_model(trigram_training_data, vocabulary): model = AbsoluteDiscountingInterpolated(order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # For unigram scores revert to uniform # P(c) = count('c') / count('**') ("c", None, 1.0 / 18), # in vocabulary but unseen # count('z') = 0 ("z", None, 0.0 / 18), # out of vocabulary should use "UNK" score # count('') = 3 ("y", None, 3 / 18), # P(c|b) = alpha('bc') + gamma('b') * P(c) # alpha('bc') = max(count('bc') - discount, 0) / count('b*') # = max(1 - 0.75, 0) / 2 # = 0.125 # gamma('b') = discount * unique('b*') / count('b*') # = (0.75 * 2) / 2 # = 0.75 ("c", ["b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))), # Building on that, let's try P(c|ab). # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') # = max(1 - 0.75, 0) / 1 # = 0.25 # gamma('ab') = (discount * unique('ab*')) / count('ab*') # = 0.75 * 1 / 1 ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (2 / 2) * (1 / 18))), # P(c|zb) # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. ("c", ["z", "b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))), ], ) def test_absolute_discounting_trigram_score( absolute_discounting_trigram_model, word, context, expected_score ): assert ( pytest.approx(absolute_discounting_trigram_model.score(word, context), 1e-4) == expected_score ) @pytest.fixture def stupid_backoff_trigram_model(trigram_training_data, vocabulary): model = StupidBackoff(order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # For unigram scores revert to uniform # total bigrams = 18 ("c", None, 1.0 / 18), # in vocabulary but unseen # bigrams ending with z = 0 ("z", None, 0.0 / 18), # out of vocabulary should use "UNK" score # count(''): 3 ("y", None, 3 / 18), # c follows 1 time out of 2 after b ("c", ["b"], 1 / 2), # c always follows ab ("c", ["a", "b"], 1 / 1), # The ngram 'z b c' was not seen, so we backoff to # the score of the ngram 'b c' * smoothing factor ("c", ["z", "b"], (0.4 * (1 / 2))), ], ) def test_stupid_backoff_trigram_score( stupid_backoff_trigram_model, word, context, expected_score ): assert ( pytest.approx(stupid_backoff_trigram_model.score(word, context), 1e-4) == expected_score ) ############################################################################### # Probability Distributions Should Sum up to Unity # ############################################################################### @pytest.fixture(scope="session") def kneserney_bigram_model(bigram_training_data, vocabulary): model = KneserNeyInterpolated(order=2, vocabulary=vocabulary) model.fit(bigram_training_data) return model @pytest.mark.parametrize( "model_fixture", [ "mle_bigram_model", "mle_trigram_model", "lidstone_bigram_model", "laplace_bigram_model", "wittenbell_trigram_model", "absolute_discounting_trigram_model", "kneserney_bigram_model", pytest.param( "stupid_backoff_trigram_model", marks=pytest.mark.xfail( reason="Stupid Backoff is not a valid distribution" ), ), ], ) @pytest.mark.parametrize( "context", [("a",), ("c",), ("",), ("b",), ("",), ("d",), ("e",), ("r",), ("w",)], ids=itemgetter(0), ) def test_sums_to_1(model_fixture, context, request): model = request.getfixturevalue(model_fixture) scores_for_context = sum(model.score(w, context) for w in model.vocab) assert pytest.approx(scores_for_context, 1e-7) == 1.0 ############################################################################### # Generating Text # ############################################################################### def test_generate_one_no_context(mle_trigram_model): assert mle_trigram_model.generate(random_seed=3) == "" def test_generate_one_from_limiting_context(mle_trigram_model): # We don't need random_seed for contexts with only one continuation assert mle_trigram_model.generate(text_seed=["c"]) == "d" assert mle_trigram_model.generate(text_seed=["b", "c"]) == "d" assert mle_trigram_model.generate(text_seed=["a", "c"]) == "d" def test_generate_one_from_varied_context(mle_trigram_model): # When context doesn't limit our options enough, seed the random choice assert mle_trigram_model.generate(text_seed=("a", ""), random_seed=2) == "a" def test_generate_cycle(mle_trigram_model): # Add a cycle to the model: bd -> b, db -> d more_training_text = [padded_everygrams(mle_trigram_model.order, list("bdbdbd"))] mle_trigram_model.fit(more_training_text) # Test that we can escape the cycle assert mle_trigram_model.generate(7, text_seed=("b", "d"), random_seed=5) == [ "b", "d", "b", "d", "b", "d", "", ] def test_generate_with_text_seed(mle_trigram_model): assert mle_trigram_model.generate(5, text_seed=("", "e"), random_seed=3) == [ "", "a", "d", "b", "", ] def test_generate_oov_text_seed(mle_trigram_model): assert mle_trigram_model.generate( text_seed=("aliens",), random_seed=3 ) == mle_trigram_model.generate(text_seed=("",), random_seed=3) def test_generate_None_text_seed(mle_trigram_model): # should crash with type error when we try to look it up in vocabulary with pytest.raises(TypeError): mle_trigram_model.generate(text_seed=(None,)) # This will work assert mle_trigram_model.generate( text_seed=None, random_seed=3 ) == mle_trigram_model.generate(random_seed=3) nltk-3.7/nltk/test/unit/lm/test_preprocessing.py000066400000000000000000000017111420073152400221430ustar00rootroot00000000000000# Natural Language Toolkit: Language Model Unit Tests # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT import unittest from nltk.lm.preprocessing import padded_everygram_pipeline class TestPreprocessing(unittest.TestCase): def test_padded_everygram_pipeline(self): expected_train = [ [ ("",), ("", "a"), ("a",), ("a", "b"), ("b",), ("b", "c"), ("c",), ("c", ""), ("",), ] ] expected_vocab = ["", "a", "b", "c", ""] train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]]) self.assertEqual([list(sent) for sent in train_data], expected_train) self.assertEqual(list(vocab_data), expected_vocab) nltk-3.7/nltk/test/unit/lm/test_vocabulary.py000066400000000000000000000132011420073152400214240ustar00rootroot00000000000000# Natural Language Toolkit: Language Model Unit Tests # # Copyright (C) 2001-2022 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT import unittest from collections import Counter from timeit import timeit from nltk.lm import Vocabulary class NgramModelVocabularyTests(unittest.TestCase): """tests Vocabulary Class""" @classmethod def setUpClass(cls): cls.vocab = Vocabulary( ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"], unk_cutoff=2, ) def test_truthiness(self): self.assertTrue(self.vocab) def test_cutoff_value_set_correctly(self): self.assertEqual(self.vocab.cutoff, 2) def test_unable_to_change_cutoff(self): with self.assertRaises(AttributeError): self.vocab.cutoff = 3 def test_cutoff_setter_checks_value(self): with self.assertRaises(ValueError) as exc_info: Vocabulary("abc", unk_cutoff=0) expected_error_msg = "Cutoff value cannot be less than 1. Got: 0" self.assertEqual(expected_error_msg, str(exc_info.exception)) def test_counts_set_correctly(self): self.assertEqual(self.vocab.counts["a"], 2) self.assertEqual(self.vocab.counts["b"], 2) self.assertEqual(self.vocab.counts["c"], 1) def test_membership_check_respects_cutoff(self): # a was seen 2 times, so it should be considered part of the vocabulary self.assertTrue("a" in self.vocab) # "c" was seen once, it shouldn't be considered part of the vocab self.assertFalse("c" in self.vocab) # "z" was never seen at all, also shouldn't be considered in the vocab self.assertFalse("z" in self.vocab) def test_vocab_len_respects_cutoff(self): # Vocab size is the number of unique tokens that occur at least as often # as the cutoff value, plus 1 to account for unknown words. self.assertEqual(5, len(self.vocab)) def test_vocab_iter_respects_cutoff(self): vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"] vocab_items = ["a", "b", "d", "e", ""] self.assertCountEqual(vocab_counts, list(self.vocab.counts.keys())) self.assertCountEqual(vocab_items, list(self.vocab)) def test_update_empty_vocab(self): empty = Vocabulary(unk_cutoff=2) self.assertEqual(len(empty), 0) self.assertFalse(empty) self.assertIn(empty.unk_label, empty) empty.update(list("abcde")) self.assertIn(empty.unk_label, empty) def test_lookup(self): self.assertEqual(self.vocab.lookup("a"), "a") self.assertEqual(self.vocab.lookup("c"), "") def test_lookup_iterables(self): self.assertEqual(self.vocab.lookup(["a", "b"]), ("a", "b")) self.assertEqual(self.vocab.lookup(("a", "b")), ("a", "b")) self.assertEqual(self.vocab.lookup(("a", "c")), ("a", "")) self.assertEqual( self.vocab.lookup(map(str, range(3))), ("", "", "") ) def test_lookup_empty_iterables(self): self.assertEqual(self.vocab.lookup(()), ()) self.assertEqual(self.vocab.lookup([]), ()) self.assertEqual(self.vocab.lookup(iter([])), ()) self.assertEqual(self.vocab.lookup(n for n in range(0, 0)), ()) def test_lookup_recursive(self): self.assertEqual( self.vocab.lookup([["a", "b"], ["a", "c"]]), (("a", "b"), ("a", "")) ) self.assertEqual(self.vocab.lookup([["a", "b"], "c"]), (("a", "b"), "")) self.assertEqual(self.vocab.lookup([[[[["a", "b"]]]]]), ((((("a", "b"),),),),)) def test_lookup_None(self): with self.assertRaises(TypeError): self.vocab.lookup(None) with self.assertRaises(TypeError): list(self.vocab.lookup([None, None])) def test_lookup_int(self): with self.assertRaises(TypeError): self.vocab.lookup(1) with self.assertRaises(TypeError): list(self.vocab.lookup([1, 2])) def test_lookup_empty_str(self): self.assertEqual(self.vocab.lookup(""), "") def test_eqality(self): v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1) v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1) v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah") v4 = Vocabulary(["a", "b"], unk_cutoff=1) self.assertEqual(v1, v2) self.assertNotEqual(v1, v3) self.assertNotEqual(v1, v4) def test_str(self): self.assertEqual( str(self.vocab), "" ) def test_creation_with_counter(self): self.assertEqual( self.vocab, Vocabulary( Counter( ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"] ), unk_cutoff=2, ), ) @unittest.skip( reason="Test is known to be flaky as it compares (runtime) performance." ) def test_len_is_constant(self): # Given an obviously small and an obviously large vocabulary. small_vocab = Vocabulary("abcde") from nltk.corpus.europarl_raw import english large_vocab = Vocabulary(english.words()) # If we time calling `len` on them. small_vocab_len_time = timeit("len(small_vocab)", globals=locals()) large_vocab_len_time = timeit("len(large_vocab)", globals=locals()) # The timing should be the same order of magnitude. self.assertAlmostEqual(small_vocab_len_time, large_vocab_len_time, places=1) nltk-3.7/nltk/test/unit/test_aline.py000066400000000000000000000020721420073152400177410ustar00rootroot00000000000000""" Test Aline algorithm for aligning phonetic sequences """ from nltk.metrics import aline def test_aline(): result = aline.align("θin", "tenwis") expected = [[("θ", "t"), ("i", "e"), ("n", "n")]] assert result == expected result = aline.align("jo", "ʒə") expected = [[("j", "ʒ"), ("o", "ə")]] assert result == expected result = aline.align("pematesiweni", "pematesewen") expected = [ [ ("p", "p"), ("e", "e"), ("m", "m"), ("a", "a"), ("t", "t"), ("e", "e"), ("s", "s"), ("i", "e"), ("w", "w"), ("e", "e"), ("n", "n"), ] ] assert result == expected result = aline.align("tuwθ", "dentis") expected = [[("t", "t"), ("u", "i"), ("w", "-"), ("θ", "s")]] assert result == expected def test_aline_delta(): """ Test aline for computing the difference between two segments """ assert aline.delta("p", "q") == 20.0 assert aline.delta("a", "A") == 0.0 nltk-3.7/nltk/test/unit/test_bllip.py000066400000000000000000000020611420073152400177510ustar00rootroot00000000000000import pytest from nltk.data import find from nltk.parse.bllip import BllipParser from nltk.tree import Tree @pytest.fixture(scope="module") def parser(): model_dir = find("models/bllip_wsj_no_aux").path return BllipParser.from_unified_model_dir(model_dir) def setup_module(): pytest.importorskip("bllipparser") class TestBllipParser: def test_parser_loads_a_valid_tree(self, parser): parsed = parser.parse("I saw the man with the telescope") tree = next(parsed) assert isinstance(tree, Tree) assert ( tree.pformat() == """ (S1 (S (NP (PRP I)) (VP (VBD saw) (NP (DT the) (NN man)) (PP (IN with) (NP (DT the) (NN telescope)))))) """.strip() ) def test_tagged_parse_finds_matching_element(self, parser): parsed = parser.parse("I saw the man with the telescope") tagged_tree = next(parser.tagged_parse([("telescope", "NN")])) assert isinstance(tagged_tree, Tree) assert tagged_tree.pformat() == "(S1 (NP (NN telescope)))" nltk-3.7/nltk/test/unit/test_brill.py000066400000000000000000000017361420073152400177630ustar00rootroot00000000000000""" Tests for Brill tagger. """ import unittest from nltk.corpus import treebank from nltk.tag import UnigramTagger, brill, brill_trainer from nltk.tbl import demo class TestBrill(unittest.TestCase): def test_pos_template(self): train_sents = treebank.tagged_sents()[:1000] tagger = UnigramTagger(train_sents) trainer = brill_trainer.BrillTaggerTrainer( tagger, [brill.Template(brill.Pos([-1]))] ) brill_tagger = trainer.train(train_sents) # Example from https://github.com/nltk/nltk/issues/769 result = brill_tagger.tag("This is a foo bar sentence".split()) expected = [ ("This", "DT"), ("is", "VBZ"), ("a", "DT"), ("foo", None), ("bar", "NN"), ("sentence", None), ] self.assertEqual(result, expected) @unittest.skip("Should be tested in __main__ of nltk.tbl.demo") def test_brill_demo(self): demo() nltk-3.7/nltk/test/unit/test_cfd_mutation.py000066400000000000000000000024661420073152400213340ustar00rootroot00000000000000import unittest import pytest from nltk import ConditionalFreqDist, tokenize class TestEmptyCondFreq(unittest.TestCase): def test_tabulate(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) with pytest.raises(ValueError): empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added self.assertEqual(empty.conditions(), []) def test_plot(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added self.assertEqual(empty.conditions(), []) def test_increment(self): # make sure that we can still mutate cfd normally text = "cow cat mouse cat tiger" cfd = ConditionalFreqDist() # create cfd with word length as condition for word in tokenize.word_tokenize(text): condition = len(word) cfd[condition][word] += 1 self.assertEqual(cfd.conditions(), [3, 5]) # incrementing previously unseen key is still possible cfd[2]["hi"] += 1 self.assertCountEqual(cfd.conditions(), [3, 5, 2]) # new condition added self.assertEqual( cfd[2]["hi"], 1 ) # key's frequency incremented from 0 (unseen) to 1 nltk-3.7/nltk/test/unit/test_cfg2chomsky.py000066400000000000000000000032151420073152400210700ustar00rootroot00000000000000import unittest import nltk from nltk.grammar import CFG class ChomskyNormalFormForCFGTest(unittest.TestCase): def test_simple(self): grammar = CFG.fromstring( """ S -> NP VP PP -> P NP NP -> Det N | NP PP P VP -> V NP | VP PP VP -> Det Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' """ ) self.assertFalse(grammar.is_flexible_chomsky_normal_form()) self.assertFalse(grammar.is_chomsky_normal_form()) grammar = grammar.chomsky_normal_form(flexible=True) self.assertTrue(grammar.is_flexible_chomsky_normal_form()) self.assertFalse(grammar.is_chomsky_normal_form()) grammar2 = CFG.fromstring( """ S -> NP VP NP -> VP N P VP -> P N -> 'dog' | 'cat' P -> 'on' | 'in' """ ) self.assertFalse(grammar2.is_flexible_chomsky_normal_form()) self.assertFalse(grammar2.is_chomsky_normal_form()) grammar2 = grammar2.chomsky_normal_form() self.assertTrue(grammar2.is_flexible_chomsky_normal_form()) self.assertTrue(grammar2.is_chomsky_normal_form()) def test_complex(self): grammar = nltk.data.load("grammars/large_grammars/atis.cfg") self.assertFalse(grammar.is_flexible_chomsky_normal_form()) self.assertFalse(grammar.is_chomsky_normal_form()) grammar = grammar.chomsky_normal_form(flexible=True) self.assertTrue(grammar.is_flexible_chomsky_normal_form()) self.assertFalse(grammar.is_chomsky_normal_form()) nltk-3.7/nltk/test/unit/test_chunk.py000066400000000000000000000041261420073152400177630ustar00rootroot00000000000000import unittest from nltk import RegexpParser class TestChunkRule(unittest.TestCase): def test_tag_pattern2re_pattern_quantifier(self): """Test for bug https://github.com/nltk/nltk/issues/1597 Ensures that curly bracket quantifiers can be used inside a chunk rule. This type of quantifier has been used for the supplementary example in https://www.nltk.org/book/ch07.html#exploring-text-corpora. """ sent = [ ("The", "AT"), ("September-October", "NP"), ("term", "NN"), ("jury", "NN"), ("had", "HVD"), ("been", "BEN"), ("charged", "VBN"), ("by", "IN"), ("Fulton", "NP-TL"), ("Superior", "JJ-TL"), ("Court", "NN-TL"), ("Judge", "NN-TL"), ("Durwood", "NP"), ("Pye", "NP"), ("to", "TO"), ("investigate", "VB"), ("reports", "NNS"), ("of", "IN"), ("possible", "JJ"), ("``", "``"), ("irregularities", "NNS"), ("''", "''"), ("in", "IN"), ("the", "AT"), ("hard-fought", "JJ"), ("primary", "NN"), ("which", "WDT"), ("was", "BEDZ"), ("won", "VBN"), ("by", "IN"), ("Mayor-nominate", "NN-TL"), ("Ivan", "NP"), ("Allen", "NP"), ("Jr.", "NP"), (".", "."), ] # source: brown corpus cp = RegexpParser("CHUNK: {{4,}}") tree = cp.parse(sent) assert ( tree.pformat() == """(S The/AT September-October/NP term/NN jury/NN had/HVD been/BEN charged/VBN by/IN Fulton/NP-TL Superior/JJ-TL (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP) to/TO investigate/VB reports/NNS of/IN possible/JJ ``/`` irregularities/NNS ''/'' in/IN the/AT hard-fought/JJ primary/NN which/WDT was/BEDZ won/VBN by/IN (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP) ./.)""" ) nltk-3.7/nltk/test/unit/test_classify.py000066400000000000000000000024101420073152400204620ustar00rootroot00000000000000""" Unit tests for nltk.classify. See also: nltk/test/classify.doctest """ import pytest from nltk import classify TRAIN = [ (dict(a=1, b=1, c=1), "y"), (dict(a=1, b=1, c=1), "x"), (dict(a=1, b=1, c=0), "y"), (dict(a=0, b=1, c=1), "x"), (dict(a=0, b=1, c=1), "y"), (dict(a=0, b=0, c=1), "y"), (dict(a=0, b=1, c=0), "x"), (dict(a=0, b=0, c=0), "x"), (dict(a=0, b=1, c=1), "y"), ] TEST = [ (dict(a=1, b=0, c=1)), # unseen (dict(a=1, b=0, c=0)), # unseen (dict(a=0, b=1, c=1)), # seen 3 times, labels=y,y,x (dict(a=0, b=1, c=0)), # seen 1 time, label=x ] RESULTS = [(0.16, 0.84), (0.46, 0.54), (0.41, 0.59), (0.76, 0.24)] def assert_classifier_correct(algorithm): try: classifier = classify.MaxentClassifier.train( TRAIN, algorithm, trace=0, max_iter=1000 ) except (LookupError, AttributeError) as e: pytest.skip(str(e)) for (px, py), featureset in zip(RESULTS, TEST): pdist = classifier.prob_classify(featureset) assert abs(pdist.prob("x") - px) < 1e-2, (pdist.prob("x"), px) assert abs(pdist.prob("y") - py) < 1e-2, (pdist.prob("y"), py) def test_megam(): assert_classifier_correct("MEGAM") def test_tadm(): assert_classifier_correct("TADM") nltk-3.7/nltk/test/unit/test_collocations.py000066400000000000000000000067621420073152400213540ustar00rootroot00000000000000from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures ## Test bigram counters with discontinuous bigrams and repeated words _EPSILON = 1e-8 SENT = "this this is is a a test test".split() def close_enough(x, y): """Verify that two sequences of n-gram association values are within _EPSILON of each other. """ return all(abs(x1[1] - y1[1]) <= _EPSILON for x1, y1 in zip(x, y)) def test_bigram2(): b = BigramCollocationFinder.from_words(SENT) assert sorted(b.ngram_fd.items()) == [ (("a", "a"), 1), (("a", "test"), 1), (("is", "a"), 1), (("is", "is"), 1), (("test", "test"), 1), (("this", "is"), 1), (("this", "this"), 1), ] assert sorted(b.word_fd.items()) == [("a", 2), ("is", 2), ("test", 2), ("this", 2)] assert len(SENT) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1 assert close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), [ (("a", "a"), 1.0), (("a", "test"), 1.0), (("is", "a"), 1.0), (("is", "is"), 1.0), (("test", "test"), 1.0), (("this", "is"), 1.0), (("this", "this"), 1.0), ], ) def test_bigram3(): b = BigramCollocationFinder.from_words(SENT, window_size=3) assert sorted(b.ngram_fd.items()) == sorted( [ (("a", "test"), 3), (("is", "a"), 3), (("this", "is"), 3), (("a", "a"), 1), (("is", "is"), 1), (("test", "test"), 1), (("this", "this"), 1), ] ) assert sorted(b.word_fd.items()) == sorted( [("a", 2), ("is", 2), ("test", 2), ("this", 2)] ) assert ( len(SENT) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0 ) assert close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted( [ (("a", "test"), 1.584962500721156), (("is", "a"), 1.584962500721156), (("this", "is"), 1.584962500721156), (("a", "a"), 0.0), (("is", "is"), 0.0), (("test", "test"), 0.0), (("this", "this"), 0.0), ] ), ) def test_bigram5(): b = BigramCollocationFinder.from_words(SENT, window_size=5) assert sorted(b.ngram_fd.items()) == sorted( [ (("a", "test"), 4), (("is", "a"), 4), (("this", "is"), 4), (("is", "test"), 3), (("this", "a"), 3), (("a", "a"), 1), (("is", "is"), 1), (("test", "test"), 1), (("this", "this"), 1), ] ) assert sorted(b.word_fd.items()) == sorted( [("a", 2), ("is", 2), ("test", 2), ("this", 2)] ) n_word_fd = sum(b.word_fd.values()) n_ngram_fd = (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0 assert len(SENT) == n_word_fd == n_ngram_fd assert close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted( [ (("a", "test"), 1.0), (("is", "a"), 1.0), (("this", "is"), 1.0), (("is", "test"), 0.5849625007211562), (("this", "a"), 0.5849625007211562), (("a", "a"), -1.0), (("is", "is"), -1.0), (("test", "test"), -1.0), (("this", "this"), -1.0), ] ), ) nltk-3.7/nltk/test/unit/test_concordance.py000066400000000000000000000076521420073152400211400ustar00rootroot00000000000000import contextlib import sys import unittest from io import StringIO from nltk.corpus import gutenberg from nltk.text import Text @contextlib.contextmanager def stdout_redirect(where): sys.stdout = where try: yield where finally: sys.stdout = sys.__stdout__ class TestConcordance(unittest.TestCase): """Text constructed using: https://www.nltk.org/book/ch01.html""" @classmethod def setUpClass(cls): cls.corpus = gutenberg.words("melville-moby_dick.txt") @classmethod def tearDownClass(cls): pass def setUp(self): self.text = Text(TestConcordance.corpus) self.query = "monstrous" self.maxDiff = None self.list_out = [ "ong the former , one was of a most monstrous size . ... This came towards us , ", 'ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r', "ll over with a heathenish array of monstrous clubs and spears . Some were thick", "d as you gazed , and wondered what monstrous cannibal and savage could ever hav", "that has survived the flood ; most monstrous and most mountainous ! That Himmal", "they might scout at Moby Dick as a monstrous fable , or still worse and more de", "th of Radney .'\" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l", "ing Scenes . In connexion with the monstrous pictures of whales , I am strongly", "ere to enter upon those still more monstrous stories of them which are to be fo", "ght have been rummaged out of this monstrous cabinet there is no telling . But ", "of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u", ] def tearDown(self): pass def test_concordance_list(self): concordance_out = self.text.concordance_list(self.query) self.assertEqual(self.list_out, [c.line for c in concordance_out]) def test_concordance_width(self): list_out = [ "monstrous", "monstrous", "monstrous", "monstrous", "monstrous", "monstrous", "Monstrous", "monstrous", "monstrous", "monstrous", "monstrous", ] concordance_out = self.text.concordance_list(self.query, width=0) self.assertEqual(list_out, [c.query for c in concordance_out]) def test_concordance_lines(self): concordance_out = self.text.concordance_list(self.query, lines=3) self.assertEqual(self.list_out[:3], [c.line for c in concordance_out]) def test_concordance_print(self): print_out = """Displaying 11 of 11 matches: ong the former , one was of a most monstrous size . ... This came towards us , ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r ll over with a heathenish array of monstrous clubs and spears . Some were thick d as you gazed , and wondered what monstrous cannibal and savage could ever hav that has survived the flood ; most monstrous and most mountainous ! That Himmal they might scout at Moby Dick as a monstrous fable , or still worse and more de th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l ing Scenes . In connexion with the monstrous pictures of whales , I am strongly ere to enter upon those still more monstrous stories of them which are to be fo ght have been rummaged out of this monstrous cabinet there is no telling . But of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u """ with stdout_redirect(StringIO()) as stdout: self.text.concordance(self.query) def strip_space(raw_str): return raw_str.replace(" ", "") self.assertEqual(strip_space(print_out), strip_space(stdout.getvalue())) nltk-3.7/nltk/test/unit/test_corenlp.py000066400000000000000000001575541420073152400203330ustar00rootroot00000000000000""" Mock test for Stanford CoreNLP wrappers. """ from unittest import TestCase from unittest.mock import MagicMock import pytest from nltk.parse import corenlp from nltk.tree import Tree def setup_module(module): global server try: server = corenlp.CoreNLPServer(port=9000) except LookupError: pytest.skip("Could not instantiate CoreNLPServer.") try: server.start() except corenlp.CoreNLPServerError as e: pytest.skip( "Skipping CoreNLP tests because the server could not be started. " "Make sure that the 9000 port is free. " "{}".format(e.strerror) ) def teardown_module(module): server.stop() class TestTokenizerAPI(TestCase): def test_tokenize(self): corenlp_tokenizer = corenlp.CoreNLPParser() api_return_value = { "sentences": [ { "index": 0, "tokens": [ { "after": " ", "before": "", "characterOffsetBegin": 0, "characterOffsetEnd": 4, "index": 1, "originalText": "Good", "word": "Good", }, { "after": " ", "before": " ", "characterOffsetBegin": 5, "characterOffsetEnd": 12, "index": 2, "originalText": "muffins", "word": "muffins", }, { "after": " ", "before": " ", "characterOffsetBegin": 13, "characterOffsetEnd": 17, "index": 3, "originalText": "cost", "word": "cost", }, { "after": "", "before": " ", "characterOffsetBegin": 18, "characterOffsetEnd": 19, "index": 4, "originalText": "$", "word": "$", }, { "after": "\n", "before": "", "characterOffsetBegin": 19, "characterOffsetEnd": 23, "index": 5, "originalText": "3.88", "word": "3.88", }, { "after": " ", "before": "\n", "characterOffsetBegin": 24, "characterOffsetEnd": 26, "index": 6, "originalText": "in", "word": "in", }, { "after": " ", "before": " ", "characterOffsetBegin": 27, "characterOffsetEnd": 30, "index": 7, "originalText": "New", "word": "New", }, { "after": "", "before": " ", "characterOffsetBegin": 31, "characterOffsetEnd": 35, "index": 8, "originalText": "York", "word": "York", }, { "after": " ", "before": "", "characterOffsetBegin": 35, "characterOffsetEnd": 36, "index": 9, "originalText": ".", "word": ".", }, ], }, { "index": 1, "tokens": [ { "after": " ", "before": " ", "characterOffsetBegin": 38, "characterOffsetEnd": 44, "index": 1, "originalText": "Please", "word": "Please", }, { "after": " ", "before": " ", "characterOffsetBegin": 45, "characterOffsetEnd": 48, "index": 2, "originalText": "buy", "word": "buy", }, { "after": "\n", "before": " ", "characterOffsetBegin": 49, "characterOffsetEnd": 51, "index": 3, "originalText": "me", "word": "me", }, { "after": " ", "before": "\n", "characterOffsetBegin": 52, "characterOffsetEnd": 55, "index": 4, "originalText": "two", "word": "two", }, { "after": " ", "before": " ", "characterOffsetBegin": 56, "characterOffsetEnd": 58, "index": 5, "originalText": "of", "word": "of", }, { "after": "", "before": " ", "characterOffsetBegin": 59, "characterOffsetEnd": 63, "index": 6, "originalText": "them", "word": "them", }, { "after": "\n", "before": "", "characterOffsetBegin": 63, "characterOffsetEnd": 64, "index": 7, "originalText": ".", "word": ".", }, ], }, { "index": 2, "tokens": [ { "after": "", "before": "\n", "characterOffsetBegin": 65, "characterOffsetEnd": 71, "index": 1, "originalText": "Thanks", "word": "Thanks", }, { "after": "", "before": "", "characterOffsetBegin": 71, "characterOffsetEnd": 72, "index": 2, "originalText": ".", "word": ".", }, ], }, ] } corenlp_tokenizer.api_call = MagicMock(return_value=api_return_value) input_string = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." expected_output = [ "Good", "muffins", "cost", "$", "3.88", "in", "New", "York", ".", "Please", "buy", "me", "two", "of", "them", ".", "Thanks", ".", ] tokenized_output = list(corenlp_tokenizer.tokenize(input_string)) corenlp_tokenizer.api_call.assert_called_once_with( "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.", properties={"annotators": "tokenize,ssplit"}, ) self.assertEqual(expected_output, tokenized_output) class TestTaggerAPI(TestCase): def test_pos_tagger(self): corenlp_tagger = corenlp.CoreNLPParser(tagtype="pos") api_return_value = { "sentences": [ { "basicDependencies": [ { "dep": "ROOT", "dependent": 1, "dependentGloss": "What", "governor": 0, "governorGloss": "ROOT", }, { "dep": "cop", "dependent": 2, "dependentGloss": "is", "governor": 1, "governorGloss": "What", }, { "dep": "det", "dependent": 3, "dependentGloss": "the", "governor": 4, "governorGloss": "airspeed", }, { "dep": "nsubj", "dependent": 4, "dependentGloss": "airspeed", "governor": 1, "governorGloss": "What", }, { "dep": "case", "dependent": 5, "dependentGloss": "of", "governor": 8, "governorGloss": "swallow", }, { "dep": "det", "dependent": 6, "dependentGloss": "an", "governor": 8, "governorGloss": "swallow", }, { "dep": "compound", "dependent": 7, "dependentGloss": "unladen", "governor": 8, "governorGloss": "swallow", }, { "dep": "nmod", "dependent": 8, "dependentGloss": "swallow", "governor": 4, "governorGloss": "airspeed", }, { "dep": "punct", "dependent": 9, "dependentGloss": "?", "governor": 1, "governorGloss": "What", }, ], "enhancedDependencies": [ { "dep": "ROOT", "dependent": 1, "dependentGloss": "What", "governor": 0, "governorGloss": "ROOT", }, { "dep": "cop", "dependent": 2, "dependentGloss": "is", "governor": 1, "governorGloss": "What", }, { "dep": "det", "dependent": 3, "dependentGloss": "the", "governor": 4, "governorGloss": "airspeed", }, { "dep": "nsubj", "dependent": 4, "dependentGloss": "airspeed", "governor": 1, "governorGloss": "What", }, { "dep": "case", "dependent": 5, "dependentGloss": "of", "governor": 8, "governorGloss": "swallow", }, { "dep": "det", "dependent": 6, "dependentGloss": "an", "governor": 8, "governorGloss": "swallow", }, { "dep": "compound", "dependent": 7, "dependentGloss": "unladen", "governor": 8, "governorGloss": "swallow", }, { "dep": "nmod:of", "dependent": 8, "dependentGloss": "swallow", "governor": 4, "governorGloss": "airspeed", }, { "dep": "punct", "dependent": 9, "dependentGloss": "?", "governor": 1, "governorGloss": "What", }, ], "enhancedPlusPlusDependencies": [ { "dep": "ROOT", "dependent": 1, "dependentGloss": "What", "governor": 0, "governorGloss": "ROOT", }, { "dep": "cop", "dependent": 2, "dependentGloss": "is", "governor": 1, "governorGloss": "What", }, { "dep": "det", "dependent": 3, "dependentGloss": "the", "governor": 4, "governorGloss": "airspeed", }, { "dep": "nsubj", "dependent": 4, "dependentGloss": "airspeed", "governor": 1, "governorGloss": "What", }, { "dep": "case", "dependent": 5, "dependentGloss": "of", "governor": 8, "governorGloss": "swallow", }, { "dep": "det", "dependent": 6, "dependentGloss": "an", "governor": 8, "governorGloss": "swallow", }, { "dep": "compound", "dependent": 7, "dependentGloss": "unladen", "governor": 8, "governorGloss": "swallow", }, { "dep": "nmod:of", "dependent": 8, "dependentGloss": "swallow", "governor": 4, "governorGloss": "airspeed", }, { "dep": "punct", "dependent": 9, "dependentGloss": "?", "governor": 1, "governorGloss": "What", }, ], "index": 0, "parse": "(ROOT\n (SBARQ\n (WHNP (WP What))\n (SQ (VBZ is)\n (NP\n (NP (DT the) (NN airspeed))\n (PP (IN of)\n (NP (DT an) (NN unladen) (NN swallow)))))\n (. ?)))", "tokens": [ { "after": " ", "before": "", "characterOffsetBegin": 0, "characterOffsetEnd": 4, "index": 1, "lemma": "what", "originalText": "What", "pos": "WP", "word": "What", }, { "after": " ", "before": " ", "characterOffsetBegin": 5, "characterOffsetEnd": 7, "index": 2, "lemma": "be", "originalText": "is", "pos": "VBZ", "word": "is", }, { "after": " ", "before": " ", "characterOffsetBegin": 8, "characterOffsetEnd": 11, "index": 3, "lemma": "the", "originalText": "the", "pos": "DT", "word": "the", }, { "after": " ", "before": " ", "characterOffsetBegin": 12, "characterOffsetEnd": 20, "index": 4, "lemma": "airspeed", "originalText": "airspeed", "pos": "NN", "word": "airspeed", }, { "after": " ", "before": " ", "characterOffsetBegin": 21, "characterOffsetEnd": 23, "index": 5, "lemma": "of", "originalText": "of", "pos": "IN", "word": "of", }, { "after": " ", "before": " ", "characterOffsetBegin": 24, "characterOffsetEnd": 26, "index": 6, "lemma": "a", "originalText": "an", "pos": "DT", "word": "an", }, { "after": " ", "before": " ", "characterOffsetBegin": 27, "characterOffsetEnd": 34, "index": 7, "lemma": "unladen", "originalText": "unladen", "pos": "JJ", "word": "unladen", }, { "after": " ", "before": " ", "characterOffsetBegin": 35, "characterOffsetEnd": 42, "index": 8, "lemma": "swallow", "originalText": "swallow", "pos": "VB", "word": "swallow", }, { "after": "", "before": " ", "characterOffsetBegin": 43, "characterOffsetEnd": 44, "index": 9, "lemma": "?", "originalText": "?", "pos": ".", "word": "?", }, ], } ] } corenlp_tagger.api_call = MagicMock(return_value=api_return_value) input_tokens = "What is the airspeed of an unladen swallow ?".split() expected_output = [ ("What", "WP"), ("is", "VBZ"), ("the", "DT"), ("airspeed", "NN"), ("of", "IN"), ("an", "DT"), ("unladen", "JJ"), ("swallow", "VB"), ("?", "."), ] tagged_output = corenlp_tagger.tag(input_tokens) corenlp_tagger.api_call.assert_called_once_with( "What is the airspeed of an unladen swallow ?", properties={ "ssplit.isOneSentence": "true", "annotators": "tokenize,ssplit,pos", }, ) self.assertEqual(expected_output, tagged_output) def test_ner_tagger(self): corenlp_tagger = corenlp.CoreNLPParser(tagtype="ner") api_return_value = { "sentences": [ { "index": 0, "tokens": [ { "after": " ", "before": "", "characterOffsetBegin": 0, "characterOffsetEnd": 4, "index": 1, "lemma": "Rami", "ner": "PERSON", "originalText": "Rami", "pos": "NNP", "word": "Rami", }, { "after": " ", "before": " ", "characterOffsetBegin": 5, "characterOffsetEnd": 8, "index": 2, "lemma": "Eid", "ner": "PERSON", "originalText": "Eid", "pos": "NNP", "word": "Eid", }, { "after": " ", "before": " ", "characterOffsetBegin": 9, "characterOffsetEnd": 11, "index": 3, "lemma": "be", "ner": "O", "originalText": "is", "pos": "VBZ", "word": "is", }, { "after": " ", "before": " ", "characterOffsetBegin": 12, "characterOffsetEnd": 20, "index": 4, "lemma": "study", "ner": "O", "originalText": "studying", "pos": "VBG", "word": "studying", }, { "after": " ", "before": " ", "characterOffsetBegin": 21, "characterOffsetEnd": 23, "index": 5, "lemma": "at", "ner": "O", "originalText": "at", "pos": "IN", "word": "at", }, { "after": " ", "before": " ", "characterOffsetBegin": 24, "characterOffsetEnd": 29, "index": 6, "lemma": "Stony", "ner": "ORGANIZATION", "originalText": "Stony", "pos": "NNP", "word": "Stony", }, { "after": " ", "before": " ", "characterOffsetBegin": 30, "characterOffsetEnd": 35, "index": 7, "lemma": "Brook", "ner": "ORGANIZATION", "originalText": "Brook", "pos": "NNP", "word": "Brook", }, { "after": " ", "before": " ", "characterOffsetBegin": 36, "characterOffsetEnd": 46, "index": 8, "lemma": "University", "ner": "ORGANIZATION", "originalText": "University", "pos": "NNP", "word": "University", }, { "after": " ", "before": " ", "characterOffsetBegin": 47, "characterOffsetEnd": 49, "index": 9, "lemma": "in", "ner": "O", "originalText": "in", "pos": "IN", "word": "in", }, { "after": "", "before": " ", "characterOffsetBegin": 50, "characterOffsetEnd": 52, "index": 10, "lemma": "NY", "ner": "O", "originalText": "NY", "pos": "NNP", "word": "NY", }, ], } ] } corenlp_tagger.api_call = MagicMock(return_value=api_return_value) input_tokens = "Rami Eid is studying at Stony Brook University in NY".split() expected_output = [ ("Rami", "PERSON"), ("Eid", "PERSON"), ("is", "O"), ("studying", "O"), ("at", "O"), ("Stony", "ORGANIZATION"), ("Brook", "ORGANIZATION"), ("University", "ORGANIZATION"), ("in", "O"), ("NY", "O"), ] tagged_output = corenlp_tagger.tag(input_tokens) corenlp_tagger.api_call.assert_called_once_with( "Rami Eid is studying at Stony Brook University in NY", properties={ "ssplit.isOneSentence": "true", "annotators": "tokenize,ssplit,ner", }, ) self.assertEqual(expected_output, tagged_output) def test_unexpected_tagtype(self): with self.assertRaises(ValueError): corenlp_tagger = corenlp.CoreNLPParser(tagtype="test") class TestParserAPI(TestCase): def test_parse(self): corenlp_parser = corenlp.CoreNLPParser() api_return_value = { "sentences": [ { "basicDependencies": [ { "dep": "ROOT", "dependent": 4, "dependentGloss": "fox", "governor": 0, "governorGloss": "ROOT", }, { "dep": "det", "dependent": 1, "dependentGloss": "The", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 2, "dependentGloss": "quick", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 3, "dependentGloss": "brown", "governor": 4, "governorGloss": "fox", }, { "dep": "dep", "dependent": 5, "dependentGloss": "jumps", "governor": 4, "governorGloss": "fox", }, { "dep": "case", "dependent": 6, "dependentGloss": "over", "governor": 9, "governorGloss": "dog", }, { "dep": "det", "dependent": 7, "dependentGloss": "the", "governor": 9, "governorGloss": "dog", }, { "dep": "amod", "dependent": 8, "dependentGloss": "lazy", "governor": 9, "governorGloss": "dog", }, { "dep": "nmod", "dependent": 9, "dependentGloss": "dog", "governor": 5, "governorGloss": "jumps", }, ], "enhancedDependencies": [ { "dep": "ROOT", "dependent": 4, "dependentGloss": "fox", "governor": 0, "governorGloss": "ROOT", }, { "dep": "det", "dependent": 1, "dependentGloss": "The", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 2, "dependentGloss": "quick", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 3, "dependentGloss": "brown", "governor": 4, "governorGloss": "fox", }, { "dep": "dep", "dependent": 5, "dependentGloss": "jumps", "governor": 4, "governorGloss": "fox", }, { "dep": "case", "dependent": 6, "dependentGloss": "over", "governor": 9, "governorGloss": "dog", }, { "dep": "det", "dependent": 7, "dependentGloss": "the", "governor": 9, "governorGloss": "dog", }, { "dep": "amod", "dependent": 8, "dependentGloss": "lazy", "governor": 9, "governorGloss": "dog", }, { "dep": "nmod:over", "dependent": 9, "dependentGloss": "dog", "governor": 5, "governorGloss": "jumps", }, ], "enhancedPlusPlusDependencies": [ { "dep": "ROOT", "dependent": 4, "dependentGloss": "fox", "governor": 0, "governorGloss": "ROOT", }, { "dep": "det", "dependent": 1, "dependentGloss": "The", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 2, "dependentGloss": "quick", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 3, "dependentGloss": "brown", "governor": 4, "governorGloss": "fox", }, { "dep": "dep", "dependent": 5, "dependentGloss": "jumps", "governor": 4, "governorGloss": "fox", }, { "dep": "case", "dependent": 6, "dependentGloss": "over", "governor": 9, "governorGloss": "dog", }, { "dep": "det", "dependent": 7, "dependentGloss": "the", "governor": 9, "governorGloss": "dog", }, { "dep": "amod", "dependent": 8, "dependentGloss": "lazy", "governor": 9, "governorGloss": "dog", }, { "dep": "nmod:over", "dependent": 9, "dependentGloss": "dog", "governor": 5, "governorGloss": "jumps", }, ], "index": 0, "parse": "(ROOT\n (NP\n (NP (DT The) (JJ quick) (JJ brown) (NN fox))\n (NP\n (NP (NNS jumps))\n (PP (IN over)\n (NP (DT the) (JJ lazy) (NN dog))))))", "tokens": [ { "after": " ", "before": "", "characterOffsetBegin": 0, "characterOffsetEnd": 3, "index": 1, "lemma": "the", "originalText": "The", "pos": "DT", "word": "The", }, { "after": " ", "before": " ", "characterOffsetBegin": 4, "characterOffsetEnd": 9, "index": 2, "lemma": "quick", "originalText": "quick", "pos": "JJ", "word": "quick", }, { "after": " ", "before": " ", "characterOffsetBegin": 10, "characterOffsetEnd": 15, "index": 3, "lemma": "brown", "originalText": "brown", "pos": "JJ", "word": "brown", }, { "after": " ", "before": " ", "characterOffsetBegin": 16, "characterOffsetEnd": 19, "index": 4, "lemma": "fox", "originalText": "fox", "pos": "NN", "word": "fox", }, { "after": " ", "before": " ", "characterOffsetBegin": 20, "characterOffsetEnd": 25, "index": 5, "lemma": "jump", "originalText": "jumps", "pos": "VBZ", "word": "jumps", }, { "after": " ", "before": " ", "characterOffsetBegin": 26, "characterOffsetEnd": 30, "index": 6, "lemma": "over", "originalText": "over", "pos": "IN", "word": "over", }, { "after": " ", "before": " ", "characterOffsetBegin": 31, "characterOffsetEnd": 34, "index": 7, "lemma": "the", "originalText": "the", "pos": "DT", "word": "the", }, { "after": " ", "before": " ", "characterOffsetBegin": 35, "characterOffsetEnd": 39, "index": 8, "lemma": "lazy", "originalText": "lazy", "pos": "JJ", "word": "lazy", }, { "after": "", "before": " ", "characterOffsetBegin": 40, "characterOffsetEnd": 43, "index": 9, "lemma": "dog", "originalText": "dog", "pos": "NN", "word": "dog", }, ], } ] } corenlp_parser.api_call = MagicMock(return_value=api_return_value) input_string = "The quick brown fox jumps over the lazy dog".split() expected_output = Tree( "ROOT", [ Tree( "NP", [ Tree( "NP", [ Tree("DT", ["The"]), Tree("JJ", ["quick"]), Tree("JJ", ["brown"]), Tree("NN", ["fox"]), ], ), Tree( "NP", [ Tree("NP", [Tree("NNS", ["jumps"])]), Tree( "PP", [ Tree("IN", ["over"]), Tree( "NP", [ Tree("DT", ["the"]), Tree("JJ", ["lazy"]), Tree("NN", ["dog"]), ], ), ], ), ], ), ], ) ], ) parsed_data = next(corenlp_parser.parse(input_string)) corenlp_parser.api_call.assert_called_once_with( "The quick brown fox jumps over the lazy dog", properties={"ssplit.eolonly": "true"}, ) self.assertEqual(expected_output, parsed_data) def test_dependency_parser(self): corenlp_parser = corenlp.CoreNLPDependencyParser() api_return_value = { "sentences": [ { "basicDependencies": [ { "dep": "ROOT", "dependent": 5, "dependentGloss": "jumps", "governor": 0, "governorGloss": "ROOT", }, { "dep": "det", "dependent": 1, "dependentGloss": "The", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 2, "dependentGloss": "quick", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 3, "dependentGloss": "brown", "governor": 4, "governorGloss": "fox", }, { "dep": "nsubj", "dependent": 4, "dependentGloss": "fox", "governor": 5, "governorGloss": "jumps", }, { "dep": "case", "dependent": 6, "dependentGloss": "over", "governor": 9, "governorGloss": "dog", }, { "dep": "det", "dependent": 7, "dependentGloss": "the", "governor": 9, "governorGloss": "dog", }, { "dep": "amod", "dependent": 8, "dependentGloss": "lazy", "governor": 9, "governorGloss": "dog", }, { "dep": "nmod", "dependent": 9, "dependentGloss": "dog", "governor": 5, "governorGloss": "jumps", }, ], "enhancedDependencies": [ { "dep": "ROOT", "dependent": 5, "dependentGloss": "jumps", "governor": 0, "governorGloss": "ROOT", }, { "dep": "det", "dependent": 1, "dependentGloss": "The", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 2, "dependentGloss": "quick", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 3, "dependentGloss": "brown", "governor": 4, "governorGloss": "fox", }, { "dep": "nsubj", "dependent": 4, "dependentGloss": "fox", "governor": 5, "governorGloss": "jumps", }, { "dep": "case", "dependent": 6, "dependentGloss": "over", "governor": 9, "governorGloss": "dog", }, { "dep": "det", "dependent": 7, "dependentGloss": "the", "governor": 9, "governorGloss": "dog", }, { "dep": "amod", "dependent": 8, "dependentGloss": "lazy", "governor": 9, "governorGloss": "dog", }, { "dep": "nmod:over", "dependent": 9, "dependentGloss": "dog", "governor": 5, "governorGloss": "jumps", }, ], "enhancedPlusPlusDependencies": [ { "dep": "ROOT", "dependent": 5, "dependentGloss": "jumps", "governor": 0, "governorGloss": "ROOT", }, { "dep": "det", "dependent": 1, "dependentGloss": "The", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 2, "dependentGloss": "quick", "governor": 4, "governorGloss": "fox", }, { "dep": "amod", "dependent": 3, "dependentGloss": "brown", "governor": 4, "governorGloss": "fox", }, { "dep": "nsubj", "dependent": 4, "dependentGloss": "fox", "governor": 5, "governorGloss": "jumps", }, { "dep": "case", "dependent": 6, "dependentGloss": "over", "governor": 9, "governorGloss": "dog", }, { "dep": "det", "dependent": 7, "dependentGloss": "the", "governor": 9, "governorGloss": "dog", }, { "dep": "amod", "dependent": 8, "dependentGloss": "lazy", "governor": 9, "governorGloss": "dog", }, { "dep": "nmod:over", "dependent": 9, "dependentGloss": "dog", "governor": 5, "governorGloss": "jumps", }, ], "index": 0, "tokens": [ { "after": " ", "before": "", "characterOffsetBegin": 0, "characterOffsetEnd": 3, "index": 1, "lemma": "the", "originalText": "The", "pos": "DT", "word": "The", }, { "after": " ", "before": " ", "characterOffsetBegin": 4, "characterOffsetEnd": 9, "index": 2, "lemma": "quick", "originalText": "quick", "pos": "JJ", "word": "quick", }, { "after": " ", "before": " ", "characterOffsetBegin": 10, "characterOffsetEnd": 15, "index": 3, "lemma": "brown", "originalText": "brown", "pos": "JJ", "word": "brown", }, { "after": " ", "before": " ", "characterOffsetBegin": 16, "characterOffsetEnd": 19, "index": 4, "lemma": "fox", "originalText": "fox", "pos": "NN", "word": "fox", }, { "after": " ", "before": " ", "characterOffsetBegin": 20, "characterOffsetEnd": 25, "index": 5, "lemma": "jump", "originalText": "jumps", "pos": "VBZ", "word": "jumps", }, { "after": " ", "before": " ", "characterOffsetBegin": 26, "characterOffsetEnd": 30, "index": 6, "lemma": "over", "originalText": "over", "pos": "IN", "word": "over", }, { "after": " ", "before": " ", "characterOffsetBegin": 31, "characterOffsetEnd": 34, "index": 7, "lemma": "the", "originalText": "the", "pos": "DT", "word": "the", }, { "after": " ", "before": " ", "characterOffsetBegin": 35, "characterOffsetEnd": 39, "index": 8, "lemma": "lazy", "originalText": "lazy", "pos": "JJ", "word": "lazy", }, { "after": "", "before": " ", "characterOffsetBegin": 40, "characterOffsetEnd": 43, "index": 9, "lemma": "dog", "originalText": "dog", "pos": "NN", "word": "dog", }, ], } ] } corenlp_parser.api_call = MagicMock(return_value=api_return_value) input_string = "The quick brown fox jumps over the lazy dog".split() expected_output = Tree( "jumps", [ Tree("fox", ["The", "quick", "brown"]), Tree("dog", ["over", "the", "lazy"]), ], ) parsed_data = next(corenlp_parser.parse(input_string)) corenlp_parser.api_call.assert_called_once_with( "The quick brown fox jumps over the lazy dog", properties={"ssplit.eolonly": "true"}, ) self.assertEqual(expected_output, parsed_data.tree()) nltk-3.7/nltk/test/unit/test_corpora.py000066400000000000000000000217511420073152400203230ustar00rootroot00000000000000import unittest import pytest from nltk.corpus import ( # mwa_ppdb cess_cat, cess_esp, conll2007, floresta, indian, ptb, sinica_treebank, udhr, ) from nltk.tree import Tree class TestUdhr(unittest.TestCase): def test_words(self): for name in udhr.fileids(): words = list(udhr.words(name)) self.assertTrue(words) def test_raw_unicode(self): for name in udhr.fileids(): txt = udhr.raw(name) assert not isinstance(txt, bytes), name class TestIndian(unittest.TestCase): def test_words(self): words = indian.words()[:3] self.assertEqual(words, ["মহিষের", "সন্তান", ":"]) def test_tagged_words(self): tagged_words = indian.tagged_words()[:3] self.assertEqual( tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")] ) class TestCess(unittest.TestCase): def test_catalan(self): words = cess_cat.words()[:15] txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial" self.assertEqual(words, txt.split()) self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs") def test_esp(self): words = cess_esp.words()[:15] txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del" self.assertEqual(words, txt.split()) self.assertEqual(cess_esp.words()[115], "años") class TestFloresta(unittest.TestCase): def test_words(self): words = floresta.words()[:10] txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a" self.assertEqual(words, txt.split()) class TestSinicaTreebank(unittest.TestCase): def test_sents(self): first_3_sents = sinica_treebank.sents()[:3] self.assertEqual( first_3_sents, [["一"], ["友情"], ["嘉珍", "和", "我", "住在", "同一條", "巷子"]] ) def test_parsed_sents(self): parsed_sents = sinica_treebank.parsed_sents()[25] self.assertEqual( parsed_sents, Tree( "S", [ Tree("NP", [Tree("Nba", ["嘉珍"])]), Tree("V‧地", [Tree("VA11", ["不停"]), Tree("DE", ["的"])]), Tree("VA4", ["哭泣"]), ], ), ) class TestCoNLL2007(unittest.TestCase): # Reading the CoNLL 2007 Dependency Treebanks def test_sents(self): sents = conll2007.sents("esp.train")[0] self.assertEqual( sents[:6], ["El", "aumento", "del", "índice", "de", "desempleo"] ) def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents("esp.train")[0] self.assertEqual( parsed_sents.tree(), Tree( "fortaleció", [ Tree( "aumento", [ "El", Tree( "del", [ Tree( "índice", [ Tree( "de", [Tree("desempleo", ["estadounidense"])], ) ], ) ], ), ], ), "hoy", "considerablemente", Tree( "al", [ Tree( "euro", [ Tree( "cotizaba", [ ",", "que", Tree("a", [Tree("15.35", ["las", "GMT"])]), "se", Tree( "en", [ Tree( "mercado", [ "el", Tree("de", ["divisas"]), Tree("de", ["Fráncfort"]), ], ) ], ), Tree("a", ["0,9452_dólares"]), Tree( "frente_a", [ ",", Tree( "0,9349_dólares", [ "los", Tree( "de", [ Tree( "mañana", ["esta"], ) ], ), ], ), ], ), ], ) ], ) ], ), ".", ], ), ) @pytest.mark.skipif( not ptb.fileids(), reason="A full installation of the Penn Treebank is not available", ) class TestPTB(unittest.TestCase): def test_fileids(self): self.assertEqual( ptb.fileids()[:4], [ "BROWN/CF/CF01.MRG", "BROWN/CF/CF02.MRG", "BROWN/CF/CF03.MRG", "BROWN/CF/CF04.MRG", ], ) def test_words(self): self.assertEqual( ptb.words("WSJ/00/WSJ_0003.MRG")[:7], ["A", "form", "of", "asbestos", "once", "used", "*"], ) def test_tagged_words(self): self.assertEqual( ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3], [("A", "DT"), ("form", "NN"), ("of", "IN")], ) def test_categories(self): self.assertEqual( ptb.categories(), [ "adventure", "belles_lettres", "fiction", "humor", "lore", "mystery", "news", "romance", "science_fiction", ], ) def test_news_fileids(self): self.assertEqual( ptb.fileids("news")[:3], ["WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG"], ) def test_category_words(self): self.assertEqual( ptb.words(categories=["humor", "fiction"])[:6], ["Thirty-three", "Scotty", "did", "not", "go", "back"], ) @pytest.mark.skip("Skipping test for mwa_ppdb.") class TestMWAPPDB(unittest.TestCase): def test_fileids(self): self.assertEqual( mwa_ppdb.fileids(), ["ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"] ) def test_entries(self): self.assertEqual( mwa_ppdb.entries()[:10], [ ("10/17/01", "17/10/2001"), ("102,70", "102.70"), ("13,53", "13.53"), ("3.2.5.3.2.1", "3.2.5.3.2.1."), ("53,76", "53.76"), ("6.9.5", "6.9.5."), ("7.7.6.3", "7.7.6.3."), ("76,20", "76.20"), ("79,85", "79.85"), ("93,65", "93.65"), ], ) nltk-3.7/nltk/test/unit/test_corpus_views.py000066400000000000000000000030201420073152400213730ustar00rootroot00000000000000""" Corpus View Regression Tests """ import unittest import nltk.data from nltk.corpus.reader.util import ( StreamBackedCorpusView, read_line_block, read_whitespace_block, ) class TestCorpusViews(unittest.TestCase): linetok = nltk.LineTokenizer(blanklines="keep") names = [ "corpora/inaugural/README", # A very short file (160 chars) "corpora/inaugural/1793-Washington.txt", # A relatively short file (791 chars) "corpora/inaugural/1909-Taft.txt", # A longer file (32k chars) ] def data(self): for name in self.names: f = nltk.data.find(name) with f.open() as fp: file_data = fp.read().decode("utf8") yield f, file_data def test_correct_values(self): # Check that corpus views produce the correct sequence of values. for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(list(v), file_data.split()) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(list(v), self.linetok.tokenize(file_data)) def test_correct_length(self): # Check that the corpus views report the correct lengths: for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(len(v), len(file_data.split())) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) nltk-3.7/nltk/test/unit/test_data.py000066400000000000000000000005671420073152400175710ustar00rootroot00000000000000import pytest import nltk.data def test_find_raises_exception(): with pytest.raises(LookupError): nltk.data.find("no_such_resource/foo") def test_find_raises_exception_with_full_resource_name(): no_such_thing = "no_such_thing/bar" with pytest.raises(LookupError) as exc: nltk.data.find(no_such_thing) assert no_such_thing in str(exc) nltk-3.7/nltk/test/unit/test_disagreement.py000066400000000000000000000103351420073152400213210ustar00rootroot00000000000000import unittest from nltk.metrics.agreement import AnnotationTask class TestDisagreement(unittest.TestCase): """ Class containing unit tests for nltk.metrics.agreement.Disagreement. """ def test_easy(self): """ Simple test, based on https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf. """ data = [ ("coder1", "dress1", "YES"), ("coder2", "dress1", "NO"), ("coder3", "dress1", "NO"), ("coder1", "dress2", "YES"), ("coder2", "dress2", "NO"), ("coder3", "dress3", "NO"), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), -0.3333333) def test_easy2(self): """ Same simple test with 1 rating removed. Removal of that rating should not matter: K-Apha ignores items with only 1 rating. """ data = [ ("coder1", "dress1", "YES"), ("coder2", "dress1", "NO"), ("coder3", "dress1", "NO"), ("coder1", "dress2", "YES"), ("coder2", "dress2", "NO"), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), -0.3333333) def test_advanced(self): """ More advanced test, based on http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf """ data = [ ("A", "1", "1"), ("B", "1", "1"), ("D", "1", "1"), ("A", "2", "2"), ("B", "2", "2"), ("C", "2", "3"), ("D", "2", "2"), ("A", "3", "3"), ("B", "3", "3"), ("C", "3", "3"), ("D", "3", "3"), ("A", "4", "3"), ("B", "4", "3"), ("C", "4", "3"), ("D", "4", "3"), ("A", "5", "2"), ("B", "5", "2"), ("C", "5", "2"), ("D", "5", "2"), ("A", "6", "1"), ("B", "6", "2"), ("C", "6", "3"), ("D", "6", "4"), ("A", "7", "4"), ("B", "7", "4"), ("C", "7", "4"), ("D", "7", "4"), ("A", "8", "1"), ("B", "8", "1"), ("C", "8", "2"), ("D", "8", "1"), ("A", "9", "2"), ("B", "9", "2"), ("C", "9", "2"), ("D", "9", "2"), ("B", "10", "5"), ("C", "10", "5"), ("D", "10", "5"), ("C", "11", "1"), ("D", "11", "1"), ("C", "12", "3"), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632) def test_advanced2(self): """ Same more advanced example, but with 1 rating removed. Again, removal of that 1 rating should not matter. """ data = [ ("A", "1", "1"), ("B", "1", "1"), ("D", "1", "1"), ("A", "2", "2"), ("B", "2", "2"), ("C", "2", "3"), ("D", "2", "2"), ("A", "3", "3"), ("B", "3", "3"), ("C", "3", "3"), ("D", "3", "3"), ("A", "4", "3"), ("B", "4", "3"), ("C", "4", "3"), ("D", "4", "3"), ("A", "5", "2"), ("B", "5", "2"), ("C", "5", "2"), ("D", "5", "2"), ("A", "6", "1"), ("B", "6", "2"), ("C", "6", "3"), ("D", "6", "4"), ("A", "7", "4"), ("B", "7", "4"), ("C", "7", "4"), ("D", "7", "4"), ("A", "8", "1"), ("B", "8", "1"), ("C", "8", "2"), ("D", "8", "1"), ("A", "9", "2"), ("B", "9", "2"), ("C", "9", "2"), ("D", "9", "2"), ("B", "10", "5"), ("C", "10", "5"), ("D", "10", "5"), ("C", "11", "1"), ("D", "11", "1"), ("C", "12", "3"), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632) nltk-3.7/nltk/test/unit/test_distance.py000066400000000000000000000131161420073152400204440ustar00rootroot00000000000000from typing import Tuple import pytest from nltk.metrics.distance import edit_distance class TestEditDistance: @pytest.mark.parametrize( "left,right,substitution_cost,expecteds", [ # Allowing transpositions reduces the number of edits required. # with transpositions: # e.g. "abc" -T-> "cba" -D-> "ca": 2 steps # # without transpositions: # e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps ("abc", "ca", 1, (2, 3)), ("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions # Note, a substition_cost of higher than 2 doesn't make much # sense, as a deletion + insertion is identical, and always # costs 2. # # # Transpositions don't always reduce the number of edits required: # with or without transpositions: # e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps ("wants", "wasp", 1, (3, 3)), ("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions # # # Ought to have the same results with and without transpositions # with or without transpositions: # e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps # (but cost 5 if substitution_cost=2) ("rain", "shine", 1, (3, 3)), ("rain", "shine", 2, (5, 5)), # Does *require* substitutions # # # Several potentially interesting typos # with transpositions: # e.g. "acbdef" -T-> "abcdef": 1 step # # without transpositions: # e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps ("acbdef", "abcdef", 1, (1, 2)), ("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions # # # with transpositions: # e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps # # without transpositions: # e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps ("lnaguaeg", "language", 1, (2, 4)), ("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions # # # with transpositions: # e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps # # without transpositions: # e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps # (but one substitution, so a cost of 4 if substition_cost = 2) ("lnaugage", "language", 1, (2, 3)), ("lnaugage", "language", 2, (2, 4)), # Does *require* substitutions if no transpositions # # # with transpositions: # e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps # without transpositions: # e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps ("lngauage", "language", 1, (2, 2)), ("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions # # # with or without transpositions: # e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps # # with substitution_cost=2 and transpositions: # e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw" # -I-> "swi" -I-> "swim": 6 steps # # with substitution_cost=2 and no transpositions: # e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw" # -I-> "swi" -I-> "swim": 7 steps ("wants", "swim", 1, (5, 5)), ("wants", "swim", 2, (6, 7)), # # # with or without transpositions: # e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps # (but cost 5 if substitution_cost=2) ("kitten", "sitting", 1, (3, 3)), ("kitten", "sitting", 2, (5, 5)), # # duplicated letter # e.g. "duplicated" -D-> "duplicated" ("duplicated", "duuplicated", 1, (1, 1)), ("duplicated", "duuplicated", 2, (1, 1)), ("very duplicated", "very duuplicateed", 2, (2, 2)), ], ) def test_with_transpositions( self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int] ): """ Test `edit_distance` between two strings, given some `substitution_cost`, and whether transpositions are allowed. :param str left: First input string to `edit_distance`. :param str right: Second input string to `edit_distance`. :param int substitution_cost: The cost of a substitution action in `edit_distance`. :param Tuple[int, int] expecteds: A tuple of expected outputs, such that `expecteds[0]` is the expected output with `transpositions=True`, and `expecteds[1]` is the expected output with `transpositions=False`. """ # Test the input strings in both orderings for s1, s2 in ((left, right), (right, left)): # zip with [True, False] to get the transpositions value for expected, transpositions in zip(expecteds, [True, False]): predicted = edit_distance( s1, s2, substitution_cost=substitution_cost, transpositions=transpositions, ) assert predicted == expected nltk-3.7/nltk/test/unit/test_downloader.py000066400000000000000000000013221420073152400210040ustar00rootroot00000000000000from nltk import download def test_downloader_using_existing_parent_download_dir(tmp_path): """Test that download works properly when the parent folder of the download_dir exists""" download_dir = str(tmp_path.joinpath("another_dir")) download_status = download("mwa_ppdb", download_dir) assert download_status is True def test_downloader_using_non_existing_parent_download_dir(tmp_path): """Test that download works properly when the parent folder of the download_dir does not exist""" download_dir = str( tmp_path.joinpath("non-existing-parent-folder", "another-non-existing-folder") ) download_status = download("mwa_ppdb", download_dir) assert download_status is True nltk-3.7/nltk/test/unit/test_freqdist.py000066400000000000000000000003131420073152400204660ustar00rootroot00000000000000import nltk def test_iterating_returns_an_iterator_ordered_by_frequency(): samples = ["one", "two", "two"] distribution = nltk.FreqDist(samples) assert list(distribution) == ["two", "one"] nltk-3.7/nltk/test/unit/test_hmm.py000066400000000000000000000042331420073152400174330ustar00rootroot00000000000000import pytest from nltk.tag import hmm def _wikipedia_example_hmm(): # Example from wikipedia # (https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm) states = ["rain", "no rain"] symbols = ["umbrella", "no umbrella"] A = [[0.7, 0.3], [0.3, 0.7]] # transition probabilities B = [[0.9, 0.1], [0.2, 0.8]] # emission probabilities pi = [0.5, 0.5] # initial probabilities seq = ["umbrella", "umbrella", "no umbrella", "umbrella", "umbrella"] seq = list(zip(seq, [None] * len(seq))) model = hmm._create_hmm_tagger(states, symbols, A, B, pi) return model, states, symbols, seq def test_forward_probability(): from numpy.testing import assert_array_almost_equal # example from p. 385, Huang et al model, states, symbols = hmm._market_hmm_example() seq = [("up", None), ("up", None)] expected = [[0.35, 0.02, 0.09], [0.1792, 0.0085, 0.0357]] fp = 2 ** model._forward_probability(seq) assert_array_almost_equal(fp, expected) def test_forward_probability2(): from numpy.testing import assert_array_almost_equal model, states, symbols, seq = _wikipedia_example_hmm() fp = 2 ** model._forward_probability(seq) # examples in wikipedia are normalized fp = (fp.T / fp.sum(axis=1)).T wikipedia_results = [ [0.8182, 0.1818], [0.8834, 0.1166], [0.1907, 0.8093], [0.7308, 0.2692], [0.8673, 0.1327], ] assert_array_almost_equal(wikipedia_results, fp, 4) def test_backward_probability(): from numpy.testing import assert_array_almost_equal model, states, symbols, seq = _wikipedia_example_hmm() bp = 2 ** model._backward_probability(seq) # examples in wikipedia are normalized bp = (bp.T / bp.sum(axis=1)).T wikipedia_results = [ # Forward-backward algorithm doesn't need b0_5, # so .backward_probability doesn't compute it. # [0.6469, 0.3531], [0.5923, 0.4077], [0.3763, 0.6237], [0.6533, 0.3467], [0.6273, 0.3727], [0.5, 0.5], ] assert_array_almost_equal(wikipedia_results, bp, 4) def setup_module(module): pytest.importorskip("numpy") nltk-3.7/nltk/test/unit/test_json2csv_corpus.py000066400000000000000000000130561420073152400220170ustar00rootroot00000000000000# Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2022 NLTK Project # Author: Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ Regression tests for `json2csv()` and `json2csv_entities()` in Twitter package. """ from pathlib import Path import pytest from nltk.corpus import twitter_samples from nltk.twitter.common import json2csv, json2csv_entities def files_are_identical(pathA, pathB): """ Compare two files, ignoring carriage returns, leading whitespace, and trailing whitespace """ f1 = [l.strip() for l in pathA.read_bytes().splitlines()] f2 = [l.strip() for l in pathB.read_bytes().splitlines()] return f1 == f2 subdir = Path(__file__).parent / "files" @pytest.fixture def infile(): with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile: return [next(infile) for x in range(100)] def test_textoutput(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.text.csv.ref" outfn = tmp_path / "tweets.20150430-223406.text.csv" json2csv(infile, outfn, ["text"], gzip_compress=False) assert files_are_identical(outfn, ref_fn) def test_tweet_metadata(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.tweet.csv.ref" fields = [ "created_at", "favorite_count", "id", "in_reply_to_status_id", "in_reply_to_user_id", "retweet_count", "retweeted", "text", "truncated", "user.id", ] outfn = tmp_path / "tweets.20150430-223406.tweet.csv" json2csv(infile, outfn, fields, gzip_compress=False) assert files_are_identical(outfn, ref_fn) def test_user_metadata(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.user.csv.ref" fields = ["id", "text", "user.id", "user.followers_count", "user.friends_count"] outfn = tmp_path / "tweets.20150430-223406.user.csv" json2csv(infile, outfn, fields, gzip_compress=False) assert files_are_identical(outfn, ref_fn) def test_tweet_hashtag(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.hashtag.csv.ref" outfn = tmp_path / "tweets.20150430-223406.hashtag.csv" json2csv_entities( infile, outfn, ["id", "text"], "hashtags", ["text"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn) def test_tweet_usermention(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.usermention.csv.ref" outfn = tmp_path / "tweets.20150430-223406.usermention.csv" json2csv_entities( infile, outfn, ["id", "text"], "user_mentions", ["id", "screen_name"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn) def test_tweet_media(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.media.csv.ref" outfn = tmp_path / "tweets.20150430-223406.media.csv" json2csv_entities( infile, outfn, ["id"], "media", ["media_url", "url"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn) def test_tweet_url(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.url.csv.ref" outfn = tmp_path / "tweets.20150430-223406.url.csv" json2csv_entities( infile, outfn, ["id"], "urls", ["url", "expanded_url"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn) def test_userurl(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.userurl.csv.ref" outfn = tmp_path / "tweets.20150430-223406.userurl.csv" json2csv_entities( infile, outfn, ["id", "screen_name"], "user.urls", ["url", "expanded_url"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn) def test_tweet_place(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.place.csv.ref" outfn = tmp_path / "tweets.20150430-223406.place.csv" json2csv_entities( infile, outfn, ["id", "text"], "place", ["name", "country"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn) def test_tweet_place_boundingbox(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.placeboundingbox.csv.ref" outfn = tmp_path / "tweets.20150430-223406.placeboundingbox.csv" json2csv_entities( infile, outfn, ["id", "name"], "place.bounding_box", ["coordinates"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn) def test_retweet_original_tweet(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref" outfn = tmp_path / "tweets.20150430-223406.retweet.csv" json2csv_entities( infile, outfn, ["id"], "retweeted_status", [ "created_at", "favorite_count", "id", "in_reply_to_status_id", "in_reply_to_user_id", "retweet_count", "text", "truncated", "user.id", ], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn) def test_file_is_wrong(tmp_path, infile): """ Sanity check that file comparison is not giving false positives. """ ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref" outfn = tmp_path / "tweets.20150430-223406.text.csv" json2csv(infile, outfn, ["text"], gzip_compress=False) assert not files_are_identical(outfn, ref_fn) nltk-3.7/nltk/test/unit/test_json_serialization.py000066400000000000000000000067231420073152400225660ustar00rootroot00000000000000import unittest from nltk.corpus import brown from nltk.jsontags import JSONTaggedDecoder, JSONTaggedEncoder from nltk.tag import ( AffixTagger, BigramTagger, BrillTagger, BrillTaggerTrainer, DefaultTagger, NgramTagger, PerceptronTagger, RegexpTagger, TrigramTagger, UnigramTagger, ) from nltk.tag.brill import nltkdemo18 class TestJSONSerialization(unittest.TestCase): def setUp(self): self.corpus = brown.tagged_sents()[:35] self.decoder = JSONTaggedDecoder() self.encoder = JSONTaggedEncoder() self.default_tagger = DefaultTagger("NN") def test_default_tagger(self): encoded = self.encoder.encode(self.default_tagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(self.default_tagger), repr(decoded)) self.assertEqual(self.default_tagger._tag, decoded._tag) def test_regexp_tagger(self): tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(tagger), repr(decoded)) self.assertEqual(repr(tagger.backoff), repr(decoded.backoff)) self.assertEqual(tagger._regexps, decoded._regexps) def test_affix_tagger(self): tagger = AffixTagger(self.corpus, backoff=self.default_tagger) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(tagger), repr(decoded)) self.assertEqual(repr(tagger.backoff), repr(decoded.backoff)) self.assertEqual(tagger._affix_length, decoded._affix_length) self.assertEqual(tagger._min_word_length, decoded._min_word_length) self.assertEqual(tagger._context_to_tag, decoded._context_to_tag) def test_ngram_taggers(self): unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger) bitagger = BigramTagger(self.corpus, backoff=unitagger) tritagger = TrigramTagger(self.corpus, backoff=bitagger) ntagger = NgramTagger(4, self.corpus, backoff=tritagger) encoded = self.encoder.encode(ntagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(ntagger), repr(decoded)) self.assertEqual(repr(tritagger), repr(decoded.backoff)) self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff)) self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff)) self.assertEqual( repr(self.default_tagger), repr(decoded.backoff.backoff.backoff.backoff) ) def test_perceptron_tagger(self): tagger = PerceptronTagger(load=False) tagger.train(self.corpus) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(tagger.model.weights, decoded.model.weights) self.assertEqual(tagger.tagdict, decoded.tagdict) self.assertEqual(tagger.classes, decoded.classes) def test_brill_tagger(self): trainer = BrillTaggerTrainer( self.default_tagger, nltkdemo18(), deterministic=True ) tagger = trainer.train(self.corpus, max_rules=30) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(tagger._initial_tagger), repr(decoded._initial_tagger)) self.assertEqual(tagger._rules, decoded._rules) self.assertEqual(tagger._training_stats, decoded._training_stats) nltk-3.7/nltk/test/unit/test_metrics.py000066400000000000000000000035331420073152400203220ustar00rootroot00000000000000import unittest from nltk.metrics import ( BigramAssocMeasures, QuadgramAssocMeasures, TrigramAssocMeasures, ) ## Test the likelihood ratio metric _DELTA = 1e-8 class TestLikelihoodRatio(unittest.TestCase): def test_lr_bigram(self): self.assertAlmostEqual( BigramAssocMeasures.likelihood_ratio(2, (4, 4), 20), 2.4142743368419755, delta=_DELTA, ) self.assertAlmostEqual( BigramAssocMeasures.likelihood_ratio(1, (1, 1), 1), 0.0, delta=_DELTA ) self.assertRaises( ValueError, BigramAssocMeasures.likelihood_ratio, *(0, (2, 2), 2), ) def test_lr_trigram(self): self.assertAlmostEqual( TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 2), 5.545177444479562, delta=_DELTA, ) self.assertAlmostEqual( TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 1), 0.0, delta=_DELTA, ) self.assertRaises( ValueError, TrigramAssocMeasures.likelihood_ratio, *(1, (1, 1, 2), (1, 1, 2), 2), ) def test_lr_quadgram(self): self.assertAlmostEqual( QuadgramAssocMeasures.likelihood_ratio( 1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 2 ), 8.317766166719343, delta=_DELTA, ) self.assertAlmostEqual( QuadgramAssocMeasures.likelihood_ratio( 1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 1 ), 0.0, delta=_DELTA, ) self.assertRaises( ValueError, QuadgramAssocMeasures.likelihood_ratio, *(1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 2), (1, 1, 1, 1), 1), ) nltk-3.7/nltk/test/unit/test_naivebayes.py000066400000000000000000000013471420073152400210030ustar00rootroot00000000000000import unittest from nltk.classify.naivebayes import NaiveBayesClassifier class NaiveBayesClassifierTest(unittest.TestCase): def test_simple(self): training_features = [ ({"nice": True, "good": True}, "positive"), ({"bad": True, "mean": True}, "negative"), ] classifier = NaiveBayesClassifier.train(training_features) result = classifier.prob_classify({"nice": True}) self.assertTrue(result.prob("positive") > result.prob("negative")) self.assertEqual(result.max(), "positive") result = classifier.prob_classify({"bad": True}) self.assertTrue(result.prob("positive") < result.prob("negative")) self.assertEqual(result.max(), "negative") nltk-3.7/nltk/test/unit/test_nombank.py000066400000000000000000000013351420073152400202770ustar00rootroot00000000000000""" Unit tests for nltk.corpus.nombank """ import unittest from nltk.corpus import nombank # Load the nombank once. nombank.nouns() class NombankDemo(unittest.TestCase): def test_numbers(self): # No. of instances. self.assertEqual(len(nombank.instances()), 114574) # No. of rolesets self.assertEqual(len(nombank.rolesets()), 5577) # No. of nouns. self.assertEqual(len(nombank.nouns()), 4704) def test_instance(self): self.assertEqual(nombank.instances()[0].roleset, "perc-sign.01") def test_framefiles_fileids(self): self.assertEqual(len(nombank.fileids()), 4705) self.assertTrue(all(fileid.endswith(".xml") for fileid in nombank.fileids())) nltk-3.7/nltk/test/unit/test_pl196x.py000066400000000000000000000006151420073152400177150ustar00rootroot00000000000000import unittest import nltk from nltk.corpus.reader import pl196x class TestCorpusViews(unittest.TestCase): def test_corpus_reader(self): pl196x_dir = nltk.data.find("corpora/pl196x") pl = pl196x.Pl196xCorpusReader( pl196x_dir, r".*\.xml", textids="textids.txt", cat_file="cats.txt" ) pl.tagged_words(fileids=pl.fileids(), categories="cats.txt") nltk-3.7/nltk/test/unit/test_pos_tag.py000066400000000000000000000052131420073152400203050ustar00rootroot00000000000000""" Tests for nltk.pos_tag """ import unittest from nltk import pos_tag, word_tokenize class TestPosTag(unittest.TestCase): def test_pos_tag_eng(self): text = "John's big idea isn't all that bad." expected_tagged = [ ("John", "NNP"), ("'s", "POS"), ("big", "JJ"), ("idea", "NN"), ("is", "VBZ"), ("n't", "RB"), ("all", "PDT"), ("that", "DT"), ("bad", "JJ"), (".", "."), ] assert pos_tag(word_tokenize(text)) == expected_tagged def test_pos_tag_eng_universal(self): text = "John's big idea isn't all that bad." expected_tagged = [ ("John", "NOUN"), ("'s", "PRT"), ("big", "ADJ"), ("idea", "NOUN"), ("is", "VERB"), ("n't", "ADV"), ("all", "DET"), ("that", "DET"), ("bad", "ADJ"), (".", "."), ] assert pos_tag(word_tokenize(text), tagset="universal") == expected_tagged def test_pos_tag_rus(self): text = "Илья оторопел и дважды перечитал бумажку." expected_tagged = [ ("Илья", "S"), ("оторопел", "V"), ("и", "CONJ"), ("дважды", "ADV"), ("перечитал", "V"), ("бумажку", "S"), (".", "NONLEX"), ] assert pos_tag(word_tokenize(text), lang="rus") == expected_tagged def test_pos_tag_rus_universal(self): text = "Илья оторопел и дважды перечитал бумажку." expected_tagged = [ ("Илья", "NOUN"), ("оторопел", "VERB"), ("и", "CONJ"), ("дважды", "ADV"), ("перечитал", "VERB"), ("бумажку", "NOUN"), (".", "."), ] assert ( pos_tag(word_tokenize(text), tagset="universal", lang="rus") == expected_tagged ) def test_pos_tag_unknown_lang(self): text = "모르겠 습니 다" self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang="kor") # Test for default kwarg, `lang=None` self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None) def test_unspecified_lang(self): # Tries to force the lang='eng' option. text = "모르겠 습니 다" expected_but_wrong = [("모르겠", "JJ"), ("습니", "NNP"), ("다", "NN")] assert pos_tag(word_tokenize(text)) == expected_but_wrong nltk-3.7/nltk/test/unit/test_ribes.py000066400000000000000000000115361420073152400177620ustar00rootroot00000000000000from nltk.translate.ribes_score import corpus_ribes, word_rank_alignment def test_ribes_empty_worder(): # worder as in word order # Verifies that these two sentences have no alignment, # and hence have the lowest possible RIBES score. hyp = "This is a nice sentence which I quite like".split() ref = "Okay well that's neat and all but the reference's different".split() assert word_rank_alignment(ref, hyp) == [] list_of_refs = [[ref]] hypotheses = [hyp] assert corpus_ribes(list_of_refs, hypotheses) == 0.0 def test_ribes_one_worder(): # Verifies that these two sentences have just one match, # and the RIBES score for this sentence with very little # correspondence is 0. hyp = "This is a nice sentence which I quite like".split() ref = "Okay well that's nice and all but the reference's different".split() assert word_rank_alignment(ref, hyp) == [3] list_of_refs = [[ref]] hypotheses = [hyp] assert corpus_ribes(list_of_refs, hypotheses) == 0.0 def test_ribes_two_worder(): # Verifies that these two sentences have two matches, # but still get the lowest possible RIBES score due # to the lack of similarity. hyp = "This is a nice sentence which I quite like".split() ref = "Okay well that's nice and all but the reference is different".split() assert word_rank_alignment(ref, hyp) == [9, 3] list_of_refs = [[ref]] hypotheses = [hyp] assert corpus_ribes(list_of_refs, hypotheses) == 0.0 def test_ribes(): # Based on the doctest of the corpus_ribes function hyp1 = [ "It", "is", "a", "guide", "to", "action", "which", "ensures", "that", "the", "military", "always", "obeys", "the", "commands", "of", "the", "party", ] ref1a = [ "It", "is", "a", "guide", "to", "action", "that", "ensures", "that", "the", "military", "will", "forever", "heed", "Party", "commands", ] ref1b = [ "It", "is", "the", "guiding", "principle", "which", "guarantees", "the", "military", "forces", "always", "being", "under", "the", "command", "of", "the", "Party", ] ref1c = [ "It", "is", "the", "practical", "guide", "for", "the", "army", "always", "to", "heed", "the", "directions", "of", "the", "party", ] hyp2 = [ "he", "read", "the", "book", "because", "he", "was", "interested", "in", "world", "history", ] ref2a = [ "he", "was", "interested", "in", "world", "history", "because", "he", "read", "the", "book", ] list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]] hypotheses = [hyp1, hyp2] score = corpus_ribes(list_of_refs, hypotheses) assert round(score, 4) == 0.3597 def test_no_zero_div(): # Regression test for Issue 2529, assure that no ZeroDivisionError is thrown. hyp1 = [ "It", "is", "a", "guide", "to", "action", "which", "ensures", "that", "the", "military", "always", "obeys", "the", "commands", "of", "the", "party", ] ref1a = [ "It", "is", "a", "guide", "to", "action", "that", "ensures", "that", "the", "military", "will", "forever", "heed", "Party", "commands", ] ref1b = [ "It", "is", "the", "guiding", "principle", "which", "guarantees", "the", "military", "forces", "always", "being", "under", "the", "command", "of", "the", "Party", ] ref1c = [ "It", "is", "the", "practical", "guide", "for", "the", "army", "always", "to", "heed", "the", "directions", "of", "the", "party", ] hyp2 = ["he", "read", "the"] ref2a = ["he", "was", "interested", "in", "world", "history", "because", "he"] list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]] hypotheses = [hyp1, hyp2] score = corpus_ribes(list_of_refs, hypotheses) assert round(score, 4) == 0.1688 nltk-3.7/nltk/test/unit/test_rte_classify.py000066400000000000000000000051571420073152400213470ustar00rootroot00000000000000import pytest from nltk import config_megam from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features from nltk.corpus import rte as rte_corpus expected_from_rte_feature_extration = """ alwayson => True ne_hyp_extra => 0 ne_overlap => 1 neg_hyp => 0 neg_txt => 0 word_hyp_extra => 3 word_overlap => 3 alwayson => True ne_hyp_extra => 0 ne_overlap => 1 neg_hyp => 0 neg_txt => 0 word_hyp_extra => 2 word_overlap => 1 alwayson => True ne_hyp_extra => 1 ne_overlap => 1 neg_hyp => 0 neg_txt => 0 word_hyp_extra => 1 word_overlap => 2 alwayson => True ne_hyp_extra => 1 ne_overlap => 0 neg_hyp => 0 neg_txt => 0 word_hyp_extra => 6 word_overlap => 2 alwayson => True ne_hyp_extra => 1 ne_overlap => 0 neg_hyp => 0 neg_txt => 0 word_hyp_extra => 4 word_overlap => 0 alwayson => True ne_hyp_extra => 1 ne_overlap => 0 neg_hyp => 0 neg_txt => 0 word_hyp_extra => 3 word_overlap => 1 """ class TestRTEClassifier: # Test the feature extraction method. def test_rte_feature_extraction(self): pairs = rte_corpus.pairs(["rte1_dev.xml"])[:6] test_output = [ f"{key:<15} => {rte_features(pair)[key]}" for pair in pairs for key in sorted(rte_features(pair)) ] expected_output = expected_from_rte_feature_extration.strip().split("\n") # Remove null strings. expected_output = list(filter(None, expected_output)) assert test_output == expected_output # Test the RTEFeatureExtractor object. def test_feature_extractor_object(self): rtepair = rte_corpus.pairs(["rte3_dev.xml"])[33] extractor = RTEFeatureExtractor(rtepair) assert extractor.hyp_words == {"member", "China", "SCO."} assert extractor.overlap("word") == set() assert extractor.overlap("ne") == {"China"} assert extractor.hyp_extra("word") == {"member"} # Test the RTE classifier training. def test_rte_classification_without_megam(self): # Use a sample size for unit testing, since we # don't need to fully train these classifiers clf = rte_classifier("IIS", sample_N=100) clf = rte_classifier("GIS", sample_N=100) def test_rte_classification_with_megam(self): try: config_megam() except (LookupError, AttributeError) as e: pytest.skip("Skipping tests with dependencies on MEGAM") clf = rte_classifier("megam", sample_N=100) nltk-3.7/nltk/test/unit/test_seekable_unicode_stream_reader.py000066400000000000000000000042031420073152400250250ustar00rootroot00000000000000import os from io import BytesIO import pytest from nltk.corpus.reader import SeekableUnicodeStreamReader def check_reader(unicode_string, encoding): bytestr = unicode_string.encode(encoding) stream = BytesIO(bytestr) reader = SeekableUnicodeStreamReader(stream, encoding) # Should open at the start of the file assert reader.tell() == 0 # Compare original string to contents from `.readlines()` assert unicode_string == "".join(reader.readlines()) # Should be at the end of the file now stream.seek(0, os.SEEK_END) assert reader.tell() == stream.tell() reader.seek(0) # go back to start # Compare original string to contents from `.read()` contents = "" char = None while char != "": char = reader.read(1) contents += char assert unicode_string == contents # Call `check_reader` with a variety of input strings and encodings. ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"] STRINGS = [ """ This is a test file. It is fairly short. """, "This file can be encoded with latin1. \x83", """\ This is a test file. Here's a blank line: And here's some unicode: \xee \u0123 \uffe3 """, """\ This is a test file. Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555 """, """\ This is a larger file. It has some lines that are longer \ than 72 characters. It's got lots of repetition. Here's \ some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345 How fun! Let's repeat it twenty times. """ * 20, ] @pytest.mark.parametrize("string", STRINGS) def test_reader(string): for encoding in ENCODINGS: # skip strings that can't be encoded with the current encoding try: string.encode(encoding) except UnicodeEncodeError: continue check_reader(string, encoding) def test_reader_stream_closes_when_deleted(): reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii") assert not reader.stream.closed reader.__del__() assert reader.stream.closed def teardown_module(module=None): import gc gc.collect() nltk-3.7/nltk/test/unit/test_senna.py000066400000000000000000000070201420073152400177530ustar00rootroot00000000000000""" Unit tests for Senna """ import unittest from os import environ, path, sep from nltk.classify import Senna from nltk.tag import SennaChunkTagger, SennaNERTagger, SennaTagger # Set Senna executable path for tests if it is not specified as an environment variable if "SENNA" in environ: SENNA_EXECUTABLE_PATH = path.normpath(environ["SENNA"]) + sep else: SENNA_EXECUTABLE_PATH = "/usr/share/senna-v3.0" senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH) @unittest.skipUnless(senna_is_installed, "Requires Senna executable") class TestSennaPipeline(unittest.TestCase): """Unittest for nltk.classify.senna""" def test_senna_pipeline(self): """Senna pipeline interface""" pipeline = Senna(SENNA_EXECUTABLE_PATH, ["pos", "chk", "ner"]) sent = "Dusseldorf is an international business center".split() result = [ (token["word"], token["chk"], token["ner"], token["pos"]) for token in pipeline.tag(sent) ] expected = [ ("Dusseldorf", "B-NP", "B-LOC", "NNP"), ("is", "B-VP", "O", "VBZ"), ("an", "B-NP", "O", "DT"), ("international", "I-NP", "O", "JJ"), ("business", "I-NP", "O", "NN"), ("center", "I-NP", "O", "NN"), ] self.assertEqual(result, expected) @unittest.skipUnless(senna_is_installed, "Requires Senna executable") class TestSennaTagger(unittest.TestCase): """Unittest for nltk.tag.senna""" def test_senna_tagger(self): tagger = SennaTagger(SENNA_EXECUTABLE_PATH) result = tagger.tag("What is the airspeed of an unladen swallow ?".split()) expected = [ ("What", "WP"), ("is", "VBZ"), ("the", "DT"), ("airspeed", "NN"), ("of", "IN"), ("an", "DT"), ("unladen", "NN"), ("swallow", "NN"), ("?", "."), ] self.assertEqual(result, expected) def test_senna_chunk_tagger(self): chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH) result_1 = chktagger.tag("What is the airspeed of an unladen swallow ?".split()) expected_1 = [ ("What", "B-NP"), ("is", "B-VP"), ("the", "B-NP"), ("airspeed", "I-NP"), ("of", "B-PP"), ("an", "B-NP"), ("unladen", "I-NP"), ("swallow", "I-NP"), ("?", "O"), ] result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type="NP")) expected_2 = [ ("What", "0"), ("the airspeed", "2-3"), ("an unladen swallow", "5-6-7"), ] self.assertEqual(result_1, expected_1) self.assertEqual(result_2, expected_2) def test_senna_ner_tagger(self): nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH) result_1 = nertagger.tag("Shakespeare theatre was in London .".split()) expected_1 = [ ("Shakespeare", "B-PER"), ("theatre", "O"), ("was", "O"), ("in", "O"), ("London", "B-LOC"), (".", "O"), ] result_2 = nertagger.tag("UN headquarters are in NY , USA .".split()) expected_2 = [ ("UN", "B-ORG"), ("headquarters", "O"), ("are", "O"), ("in", "O"), ("NY", "B-LOC"), (",", "O"), ("USA", "B-LOC"), (".", "O"), ] self.assertEqual(result_1, expected_1) self.assertEqual(result_2, expected_2) nltk-3.7/nltk/test/unit/test_stem.py000066400000000000000000000140561420073152400176260ustar00rootroot00000000000000import unittest from contextlib import closing from nltk import data from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import SnowballStemmer class SnowballTest(unittest.TestCase): def test_arabic(self): """ this unit testing for test the snowball arabic light stemmer this stemmer deals with prefixes and suffixes """ # Test where the ignore_stopwords=True. ar_stemmer = SnowballStemmer("arabic", True) assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب" assert ar_stemmer.stem("العربية") == "عرب" assert ar_stemmer.stem("فقالوا") == "قال" assert ar_stemmer.stem("الطالبات") == "طالب" assert ar_stemmer.stem("فالطالبات") == "طالب" assert ar_stemmer.stem("والطالبات") == "طالب" assert ar_stemmer.stem("الطالبون") == "طالب" assert ar_stemmer.stem("اللذان") == "اللذان" assert ar_stemmer.stem("من") == "من" # Test where the ignore_stopwords=False. ar_stemmer = SnowballStemmer("arabic", False) assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word assert ar_stemmer.stem("الطالبات") == "طالب" assert ar_stemmer.stem("الكلمات") == "كلم" # test where create the arabic stemmer without given init value to ignore_stopwords ar_stemmer = SnowballStemmer("arabic") assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب" assert ar_stemmer.stem("العربية") == "عرب" assert ar_stemmer.stem("فقالوا") == "قال" assert ar_stemmer.stem("الطالبات") == "طالب" assert ar_stemmer.stem("الكلمات") == "كلم" def test_russian(self): stemmer_russian = SnowballStemmer("russian") assert stemmer_russian.stem("авантненькая") == "авантненьк" def test_german(self): stemmer_german = SnowballStemmer("german") stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True) assert stemmer_german.stem("Schr\xe4nke") == "schrank" assert stemmer_german2.stem("Schr\xe4nke") == "schrank" assert stemmer_german.stem("keinen") == "kein" assert stemmer_german2.stem("keinen") == "keinen" def test_spanish(self): stemmer = SnowballStemmer("spanish") assert stemmer.stem("Visionado") == "vision" # The word 'algue' was raising an IndexError assert stemmer.stem("algue") == "algu" def test_short_strings_bug(self): stemmer = SnowballStemmer("english") assert stemmer.stem("y's") == "y" class PorterTest(unittest.TestCase): def _vocabulary(self): with closing( data.find("stemmers/porter_test/porter_vocabulary.txt").open( encoding="utf-8" ) ) as fp: return fp.read().splitlines() def _test_against_expected_output(self, stemmer_mode, expected_stems): stemmer = PorterStemmer(mode=stemmer_mode) for word, true_stem in zip(self._vocabulary(), expected_stems): our_stem = stemmer.stem(word) assert ( our_stem == true_stem ), "{} should stem to {} in {} mode but got {}".format( word, true_stem, stemmer_mode, our_stem, ) def test_vocabulary_martin_mode(self): """Tests all words from the test vocabulary provided by M Porter The sample vocabulary and output were sourced from https://tartarus.org/martin/PorterStemmer/voc.txt and https://tartarus.org/martin/PorterStemmer/output.txt and are linked to from the Porter Stemmer algorithm's homepage at https://tartarus.org/martin/PorterStemmer/ """ with closing( data.find("stemmers/porter_test/porter_martin_output.txt").open( encoding="utf-8" ) ) as fp: self._test_against_expected_output( PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines() ) def test_vocabulary_nltk_mode(self): with closing( data.find("stemmers/porter_test/porter_nltk_output.txt").open( encoding="utf-8" ) ) as fp: self._test_against_expected_output( PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines() ) def test_vocabulary_original_mode(self): # The list of stems for this test was generated by taking the # Martin-blessed stemmer from # https://tartarus.org/martin/PorterStemmer/c.txt # and removing all the --DEPARTURE-- sections from it and # running it against Martin's test vocabulary. with closing( data.find("stemmers/porter_test/porter_original_output.txt").open( encoding="utf-8" ) ) as fp: self._test_against_expected_output( PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines() ) self._test_against_expected_output( PorterStemmer.ORIGINAL_ALGORITHM, data.find("stemmers/porter_test/porter_original_output.txt") .open(encoding="utf-8") .read() .splitlines(), ) def test_oed_bug(self): """Test for bug https://github.com/nltk/nltk/issues/1581 Ensures that 'oed' can be stemmed without throwing an error. """ assert PorterStemmer().stem("oed") == "o" def test_lowercase_option(self): """Test for improvement on https://github.com/nltk/nltk/issues/2507 Ensures that stems are lowercased when `to_lowercase=True` """ porter = PorterStemmer() assert porter.stem("On") == "on" assert porter.stem("I") == "i" assert porter.stem("I", to_lowercase=False) == "I" assert porter.stem("Github") == "github" assert porter.stem("Github", to_lowercase=False) == "Github" nltk-3.7/nltk/test/unit/test_tag.py000066400000000000000000000010001420073152400174120ustar00rootroot00000000000000def test_basic(): from nltk.tag import pos_tag from nltk.tokenize import word_tokenize result = pos_tag(word_tokenize("John's big idea isn't all that bad.")) assert result == [ ("John", "NNP"), ("'s", "POS"), ("big", "JJ"), ("idea", "NN"), ("is", "VBZ"), ("n't", "RB"), ("all", "PDT"), ("that", "DT"), ("bad", "JJ"), (".", "."), ] def setup_module(module): import pytest pytest.importorskip("numpy") nltk-3.7/nltk/test/unit/test_tgrep.py000066400000000000000000000743201420073152400177770ustar00rootroot00000000000000#!/usr/bin/env python # # Natural Language Toolkit: TGrep search # # Copyright (C) 2001-2022 NLTK Project # Author: Will Roberts # URL: # For license information, see LICENSE.TXT """ Unit tests for nltk.tgrep. """ import unittest from nltk import tgrep from nltk.tree import ParentedTree class TestSequenceFunctions(unittest.TestCase): """ Class containing unit tests for nltk.tgrep. """ def test_tokenize_simple(self): """ Simple test of tokenization. """ tokens = tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]") self.assertEqual( tokens, [ "A", "..", "(", "B", "!", "<", "C", ".", "D", ")", "|", "!", "[", "<<", "(", "E", ",", "F", ")", "$", "G", "]", ], ) def test_tokenize_encoding(self): """ Test that tokenization handles bytes and strs the same way. """ self.assertEqual( tgrep.tgrep_tokenize(b"A .. (B !< C . D) | ![<< (E , F) $ G]"), tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]"), ) def test_tokenize_link_types(self): """ Test tokenization of basic link types. """ self.assertEqual(tgrep.tgrep_tokenize("AB"), ["A", ">", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<3B"), ["A", "<3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>3B"), ["A", ">3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<,B"), ["A", "<,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>,B"), ["A", ">,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<-3B"), ["A", "<-3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>-3B"), ["A", ">-3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<-B"), ["A", "<-", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>-B"), ["A", ">-", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<'B"), ["A", "<'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>'B"), ["A", ">'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<:B"), ["A", "<:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>:B"), ["A", ">:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<>B"), ["A", ">>", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<<,B"), ["A", "<<,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>>,B"), ["A", ">>,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<<'B"), ["A", "<<'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>>'B"), ["A", ">>'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A<<:B"), ["A", "<<:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A>>:B"), ["A", ">>:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A.B"), ["A", ".", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A,B"), ["A", ",", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A..B"), ["A", "..", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A,,B"), ["A", ",,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$B"), ["A", "$", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$.B"), ["A", "$.", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$,B"), ["A", "$,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$..B"), ["A", "$..", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A$,,B"), ["A", "$,,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!B"), ["A", "!", ">", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<3B"), ["A", "!", "<3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>3B"), ["A", "!", ">3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<,B"), ["A", "!", "<,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>,B"), ["A", "!", ">,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<-3B"), ["A", "!", "<-3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>-3B"), ["A", "!", ">-3", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<-B"), ["A", "!", "<-", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>-B"), ["A", "!", ">-", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<'B"), ["A", "!", "<'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>'B"), ["A", "!", ">'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<:B"), ["A", "!", "<:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>:B"), ["A", "!", ">:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<>B"), ["A", "!", ">>", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<<,B"), ["A", "!", "<<,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>>,B"), ["A", "!", ">>,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<<'B"), ["A", "!", "<<'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>>'B"), ["A", "!", ">>'", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!<<:B"), ["A", "!", "<<:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!>>:B"), ["A", "!", ">>:", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!.B"), ["A", "!", ".", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!,B"), ["A", "!", ",", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!..B"), ["A", "!", "..", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!,,B"), ["A", "!", ",,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$B"), ["A", "!", "$", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$.B"), ["A", "!", "$.", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$,B"), ["A", "!", "$,", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$..B"), ["A", "!", "$..", "B"]) self.assertEqual(tgrep.tgrep_tokenize("A!$,,B"), ["A", "!", "$,,", "B"]) def test_tokenize_examples(self): """ Test tokenization of the TGrep2 manual example patterns. """ self.assertEqual(tgrep.tgrep_tokenize("NP < PP"), ["NP", "<", "PP"]) self.assertEqual(tgrep.tgrep_tokenize("/^NP/"), ["/^NP/"]) self.assertEqual( tgrep.tgrep_tokenize("NP << PP . VP"), ["NP", "<<", "PP", ".", "VP"] ) self.assertEqual( tgrep.tgrep_tokenize("NP << PP | . VP"), ["NP", "<<", "PP", "|", ".", "VP"] ) self.assertEqual( tgrep.tgrep_tokenize("NP !<< PP [> NP | >> VP]"), ["NP", "!", "<<", "PP", "[", ">", "NP", "|", ">>", "VP", "]"], ) self.assertEqual( tgrep.tgrep_tokenize("NP << (PP . VP)"), ["NP", "<<", "(", "PP", ".", "VP", ")"], ) self.assertEqual( tgrep.tgrep_tokenize("NP <' (PP <, (IN < on))"), ["NP", "<'", "(", "PP", "<,", "(", "IN", "<", "on", ")", ")"], ) self.assertEqual( tgrep.tgrep_tokenize("S < (A < B) < C"), ["S", "<", "(", "A", "<", "B", ")", "<", "C"], ) self.assertEqual( tgrep.tgrep_tokenize("S < ((A < B) < C)"), ["S", "<", "(", "(", "A", "<", "B", ")", "<", "C", ")"], ) self.assertEqual( tgrep.tgrep_tokenize("S < (A < B < C)"), ["S", "<", "(", "A", "<", "B", "<", "C", ")"], ) self.assertEqual(tgrep.tgrep_tokenize("A3B"3B"', "<", "C"], ) def test_tokenize_nodenames(self): """ Test tokenization of node names. """ self.assertEqual(tgrep.tgrep_tokenize("Robert"), ["Robert"]) self.assertEqual(tgrep.tgrep_tokenize("/^[Bb]ob/"), ["/^[Bb]ob/"]) self.assertEqual(tgrep.tgrep_tokenize("*"), ["*"]) self.assertEqual(tgrep.tgrep_tokenize("__"), ["__"]) # test tokenization of NLTK tree position syntax self.assertEqual(tgrep.tgrep_tokenize("N()"), ["N(", ")"]) self.assertEqual(tgrep.tgrep_tokenize("N(0,)"), ["N(", "0", ",", ")"]) self.assertEqual(tgrep.tgrep_tokenize("N(0,0)"), ["N(", "0", ",", "0", ")"]) self.assertEqual( tgrep.tgrep_tokenize("N(0,0,)"), ["N(", "0", ",", "0", ",", ")"] ) def test_tokenize_macros(self): """ Test tokenization of macro definitions. """ self.assertEqual( tgrep.tgrep_tokenize( "@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN" ), [ "@", "NP", "/^NP/", ";", "@", "NN", "/^NN/", ";", "@NP", "[", "!", "<", "NP", "|", "<", "@NN", "]", "!", "$..", "@NN", ], ) def test_node_simple(self): """ Test a simple use of tgrep for finding nodes matching a given pattern. """ tree = ParentedTree.fromstring( "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" ) self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]]) self.assertEqual( list(tgrep.tgrep_nodes("NN", [tree])), [[tree[0, 2], tree[2, 1]]] ) self.assertEqual( list(tgrep.tgrep_positions("NN|JJ", [tree])), [[(0, 1), (0, 2), (2, 1)]] ) def test_node_printing(self): """Test that the tgrep print operator ' is properly ignored.""" tree = ParentedTree.fromstring("(S (n x) (N x))") self.assertEqual( list(tgrep.tgrep_positions("N", [tree])), list(tgrep.tgrep_positions("'N", [tree])), ) self.assertEqual( list(tgrep.tgrep_positions("/[Nn]/", [tree])), list(tgrep.tgrep_positions("'/[Nn]/", [tree])), ) def test_node_encoding(self): """ Test that tgrep search strings handles bytes and strs the same way. """ tree = ParentedTree.fromstring( "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" ) self.assertEqual( list(tgrep.tgrep_positions(b"NN", [tree])), list(tgrep.tgrep_positions(b"NN", [tree])), ) self.assertEqual( list(tgrep.tgrep_nodes(b"NN", [tree])), list(tgrep.tgrep_nodes("NN", [tree])), ) self.assertEqual( list(tgrep.tgrep_positions(b"NN|JJ", [tree])), list(tgrep.tgrep_positions("NN|JJ", [tree])), ) def test_node_nocase(self): """ Test selecting nodes using case insensitive node names. """ tree = ParentedTree.fromstring("(S (n x) (N x))") self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]]) def test_node_quoted(self): """ Test selecting nodes using quoted node names. """ tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))') self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]]) def test_node_regex(self): """ Test regex matching on nodes. """ tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))") # This is a regular expression that matches any node whose # name starts with NP, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions("/^NP/", [tree])), [[(0,), (1,)]]) def test_node_regex_2(self): """ Test regex matching on nodes. """ tree = ParentedTree.fromstring("(S (SBJ x) (SBJ1 x) (NP-SBJ x))") self.assertEqual(list(tgrep.tgrep_positions("/^SBJ/", [tree])), [[(0,), (1,)]]) # This is a regular expression that matches any node whose # name includes SBJ, including NP-SBJ: self.assertEqual( list(tgrep.tgrep_positions("/SBJ/", [tree])), [[(0,), (1,), (2,)]] ) def test_node_tree_position(self): """ Test matching on nodes based on NLTK tree position. """ tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))") # test all tree positions that are not leaves leaf_positions = {tree.leaf_treeposition(x) for x in range(len(tree.leaves()))} tree_positions = [x for x in tree.treepositions() if x not in leaf_positions] for position in tree_positions: node_id = f"N{position}" tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree])) self.assertEqual(len(tgrep_positions[0]), 1) self.assertEqual(tgrep_positions[0][0], position) def test_node_noleaves(self): """ Test node name matching with the search_leaves flag set to False. """ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") self.assertEqual( list(tgrep.tgrep_positions("x", [tree])), [[(0, 0, 0), (1, 0, 0)]] ) self.assertEqual(list(tgrep.tgrep_positions("x", [tree], False)), [[]]) def tests_rel_dominance(self): """ Test matching nodes based on dominance relations. """ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* < T > S", [tree])), [[(0,)]]) self.assertEqual( list(tgrep.tgrep_positions("* !< T", [tree])), [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]], ) self.assertEqual(list(tgrep.tgrep_positions("* !< T > S", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* > A", [tree])), [[(0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions("* > B", [tree])), [[(1, 0)]]) self.assertEqual( list(tgrep.tgrep_positions("* !> B", [tree])), [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* !> B >> S", [tree])), [[(0,), (0, 0), (1,)]] ) self.assertEqual( list(tgrep.tgrep_positions("* >> S", [tree])), [[(0,), (0, 0), (1,), (1, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* >>, S", [tree])), [[(0,), (0, 0)]] ) self.assertEqual( list(tgrep.tgrep_positions("* >>' S", [tree])), [[(1,), (1, 0)]] ) # Known issue: # self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])), # [[()]]) self.assertEqual(list(tgrep.tgrep_positions("* << T", [tree])), [[(), (0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <<' T", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <<1 N", [tree])), [[(1,)]]) self.assertEqual( list(tgrep.tgrep_positions("* !<< T", [tree])), [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]], ) tree = ParentedTree.fromstring("(S (A (T x)) (B (T x) (N x )))") self.assertEqual(list(tgrep.tgrep_positions("* <: T", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,), (1,)]]) self.assertEqual( list(tgrep.tgrep_positions("* !<: T", [tree])), [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]], ) self.assertEqual(list(tgrep.tgrep_positions("* !<: T > S", [tree])), [[(1,)]]) tree = ParentedTree.fromstring("(S (T (A x) (B x)) (T (C x)))") self.assertEqual(list(tgrep.tgrep_positions("* >: T", [tree])), [[(1, 0)]]) self.assertEqual( list(tgrep.tgrep_positions("* !>: T", [tree])), [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]], ) tree = ParentedTree.fromstring( "(S (A (B (C (D (E (T x))))))" " (A (B (C (D (E (T x))) (N x)))))" ) self.assertEqual( list(tgrep.tgrep_positions("* <<: T", [tree])), [ [ (0,), (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0), (1, 0, 0, 0), (1, 0, 0, 0, 0), ] ], ) self.assertEqual( list(tgrep.tgrep_positions("* >>: A", [tree])), [ [ (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0), (1, 0), (1, 0, 0), ] ], ) def test_bad_operator(self): """ Test error handling of undefined tgrep operators. """ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions("* >>> S", [tree]) ) def test_comments(self): """ Test that comments are correctly filtered out of tgrep search strings. """ tree = ParentedTree.fromstring("(S (NN x) (NP x) (NN x))") search1 = """ @ NP /^NP/; @ NN /^NN/; @NN """ self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]]) search2 = """ # macros @ NP /^NP/; @ NN /^NN/; # search string @NN """ self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]]) def test_rel_sister_nodes(self): """ Test matching sister nodes in a tree. """ tree = ParentedTree.fromstring("(S (A x) (B x) (C x))") self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]]) def tests_rel_indexed_children(self): """ Test matching nodes based on their index in their parent node. """ tree = ParentedTree.fromstring("(S (A x) (B x) (C x))") self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])), [[(0,)]]) tree = ParentedTree.fromstring( "(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) " "(F (C x) (A x) (B x)))" ) self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])), [[(0,)]]) def test_rel_precedence(self): """ Test matching nodes based on precedence relations. """ tree = ParentedTree.fromstring( "(S (NP (NP (PP x)) (NP (AP x)))" " (VP (AP (X (PP x)) (Y (AP x))))" " (NP (RC (NP (AP x)))))" ) self.assertEqual( list(tgrep.tgrep_positions("* . X", [tree])), [[(0,), (0, 1), (0, 1, 0)]] ) self.assertEqual( list(tgrep.tgrep_positions("* . Y", [tree])), [[(1, 0, 0), (1, 0, 0, 0)]] ) self.assertEqual( list(tgrep.tgrep_positions("* .. X", [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* .. Y", [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* , X", [tree])), [[(1, 0, 1), (1, 0, 1, 0)]] ) self.assertEqual( list(tgrep.tgrep_positions("* , Y", [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* ,, X", [tree])), [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* ,, Y", [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) def test_examples(self): """ Test the Basic Examples from the TGrep2 manual. """ tree = ParentedTree.fromstring("(S (NP (AP x)) (NP (PP x)))") # This matches any NP node that immediately dominates a PP: self.assertEqual(list(tgrep.tgrep_positions("NP < PP", [tree])), [[(1,)]]) tree = ParentedTree.fromstring("(S (NP x) (VP x) (NP (PP x)) (VP x))") # This matches an NP that dominates a PP and is immediately # followed by a VP: self.assertEqual(list(tgrep.tgrep_positions("NP << PP . VP", [tree])), [[(2,)]]) tree = ParentedTree.fromstring( "(S (NP (AP x)) (NP (PP x)) " "(NP (DET x) (NN x)) (VP x))" ) # This matches an NP that dominates a PP or is immediately # followed by a VP: self.assertEqual( list(tgrep.tgrep_positions("NP << PP | . VP", [tree])), [[(1,), (2,)]] ) tree = ParentedTree.fromstring( "(S (NP (NP (PP x)) (NP (AP x)))" " (VP (AP (NP (PP x)) (NP (AP x))))" " (NP (RC (NP (AP x)))))" ) # This matches an NP that does not dominate a PP. Also, the NP # must either have a parent that is an NP or be dominated by a # VP: self.assertEqual( list(tgrep.tgrep_positions("NP !<< PP [> NP | >> VP]", [tree])), [[(0, 1), (1, 0, 1)]], ) tree = ParentedTree.fromstring( "(S (NP (AP (PP x) (VP x))) " "(NP (AP (PP x) (NP x))) (NP x))" ) # This matches an NP that dominates a PP which itself is # immediately followed by a VP. Note the use of parentheses to # group ". VP" with the PP rather than with the NP: self.assertEqual( list(tgrep.tgrep_positions("NP << (PP . VP)", [tree])), [[(0,)]] ) tree = ParentedTree.fromstring( "(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))" " (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))" " (NP x))" ) # This matches an NP whose last child is a PP that begins with # the preposition "on": self.assertEqual( list(tgrep.tgrep_positions("NP <' (PP <, (IN < on))", [tree])), [[(0,)]] ) tree = ParentedTree.fromstring( "(S (S (C x) (A (B x))) (S (C x) (A x)) " "(S (D x) (A (B x))))" ) # The following pattern matches an S which has a child A and # another child that is a C and that the A has a child B: self.assertEqual( list(tgrep.tgrep_positions("S < (A < B) < C", [tree])), [[(0,)]] ) tree = ParentedTree.fromstring( "(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))" ) # However, this pattern means that S has child A and that A # has children B and C: self.assertEqual( list(tgrep.tgrep_positions("S < ((A < B) < C)", [tree])), [[(0,)]] ) # It is equivalent to this: self.assertEqual( list(tgrep.tgrep_positions("S < (A < B < C)", [tree])), [[(0,)]] ) def test_use_macros(self): """ Test defining and using tgrep2 macros. """ tree = ParentedTree.fromstring( "(VP (VB sold) (NP (DET the) " "(NN heiress)) (NP (NN deed) (PREP to) " "(NP (DET the) (NN school) (NN house))))" ) self.assertEqual( list( tgrep.tgrep_positions( "@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN", [tree] ) ), [[(1,), (2, 2)]], ) # use undefined macro @CNP self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions( "@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN", [tree] ), ) def test_tokenize_node_labels(self): """Test tokenization of labeled nodes.""" self.assertEqual( tgrep.tgrep_tokenize("S < @SBJ < (@VP < (@VB $.. @OBJ))"), [ "S", "<", "@SBJ", "<", "(", "@VP", "<", "(", "@VB", "$..", "@OBJ", ")", ")", ], ) self.assertEqual( tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))"), [ "S", "<", "@SBJ", "=", "s", "<", "(", "@VP", "=", "v", "<", "(", "@VB", "$..", "@OBJ", ")", ")", ], ) def test_tokenize_segmented_patterns(self): """Test tokenization of segmented patterns.""" self.assertEqual( tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"), [ "S", "<", "@SBJ", "=", "s", "<", "(", "@VP", "=", "v", "<", "(", "@VB", "$..", "@OBJ", ")", ")", ":", "=s", "..", "=v", ], ) def test_labeled_nodes(self): """ Test labeled nodes. Test case from Emily M. Bender. """ search = """ # macros @ SBJ /SBJ/; @ VP /VP/; @ VB /VB/; @ VPoB /V[PB]/; @ OBJ /OBJ/; # 1 svo S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v""" sent1 = ParentedTree.fromstring( "(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))" ) sent2 = ParentedTree.fromstring( "(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))" ) search_firsthalf = search.split("\n\n")[0] + "S < @SBJ < (@VP < (@VB $.. @OBJ))" search_rewrite = "S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))" self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0]) self.assertEqual( list(tgrep.tgrep_positions(search, [sent1])), list(tgrep.tgrep_positions(search_rewrite, [sent1])), ) self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0]) self.assertEqual( list(tgrep.tgrep_positions(search, [sent2])), list(tgrep.tgrep_positions(search_rewrite, [sent2])), ) def test_multiple_conjs(self): """ Test that multiple (3 or more) conjunctions of node relations are handled properly. """ sent = ParentedTree.fromstring("((A (B b) (C c)) (A (B b) (C c) (D d)))") # search = '(A < B < C < D)' # search_tworels = '(A < B < C)' self.assertEqual( list(tgrep.tgrep_positions("(A < B < C < D)", [sent])), [[(1,)]] ) self.assertEqual( list(tgrep.tgrep_positions("(A < B < C)", [sent])), [[(0,), (1,)]] ) def test_trailing_semicolon(self): """ Test that semicolons at the end of a tgrep2 search string won't cause a parse failure. """ tree = ParentedTree.fromstring( "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" ) self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]]) self.assertEqual(list(tgrep.tgrep_positions("NN;", [tree])), [[(0, 2), (2, 1)]]) self.assertEqual( list(tgrep.tgrep_positions("NN;;", [tree])), [[(0, 2), (2, 1)]] ) nltk-3.7/nltk/test/unit/test_tokenize.py000066400000000000000000000665021420073152400205110ustar00rootroot00000000000000""" Unit tests for nltk.tokenize. See also nltk/test/tokenize.doctest """ from typing import List, Tuple import pytest from nltk.tokenize import ( LegalitySyllableTokenizer, StanfordSegmenter, SyllableTokenizer, TreebankWordTokenizer, TweetTokenizer, punkt, word_tokenize, ) def load_stanford_segmenter(): try: seg = StanfordSegmenter() seg.default_config("ar") seg.default_config("zh") return True except LookupError: return False check_stanford_segmenter = pytest.mark.skipif( not load_stanford_segmenter(), reason="NLTK was unable to find stanford-segmenter.jar.", ) class TestTokenize: def test_tweet_tokenizer(self): """ Test TweetTokenizer using words with special and accented characters. """ tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) s9 = "@myke: Let's test these words: resumé España München français" tokens = tokenizer.tokenize(s9) expected = [ ":", "Let's", "test", "these", "words", ":", "resumé", "España", "München", "français", ] assert tokens == expected @pytest.mark.parametrize( "test_input, expecteds", [ ( "My text 0106404243030 is great text", ( ["My", "text", "01064042430", "30", "is", "great", "text"], ["My", "text", "0106404243030", "is", "great", "text"], ), ), ( "My ticket id is 1234543124123", ( ["My", "ticket", "id", "is", "12345431241", "23"], ["My", "ticket", "id", "is", "1234543124123"], ), ), ( "@remy: This is waaaaayyyy too much for you!!!!!! 01064042430", ( [ ":", "This", "is", "waaayyy", "too", "much", "for", "you", "!", "!", "!", "01064042430", ], [ ":", "This", "is", "waaayyy", "too", "much", "for", "you", "!", "!", "!", "01064042430", ], ), ), # Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085, # showing the TweetTokenizer performance for `match_phone_numbers=True` and # `match_phone_numbers=False`. ( # Some phone numbers are always tokenized, even with `match_phone_numbers=`False` "My number is 06-46124080, except it's not.", ( [ "My", "number", "is", "06-46124080", ",", "except", "it's", "not", ".", ], [ "My", "number", "is", "06-46124080", ",", "except", "it's", "not", ".", ], ), ), ( # Phone number here is only tokenized correctly if `match_phone_numbers=True` "My number is 601-984-4813, except it's not.", ( [ "My", "number", "is", "601-984-4813", ",", "except", "it's", "not", ".", ], [ "My", "number", "is", "601-984-", "4813", ",", "except", "it's", "not", ".", ], ), ), ( # Phone number here is only tokenized correctly if `match_phone_numbers=True` "My number is (393) 928 -3010, except it's not.", ( [ "My", "number", "is", "(393) 928 -3010", ",", "except", "it's", "not", ".", ], [ "My", "number", "is", "(", "393", ")", "928", "-", "3010", ",", "except", "it's", "not", ".", ], ), ), ( # A long number is tokenized correctly only if `match_phone_numbers=False` "The product identification number is 48103284512.", ( [ "The", "product", "identification", "number", "is", "4810328451", "2", ".", ], [ "The", "product", "identification", "number", "is", "48103284512", ".", ], ), ), ( # `match_phone_numbers=True` can have some unforeseen "My favourite substraction is 240 - 1353.", ( ["My", "favourite", "substraction", "is", "240 - 1353", "."], ["My", "favourite", "substraction", "is", "240", "-", "1353", "."], ), ), ], ) def test_tweet_tokenizer_expanded( self, test_input: str, expecteds: Tuple[List[str], List[str]] ): """ Test `match_phone_numbers` in TweetTokenizer. Note that TweetTokenizer is also passed the following for these tests: * strip_handles=True * reduce_len=True :param test_input: The input string to tokenize using TweetTokenizer. :type test_input: str :param expecteds: A 2-tuple of tokenized sentences. The first of the two tokenized is the expected output of tokenization with `match_phone_numbers=True`. The second of the two tokenized lists is the expected output of tokenization with `match_phone_numbers=False`. :type expecteds: Tuple[List[str], List[str]] """ for match_phone_numbers, expected in zip([True, False], expecteds): tokenizer = TweetTokenizer( strip_handles=True, reduce_len=True, match_phone_numbers=match_phone_numbers, ) predicted = tokenizer.tokenize(test_input) assert predicted == expected def test_sonority_sequencing_syllable_tokenizer(self): """ Test SyllableTokenizer tokenizer. """ tokenizer = SyllableTokenizer() tokens = tokenizer.tokenize("justification") assert tokens == ["jus", "ti", "fi", "ca", "tion"] def test_legality_principle_syllable_tokenizer(self): """ Test LegalitySyllableTokenizer tokenizer. """ from nltk.corpus import words test_word = "wonderful" tokenizer = LegalitySyllableTokenizer(words.words()) tokens = tokenizer.tokenize(test_word) assert tokens == ["won", "der", "ful"] @check_stanford_segmenter def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ seg = StanfordSegmenter() seg.default_config("ar") sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ "يبحث", "علم", "الحاسوب", "استخدام", "الحوسبة", "ب", "جميع", "اشكال", "ها", "ل", "حل", "المشكلات", ] @check_stanford_segmenter def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ seg = StanfordSegmenter() seg.default_config("zh") sent = "这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"] def test_phone_tokenizer(self): """ Test a string that resembles a phone number but contains a newline """ # Should be recognized as a phone number, albeit one with multiple spaces tokenizer = TweetTokenizer() test1 = "(393) 928 -3010" expected = ["(393) 928 -3010"] result = tokenizer.tokenize(test1) assert result == expected # Due to newline, first three elements aren't part of a phone number; # fourth is test2 = "(393)\n928 -3010" expected = ["(", "393", ")", "928 -3010"] result = tokenizer.tokenize(test2) assert result == expected def test_emoji_tokenizer(self): """ Test a string that contains Emoji ZWJ Sequences and skin tone modifier """ tokenizer = TweetTokenizer() # A Emoji ZWJ Sequences, they together build as a single emoji, should not be split. test1 = "👨‍👩‍👧‍👧" expected = ["👨‍👩‍👧‍👧"] result = tokenizer.tokenize(test1) assert result == expected # A Emoji with skin tone modifier, the two characters build a single emoji, should not be split. test2 = "👨🏿" expected = ["👨🏿"] result = tokenizer.tokenize(test2) assert result == expected # A string containing both skin tone modifier and ZWJ Sequences test3 = "🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽" expected = [ "🤔", "🙈", "me", "así", ",", "se", "😌", "ds", "💕", "👭", "👙", "hello", "👩🏾\u200d🎓", "emoji", "hello", "👨\u200d👩\u200d👦\u200d👦", "how", "are", "😊", "you", "today", "🙅🏽", "🙅🏽", ] result = tokenizer.tokenize(test3) assert result == expected def test_pad_asterisk(self): """ Test padding of asterisk for word tokenization. """ text = "This is a, *weird sentence with *asterisks in it." expected = [ "This", "is", "a", ",", "*", "weird", "sentence", "with", "*", "asterisks", "in", "it", ".", ] assert word_tokenize(text) == expected def test_pad_dotdot(self): """ Test padding of dotdot* for word tokenization. """ text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....." expected = [ "Why", "did", "dotdot", "..", "not", "get", "tokenized", "but", "dotdotdot", "...", "did", "?", "How", "about", "manydots", ".....", ] assert word_tokenize(text) == expected def test_remove_handle(self): """ Test remove_handle() from casual.py with specially crafted edge cases """ tokenizer = TweetTokenizer(strip_handles=True) # Simple example. Handles with just numbers should be allowed test1 = "@twitter hello @twi_tter_. hi @12345 @123news" expected = ["hello", ".", "hi"] result = tokenizer.tokenize(test1) assert result == expected # Handles are allowed to follow any of the following characters test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n." expected = [ "`", "~", "(", ")", "-", "=", "+", "\\", "|", "[", "]", "{", "}", ";", ":", "'", '"', "/", "?", ".", ",", "<", ">", "ñ", ".", "ü", ".", "ç", ".", ] result = tokenizer.tokenize(test2) assert result == expected # Handles are NOT allowed to follow any of the following characters test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n" expected = [ "a", "@n", "j", "@n", "z", "@n", "A", "@n", "L", "@n", "Z", "@n", "1", "@n", "4", "@n", "7", "@n", "9", "@n", "0", "@n", "_", "@n", "!", "@n", "@", "@n", "#", "@n", "$", "@n", "%", "@n", "&", "@n", "*", "@n", ] result = tokenizer.tokenize(test3) assert result == expected # Handles are allowed to precede the following characters test4 = "@n!a @n#a @n$a @n%a @n&a @n*a" expected = ["!", "a", "#", "a", "$", "a", "%", "a", "&", "a", "*", "a"] result = tokenizer.tokenize(test4) assert result == expected # Tests interactions with special symbols and multiple @ test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n" expected = [ "!", "@n", "#", "@n", "$", "@n", "%", "@n", "&", "@n", "*", "@n", "@n", "@n", "@", "@n", "@n", "@", "@n", "@n_", "@n", "@n7", "@n", "@nj", "@n", ] result = tokenizer.tokenize(test5) assert result == expected # Tests that handles can have a max length of 15 test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle" expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"] result = tokenizer.tokenize(test6) assert result == expected # Edge case where an @ comes directly after a long handle test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde" expected = [ "p", "@abcde", "@abcdefghijklmno", "@abcde", "_", "@abcde", "5", "@abcde", ] result = tokenizer.tokenize(test7) assert result == expected def test_treebank_span_tokenizer(self): """ Test TreebankWordTokenizer.span_tokenize function """ tokenizer = TreebankWordTokenizer() # Test case in the docstring test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)." expected = [ (0, 4), (5, 12), (13, 17), (18, 19), (19, 23), (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78), ] result = list(tokenizer.span_tokenize(test1)) assert result == expected # Test case with double quotation test2 = 'The DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues' expected = [ (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102), (103, 109), ] result = list(tokenizer.span_tokenize(test2)) assert result == expected # Test case with double qoutation as well as converted quotations test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues" expected = [ (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96), (97, 99), (100, 106), (107, 113), ] result = list(tokenizer.span_tokenize(test3)) assert result == expected def test_word_tokenize(self): """ Test word_tokenize function """ sentence = "The 'v', I've been fooled but I'll seek revenge." expected = [ "The", "'", "v", "'", ",", "I", "'ve", "been", "fooled", "but", "I", "'ll", "seek", "revenge", ".", ] assert word_tokenize(sentence) == expected sentence = "'v' 're'" expected = ["'", "v", "'", "'re", "'"] assert word_tokenize(sentence) == expected def test_punkt_pair_iter(self): test_cases = [ ("12", [("1", "2"), ("2", None)]), ("123", [("1", "2"), ("2", "3"), ("3", None)]), ("1234", [("1", "2"), ("2", "3"), ("3", "4"), ("4", None)]), ] for (test_input, expected_output) in test_cases: actual_output = [x for x in punkt._pair_iter(test_input)] assert actual_output == expected_output def test_punkt_pair_iter_handles_stop_iteration_exception(self): # test input to trigger StopIteration from next() it = iter([]) # call method under test and produce a generator gen = punkt._pair_iter(it) # unpack generator, ensure that no error is raised list(gen) def test_punkt_tokenize_words_handles_stop_iteration_exception(self): obj = punkt.PunktBaseClass() class TestPunktTokenizeWordsMock: def word_tokenize(self, s): return iter([]) obj._lang_vars = TestPunktTokenizeWordsMock() # unpack generator, ensure that no error is raised list(obj._tokenize_words("test")) def test_punkt_tokenize_custom_lang_vars(self): # Create LangVars including a full stop end character as used in Bengali class BengaliLanguageVars(punkt.PunktLanguageVars): sent_end_chars = (".", "?", "!", "\u0964") obj = punkt.PunktSentenceTokenizer(lang_vars=BengaliLanguageVars()) # We now expect these sentences to be split up into the individual sentences sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।" expected = [ "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।", "অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন।", "এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।", ] assert obj.tokenize(sentences) == expected def test_punkt_tokenize_no_custom_lang_vars(self): obj = punkt.PunktSentenceTokenizer() # We expect these sentences to not be split properly, as the Bengali full stop '।' is not included in the default language vars sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।" expected = [ "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।" ] assert obj.tokenize(sentences) == expected @pytest.mark.parametrize( "input_text,n_sents,n_splits,lang_vars", [ # Test debug_decisions on a text with two sentences, split by a dot. ("Subject: Some subject. Attachments: Some attachments", 2, 1), # The sentence should be split into two sections, # with one split and hence one decision. # Test debug_decisions on a text with two sentences, split by an exclamation mark. ("Subject: Some subject! Attachments: Some attachments", 2, 1), # The sentence should be split into two sections, # with one split and hence one decision. # Test debug_decisions on a text with one sentences, # which is not split. ("This is just a normal sentence, just like any other.", 1, 0) # Hence just 1 ], ) def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None): tokenizer = punkt.PunktSentenceTokenizer() if lang_vars != None: tokenizer._lang_vars = lang_vars assert len(tokenizer.tokenize(input_text)) == n_sents assert len(list(tokenizer.debug_decisions(input_text))) == n_splits def test_punkt_debug_decisions_custom_end(self): # Test debug_decisions on a text with two sentences, # split by a custom end character, based on Issue #2519 class ExtLangVars(punkt.PunktLanguageVars): sent_end_chars = (".", "?", "!", "^") self.punkt_debug_decisions( "Subject: Some subject^ Attachments: Some attachments", n_sents=2, n_splits=1, lang_vars=ExtLangVars(), ) # The sentence should be split into two sections, # with one split and hence one decision. nltk-3.7/nltk/test/unit/test_twitter_auth.py000066400000000000000000000046001420073152400213730ustar00rootroot00000000000000""" Tests for static parts of Twitter package """ import os import pytest pytest.importorskip("twython") from nltk.twitter import Authenticate @pytest.fixture def auth(): return Authenticate() class TestCredentials: """ Tests that Twitter credentials from a file are handled correctly. """ @classmethod def setup_class(self): self.subdir = os.path.join(os.path.dirname(__file__), "files") os.environ["TWITTER"] = "twitter-files" def test_environment(self, auth): """ Test that environment variable has been read correctly. """ fn = os.path.basename(auth.creds_subdir) assert fn == os.environ["TWITTER"] @pytest.mark.parametrize( "kwargs", [ # Each of the following scenarios should raise an error: # An empty subdir path {"subdir": ""}, # A subdir path of None {"subdir": None}, # A nonexistent directory {"subdir": "/nosuchdir"}, # 'credentials.txt' is not in default subdir, as read from `os.environ['TWITTER']` {}, # Nonexistent credentials file ('foobar') {"creds_file": "foobar"}, # 'bad_oauth1-1.txt' is incomplete {"creds_file": "bad_oauth1-1.txt"}, # The first key in credentials file 'bad_oauth1-2.txt' is ill-formed {"creds_file": "bad_oauth1-2.txt"}, # The first two lines in 'bad_oauth1-3.txt' are collapsed {"creds_file": "bad_oauth1-3.txt"}, ], ) def test_scenarios_that_should_raise_errors(self, kwargs, auth): """Various scenarios that should raise errors""" try: auth.load_creds(**kwargs) # raises ValueError (zero length field name in format) for python 2.6 # OSError for the rest except (OSError, ValueError): pass except Exception as e: pytest.fail("Unexpected exception thrown: %s" % e) else: pytest.fail("OSError exception not thrown.") def test_correct_file(self, auth): """Test that a proper file succeeds and is read correctly""" oauth = auth.load_creds(subdir=self.subdir) assert auth.creds_fullpath == os.path.join(self.subdir, auth.creds_file) assert auth.creds_file == "credentials.txt" assert oauth["app_key"] == "a" nltk-3.7/nltk/test/unit/test_util.py000066400000000000000000000034161420073152400176310ustar00rootroot00000000000000import pytest from nltk.util import everygrams @pytest.fixture def everygram_input(): """Form test data for tests.""" return iter(["a", "b", "c"]) def test_everygrams_without_padding(everygram_input): expected_output = [ ("a",), ("a", "b"), ("a", "b", "c"), ("b",), ("b", "c"), ("c",), ] output = list(everygrams(everygram_input)) assert output == expected_output def test_everygrams_max_len(everygram_input): expected_output = [ ("a",), ("a", "b"), ("b",), ("b", "c"), ("c",), ] output = list(everygrams(everygram_input, max_len=2)) assert output == expected_output def test_everygrams_min_len(everygram_input): expected_output = [ ("a", "b"), ("a", "b", "c"), ("b", "c"), ] output = list(everygrams(everygram_input, min_len=2)) assert output == expected_output def test_everygrams_pad_right(everygram_input): expected_output = [ ("a",), ("a", "b"), ("a", "b", "c"), ("b",), ("b", "c"), ("b", "c", None), ("c",), ("c", None), ("c", None, None), (None,), (None, None), (None,), ] output = list(everygrams(everygram_input, max_len=3, pad_right=True)) assert output == expected_output def test_everygrams_pad_left(everygram_input): expected_output = [ (None,), (None, None), (None, None, "a"), (None,), (None, "a"), (None, "a", "b"), ("a",), ("a", "b"), ("a", "b", "c"), ("b",), ("b", "c"), ("c",), ] output = list(everygrams(everygram_input, max_len=3, pad_left=True)) assert output == expected_output nltk-3.7/nltk/test/unit/test_wordnet.py000066400000000000000000000214741420073152400203420ustar00rootroot00000000000000""" Unit tests for nltk.corpus.wordnet See also nltk/test/wordnet.doctest """ import unittest from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic as wnic wn.ensure_loaded() S = wn.synset L = wn.lemma class WordnNetDemo(unittest.TestCase): def test_retrieve_synset(self): move_synset = S("go.v.21") self.assertEqual(move_synset.name(), "move.v.15") self.assertEqual(move_synset.lemma_names(), ["move", "go"]) self.assertEqual( move_synset.definition(), "have a turn; make one's move in a game" ) self.assertEqual(move_synset.examples(), ["Can I go now?"]) def test_retrieve_synsets(self): self.assertEqual(sorted(wn.synsets("zap", pos="n")), [S("zap.n.01")]) self.assertEqual( sorted(wn.synsets("zap", pos="v")), [S("microwave.v.01"), S("nuke.v.01"), S("zap.v.01"), S("zap.v.02")], ) def test_hyperhyponyms(self): # Not every synset as hypernyms() self.assertEqual(S("travel.v.01").hypernyms(), []) self.assertEqual(S("travel.v.02").hypernyms(), [S("travel.v.03")]) self.assertEqual(S("travel.v.03").hypernyms(), []) # Test hyper-/hyponyms. self.assertEqual(S("breakfast.n.1").hypernyms(), [S("meal.n.01")]) first_five_meal_hypo = [ S("banquet.n.02"), S("bite.n.04"), S("breakfast.n.01"), S("brunch.n.01"), S("buffet.n.02"), ] self.assertEqual(sorted(S("meal.n.1").hyponyms()[:5]), first_five_meal_hypo) self.assertEqual(S("Austen.n.1").instance_hypernyms(), [S("writer.n.01")]) first_five_composer_hypo = [ S("ambrose.n.01"), S("bach.n.01"), S("barber.n.01"), S("bartok.n.01"), S("beethoven.n.01"), ] self.assertEqual( S("composer.n.1").instance_hyponyms()[:5], first_five_composer_hypo ) # Test root hyper-/hyponyms self.assertEqual(S("person.n.01").root_hypernyms(), [S("entity.n.01")]) self.assertEqual(S("sail.v.01").root_hypernyms(), [S("travel.v.01")]) self.assertEqual( S("fall.v.12").root_hypernyms(), [S("act.v.01"), S("fall.v.17")] ) def test_derivationally_related_forms(self): # Test `derivationally_related_forms()` self.assertEqual( L("zap.v.03.nuke").derivationally_related_forms(), [L("atomic_warhead.n.01.nuke")], ) self.assertEqual( L("zap.v.03.atomize").derivationally_related_forms(), [L("atomization.n.02.atomization")], ) self.assertEqual( L("zap.v.03.atomise").derivationally_related_forms(), [L("atomization.n.02.atomisation")], ) self.assertEqual(L("zap.v.03.zap").derivationally_related_forms(), []) def test_meronyms_holonyms(self): # Test meronyms, holonyms. self.assertEqual( S("dog.n.01").member_holonyms(), [S("canis.n.01"), S("pack.n.06")] ) self.assertEqual(S("dog.n.01").part_meronyms(), [S("flag.n.07")]) self.assertEqual(S("faculty.n.2").member_meronyms(), [S("professor.n.01")]) self.assertEqual(S("copilot.n.1").member_holonyms(), [S("crew.n.01")]) self.assertEqual( S("table.n.2").part_meronyms(), [S("leg.n.03"), S("tabletop.n.01"), S("tableware.n.01")], ) self.assertEqual(S("course.n.7").part_holonyms(), [S("meal.n.01")]) self.assertEqual( S("water.n.1").substance_meronyms(), [S("hydrogen.n.01"), S("oxygen.n.01")] ) self.assertEqual( S("gin.n.1").substance_holonyms(), [ S("gin_and_it.n.01"), S("gin_and_tonic.n.01"), S("martini.n.01"), S("pink_lady.n.01"), ], ) def test_antonyms(self): # Test antonyms. self.assertEqual( L("leader.n.1.leader").antonyms(), [L("follower.n.01.follower")] ) self.assertEqual( L("increase.v.1.increase").antonyms(), [L("decrease.v.01.decrease")] ) def test_misc_relations(self): # Test misc relations. self.assertEqual(S("snore.v.1").entailments(), [S("sleep.v.01")]) self.assertEqual( S("heavy.a.1").similar_tos(), [ S("dense.s.03"), S("doughy.s.01"), S("heavier-than-air.s.01"), S("hefty.s.02"), S("massive.s.04"), S("non-buoyant.s.01"), S("ponderous.s.02"), ], ) self.assertEqual(S("light.a.1").attributes(), [S("weight.n.01")]) self.assertEqual(S("heavy.a.1").attributes(), [S("weight.n.01")]) # Test pertainyms. self.assertEqual( L("English.a.1.English").pertainyms(), [L("england.n.01.England")] ) def test_lch(self): # Test LCH. self.assertEqual( S("person.n.01").lowest_common_hypernyms(S("dog.n.01")), [S("organism.n.01")], ) self.assertEqual( S("woman.n.01").lowest_common_hypernyms(S("girlfriend.n.02")), [S("woman.n.01")], ) def test_domains(self): # Test domains. self.assertEqual(S("code.n.03").topic_domains(), [S("computer_science.n.01")]) self.assertEqual(S("pukka.a.01").region_domains(), [S("india.n.01")]) self.assertEqual(S("freaky.a.01").usage_domains(), [S("slang.n.02")]) def test_in_topic_domains(self): # Test in domains. self.assertEqual( S("computer_science.n.01").in_topic_domains()[0], S("access.n.05") ) self.assertEqual(S("germany.n.01").in_region_domains()[23], S("trillion.n.02")) self.assertEqual(S("slang.n.02").in_usage_domains()[1], S("airhead.n.01")) def test_wordnet_similarities(self): # Path based similarities. self.assertAlmostEqual(S("cat.n.01").path_similarity(S("cat.n.01")), 1.0) self.assertAlmostEqual(S("dog.n.01").path_similarity(S("cat.n.01")), 0.2) self.assertAlmostEqual( S("car.n.01").path_similarity(S("automobile.v.01")), S("automobile.v.01").path_similarity(S("car.n.01")), ) self.assertAlmostEqual( S("big.a.01").path_similarity(S("dog.n.01")), S("dog.n.01").path_similarity(S("big.a.01")), ) self.assertAlmostEqual( S("big.a.01").path_similarity(S("long.a.01")), S("long.a.01").path_similarity(S("big.a.01")), ) self.assertAlmostEqual( S("dog.n.01").lch_similarity(S("cat.n.01")), 2.028, places=3 ) self.assertAlmostEqual( S("dog.n.01").wup_similarity(S("cat.n.01")), 0.8571, places=3 ) self.assertAlmostEqual( S("car.n.01").wup_similarity(S("automobile.v.01")), S("automobile.v.01").wup_similarity(S("car.n.01")), ) self.assertAlmostEqual( S("big.a.01").wup_similarity(S("dog.n.01")), S("dog.n.01").wup_similarity(S("big.a.01")), ) self.assertAlmostEqual( S("big.a.01").wup_similarity(S("long.a.01")), S("long.a.01").wup_similarity(S("big.a.01")), ) self.assertAlmostEqual( S("big.a.01").lch_similarity(S("long.a.01")), S("long.a.01").lch_similarity(S("big.a.01")), ) # Information Content similarities. brown_ic = wnic.ic("ic-brown.dat") self.assertAlmostEqual( S("dog.n.01").jcn_similarity(S("cat.n.01"), brown_ic), 0.4497, places=3 ) semcor_ic = wnic.ic("ic-semcor.dat") self.assertAlmostEqual( S("dog.n.01").lin_similarity(S("cat.n.01"), semcor_ic), 0.8863, places=3 ) def test_omw_lemma_no_trailing_underscore(self): expected = sorted( [ "popolna_sprememba_v_mišljenju", "popoln_obrat", "preobrat", "preobrat_v_mišljenju", ] ) self.assertEqual(sorted(S("about-face.n.02").lemma_names(lang="slv")), expected) def test_iterable_type_for_all_lemma_names(self): # Duck-test for iterables. # See https://stackoverflow.com/a/36230057/610569 cat_lemmas = wn.all_lemma_names(lang="cat") eng_lemmas = wn.all_lemma_names(lang="eng") self.assertTrue(hasattr(eng_lemmas, "__iter__")) self.assertTrue(hasattr(eng_lemmas, "__next__") or hasattr(eng_lemmas, "next")) self.assertTrue(eng_lemmas.__iter__() is eng_lemmas) self.assertTrue(hasattr(cat_lemmas, "__iter__")) self.assertTrue(hasattr(cat_lemmas, "__next__") or hasattr(eng_lemmas, "next")) self.assertTrue(cat_lemmas.__iter__() is cat_lemmas) nltk-3.7/nltk/test/unit/translate/000077500000000000000000000000001420073152400172345ustar00rootroot00000000000000nltk-3.7/nltk/test/unit/translate/__init__.py000066400000000000000000000000001420073152400213330ustar00rootroot00000000000000nltk-3.7/nltk/test/unit/translate/test_bleu.py000066400000000000000000000361551420073152400216060ustar00rootroot00000000000000""" Tests for BLEU translation evaluation metric """ import io import unittest from nltk.data import find from nltk.translate.bleu_score import ( SmoothingFunction, brevity_penalty, closest_ref_length, corpus_bleu, modified_precision, sentence_bleu, ) class TestBLEU(unittest.TestCase): def test_modified_precision(self): """ Examples from the original BLEU paper https://www.aclweb.org/anthology/P02-1040.pdf """ # Example 1: the "the*" example. # Reference sentences. ref1 = "the cat is on the mat".split() ref2 = "there is a cat on the mat".split() # Hypothesis sentence(s). hyp1 = "the the the the the the the".split() references = [ref1, ref2] # Testing modified unigram precision. hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) assert round(hyp1_unigram_precision, 4) == 0.2857 # With assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4) # Testing modified bigram precision. assert float(modified_precision(references, hyp1, n=2)) == 0.0 # Example 2: the "of the" example. # Reference sentences ref1 = str( "It is a guide to action that ensures that the military " "will forever heed Party commands" ).split() ref2 = str( "It is the guiding principle which guarantees the military " "forces always being under the command of the Party" ).split() ref3 = str( "It is the practical guide for the army always to heed " "the directions of the party" ).split() # Hypothesis sentence(s). hyp1 = "of the".split() references = [ref1, ref2, ref3] # Testing modified unigram precision. assert float(modified_precision(references, hyp1, n=1)) == 1.0 # Testing modified bigram precision. assert float(modified_precision(references, hyp1, n=2)) == 1.0 # Example 3: Proper MT outputs. hyp1 = str( "It is a guide to action which ensures that the military " "always obeys the commands of the party" ).split() hyp2 = str( "It is to insure the troops forever hearing the activity " "guidebook that party direct" ).split() references = [ref1, ref2, ref3] # Unigram precision. hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1)) # Test unigram precision with assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4) self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4) # Test unigram precision with rounding. assert round(hyp1_unigram_precision, 4) == 0.9444 assert round(hyp2_unigram_precision, 4) == 0.5714 # Bigram precision hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2)) hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2)) # Test bigram precision with assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4) self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4) # Test bigram precision with rounding. assert round(hyp1_bigram_precision, 4) == 0.5882 assert round(hyp2_bigram_precision, 4) == 0.0769 def test_brevity_penalty(self): # Test case from brevity_penalty_closest function in mteval-v13a.pl. # Same test cases as in the doctest in nltk.translate.bleu_score.py references = [["a"] * 11, ["a"] * 8] hypothesis = ["a"] * 7 hyp_len = len(hypothesis) closest_ref_len = closest_ref_length(references, hyp_len) self.assertAlmostEqual( brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4 ) references = [["a"] * 11, ["a"] * 8, ["a"] * 6, ["a"] * 7] hypothesis = ["a"] * 7 hyp_len = len(hypothesis) closest_ref_len = closest_ref_length(references, hyp_len) assert brevity_penalty(closest_ref_len, hyp_len) == 1.0 def test_zero_matches(self): # Test case where there's 0 matches references = ["The candidate has no alignment to any of the references".split()] hypothesis = "John loves Mary".split() # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1, len(hypothesis)): weights = (1.0 / n,) * n # Uniform weights. assert sentence_bleu(references, hypothesis, weights) == 0 def test_full_matches(self): # Test case where there's 100% matches references = ["John loves Mary".split()] hypothesis = "John loves Mary".split() # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1, len(hypothesis)): weights = (1.0 / n,) * n # Uniform weights. assert sentence_bleu(references, hypothesis, weights) == 1.0 def test_partial_matches_hypothesis_longer_than_reference(self): references = ["John loves Mary".split()] hypothesis = "John loves Mary who loves Mike".split() # Since no 4-grams matches were found the result should be zero # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4) # Checks that the warning has been raised because len(reference) < 4. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. # @unittest.skip("Skipping fringe cases for BLEU.") class TestBLEUFringeCases(unittest.TestCase): def test_case_where_n_is_bigger_than_hypothesis_length(self): # Test BLEU to nth order of n-grams, where n > len(hypothesis). references = ["John loves Mary ?".split()] hypothesis = "John loves Mary".split() n = len(hypothesis) + 1 # weights = (1.0 / n,) * n # Uniform weights. # Since no n-grams matches were found the result should be zero # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual( sentence_bleu(references, hypothesis, weights), 0.0, places=4 ) # Checks that the warning has been raised because len(hypothesis) < 4. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. # Test case where n > len(hypothesis) but so is n > len(reference), and # it's a special case where reference == hypothesis. references = ["John loves Mary".split()] hypothesis = "John loves Mary".split() # Since no 4-grams matches were found the result should be zero # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual( sentence_bleu(references, hypothesis, weights), 0.0, places=4 ) def test_empty_hypothesis(self): # Test case where there's hypothesis is empty. references = ["The candidate has no alignment to any of the references".split()] hypothesis = [] assert sentence_bleu(references, hypothesis) == 0 def test_length_one_hypothesis(self): # Test case where there's hypothesis is of length 1 in Smoothing method 4. references = ["The candidate has no alignment to any of the references".split()] hypothesis = ["Foo"] method4 = SmoothingFunction().method4 try: sentence_bleu(references, hypothesis, smoothing_function=method4) except ValueError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. def test_empty_references(self): # Test case where there's reference is empty. references = [[]] hypothesis = "John loves Mary".split() assert sentence_bleu(references, hypothesis) == 0 def test_empty_references_and_hypothesis(self): # Test case where both references and hypothesis is empty. references = [[]] hypothesis = [] assert sentence_bleu(references, hypothesis) == 0 def test_reference_or_hypothesis_shorter_than_fourgrams(self): # Test case where the length of reference or hypothesis # is shorter than 4. references = ["let it go".split()] hypothesis = "let go it".split() # Checks that the value the hypothesis and reference returns is 0.0 # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4) # Checks that the warning has been raised. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. class TestBLEUvsMteval13a(unittest.TestCase): def test_corpus_bleu(self): ref_file = find("models/wmt15_eval/ref.ru") hyp_file = find("models/wmt15_eval/google.ru") mteval_output_file = find("models/wmt15_eval/mteval-13a.output") # Reads the BLEU scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file) as mteval_fin: # The numbers are located in the last 2nd line of the file. # The first and 2nd item in the list are the score and system names. mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1]) with open(ref_file, encoding="utf8") as ref_fin: with open(hyp_file, encoding="utf8") as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypothesis = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores): nltk_bleu = corpus_bleu( references, hypothesis, weights=(1.0 / i,) * i ) # Check that the BLEU scores difference is less than 0.005 . # Note: This is an approximate comparison; as much as # +/- 0.01 BLEU might be "statistically significant", # the actual translation quality might not be. assert abs(mteval_bleu - nltk_bleu) < 0.005 # With the same smoothing method used in mteval-v13a.pl chencherry = SmoothingFunction() for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores): nltk_bleu = corpus_bleu( references, hypothesis, weights=(1.0 / i,) * i, smoothing_function=chencherry.method3, ) assert abs(mteval_bleu - nltk_bleu) < 0.005 class TestBLEUWithBadSentence(unittest.TestCase): def test_corpus_bleu_with_bad_sentence(self): hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R" ref = str( "Their tasks include changing a pump on the faulty stokehold ." "Likewise , two species that are very similar in morphology " "were distinguished using genetics ." ) references = [[ref.split()]] hypotheses = [hyp.split()] try: # Check that the warning is raised since no. of 2-grams < 0. with self.assertWarns(UserWarning): # Verify that the BLEU output is undesired since no. of 2-grams < 0. self.assertAlmostEqual( corpus_bleu(references, hypotheses), 0.0, places=4 ) except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2. self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4) class TestBLEUWithMultipleWeights(unittest.TestCase): def test_corpus_bleu_with_multiple_weights(self): hyp1 = [ "It", "is", "a", "guide", "to", "action", "which", "ensures", "that", "the", "military", "always", "obeys", "the", "commands", "of", "the", "party", ] ref1a = [ "It", "is", "a", "guide", "to", "action", "that", "ensures", "that", "the", "military", "will", "forever", "heed", "Party", "commands", ] ref1b = [ "It", "is", "the", "guiding", "principle", "which", "guarantees", "the", "military", "forces", "always", "being", "under", "the", "command", "of", "the", "Party", ] ref1c = [ "It", "is", "the", "practical", "guide", "for", "the", "army", "always", "to", "heed", "the", "directions", "of", "the", "party", ] hyp2 = [ "he", "read", "the", "book", "because", "he", "was", "interested", "in", "world", "history", ] ref2a = [ "he", "was", "interested", "in", "world", "history", "because", "he", "read", "the", "book", ] weight_1 = (1, 0, 0, 0) weight_2 = (0.25, 0.25, 0.25, 0.25) weight_3 = (0, 0, 0, 0, 1) bleu_scores = corpus_bleu( list_of_references=[[ref1a, ref1b, ref1c], [ref2a]], hypotheses=[hyp1, hyp2], weights=[weight_1, weight_2, weight_3], ) assert bleu_scores[0] == corpus_bleu( [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1 ) assert bleu_scores[1] == corpus_bleu( [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2 ) assert bleu_scores[2] == corpus_bleu( [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3 ) nltk-3.7/nltk/test/unit/translate/test_gdfa.py000066400000000000000000000110101420073152400215370ustar00rootroot00000000000000""" Tests GDFA alignments """ import unittest from nltk.translate.gdfa import grow_diag_final_and class TestGDFA(unittest.TestCase): def test_from_eflomal_outputs(self): """ Testing GDFA with first 10 eflomal outputs from issue #1829 https://github.com/nltk/nltk/issues/1829 """ # Input. forwards = [ "0-0 1-2", "0-0 1-1", "0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14", "0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10", "0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31", "0-0 1-1 0-2 2-3", "0-0 2-2 4-4", "0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20", "3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14", "1-0", ] backwards = [ "0-0 1-2", "0-0 1-1", "0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13", "0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8", "0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31", "0-0 1-1 2-3", "0-0 1-1 2-3 4-4", "0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18", "0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10", "1-0", ] source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18] target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16] # Expected Output. expected = [ [(0, 0), (1, 2)], [(0, 0), (1, 1)], [ (0, 0), (2, 1), (3, 2), (4, 3), (5, 4), (6, 5), (7, 6), (8, 7), (10, 10), (11, 12), ], [ (0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6), (5, 7), (6, 8), (7, 5), (8, 7), (8, 9), (9, 8), (9, 10), ], [ (0, 0), (1, 8), (2, 9), (3, 10), (4, 11), (5, 8), (6, 9), (6, 11), (7, 10), (8, 11), (31, 31), ], [(0, 0), (0, 2), (1, 1), (2, 3)], [(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)], [ (0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (7, 6), (8, 7), (9, 8), (10, 9), (11, 10), (12, 11), (13, 12), (14, 13), (15, 14), (16, 16), (17, 17), (18, 18), (19, 19), ], [ (0, 0), (1, 1), (3, 0), (3, 2), (4, 1), (5, 3), (6, 2), (6, 4), (7, 5), (8, 6), (9, 7), (9, 12), (10, 8), (10, 13), (11, 9), (12, 8), (12, 14), (13, 9), (14, 8), (15, 9), (16, 10), ], [(1, 0)], [ (0, 0), (1, 1), (3, 2), (4, 3), (5, 4), (6, 5), (7, 6), (9, 10), (10, 12), (11, 13), (12, 14), (13, 15), ], ] # Iterate through all 10 examples and check for expected outputs. for fw, bw, src_len, trg_len, expect in zip( forwards, backwards, source_lens, target_lens, expected ): self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw)) nltk-3.7/nltk/test/unit/translate/test_ibm1.py000066400000000000000000000050441420073152400215000ustar00rootroot00000000000000""" Tests for IBM Model 1 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent, IBMModel, IBMModel1 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel1(unittest.TestCase): def test_set_uniform_translation_probabilities(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model1 = IBMModel1(corpus, 0) # act model1.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / (target vocab size + 1) self.assertEqual(model1.translation_table["ham"]["eier"], 1.0 / 3) self.assertEqual(model1.translation_table["eggs"][None], 1.0 / 3) def test_set_uniform_translation_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model1 = IBMModel1(corpus, 0) # act model1.set_uniform_probabilities(corpus) # assert # examine target words that are not in the training data domain self.assertEqual(model1.translation_table["parrot"]["eier"], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ["UNUSED"] + trg_sentence, None, ) translation_table = defaultdict(lambda: defaultdict(float)) translation_table["i"]["ich"] = 0.98 translation_table["love"]["gern"] = 0.98 translation_table["to"][None] = 0.98 translation_table["eat"]["esse"] = 0.98 translation_table["smoked"]["räucherschinken"] = 0.98 translation_table["ham"]["räucherschinken"] = 0.98 model1 = IBMModel1(corpus, 0) model1.translation_table = translation_table # act probability = model1.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 expected_probability = lexical_translation self.assertEqual(round(probability, 4), round(expected_probability, 4)) nltk-3.7/nltk/test/unit/translate/test_ibm2.py000066400000000000000000000063331420073152400215030ustar00rootroot00000000000000""" Tests for IBM Model 2 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent, IBMModel, IBMModel2 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel2(unittest.TestCase): def test_set_uniform_alignment_probabilities(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model2 = IBMModel2(corpus, 0) # act model2.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / (length of source sentence + 1) self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4) self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3) def test_set_uniform_alignment_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model2 = IBMModel2(corpus, 0) # act model2.set_uniform_probabilities(corpus) # assert # examine i and j values that are not in the training data domain self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB) self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ["UNUSED"] + trg_sentence, None, ) translation_table = defaultdict(lambda: defaultdict(float)) translation_table["i"]["ich"] = 0.98 translation_table["love"]["gern"] = 0.98 translation_table["to"][None] = 0.98 translation_table["eat"]["esse"] = 0.98 translation_table["smoked"]["räucherschinken"] = 0.98 translation_table["ham"]["räucherschinken"] = 0.98 alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))) ) alignment_table[0][3][5][6] = 0.97 # None -> to alignment_table[1][1][5][6] = 0.97 # ich -> i alignment_table[2][4][5][6] = 0.97 # esse -> eat alignment_table[4][2][5][6] = 0.97 # gern -> love alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham model2 = IBMModel2(corpus, 0) model2.translation_table = translation_table model2.alignment_table = alignment_table # act probability = model2.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96 expected_probability = lexical_translation * alignment self.assertEqual(round(probability, 4), round(expected_probability, 4)) nltk-3.7/nltk/test/unit/translate/test_ibm3.py000066400000000000000000000077641420073152400215150ustar00rootroot00000000000000""" Tests for IBM Model 3 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent, IBMModel, IBMModel3 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel3(unittest.TestCase): def test_set_uniform_distortion_probabilities(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model3 = IBMModel3(corpus, 0) # act model3.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / length of target sentence self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2) self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4) def test_set_uniform_distortion_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model3 = IBMModel3(corpus, 0) # act model3.set_uniform_probabilities(corpus) # assert # examine i and j values that are not in the training data domain self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB) self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB) self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ["UNUSED"] + trg_sentence, [[3], [1], [4], [], [2], [5, 6]], ) distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))) ) distortion_table[1][1][5][6] = 0.97 # i -> ich distortion_table[2][4][5][6] = 0.97 # love -> gern distortion_table[3][0][5][6] = 0.97 # to -> NULL distortion_table[4][2][5][6] = 0.97 # eat -> esse distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken translation_table = defaultdict(lambda: defaultdict(float)) translation_table["i"]["ich"] = 0.98 translation_table["love"]["gern"] = 0.98 translation_table["to"][None] = 0.98 translation_table["eat"]["esse"] = 0.98 translation_table["smoked"]["räucherschinken"] = 0.98 translation_table["ham"]["räucherschinken"] = 0.98 fertility_table = defaultdict(lambda: defaultdict(float)) fertility_table[1]["ich"] = 0.99 fertility_table[1]["esse"] = 0.99 fertility_table[0]["ja"] = 0.99 fertility_table[1]["gern"] = 0.99 fertility_table[2]["räucherschinken"] = 0.999 fertility_table[1][None] = 0.99 probabilities = { "p1": 0.167, "translation_table": translation_table, "distortion_table": distortion_table, "fertility_table": fertility_table, "alignment_table": None, } model3 = IBMModel3(corpus, 0, probabilities) # act probability = model3.prob_t_a_given_s(alignment_info) # assert null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97 expected_probability = ( null_generation * fertility * lexical_translation * distortion ) self.assertEqual(round(probability, 4), round(expected_probability, 4)) nltk-3.7/nltk/test/unit/translate/test_ibm4.py000066400000000000000000000117411420073152400215040ustar00rootroot00000000000000""" Tests for IBM Model 4 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent, IBMModel, IBMModel4 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel4(unittest.TestCase): def test_set_uniform_distortion_probabilities_of_max_displacements(self): # arrange src_classes = {"schinken": 0, "eier": 0, "spam": 1} trg_classes = {"ham": 0, "eggs": 1, "spam": 2} corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model4 = IBMModel4(corpus, 0, src_classes, trg_classes) # act model4.set_uniform_probabilities(corpus) # assert # number of displacement values = # 2 *(number of words in longest target sentence - 1) expected_prob = 1.0 / (2 * (4 - 1)) # examine the boundary values for (displacement, src_class, trg_class) self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob) self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob) self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob) self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob) def test_set_uniform_distortion_probabilities_of_non_domain_values(self): # arrange src_classes = {"schinken": 0, "eier": 0, "spam": 1} trg_classes = {"ham": 0, "eggs": 1, "spam": 2} corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model4 = IBMModel4(corpus, 0, src_classes, trg_classes) # act model4.set_uniform_probabilities(corpus) # assert # examine displacement values that are not in the training data domain self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB) self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB) self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB) self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] src_classes = {"räucherschinken": 0, "ja": 1, "ich": 2, "esse": 3, "gern": 4} trg_classes = {"ham": 0, "smoked": 1, "i": 3, "love": 4, "to": 2, "eat": 4} corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ["UNUSED"] + trg_sentence, [[3], [1], [4], [], [2], [5, 6]], ) head_distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict(float)) ) head_distortion_table[1][None][3] = 0.97 # None, i head_distortion_table[3][2][4] = 0.97 # ich, eat head_distortion_table[-2][3][4] = 0.97 # esse, love head_distortion_table[3][4][1] = 0.97 # gern, smoked non_head_distortion_table = defaultdict(lambda: defaultdict(float)) non_head_distortion_table[1][0] = 0.96 # ham translation_table = defaultdict(lambda: defaultdict(float)) translation_table["i"]["ich"] = 0.98 translation_table["love"]["gern"] = 0.98 translation_table["to"][None] = 0.98 translation_table["eat"]["esse"] = 0.98 translation_table["smoked"]["räucherschinken"] = 0.98 translation_table["ham"]["räucherschinken"] = 0.98 fertility_table = defaultdict(lambda: defaultdict(float)) fertility_table[1]["ich"] = 0.99 fertility_table[1]["esse"] = 0.99 fertility_table[0]["ja"] = 0.99 fertility_table[1]["gern"] = 0.99 fertility_table[2]["räucherschinken"] = 0.999 fertility_table[1][None] = 0.99 probabilities = { "p1": 0.167, "translation_table": translation_table, "head_distortion_table": head_distortion_table, "non_head_distortion_table": non_head_distortion_table, "fertility_table": fertility_table, "alignment_table": None, } model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities) # act probability = model4.prob_t_a_given_s(alignment_info) # assert null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96 expected_probability = ( null_generation * fertility * lexical_translation * distortion ) self.assertEqual(round(probability, 4), round(expected_probability, 4)) nltk-3.7/nltk/test/unit/translate/test_ibm5.py000066400000000000000000000147111420073152400215050ustar00rootroot00000000000000""" Tests for IBM Model 5 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent, IBMModel, IBMModel4, IBMModel5 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel5(unittest.TestCase): def test_set_uniform_vacancy_probabilities_of_max_displacements(self): # arrange src_classes = {"schinken": 0, "eier": 0, "spam": 1} trg_classes = {"ham": 0, "eggs": 1, "spam": 2} corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model5 = IBMModel5(corpus, 0, src_classes, trg_classes) # act model5.set_uniform_probabilities(corpus) # assert # number of vacancy difference values = # 2 * number of words in longest target sentence expected_prob = 1.0 / (2 * 4) # examine the boundary values for (dv, max_v, trg_class) self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob) self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob) self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob) self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob) def test_set_uniform_vacancy_probabilities_of_non_domain_values(self): # arrange src_classes = {"schinken": 0, "eier": 0, "spam": 1} trg_classes = {"ham": 0, "eggs": 1, "spam": 2} corpus = [ AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), ] model5 = IBMModel5(corpus, 0, src_classes, trg_classes) # act model5.set_uniform_probabilities(corpus) # assert # examine dv and max_v values that are not in the training data domain self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB) self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB) self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB) self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] src_classes = {"räucherschinken": 0, "ja": 1, "ich": 2, "esse": 3, "gern": 4} trg_classes = {"ham": 0, "smoked": 1, "i": 3, "love": 4, "to": 2, "eat": 4} corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo( (0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ["UNUSED"] + trg_sentence, [[3], [1], [4], [], [2], [5, 6]], ) head_vacancy_table = defaultdict( lambda: defaultdict(lambda: defaultdict(float)) ) head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked non_head_vacancy_table = defaultdict( lambda: defaultdict(lambda: defaultdict(float)) ) non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham translation_table = defaultdict(lambda: defaultdict(float)) translation_table["i"]["ich"] = 0.98 translation_table["love"]["gern"] = 0.98 translation_table["to"][None] = 0.98 translation_table["eat"]["esse"] = 0.98 translation_table["smoked"]["räucherschinken"] = 0.98 translation_table["ham"]["räucherschinken"] = 0.98 fertility_table = defaultdict(lambda: defaultdict(float)) fertility_table[1]["ich"] = 0.99 fertility_table[1]["esse"] = 0.99 fertility_table[0]["ja"] = 0.99 fertility_table[1]["gern"] = 0.99 fertility_table[2]["räucherschinken"] = 0.999 fertility_table[1][None] = 0.99 probabilities = { "p1": 0.167, "translation_table": translation_table, "fertility_table": fertility_table, "head_vacancy_table": head_vacancy_table, "non_head_vacancy_table": non_head_vacancy_table, "head_distortion_table": None, "non_head_distortion_table": None, "alignment_table": None, } model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities) # act probability = model5.prob_t_a_given_s(alignment_info) # assert null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96 expected_probability = ( null_generation * fertility * lexical_translation * vacancy ) self.assertEqual(round(probability, 4), round(expected_probability, 4)) def test_prune(self): # arrange alignment_infos = [ AlignmentInfo((1, 1), None, None, None), AlignmentInfo((1, 2), None, None, None), AlignmentInfo((2, 1), None, None, None), AlignmentInfo((2, 2), None, None, None), AlignmentInfo((0, 0), None, None, None), ] min_factor = IBMModel5.MIN_SCORE_FACTOR best_score = 0.9 scores = { (1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold (1, 2): best_score, (2, 1): min_factor * best_score, # at threshold (2, 2): min_factor * best_score * 0.5, # low score (0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold } corpus = [AlignedSent(["a"], ["b"])] original_prob_function = IBMModel4.model4_prob_t_a_given_s # mock static method IBMModel4.model4_prob_t_a_given_s = staticmethod( lambda a, model: scores[a.alignment] ) model5 = IBMModel5(corpus, 0, None, None) # act pruned_alignments = model5.prune(alignment_infos) # assert self.assertEqual(len(pruned_alignments), 3) # restore static method IBMModel4.model4_prob_t_a_given_s = original_prob_function nltk-3.7/nltk/test/unit/translate/test_ibm_model.py000066400000000000000000000222771420073152400226060ustar00rootroot00000000000000""" Tests for common methods of IBM translation models """ import unittest from collections import defaultdict from nltk.translate import AlignedSent, IBMModel from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel(unittest.TestCase): __TEST_SRC_SENTENCE = ["j'", "aime", "bien", "jambon"] __TEST_TRG_SENTENCE = ["i", "love", "ham"] def test_vocabularies_are_initialized(self): parallel_corpora = [ AlignedSent(["one", "two", "three", "four"], ["un", "deux", "trois"]), AlignedSent(["five", "one", "six"], ["quatre", "cinq", "six"]), AlignedSent([], ["sept"]), ] ibm_model = IBMModel(parallel_corpora) self.assertEqual(len(ibm_model.src_vocab), 8) self.assertEqual(len(ibm_model.trg_vocab), 6) def test_vocabularies_are_initialized_even_with_empty_corpora(self): parallel_corpora = [] ibm_model = IBMModel(parallel_corpora) self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token self.assertEqual(len(ibm_model.trg_vocab), 0) def test_best_model2_alignment(self): # arrange sentence_pair = AlignedSent( TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE ) # None and 'bien' have zero fertility translation_table = { "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, } alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) ) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]]) def test_best_model2_alignment_does_not_change_pegged_alignment(self): # arrange sentence_pair = AlignedSent( TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE ) translation_table = { "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, } alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) ) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act: force 'love' to be pegged to 'jambon' a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4) # assert self.assertEqual(a_info.alignment[1:], (1, 4, 4)) self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]]) def test_best_model2_alignment_handles_fertile_words(self): # arrange sentence_pair = AlignedSent( ["i", "really", ",", "really", "love", "ham"], TestIBMModel.__TEST_SRC_SENTENCE, ) # 'bien' produces 2 target words: 'really' and another 'really' translation_table = { "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, "really": {"j'": 0, "aime": 0, "bien": 0.9, "jambon": 0.01, None: 0.09}, ",": {"j'": 0, "aime": 0, "bien": 0.3, "jambon": 0, None: 0.7}, "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, } alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) ) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4)) self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]]) def test_best_model2_alignment_handles_empty_src_sentence(self): # arrange sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, []) ibm_model = IBMModel([]) # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (0, 0, 0)) self.assertEqual(a_info.cepts, [[1, 2, 3]]) def test_best_model2_alignment_handles_empty_trg_sentence(self): # arrange sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE) ibm_model = IBMModel([]) # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], ()) self.assertEqual(a_info.cepts, [[], [], [], [], []]) def test_neighboring_finds_neighbor_alignments(self): # arrange a_info = AlignmentInfo( (0, 3, 2), (None, "des", "œufs", "verts"), ("UNUSED", "green", "eggs"), [[], [], [2], [1]], ) ibm_model = IBMModel([]) # act neighbors = ibm_model.neighboring(a_info) # assert neighbor_alignments = set() for neighbor in neighbors: neighbor_alignments.add(neighbor.alignment) expected_alignments = { # moves (0, 0, 2), (0, 1, 2), (0, 2, 2), (0, 3, 0), (0, 3, 1), (0, 3, 3), # swaps (0, 2, 3), # original alignment (0, 3, 2), } self.assertEqual(neighbor_alignments, expected_alignments) def test_neighboring_sets_neighbor_alignment_info(self): # arrange a_info = AlignmentInfo( (0, 3, 2), (None, "des", "œufs", "verts"), ("UNUSED", "green", "eggs"), [[], [], [2], [1]], ) ibm_model = IBMModel([]) # act neighbors = ibm_model.neighboring(a_info) # assert: select a few particular alignments for neighbor in neighbors: if neighbor.alignment == (0, 2, 2): moved_alignment = neighbor elif neighbor.alignment == (0, 3, 2): swapped_alignment = neighbor self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []]) self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]]) def test_neighboring_returns_neighbors_with_pegged_alignment(self): # arrange a_info = AlignmentInfo( (0, 3, 2), (None, "des", "œufs", "verts"), ("UNUSED", "green", "eggs"), [[], [], [2], [1]], ) ibm_model = IBMModel([]) # act: peg 'eggs' to align with 'œufs' neighbors = ibm_model.neighboring(a_info, 2) # assert neighbor_alignments = set() for neighbor in neighbors: neighbor_alignments.add(neighbor.alignment) expected_alignments = { # moves (0, 0, 2), (0, 1, 2), (0, 2, 2), # no swaps # original alignment (0, 3, 2), } self.assertEqual(neighbor_alignments, expected_alignments) def test_hillclimb(self): # arrange initial_alignment = AlignmentInfo((0, 3, 2), None, None, None) def neighboring_mock(a, j): if a.alignment == (0, 3, 2): return { AlignmentInfo((0, 2, 2), None, None, None), AlignmentInfo((0, 1, 1), None, None, None), } elif a.alignment == (0, 2, 2): return { AlignmentInfo((0, 3, 3), None, None, None), AlignmentInfo((0, 4, 4), None, None, None), } return set() def prob_t_a_given_s_mock(a): prob_values = { (0, 3, 2): 0.5, (0, 2, 2): 0.6, (0, 1, 1): 0.4, (0, 3, 3): 0.6, (0, 4, 4): 0.7, } return prob_values.get(a.alignment, 0.01) ibm_model = IBMModel([]) ibm_model.neighboring = neighboring_mock ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock # act best_alignment = ibm_model.hillclimb(initial_alignment) # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4) self.assertEqual(best_alignment.alignment, (0, 4, 4)) def test_sample(self): # arrange sentence_pair = AlignedSent( TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE ) ibm_model = IBMModel([]) ibm_model.prob_t_a_given_s = lambda x: 0.001 # act samples, best_alignment = ibm_model.sample(sentence_pair) # assert self.assertEqual(len(samples), 61) nltk-3.7/nltk/test/unit/translate/test_meteor.py000066400000000000000000000013321420073152400221370ustar00rootroot00000000000000import unittest from nltk.translate.meteor_score import meteor_score class TestMETEOR(unittest.TestCase): reference = [["this", "is", "a", "test"], ["this", "is" "test"]] candidate = ["THIS", "Is", "a", "tEST"] def test_meteor(self): score = meteor_score(self.reference, self.candidate, preprocess=str.lower) assert score == 0.9921875 def test_reference_type_check(self): str_reference = [" ".join(ref) for ref in self.reference] self.assertRaises(TypeError, meteor_score, str_reference, self.candidate) def test_candidate_type_check(self): str_candidate = " ".join(self.candidate) self.assertRaises(TypeError, meteor_score, self.reference, str_candidate) nltk-3.7/nltk/test/unit/translate/test_nist.py000066400000000000000000000031111420073152400216160ustar00rootroot00000000000000""" Tests for NIST translation evaluation metric """ import io import unittest from nltk.data import find from nltk.translate.nist_score import corpus_nist class TestNIST(unittest.TestCase): def test_sentence_nist(self): ref_file = find("models/wmt15_eval/ref.ru") hyp_file = find("models/wmt15_eval/google.ru") mteval_output_file = find("models/wmt15_eval/mteval-13a.output") # Reads the NIST scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file) as mteval_fin: # The numbers are located in the last 4th line of the file. # The first and 2nd item in the list are the score and system names. mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1]) with open(ref_file, encoding="utf8") as ref_fin: with open(hyp_file, encoding="utf8") as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypotheses = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_nist in zip(range(1, 10), mteval_nist_scores): nltk_nist = corpus_nist(references, hypotheses, i) # Check that the NIST scores difference is less than 0.5 assert abs(mteval_nist - nltk_nist) < 0.05 nltk-3.7/nltk/test/unit/translate/test_stack_decoder.py000066400000000000000000000227521420073152400234470ustar00rootroot00000000000000# Natural Language Toolkit: Stack decoder # # Copyright (C) 2001-2022 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Tests for stack decoder """ import unittest from collections import defaultdict from math import log from nltk.translate import PhraseTable, StackDecoder from nltk.translate.stack_decoder import _Hypothesis, _Stack class TestStackDecoder(unittest.TestCase): def test_find_all_src_phrases(self): # arrange phrase_table = TestStackDecoder.create_fake_phrase_table() stack_decoder = StackDecoder(phrase_table, None) sentence = ("my", "hovercraft", "is", "full", "of", "eels") # act src_phrase_spans = stack_decoder.find_all_src_phrases(sentence) # assert self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft' self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft' self.assertEqual(src_phrase_spans[2], [3]) # 'is' self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels' self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of' self.assertEqual(src_phrase_spans[5], [6]) # 'eels' def test_distortion_score(self): # arrange stack_decoder = StackDecoder(None, None) stack_decoder.distortion_factor = 0.5 hypothesis = _Hypothesis() hypothesis.src_phrase_span = (3, 5) # act score = stack_decoder.distortion_score(hypothesis, (8, 10)) # assert expected_score = log(stack_decoder.distortion_factor) * (8 - 5) self.assertEqual(score, expected_score) def test_distortion_score_of_first_expansion(self): # arrange stack_decoder = StackDecoder(None, None) stack_decoder.distortion_factor = 0.5 hypothesis = _Hypothesis() # act score = stack_decoder.distortion_score(hypothesis, (8, 10)) # assert # expansion from empty hypothesis always has zero distortion cost self.assertEqual(score, 0.0) def test_compute_future_costs(self): # arrange phrase_table = TestStackDecoder.create_fake_phrase_table() language_model = TestStackDecoder.create_fake_language_model() stack_decoder = StackDecoder(phrase_table, language_model) sentence = ("my", "hovercraft", "is", "full", "of", "eels") # act future_scores = stack_decoder.compute_future_scores(sentence) # assert self.assertEqual( future_scores[1][2], ( phrase_table.translations_for(("hovercraft",))[0].log_prob + language_model.probability(("hovercraft",)) ), ) self.assertEqual( future_scores[0][2], ( phrase_table.translations_for(("my", "hovercraft"))[0].log_prob + language_model.probability(("my", "hovercraft")) ), ) def test_compute_future_costs_for_phrases_not_in_phrase_table(self): # arrange phrase_table = TestStackDecoder.create_fake_phrase_table() language_model = TestStackDecoder.create_fake_language_model() stack_decoder = StackDecoder(phrase_table, language_model) sentence = ("my", "hovercraft", "is", "full", "of", "eels") # act future_scores = stack_decoder.compute_future_scores(sentence) # assert self.assertEqual( future_scores[1][3], # 'hovercraft is' is not in phrase table future_scores[1][2] + future_scores[2][3], ) # backoff def test_future_score(self): # arrange: sentence with 8 words; words 2, 3, 4 already translated hypothesis = _Hypothesis() hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock future_score_table = defaultdict(lambda: defaultdict(float)) future_score_table[0][2] = 0.4 future_score_table[5][8] = 0.5 stack_decoder = StackDecoder(None, None) # act future_score = stack_decoder.future_score(hypothesis, future_score_table, 8) # assert self.assertEqual(future_score, 0.4 + 0.5) def test_valid_phrases(self): # arrange hypothesis = _Hypothesis() # mock untranslated_spans method hypothesis.untranslated_spans = lambda _: [(0, 2), (3, 6)] all_phrases_from = [[1, 4], [2], [], [5], [5, 6, 7], [], [7]] # act phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis) # assert self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)]) @staticmethod def create_fake_phrase_table(): phrase_table = PhraseTable() phrase_table.add(("hovercraft",), ("",), 0.8) phrase_table.add(("my", "hovercraft"), ("", ""), 0.7) phrase_table.add(("my", "cheese"), ("", ""), 0.7) phrase_table.add(("is",), ("",), 0.8) phrase_table.add(("is",), ("",), 0.5) phrase_table.add(("full", "of"), ("", ""), 0.01) phrase_table.add(("full", "of", "eels"), ("", "", ""), 0.5) phrase_table.add(("full", "of", "spam"), ("", ""), 0.5) phrase_table.add(("eels",), ("",), 0.5) phrase_table.add(("spam",), ("",), 0.5) return phrase_table @staticmethod def create_fake_language_model(): # nltk.model should be used here once it is implemented language_prob = defaultdict(lambda: -999.0) language_prob[("my",)] = log(0.1) language_prob[("hovercraft",)] = log(0.1) language_prob[("is",)] = log(0.1) language_prob[("full",)] = log(0.1) language_prob[("of",)] = log(0.1) language_prob[("eels",)] = log(0.1) language_prob[("my", "hovercraft")] = log(0.3) language_model = type( "", (object,), {"probability": lambda _, phrase: language_prob[phrase]} )() return language_model class TestHypothesis(unittest.TestCase): def setUp(self): root = _Hypothesis() child = _Hypothesis( raw_score=0.5, src_phrase_span=(3, 7), trg_phrase=("hello", "world"), previous=root, ) grandchild = _Hypothesis( raw_score=0.4, src_phrase_span=(1, 2), trg_phrase=("and", "goodbye"), previous=child, ) self.hypothesis_chain = grandchild def test_translation_so_far(self): # act translation = self.hypothesis_chain.translation_so_far() # assert self.assertEqual(translation, ["hello", "world", "and", "goodbye"]) def test_translation_so_far_for_empty_hypothesis(self): # arrange hypothesis = _Hypothesis() # act translation = hypothesis.translation_so_far() # assert self.assertEqual(translation, []) def test_total_translated_words(self): # act total_translated_words = self.hypothesis_chain.total_translated_words() # assert self.assertEqual(total_translated_words, 5) def test_translated_positions(self): # act translated_positions = self.hypothesis_chain.translated_positions() # assert translated_positions.sort() self.assertEqual(translated_positions, [1, 3, 4, 5, 6]) def test_untranslated_spans(self): # act untranslated_spans = self.hypothesis_chain.untranslated_spans(10) # assert self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)]) def test_untranslated_spans_for_empty_hypothesis(self): # arrange hypothesis = _Hypothesis() # act untranslated_spans = hypothesis.untranslated_spans(10) # assert self.assertEqual(untranslated_spans, [(0, 10)]) class TestStack(unittest.TestCase): def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self): # arrange stack = _Stack(3) poor_hypothesis = _Hypothesis(0.01) # act stack.push(_Hypothesis(0.2)) stack.push(poor_hypothesis) stack.push(_Hypothesis(0.1)) stack.push(_Hypothesis(0.3)) # assert self.assertFalse(poor_hypothesis in stack) def test_push_removes_hypotheses_that_fall_below_beam_threshold(self): # arrange stack = _Stack(3, 0.5) poor_hypothesis = _Hypothesis(0.01) worse_hypothesis = _Hypothesis(0.009) # act stack.push(poor_hypothesis) stack.push(worse_hypothesis) stack.push(_Hypothesis(0.9)) # greatly superior hypothesis # assert self.assertFalse(poor_hypothesis in stack) self.assertFalse(worse_hypothesis in stack) def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self): # arrange stack = _Stack(3, 0.5) poor_hypothesis = _Hypothesis(0.01) # act stack.push(_Hypothesis(0.9)) # greatly superior hypothesis stack.push(poor_hypothesis) # assert self.assertFalse(poor_hypothesis in stack) def test_best_returns_the_best_hypothesis(self): # arrange stack = _Stack(3) best_hypothesis = _Hypothesis(0.99) # act stack.push(_Hypothesis(0.0)) stack.push(best_hypothesis) stack.push(_Hypothesis(0.5)) # assert self.assertEqual(stack.best(), best_hypothesis) def test_best_returns_none_when_stack_is_empty(self): # arrange stack = _Stack(3) # assert self.assertEqual(stack.best(), None) nltk-3.7/nltk/test/util.doctest000066400000000000000000000017631420073152400166330ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ================= Utility functions ================= >>> from nltk.util import * >>> from nltk.tree import Tree >>> print_string("This is a long string, therefore it should break", 25) This is a long string, therefore it should break >>> re_show("[a-z]+", "sdf123") {sdf}123 >>> tree = Tree(5, ... [Tree(4, [Tree(2, [1, 3])]), ... Tree(8, [Tree(6, [7]), 9])]) >>> for x in breadth_first(tree): ... if isinstance(x, int): print(x) ... else: print(x.label()) 5 4 8 2 6 9 1 3 7 >>> for x in breadth_first(tree, maxdepth=2): ... if isinstance(x, int): print(x) ... else: print(x.label()) 5 4 8 2 6 9 >>> invert_dict({1: 2}) defaultdict(<... 'list'>, {2: 1}) >>> invert_dict({1: [3, 4, 5]}) defaultdict(<... 'list'>, {3: [1], 4: [1], 5: [1]}) nltk-3.7/nltk/test/wordnet.doctest000066400000000000000000000671651420073152400173500ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT ================= WordNet Interface ================= WordNet is just another NLTK corpus reader, and can be imported like this: >>> from nltk.corpus import wordnet For more compact code, we recommend: >>> from nltk.corpus import wordnet as wn ----- Words ----- Look up a word using ``synsets()``; this function has an optional ``pos`` argument which lets you constrain the part of speech of the word: >>> wn.synsets('dog') [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')] >>> wn.synsets('dog', pos=wn.VERB) [Synset('chase.v.01')] The other parts of speech are ``NOUN``, ``ADJ`` and ``ADV``. A synset is identified with a 3-part name of the form: word.pos.nn: >>> wn.synset('dog.n.01') Synset('dog.n.01') >>> print(wn.synset('dog.n.01').definition()) a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds >>> len(wn.synset('dog.n.01').examples()) 1 >>> print(wn.synset('dog.n.01').examples()[0]) the dog barked all night >>> wn.synset('dog.n.01').lemmas() [Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')] >>> [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()] ['dog', 'domestic_dog', 'Canis_familiaris'] >>> wn.lemma('dog.n.01.dog').synset() Synset('dog.n.01') The WordNet corpus reader gives access to the Open Multilingual WordNet, using ISO-639 language codes. >>> sorted(wn.langs()) ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn', 'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe', 'tha', 'zsm'] >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn') [Synset('dog.n.01'), Synset('spy.n.01')] >>> wn.synset('spy.n.01').lemma_names('jpn') ['いぬ', 'まわし者', 'スパイ', '回し者', '回者', '密偵', '工作員', '廻し者', '廻者', '探', '探り', '犬', '秘密捜査員', '諜報員', '諜者', '間者', '間諜', '隠密'] >>> wn.synset('dog.n.01').lemma_names('ita') ['Canis_familiaris', 'cane'] >>> wn.lemmas('cane', lang='ita') [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'), Lemma('incompetent.n.01.cane')] >>> sorted(wn.synset('dog.n.01').lemmas('dan')) [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'), Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')] >>> sorted(wn.synset('dog.n.01').lemmas('por')) [Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')] >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por') >>> dog_lemma Lemma('dog.n.01.c\xe3o') >>> dog_lemma.lang() 'por' >>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn'))) 66031 ------- Synsets ------- `Synset`: a set of synonyms that share a common meaning. >>> dog = wn.synset('dog.n.01') >>> dog.hypernyms() [Synset('canine.n.02'), Synset('domestic_animal.n.01')] >>> dog.hyponyms() [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), ...] >>> dog.member_holonyms() [Synset('canis.n.01'), Synset('pack.n.06')] >>> dog.root_hypernyms() [Synset('entity.n.01')] >>> wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')) [Synset('carnivore.n.01')] Each synset contains one or more lemmas, which represent a specific sense of a specific word. Note that some relations are defined by WordNet only over Lemmas: >>> good = wn.synset('good.a.01') >>> good.antonyms() Traceback (most recent call last): File "", line 1, in AttributeError: 'Synset' object has no attribute 'antonyms' >>> good.lemmas()[0].antonyms() [Lemma('bad.a.01.bad')] The relations that are currently defined in this way are `antonyms`, `derivationally_related_forms` and `pertainyms`. If you know the byte offset used to identify a synset in the original Princeton WordNet data file, you can use that to instantiate the synset in NLTK: >>> wn.synset_from_pos_and_offset('n', 4543158) Synset('wagon.n.01') ------ Lemmas ------ >>> eat = wn.lemma('eat.v.03.eat') >>> eat Lemma('feed.v.06.eat') >>> print(eat.key()) eat%2:34:02:: >>> eat.count() 4 >>> wn.lemma_from_key(eat.key()) Lemma('feed.v.06.eat') >>> wn.lemma_from_key(eat.key()).synset() Synset('feed.v.06') >>> wn.lemma_from_key('feebleminded%5:00:00:retarded:00') Lemma('backward.s.03.feebleminded') >>> for lemma in wn.synset('eat.v.03').lemmas(): ... print(lemma, lemma.count()) ... Lemma('feed.v.06.feed') 3 Lemma('feed.v.06.eat') 4 >>> for lemma in wn.lemmas('eat', 'v'): ... print(lemma, lemma.count()) ... Lemma('eat.v.01.eat') 61 Lemma('eat.v.02.eat') 13 Lemma('feed.v.06.eat') 4 Lemma('eat.v.04.eat') 0 Lemma('consume.v.05.eat') 0 Lemma('corrode.v.01.eat') 0 >>> wn.lemma('jump.v.11.jump') Lemma('jump.v.11.jump') Lemmas can also have relations between them: >>> vocal = wn.lemma('vocal.a.01.vocal') >>> vocal.derivationally_related_forms() [Lemma('vocalize.v.02.vocalize')] >>> vocal.pertainyms() [Lemma('voice.n.02.voice')] >>> vocal.antonyms() [Lemma('instrumental.a.01.instrumental')] The three relations above exist only on lemmas, not on synsets. ----------- Verb Frames ----------- >>> wn.synset('think.v.01').frame_ids() [5, 9] >>> for lemma in wn.synset('think.v.01').lemmas(): ... print(lemma, lemma.frame_ids()) ... print(" | ".join(lemma.frame_strings())) ... Lemma('think.v.01.think') [5, 9] Something think something Adjective/Noun | Somebody think somebody Lemma('think.v.01.believe') [5, 9] Something believe something Adjective/Noun | Somebody believe somebody Lemma('think.v.01.consider') [5, 9] Something consider something Adjective/Noun | Somebody consider somebody Lemma('think.v.01.conceive') [5, 9] Something conceive something Adjective/Noun | Somebody conceive somebody >>> wn.synset('stretch.v.02').frame_ids() [8] >>> for lemma in wn.synset('stretch.v.02').lemmas(): ... print(lemma, lemma.frame_ids()) ... print(" | ".join(lemma.frame_strings())) ... Lemma('stretch.v.02.stretch') [8, 2] Somebody stretch something | Somebody stretch Lemma('stretch.v.02.extend') [8] Somebody extend something ---------- Similarity ---------- >>> dog = wn.synset('dog.n.01') >>> cat = wn.synset('cat.n.01') >>> hit = wn.synset('hit.v.01') >>> slap = wn.synset('slap.v.01') ``synset1.path_similarity(synset2):`` Return a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1. By default, there is now a fake root node added to verbs so for cases where previously a path could not be found---and None was returned---it should return a value. The old behavior can be achieved by setting simulate_root to be False. A score of 1 represents identity i.e. comparing a sense with itself will return 1. >>> dog.path_similarity(cat) 0.2... >>> hit.path_similarity(slap) 0.142... >>> wn.path_similarity(hit, slap) 0.142... >>> print(hit.path_similarity(slap, simulate_root=False)) None >>> print(wn.path_similarity(hit, slap, simulate_root=False)) None ``synset1.lch_similarity(synset2):`` Leacock-Chodorow Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur. The relationship is given as -log(p/2d) where p is the shortest path length and d the taxonomy depth. >>> dog.lch_similarity(cat) 2.028... >>> hit.lch_similarity(slap) 1.312... >>> wn.lch_similarity(hit, slap) 1.312... >>> print(hit.lch_similarity(slap, simulate_root=False)) None >>> print(wn.lch_similarity(hit, slap, simulate_root=False)) None ``synset1.wup_similarity(synset2):`` Wu-Palmer Similarity: Return a score denoting how similar two word senses are, based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). Note that at this time the scores given do _not_ always agree with those given by Pedersen's Perl implementation of Wordnet Similarity. The LCS does not necessarily feature in the shortest path connecting the two senses, as it is by definition the common ancestor deepest in the taxonomy, not closest to the two senses. Typically, however, it will so feature. Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected. Where the LCS has multiple paths to the root, the longer path is used for the purposes of the calculation. >>> dog.wup_similarity(cat) 0.857... >>> hit.wup_similarity(slap) 0.25 >>> wn.wup_similarity(hit, slap) 0.25 >>> print(hit.wup_similarity(slap, simulate_root=False)) None >>> print(wn.wup_similarity(hit, slap, simulate_root=False)) None ``wordnet_ic`` Information Content: Load an information content file from the wordnet_ic corpus. >>> from nltk.corpus import wordnet_ic >>> brown_ic = wordnet_ic.ic('ic-brown.dat') >>> semcor_ic = wordnet_ic.ic('ic-semcor.dat') Or you can create an information content dictionary from a corpus (or anything that has a words() method). >>> from nltk.corpus import genesis >>> genesis_ic = wn.ic(genesis, False, 0.0) ``synset1.res_similarity(synset2, ic):`` Resnik Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node). Note that for any similarity measure that uses information content, the result is dependent on the corpus used to generate the information content and the specifics of how the information content was created. >>> dog.res_similarity(cat, brown_ic) 7.911... >>> dog.res_similarity(cat, genesis_ic) 7.204... ``synset1.jcn_similarity(synset2, ic):`` Jiang-Conrath Similarity Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). >>> dog.jcn_similarity(cat, brown_ic) 0.449... >>> dog.jcn_similarity(cat, genesis_ic) 0.285... ``synset1.lin_similarity(synset2, ic):`` Lin Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). >>> dog.lin_similarity(cat, semcor_ic) 0.886... --------------------- Access to all Synsets --------------------- Iterate over all the noun synsets: >>> for synset in list(wn.all_synsets('n'))[:10]: ... print(synset) ... Synset('entity.n.01') Synset('physical_entity.n.01') Synset('abstraction.n.06') Synset('thing.n.12') Synset('object.n.01') Synset('whole.n.02') Synset('congener.n.03') Synset('living_thing.n.01') Synset('organism.n.01') Synset('benthos.n.02') Get all synsets for this word, possibly restricted by POS: >>> wn.synsets('dog') [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), ...] >>> wn.synsets('dog', pos='v') [Synset('chase.v.01')] Walk through the noun synsets looking at their hypernyms: >>> from itertools import islice >>> for synset in islice(wn.all_synsets('n'), 5): ... print(synset, synset.hypernyms()) ... Synset('entity.n.01') [] Synset('physical_entity.n.01') [Synset('entity.n.01')] Synset('abstraction.n.06') [Synset('entity.n.01')] Synset('thing.n.12') [Synset('physical_entity.n.01')] Synset('object.n.01') [Synset('physical_entity.n.01')] ------ Morphy ------ Look up forms not in WordNet, with the help of Morphy: >>> wn.morphy('denied', wn.NOUN) >>> print(wn.morphy('denied', wn.VERB)) deny >>> wn.synsets('denied', wn.NOUN) [] >>> wn.synsets('denied', wn.VERB) [Synset('deny.v.01'), Synset('deny.v.02'), Synset('deny.v.03'), Synset('deny.v.04'), Synset('deny.v.05'), Synset('traverse.v.03'), Synset('deny.v.07')] Morphy uses a combination of inflectional ending rules and exception lists to handle a variety of different possibilities: >>> print(wn.morphy('dogs')) dog >>> print(wn.morphy('churches')) church >>> print(wn.morphy('aardwolves')) aardwolf >>> print(wn.morphy('abaci')) abacus >>> print(wn.morphy('book', wn.NOUN)) book >>> wn.morphy('hardrock', wn.ADV) >>> wn.morphy('book', wn.ADJ) >>> wn.morphy('his', wn.NOUN) >>> --------------- Synset Closures --------------- Compute transitive closures of synsets >>> dog = wn.synset('dog.n.01') >>> hypo = lambda s: s.hyponyms() >>> hyper = lambda s: s.hypernyms() >>> list(dog.closure(hypo, depth=1)) == dog.hyponyms() True >>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() True >>> list(dog.closure(hypo)) [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...] >>> list(dog.closure(hyper)) [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01')] ---------------- Regression Tests ---------------- Bug 85: morphy returns the base form of a word, if it's input is given as a base form for a POS for which that word is not defined: >>> wn.synsets('book', wn.NOUN) [Synset('book.n.01'), Synset('book.n.02'), Synset('record.n.05'), Synset('script.n.01'), Synset('ledger.n.01'), Synset('book.n.06'), Synset('book.n.07'), Synset('koran.n.01'), Synset('bible.n.01'), Synset('book.n.10'), Synset('book.n.11')] >>> wn.synsets('book', wn.ADJ) [] >>> wn.morphy('book', wn.NOUN) 'book' >>> wn.morphy('book', wn.ADJ) >>> Bug 160: wup_similarity breaks when the two synsets have no common hypernym >>> t = wn.synsets('picasso')[0] >>> m = wn.synsets('male')[1] >>> t.wup_similarity(m) 0.631... Issue #2278: wup_similarity not commutative when comparing a noun and a verb. Patch #2650 resolved this error. As a result, the output of the following use of wup_similarity no longer returns None. >>> t = wn.synsets('titan')[1] >>> s = wn.synsets('say', wn.VERB)[0] >>> t.wup_similarity(s) 0.142... Bug 21: "instance of" not included in LCS (very similar to bug 160) >>> a = wn.synsets("writings")[0] >>> b = wn.synsets("scripture")[0] >>> brown_ic = wordnet_ic.ic('ic-brown.dat') >>> a.jcn_similarity(b, brown_ic) 0.175... Bug 221: Verb root IC is zero >>> from nltk.corpus.reader.wordnet import information_content >>> s = wn.synsets('say', wn.VERB)[0] >>> information_content(s, brown_ic) 4.623... Bug 161: Comparison between WN keys/lemmas should not be case sensitive >>> k = wn.synsets("jefferson")[0].lemmas()[0].key() >>> wn.lemma_from_key(k) Lemma('jefferson.n.01.Jefferson') >>> wn.lemma_from_key(k.upper()) Lemma('jefferson.n.01.Jefferson') Bug 99: WordNet root_hypernyms gives incorrect results >>> from nltk.corpus import wordnet as wn >>> for s in wn.all_synsets(wn.NOUN): ... if s.root_hypernyms()[0] != wn.synset('entity.n.01'): ... print(s, s.root_hypernyms()) ... >>> Bug 382: JCN Division by zero error >>> tow = wn.synset('tow.v.01') >>> shlep = wn.synset('shlep.v.02') >>> from nltk.corpus import wordnet_ic >>> brown_ic = wordnet_ic.ic('ic-brown.dat') >>> tow.jcn_similarity(shlep, brown_ic) 1...e+300 Bug 428: Depth is zero for instance nouns >>> s = wn.synset("lincoln.n.01") >>> s.max_depth() > 0 True Bug 429: Information content smoothing used old reference to all_synsets >>> genesis_ic = wn.ic(genesis, True, 1.0) Bug 430: all_synsets used wrong pos lookup when synsets were cached >>> for ii in wn.all_synsets(): pass >>> for ii in wn.all_synsets(): pass Bug 470: shortest_path_distance ignored instance hypernyms >>> google = wordnet.synsets("google")[0] >>> earth = wordnet.synsets("earth")[0] >>> google.wup_similarity(earth) 0.1... Bug 484: similarity metrics returned -1 instead of None for no LCS >>> t = wn.synsets('fly', wn.VERB)[0] >>> s = wn.synsets('say', wn.VERB)[0] >>> print(s.shortest_path_distance(t)) None >>> print(s.path_similarity(t, simulate_root=False)) None >>> print(s.lch_similarity(t, simulate_root=False)) None >>> print(s.wup_similarity(t, simulate_root=False)) None Bug 427: "pants" does not return all the senses it should >>> from nltk.corpus import wordnet >>> wordnet.synsets("pants",'n') [Synset('bloomers.n.01'), Synset('pant.n.01'), Synset('trouser.n.01'), Synset('gasp.n.01')] Bug 482: Some nouns not being lemmatised by WordNetLemmatizer().lemmatize >>> from nltk.stem.wordnet import WordNetLemmatizer >>> WordNetLemmatizer().lemmatize("eggs", pos="n") 'egg' >>> WordNetLemmatizer().lemmatize("legs", pos="n") 'leg' Bug 284: instance hypernyms not used in similarity calculations >>> wn.synset('john.n.02').lch_similarity(wn.synset('dog.n.01')) 1.335... >>> wn.synset('john.n.02').wup_similarity(wn.synset('dog.n.01')) 0.571... >>> wn.synset('john.n.02').res_similarity(wn.synset('dog.n.01'), brown_ic) 2.224... >>> wn.synset('john.n.02').jcn_similarity(wn.synset('dog.n.01'), brown_ic) 0.075... >>> wn.synset('john.n.02').lin_similarity(wn.synset('dog.n.01'), brown_ic) 0.252... >>> wn.synset('john.n.02').hypernym_paths() [[Synset('entity.n.01'), ..., Synset('john.n.02')]] Issue 541: add domains to wordnet >>> wn.synset('code.n.03').topic_domains() [Synset('computer_science.n.01')] >>> wn.synset('pukka.a.01').region_domains() [Synset('india.n.01')] >>> wn.synset('freaky.a.01').usage_domains() [Synset('slang.n.02')] Issue 629: wordnet failures when python run with -O optimizations >>> # Run the test suite with python -O to check this >>> wn.synsets("brunch") [Synset('brunch.n.01'), Synset('brunch.v.01')] Issue 395: wordnet returns incorrect result for lowest_common_hypernyms of chef and policeman >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) [Synset('person.n.01')] Bug https://github.com/nltk/nltk/issues/1641: Non-English lemmas containing capital letters cannot be looked up using wordnet.lemmas() or wordnet.synsets() >>> wn.lemmas('Londres', lang='fra') [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')] >>> wn.lemmas('londres', lang='fra') [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')] Patch-1 https://github.com/nltk/nltk/pull/2065 Adding 3 functions (relations) to WordNet class >>> wn.synsets("computer_science")[0].in_topic_domains()[2] Synset('access_time.n.01') >>> wn.synsets("France")[0].in_region_domains()[18] Synset('french.n.01') >>> wn.synsets("slang")[1].in_usage_domains()[18] Synset('can-do.s.01') Issue 2721: WordNetCorpusReader.ic() does not add smoothing to N >>> class FakeCorpus: ... def words(self): return ['word'] ... >>> fake_ic = wn.ic(FakeCorpus(), False, 1.0) >>> word = wn.synset('word.n.01') >>> information_content(word, fake_ic) > 0 True ------------------------------------------------ Endlessness vs. intractability in relation trees ------------------------------------------------ 1. Endlessness -------------- Until NLTK v. 3.5, the tree() function looped forever on symmetric relations (verb_groups, attributes, and most also_sees). But in the current version, tree() now detects and discards these cycles: >>> from pprint import pprint >>> pprint(wn.synset('bound.a.01').tree(lambda s:s.also_sees())) [Synset('bound.a.01'), [Synset('unfree.a.02'), [Synset('confined.a.02'), [Synset('restricted.a.01'), [Synset('classified.a.02')]]], [Synset('dependent.a.01')], [Synset('restricted.a.01'), [Synset('classified.a.02')], [Synset('confined.a.02')]]]] Specifying the "cut_mark" parameter increases verbosity, so that the cycles are mentioned in the output, together with the level where they occur: >>> pprint(wn.synset('bound.a.01').tree(lambda s:s.also_sees(),cut_mark='...')) [Synset('bound.a.01'), [Synset('unfree.a.02'), "Cycle(Synset('bound.a.01'),-3,...)", [Synset('confined.a.02'), [Synset('restricted.a.01'), [Synset('classified.a.02')], "Cycle(Synset('confined.a.02'),-5,...)", "Cycle(Synset('unfree.a.02'),-5,...)"], "Cycle(Synset('unfree.a.02'),-4,...)"], [Synset('dependent.a.01'), "Cycle(Synset('unfree.a.02'),-4,...)"], [Synset('restricted.a.01'), [Synset('classified.a.02')], [Synset('confined.a.02'), "Cycle(Synset('restricted.a.01'),-5,...)", "Cycle(Synset('unfree.a.02'),-5,...)"], "Cycle(Synset('unfree.a.02'),-4,...)"]]] 2. Intractability ----------------- However, even after discarding the infinite cycles, some trees can remain intractable, due to combinatorial explosion in a relation. This happens in WordNet, because the also.sees() relation has a big Strongly Connected Component (_SCC_) consisting in 758 synsets, where any member node is transitively connected by the same relation, to all other members of the same SCC. This produces intractable relation trees for each of these 758 synsets, i. e. trees that are too big to compute or display on any computer. For example, the synset 'concrete.a.01' is a member of the largest SCC, so its also_sees() tree is intractable, and can normally only be handled by limiting the "depth" parameter to display a small number of levels: >>> from pprint import pprint >>> pprint(wn.synset('concrete.a.01').tree(lambda s:s.also_sees(),cut_mark='...',depth=2)) [Synset('concrete.a.01'), [Synset('practical.a.01'), "Cycle(Synset('concrete.a.01'),0,...)", [Synset('possible.a.01'), '...'], [Synset('realistic.a.01'), '...'], [Synset('serviceable.a.01'), '...']], [Synset('real.a.01'), "Cycle(Synset('concrete.a.01'),0,...)", [Synset('genuine.a.01'), '...'], [Synset('realistic.a.01'), '...'], [Synset('sincere.a.01'), '...']], [Synset('tangible.a.01'), "Cycle(Synset('concrete.a.01'),0,...)"]] 2.1 First solution: acyclic_tree() .................................. On the other hand, the new acyclic_tree() function is able to also handle the intractable cases. The also_sees() acyclic tree of 'concrete.a.01' is several hundred lines long, so here is a simpler example, concerning a much smaller SCC: counting only five members, the SCC that includes 'bound.a.01' is tractable with the normal tree() function, as seen above. But while tree() only prunes redundancy within local branches, acyclic_tree prunes the tree globally, thus discarding any additional redundancy, and produces a tree that includes all reachable nodes (i. e. a _spanning tree_). This tree is _minimal_ because it includes the reachable nodes only once, but it is not necessarily a _Minimum Spanning Tree_ (MST), because the Depth-first search strategy does not guarantee that nodes are reached through the lowest number of links (as Breadth-first search would). >>> pprint(wn.synset('bound.a.01').acyclic_tree(lambda s:s.also_sees())) [Synset('bound.a.01'), [Synset('unfree.a.02'), [Synset('confined.a.02'), [Synset('restricted.a.01'), [Synset('classified.a.02')]]], [Synset('dependent.a.01')]]] Again, specifying the "cut_mark" parameter increases verbosity, so that the cycles are mentioned in the output, together with the level where they occur: >>> pprint(wn.synset('bound.a.01').acyclic_tree(lambda s:s.also_sees(),cut_mark='...')) [Synset('bound.a.01'), [Synset('unfree.a.02'), "Cycle(Synset('bound.a.01'),-3,...)", [Synset('confined.a.02'), [Synset('restricted.a.01'), [Synset('classified.a.02')], "Cycle(Synset('confined.a.02'),-5,...)", "Cycle(Synset('unfree.a.02'),-5,...)"], "Cycle(Synset('unfree.a.02'),-4,...)"], [Synset('dependent.a.01'), "Cycle(Synset('unfree.a.02'),-4,...)"], "Cycle(Synset('restricted.a.01'),-3,...)"]] 2.2 Better solution: mst() .......................... A Minimum Spanning Tree (MST) spans all the nodes of a relation subgraph once, while guaranteeing that each node is reached through the shortest path possible. In unweighted relation graphs like WordNet, a MST can be computed very efficiently in linear time, using Breadth-First Search (BFS). Like acyclic_tree(), the new "unweighted_minimum_spanning_tree()" function (imported in the Wordnet module as "mst") handles intractable trees, such as the example discussed above: "wn.synset('concrete.a.01').mst(lambda s:s.also_sees())". But, while the also_sees() acyclic_tree of 'bound.a.01' reaches 'classified.a.02' through four links, using depth-first search as seen above (bound.a.01 > unfree.a.02 > confined.a.02 > restricted.a.01 > classified.a.02), in the following MST, the path to 'classified.a.02' is the shortest possible, consisting only in three links (bound.a.01 > unfree.a.02 > restricted.a.01 > classified.a.02): >>> pprint(wn.synset('bound.a.01').mst(lambda s:s.also_sees())) [Synset('bound.a.01'), [Synset('unfree.a.02'), [Synset('confined.a.02')], [Synset('dependent.a.01')], [Synset('restricted.a.01'), [Synset('classified.a.02')]]]] ---------------------------------------------------------------- Loading alternative Wordnet versions ---------------------------------------------------------------- >>> print("Wordnet {}".format(wn.get_version())) Wordnet 3.0 >>> from nltk.corpus import wordnet31 as wn31 >>> print("Wordnet {}".format(wn31.get_version())) Wordnet 3.1 >>> print(wn.synset('restrain.v.01').hyponyms()) [Synset('confine.v.03'), Synset('control.v.02'), Synset('hold.v.36'), Synset('inhibit.v.04')] >>> print(wn31.synset('restrain.v.01').hyponyms()) [Synset('enchain.v.01'), Synset('fetter.v.01'), Synset('ground.v.02'), Synset('impound.v.02'), Synset('pen_up.v.01'), Synset('pinion.v.01'), Synset('pound.v.06'), Synset('tie_down.v.01')] >>> print(wn31.synset('restrain.v.04').hyponyms()) [Synset('baffle.v.03'), Synset('confine.v.02'), Synset('control.v.02'), Synset('hold.v.36'), Synset('rule.v.07'), Synset('swallow.v.06'), Synset('wink.v.04')] ------------- Teardown test ------------- >>> from nltk.corpus import wordnet >>> wordnet._unload() nltk-3.7/nltk/test/wordnet_lch.doctest000066400000000000000000000044041420073152400201610ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT =============================== WordNet Lowest Common Hypernyms =============================== Wordnet's lowest_common_hypernyms() method is based used to locate the lowest single hypernym that is shared by two given words: >>> from nltk.corpus import wordnet as wn >>> wn.synset('kin.n.01').lowest_common_hypernyms(wn.synset('mother.n.01')) [Synset('relative.n.01')] >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) [Synset('person.n.01')] This method generally returns a single result, but in some cases, more than one valid LCH is possible: >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01')) [Synset('attribute.n.02'), Synset('measure.n.02')] In some cases, lowest_common_hypernyms() can return one of the synsets which was passed to it as an argument: >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02')) [Synset('woman.n.01')] In NLTK 3.0a2 the behavior of lowest_common_hypernyms() was changed to give more accurate results in a small set of cases, generally when dealing with nouns describing social roles or jobs. To emulate the pre v3.0a2 behavior, you can set the use_min_depth=True flag: >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) [Synset('person.n.01')] >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'), use_min_depth=True) [Synset('organism.n.01')] In some cases use_min_depth=True may return more or fewer results than the default behavior: >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02')) [Synset('woman.n.01')] >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'), use_min_depth=True) [Synset('organism.n.01'), Synset('woman.n.01')] In the general case, however, they tend to return the same results: >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01')) [Synset('attribute.n.02'), Synset('measure.n.02')] >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'), use_min_depth=True) [Synset('attribute.n.02'), Synset('measure.n.02')] nltk-3.7/nltk/test/wsd.doctest000066400000000000000000000056021420073152400164470ustar00rootroot00000000000000.. Copyright (C) 2001-2022 NLTK Project .. For license information, see LICENSE.TXT .. -*- coding: utf-8 -*- ========================= Word Sense Disambiguation ========================= Lesk Algorithm -------------- Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using a the definitions of the ambiguous word. Given an ambiguous word and the context in which the word occurs, Lesk returns a Synset with the highest number of overlapping words between the context sentence and different definitions from each Synset. >>> from nltk.wsd import lesk >>> sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'] >>> print(lesk(sent, 'bank', 'n')) Synset('savings_bank.n.02') >>> print(lesk(sent, 'bank')) Synset('savings_bank.n.02') The definitions for "bank" are: >>> from nltk.corpus import wordnet as wn >>> for ss in wn.synsets('bank'): ... print(ss, ss.definition()) ... Synset('bank.n.01') sloping land (especially the slope beside a body of water) Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities Synset('bank.n.03') a long ridge or pile Synset('bank.n.04') an arrangement of similar objects in a row or in tiers Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies) Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home Synset('bank.n.09') a building in which the business of banking transacted Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning) Synset('bank.v.01') tip laterally Synset('bank.v.02') enclose with a bank Synset('bank.v.03') do business with a bank or keep an account at a bank Synset('bank.v.04') act as the banker in a game or in gambling Synset('bank.v.05') be in the banking business Synset('deposit.v.02') put into a bank account Synset('bank.v.07') cover with ashes so to control the rate of burning Synset('trust.v.01') have confidence or faith in Test disambiguation of POS tagged `able`. >>> [(s, s.pos()) for s in wn.synsets('able')] [(Synset('able.a.01'), 'a'), (Synset('able.s.02'), 's'), (Synset('able.s.03'), 's'), (Synset('able.s.04'), 's')] >>> sent = 'people should be able to marry a person of their choice'.split() >>> lesk(sent, 'able') Synset('able.s.04') >>> lesk(sent, 'able', pos='a') Synset('able.a.01') Test behavior if there is are no matching senses. >>> lesk('John loves Mary'.split(), 'loves', synsets=[]) nltk-3.7/nltk/text.py000066400000000000000000000663761420073152400146610ustar00rootroot00000000000000# Natural Language Toolkit: Texts # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ This module brings together a variety of NLTK functionality for text analysis, and provides simple, interactive interfaces. Functionality includes: concordancing, collocation discovery, regular expression search over tokenized strings, and distributional similarity. """ import re import sys from collections import Counter, defaultdict, namedtuple from functools import reduce from math import log from nltk.collocations import BigramCollocationFinder from nltk.lm import MLE from nltk.lm.preprocessing import padded_everygram_pipeline from nltk.metrics import BigramAssocMeasures, f_measure from nltk.probability import ConditionalFreqDist as CFD from nltk.probability import FreqDist from nltk.tokenize import sent_tokenize from nltk.util import LazyConcatenation, tokenwrap ConcordanceLine = namedtuple( "ConcordanceLine", ["left", "query", "right", "offset", "left_print", "right_print", "line"], ) class ContextIndex: """ A bidirectional index between words and their 'contexts' in a text. The context of a word is usually defined to be the words that occur in a fixed window around the word; but other definitions may also be used by providing a custom context function. """ @staticmethod def _default_context(tokens, i): """One left token and one right token, normalized to lowercase""" left = tokens[i - 1].lower() if i != 0 else "*START*" right = tokens[i + 1].lower() if i != len(tokens) - 1 else "*END*" return (left, right) def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD( (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens) ) self._context_to_words = CFD( (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens) ) def tokens(self): """ :rtype: list(str) :return: The document that this context index was created from. """ return self._tokens def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores def similar_words(self, word, n=20): scores = defaultdict(int) for c in self._word_to_contexts[self._key(word)]: for w in self._context_to_words[c]: if w != word: scores[w] += ( self._context_to_words[c][word] * self._context_to_words[c][w] ) return sorted(scores, key=scores.get, reverse=True)[:n] def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist( c for w in words for c in self._word_to_contexts[w] if c in common ) return fd class ConcordanceIndex: """ An index that can be used to look up the offset locations at which a given word occurs in a document. """ def __init__(self, tokens, key=lambda x: x): """ Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index) def tokens(self): """ :rtype: list(str) :return: The document that this concordance index was created from. """ return self._tokens def offsets(self, word): """ :rtype: list(int) :return: A list of the offset positions at which the given word occurs. If a key function was specified for the index, then given word's key will be looked up. """ word = self._key(word) return self._offsets[word] def __repr__(self): return "" % ( len(self._tokens), len(self._offsets), ) def find_concordance(self, word, width=80): """ Find all concordance lines given the query word. Provided with a list of words, these will be found as a phrase. """ if isinstance(word, list): phrase = word else: phrase = [word] half_width = (width - len(" ".join(phrase)) - 2) // 2 context = width // 4 # approx number of words of context # Find the instances of the word to create the ConcordanceLine concordance_list = [] offsets = self.offsets(phrase[0]) for i, word in enumerate(phrase[1:]): word_offsets = {offset - i - 1 for offset in self.offsets(word)} offsets = sorted(word_offsets.intersection(offsets)) if offsets: for i in offsets: query_word = " ".join(self._tokens[i : i + len(phrase)]) # Find the context of query word. left_context = self._tokens[max(0, i - context) : i] right_context = self._tokens[i + len(phrase) : i + context] # Create the pretty lines with the query_word in the middle. left_print = " ".join(left_context)[-half_width:] right_print = " ".join(right_context)[:half_width] # The WYSIWYG line of the concordance. line_print = " ".join([left_print, query_word, right_print]) # Create the ConcordanceLine concordance_line = ConcordanceLine( left_context, query_word, right_context, i, left_print, right_print, line_print, ) concordance_list.append(concordance_line) return concordance_list def print_concordance(self, word, width=80, lines=25): """ Print concordance lines given the query word. :param word: The target word or phrase (a list of strings) :type word: str or list :param lines: The number of lines to display (default=25) :type lines: int :param width: The width of each line, in characters (default=80) :type width: int :param save: The option to save the concordance. :type save: bool """ concordance_list = self.find_concordance(word, width=width) if not concordance_list: print("no matches") else: lines = min(lines, len(concordance_list)) print(f"Displaying {lines} of {len(concordance_list)} matches:") for i, concordance_line in enumerate(concordance_list[:lines]): print(concordance_line.line) class TokenSearcher: """ A class that makes it easier to use regular expressions to search over tokenized strings. The tokenized string is converted to a string where tokens are marked with angle brackets -- e.g., ``''``. The regular expression passed to the ``findall()`` method is modified to treat angle brackets as non-capturing parentheses, in addition to matching the token boundaries; and to have ``'.'`` not match the angle brackets. """ def __init__(self, tokens): self._raw = "".join("<" + w + ">" for w in tokens) def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> from nltk.text import TokenSearcher >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*>") you rule bro; telling you bro; u twizted bro >>> text1.findall("(<.*>)") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ # preprocess the regular expression regexp = re.sub(r"\s", "", regexp) regexp = re.sub(r"<", "(?:<(?:", regexp) regexp = re.sub(r">", ")>)", regexp) regexp = re.sub(r"(?]", regexp) # perform the search hits = re.findall(regexp, self._raw) # Sanity check for h in hits: if not h.startswith("<") and h.endswith(">"): raise ValueError("Bad regexp for TokenSearcher.findall") # postprocess the output hits = [h[1:-1].split("><") for h in hits] return hits class Text: """ A wrapper around a sequence of simple (string) tokens, which is intended to support initial exploration of texts (via the interactive console). Its methods perform a variety of analyses on the text's contexts (e.g., counting, concordancing, collocation discovery), and display the results. If you wish to write a program which makes use of these analyses, then you should bypass the ``Text`` class, and use the appropriate analysis function or class directly instead. A ``Text`` is typically initialized from a given document or corpus. E.g.: >>> import nltk.corpus >>> from nltk.text import Text >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) """ # This defeats lazy loading, but makes things faster. This # *shouldn't* be necessary because the corpus view *should* be # doing intelligent caching, but without this it's running slow. # Look into whether the caching is working correctly. _COPY_TOKENS = True def __init__(self, tokens, name=None): """ Create a Text object. :param tokens: The source text. :type tokens: sequence of str """ if self._COPY_TOKENS: tokens = list(tokens) self.tokens = tokens if name: self.name = name elif "]" in tokens[:20]: end = tokens[:20].index("]") self.name = " ".join(str(tok) for tok in tokens[1:end]) else: self.name = " ".join(str(tok) for tok in tokens[:8]) + "..." # //////////////////////////////////////////////////////////// # Support item & slice access # //////////////////////////////////////////////////////////// def __getitem__(self, i): return self.tokens[i] def __len__(self): return len(self.tokens) # //////////////////////////////////////////////////////////// # Interactive console methods # //////////////////////////////////////////////////////////// def concordance(self, word, width=79, lines=25): """ Prints a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :param word: The target word or phrase (a list of strings) :type word: str or list :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int :seealso: ``ConcordanceIndex`` """ if "_concordance_index" not in self.__dict__: self._concordance_index = ConcordanceIndex( self.tokens, key=lambda s: s.lower() ) return self._concordance_index.print_concordance(word, width, lines) def concordance_list(self, word, width=79, lines=25): """ Generate a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :param word: The target word or phrase (a list of strings) :type word: str or list :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int :seealso: ``ConcordanceIndex`` """ if "_concordance_index" not in self.__dict__: self._concordance_index = ConcordanceIndex( self.tokens, key=lambda s: s.lower() ) return self._concordance_index.find_concordance(word, width)[:lines] def collocation_list(self, num=20, window_size=2): """ Return collocations derived from the text, ignoring stopwords. >>> from nltk.book import text4 >>> text4.collocation_list()[:2] [('United', 'States'), ('fellow', 'citizens')] :param num: The maximum number of collocations to return. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int :rtype: list(tuple(str, str)) """ if not ( "_collocations" in self.__dict__ and self._num == num and self._window_size == window_size ): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words("english") finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = list( finder.nbest(bigram_measures.likelihood_ratio, num) ) return self._collocations def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. >>> from nltk.book import text4 >>> text4.collocations() # doctest: +ELLIPSIS United States; fellow citizens; four years; ... :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ collocation_strings = [ w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size) ] print(tokenwrap(collocation_strings, separator="; ")) def count(self, word): """ Count the number of times this word appears in the text. """ return self.tokens.count(word) def index(self, word): """ Find the index of the first occurrence of the word in the text. """ return self.tokens.index(word) def readability(self, method): # code from nltk_contrib.readability raise NotImplementedError def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if "_word_context_index" not in self.__dict__: # print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower() ) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = Counter( w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word ) words = [w for w, _ in fd.most_common(num)] print(tokenwrap(words)) else: print("No matches") def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param words: The words used to seed the similarity search :type words: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if "_word_context_index" not in self.__dict__: # print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, key=lambda s: s.lower() ) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = [w for w, _ in fd.most_common(num)] print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts)) except ValueError as e: print(e) def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type words: list(str) :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words) def _train_default_ngram_lm(self, tokenized_sents, n=3): train_data, padded_sents = padded_everygram_pipeline(n, tokenized_sents) model = MLE(order=n) model.fit(train_data, padded_sents) return model def generate(self, length=100, text_seed=None, random_seed=42): """ Print random text, generated using a trigram language model. See also `help(nltk.lm)`. :param length: The length of text to generate (default=100) :type length: int :param text_seed: Generation can be conditioned on preceding context. :type text_seed: list(str) :param random_seed: A random seed or an instance of `random.Random`. If provided, makes the random sampling part of generation reproducible. (default=42) :type random_seed: int """ # Create the model when using it the first time. self._tokenized_sents = [ sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens)) ] if not hasattr(self, "_trigram_model"): print("Building ngram index...", file=sys.stderr) self._trigram_model = self._train_default_ngram_lm( self._tokenized_sents, n=3 ) generated_tokens = [] assert length > 0, "The `length` must be more than 0." while len(generated_tokens) < length: for idx, token in enumerate( self._trigram_model.generate( length, text_seed=text_seed, random_seed=random_seed ) ): if token == "": continue if token == "": break generated_tokens.append(token) random_seed += 1 prefix = " ".join(text_seed) + " " if text_seed else "" output_str = prefix + tokenwrap(generated_tokens[:length]) print(output_str) return output_str def plot(self, *args): """ See documentation for FreqDist.plot() :seealso: nltk.prob.FreqDist.plot() """ return self.vocab().plot(*args) def vocab(self): """ :seealso: nltk.prob.FreqDist """ if "_vocab" not in self.__dict__: # print("Building vocabulary index...") self._vocab = FreqDist(self) return self._vocab def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*>") you rule bro; telling you bro; u twizted bro >>> text1.findall("(<.*>)") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ if "_token_searcher" not in self.__dict__: self._token_searcher = TokenSearcher(self) hits = self._token_searcher.findall(regexp) hits = [" ".join(h) for h in hits] print(tokenwrap(hits, "; ")) # //////////////////////////////////////////////////////////// # Helper Methods # //////////////////////////////////////////////////////////// _CONTEXT_RE = re.compile(r"\w+|[\.\!\?]") def _context(self, tokens, i): """ One left & one right token, both case-normalized. Skip over non-sentence-final punctuation. Used by the ``ContextIndex`` that is created for ``similar()`` and ``common_contexts()``. """ # Left context j = i - 1 while j >= 0 and not self._CONTEXT_RE.match(tokens[j]): j -= 1 left = tokens[j] if j != 0 else "*START*" # Right context j = i + 1 while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]): j += 1 right = tokens[j] if j != len(tokens) else "*END*" return (left, right) # //////////////////////////////////////////////////////////// # String Display # //////////////////////////////////////////////////////////// def __str__(self): return "" % self.name def __repr__(self): return "" % self.name # Prototype only; this approach will be slow to load class TextCollection(Text): """A collection of texts, which can be loaded with list of texts, or with a corpus consisting of one or more texts, and which supports counting, concordancing, collocation discovery, etc. Initialize a TextCollection as follows: >>> import nltk.corpus >>> from nltk.text import TextCollection >>> print('hack'); from nltk.book import text1, text2, text3 hack... >>> gutenberg = TextCollection(nltk.corpus.gutenberg) >>> mytexts = TextCollection([text1, text2, text3]) Iterating over a TextCollection produces all the tokens of all the texts in order. """ def __init__(self, source): if hasattr(source, "words"): # bridge to the text corpus reader source = [source.words(f) for f in source.fileids()] self._texts = source Text.__init__(self, LazyConcatenation(source)) self._idf_cache = {} def tf(self, term, text): """The frequency of the term in text.""" return text.count(term) / len(text) def idf(self, term): """The number of texts in the corpus divided by the number of texts that the term appears in. If a term does not appear in the corpus, 0.0 is returned.""" # idf values are cached for performance. idf = self._idf_cache.get(term) if idf is None: matches = len([True for text in self._texts if term in text]) if len(self._texts) == 0: raise ValueError("IDF undefined for empty document collection") idf = log(len(self._texts) / matches) if matches else 0.0 self._idf_cache[term] = idf return idf def tf_idf(self, term, text): return self.tf(term, text) * self.idf(term) def demo(): from nltk.corpus import brown text = Text(brown.words(categories="news")) print(text) print() print("Concordance:") text.concordance("news") print() print("Distributionally similar words:") text.similar("news") print() print("Collocations:") text.collocations() print() # print("Automatically generated text:") # text.generate() # print() print("Dispersion plot:") text.dispersion_plot(["news", "report", "said", "announced"]) print() print("Vocabulary plot:") text.plot(50) print() print("Indexing:") print("text[3]:", text[3]) print("text[3:5]:", text[3:5]) print("text.vocab()['news']:", text.vocab()["news"]) if __name__ == "__main__": demo() __all__ = [ "ContextIndex", "ConcordanceIndex", "TokenSearcher", "Text", "TextCollection", ] nltk-3.7/nltk/tgrep.py000066400000000000000000001100101420073152400147650ustar00rootroot00000000000000#!/usr/bin/env python # # Natural Language Toolkit: TGrep search # # Copyright (C) 2001-2022 NLTK Project # Author: Will Roberts # URL: # For license information, see LICENSE.TXT """ ============================================ TGrep search implementation for NLTK trees ============================================ This module supports TGrep2 syntax for matching parts of NLTK Trees. Note that many tgrep operators require the tree passed to be a ``ParentedTree``. External links: - `Tgrep tutorial `_ - `Tgrep2 manual `_ - `Tgrep2 source `_ Usage ===== >>> from nltk.tree import ParentedTree >>> from nltk.tgrep import tgrep_nodes, tgrep_positions >>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') >>> list(tgrep_nodes('NN', [tree])) [[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]] >>> list(tgrep_positions('NN', [tree])) [[(0, 2), (2, 1)]] >>> list(tgrep_nodes('DT', [tree])) [[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]] >>> list(tgrep_nodes('DT $ JJ', [tree])) [[ParentedTree('DT', ['the'])]] This implementation adds syntax to select nodes based on their NLTK tree position. This syntax is ``N`` plus a Python tuple representing the tree position. For instance, ``N()``, ``N(0,)``, ``N(0,0)`` are valid node selectors. Example: >>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') >>> tree[0,0] ParentedTree('DT', ['the']) >>> tree[0,0].treeposition() (0, 0) >>> list(tgrep_nodes('N(0,0)', [tree])) [[ParentedTree('DT', ['the'])]] Caveats: ======== - Link modifiers: "?" and "=" are not implemented. - Tgrep compatibility: Using "@" for "!", "{" for "<", "}" for ">" are not implemented. - The "=" and "~" links are not implemented. Known Issues: ============= - There are some issues with link relations involving leaf nodes (which are represented as bare strings in NLTK trees). For instance, consider the tree:: (S (A x)) The search string ``* !>> S`` should select all nodes which are not dominated in some way by an ``S`` node (i.e., all nodes which are not descendants of an ``S``). Clearly, in this tree, the only node which fulfills this criterion is the top node (since it is not dominated by anything). However, the code here will find both the top node and the leaf node ``x``. This is because we cannot recover the parent of the leaf, since it is stored as a bare string. A possible workaround, when performing this kind of search, would be to filter out all leaf nodes. Implementation notes ==================== This implementation is (somewhat awkwardly) based on lambda functions which are predicates on a node. A predicate is a function which is either True or False; using a predicate function, we can identify sets of nodes with particular properties. A predicate function, could, for instance, return True only if a particular node has a label matching a particular regular expression, and has a daughter node which has no sisters. Because tgrep2 search strings can do things statefully (such as substituting in macros, and binding nodes with node labels), the actual predicate function is declared with three arguments:: pred = lambda n, m, l: return True # some logic here ``n`` is a node in a tree; this argument must always be given ``m`` contains a dictionary, mapping macro names onto predicate functions ``l`` is a dictionary to map node labels onto nodes in the tree ``m`` and ``l`` are declared to default to ``None``, and so need not be specified in a call to a predicate. Predicates which call other predicates must always pass the value of these arguments on. The top-level predicate (constructed by ``_tgrep_exprs_action``) binds the macro definitions to ``m`` and initialises ``l`` to an empty dictionary. """ import functools import re try: import pyparsing except ImportError: print("Warning: nltk.tgrep will not work without the `pyparsing` package") print("installed.") import nltk.tree class TgrepException(Exception): """Tgrep exception type.""" pass def ancestors(node): """ Returns the list of all nodes dominating the given tree node. This method will not work with leaf nodes, since there is no way to recover the parent. """ results = [] try: current = node.parent() except AttributeError: # if node is a leaf, we cannot retrieve its parent return results while current: results.append(current) current = current.parent() return results def unique_ancestors(node): """ Returns the list of all nodes dominating the given node, where there is only a single path of descent. """ results = [] try: current = node.parent() except AttributeError: # if node is a leaf, we cannot retrieve its parent return results while current and len(current) == 1: results.append(current) current = current.parent() return results def _descendants(node): """ Returns the list of all nodes which are descended from the given tree node in some way. """ try: treepos = node.treepositions() except AttributeError: return [] return [node[x] for x in treepos[1:]] def _leftmost_descendants(node): """ Returns the set of all nodes descended in some way through left branches from this node. """ try: treepos = node.treepositions() except AttributeError: return [] return [node[x] for x in treepos[1:] if all(y == 0 for y in x)] def _rightmost_descendants(node): """ Returns the set of all nodes descended in some way through right branches from this node. """ try: rightmost_leaf = max(node.treepositions()) except AttributeError: return [] return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] def _istree(obj): """Predicate to check whether `obj` is a nltk.tree.Tree.""" return isinstance(obj, nltk.tree.Tree) def _unique_descendants(node): """ Returns the list of all nodes descended from the given node, where there is only a single path of descent. """ results = [] current = node while current and _istree(current) and len(current) == 1: current = current[0] results.append(current) return results def _before(node): """ Returns the set of all nodes that are before the given node. """ try: pos = node.treeposition() tree = node.root() except AttributeError: return [] return [tree[x] for x in tree.treepositions() if x[: len(pos)] < pos[: len(x)]] def _immediately_before(node): """ Returns the set of all nodes that are immediately before the given node. Tree node A immediately precedes node B if the last terminal symbol (word) produced by A immediately precedes the first terminal symbol produced by B. """ try: pos = node.treeposition() tree = node.root() except AttributeError: return [] # go "upwards" from pos until there is a place we can go to the left idx = len(pos) - 1 while 0 <= idx and pos[idx] == 0: idx -= 1 if idx < 0: return [] pos = list(pos[: idx + 1]) pos[-1] -= 1 before = tree[pos] return [before] + _rightmost_descendants(before) def _after(node): """ Returns the set of all nodes that are after the given node. """ try: pos = node.treeposition() tree = node.root() except AttributeError: return [] return [tree[x] for x in tree.treepositions() if x[: len(pos)] > pos[: len(x)]] def _immediately_after(node): """ Returns the set of all nodes that are immediately after the given node. Tree node A immediately follows node B if the first terminal symbol (word) produced by A immediately follows the last terminal symbol produced by B. """ try: pos = node.treeposition() tree = node.root() current = node.parent() except AttributeError: return [] # go "upwards" from pos until there is a place we can go to the # right idx = len(pos) - 1 while 0 <= idx and pos[idx] == len(current) - 1: idx -= 1 current = current.parent() if idx < 0: return [] pos = list(pos[: idx + 1]) pos[-1] += 1 after = tree[pos] return [after] + _leftmost_descendants(after) def _tgrep_node_literal_value(node): """ Gets the string value of a given parse tree node, for comparison using the tgrep node literal predicates. """ return node.label() if _istree(node) else str(node) def _tgrep_macro_use_action(_s, _l, tokens): """ Builds a lambda function which looks up the macro name used. """ assert len(tokens) == 1 assert tokens[0][0] == "@" macro_name = tokens[0][1:] def macro_use(n, m=None, l=None): if m is None or macro_name not in m: raise TgrepException(f"macro {macro_name} not defined") return m[macro_name](n, m, l) return macro_use def _tgrep_node_action(_s, _l, tokens): """ Builds a lambda function representing a predicate on a tree node depending on the name of its node. """ if tokens[0] == "'": # strip initial apostrophe (tgrep2 print command) tokens = tokens[1:] if len(tokens) > 1: # disjunctive definition of a node name assert list(set(tokens[1::2])) == ["|"] # recursively call self to interpret each node name definition tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]] # capture tokens and return the disjunction return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens) else: if hasattr(tokens[0], "__call__"): # this is a previously interpreted parenthetical node # definition (lambda function) return tokens[0] elif tokens[0] == "*" or tokens[0] == "__": return lambda n, m=None, l=None: True elif tokens[0].startswith('"'): assert tokens[0].endswith('"') node_lit = tokens[0][1:-1].replace('\\"', '"').replace("\\\\", "\\") return ( lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s )(node_lit) elif tokens[0].startswith("/"): assert tokens[0].endswith("/") node_lit = tokens[0][1:-1] return ( lambda r: lambda n, m=None, l=None: r.search( _tgrep_node_literal_value(n) ) )(re.compile(node_lit)) elif tokens[0].startswith("i@"): node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()]) return ( lambda f: lambda n, m=None, l=None: f( _tgrep_node_literal_value(n).lower() ) )(node_func) else: return ( lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s )(tokens[0]) def _tgrep_parens_action(_s, _l, tokens): """ Builds a lambda function representing a predicate on a tree node from a parenthetical notation. """ assert len(tokens) == 3 assert tokens[0] == "(" assert tokens[2] == ")" return tokens[1] def _tgrep_nltk_tree_pos_action(_s, _l, tokens): """ Builds a lambda function representing a predicate on a tree node which returns true if the node is located at a specific tree position. """ # recover the tuple from the parsed string node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) # capture the node's tree position return ( lambda i: lambda n, m=None, l=None: ( hasattr(n, "treeposition") and n.treeposition() == i ) )(node_tree_position) def _tgrep_relation_action(_s, _l, tokens): """ Builds a lambda function representing a predicate on a tree node depending on its relation to other nodes in the tree. """ # process negation first if needed negated = False if tokens[0] == "!": negated = True tokens = tokens[1:] if tokens[0] == "[": # process square-bracketed relation expressions assert len(tokens) == 3 assert tokens[2] == "]" retval = tokens[1] else: # process operator-node relation expressions assert len(tokens) == 2 operator, predicate = tokens # A < B A is the parent of (immediately dominates) B. if operator == "<": retval = lambda n, m=None, l=None: ( _istree(n) and any(predicate(x, m, l) for x in n) ) # A > B A is the child of B. elif operator == ">": retval = lambda n, m=None, l=None: ( hasattr(n, "parent") and bool(n.parent()) and predicate(n.parent(), m, l) ) # A <, B Synonymous with A <1 B. elif operator == "<," or operator == "<1": retval = lambda n, m=None, l=None: ( _istree(n) and bool(list(n)) and predicate(n[0], m, l) ) # A >, B Synonymous with A >1 B. elif operator == ">," or operator == ">1": retval = lambda n, m=None, l=None: ( hasattr(n, "parent") and bool(n.parent()) and (n is n.parent()[0]) and predicate(n.parent(), m, l) ) # A N B A is the Nth child of B (the first child is >1). elif operator[0] == ">" and operator[1:].isdigit(): idx = int(operator[1:]) # capture the index parameter retval = ( lambda i: lambda n, m=None, l=None: ( hasattr(n, "parent") and bool(n.parent()) and 0 <= i < len(n.parent()) and (n is n.parent()[i]) and predicate(n.parent(), m, l) ) )(idx - 1) # A <' B B is the last child of A (also synonymous with A <-1 B). # A <- B B is the last child of A (synonymous with A <-1 B). elif operator == "<'" or operator == "<-" or operator == "<-1": retval = lambda n, m=None, l=None: ( _istree(n) and bool(list(n)) and predicate(n[-1], m, l) ) # A >' B A is the last child of B (also synonymous with A >-1 B). # A >- B A is the last child of B (synonymous with A >-1 B). elif operator == ">'" or operator == ">-" or operator == ">-1": retval = lambda n, m=None, l=None: ( hasattr(n, "parent") and bool(n.parent()) and (n is n.parent()[-1]) and predicate(n.parent(), m, l) ) # A <-N B B is the N th-to-last child of A (the last child is <-1). elif operator[:2] == "<-" and operator[2:].isdigit(): idx = -int(operator[2:]) # capture the index parameter retval = ( lambda i: lambda n, m=None, l=None: ( _istree(n) and bool(list(n)) and 0 <= (i + len(n)) < len(n) and predicate(n[i + len(n)], m, l) ) )(idx) # A >-N B A is the N th-to-last child of B (the last child is >-1). elif operator[:2] == ">-" and operator[2:].isdigit(): idx = -int(operator[2:]) # capture the index parameter retval = ( lambda i: lambda n, m=None, l=None: ( hasattr(n, "parent") and bool(n.parent()) and 0 <= (i + len(n.parent())) < len(n.parent()) and (n is n.parent()[i + len(n.parent())]) and predicate(n.parent(), m, l) ) )(idx) # A <: B B is the only child of A elif operator == "<:": retval = lambda n, m=None, l=None: ( _istree(n) and len(n) == 1 and predicate(n[0], m, l) ) # A >: B A is the only child of B. elif operator == ">:": retval = lambda n, m=None, l=None: ( hasattr(n, "parent") and bool(n.parent()) and len(n.parent()) == 1 and predicate(n.parent(), m, l) ) # A << B A dominates B (A is an ancestor of B). elif operator == "<<": retval = lambda n, m=None, l=None: ( _istree(n) and any(predicate(x, m, l) for x in _descendants(n)) ) # A >> B A is dominated by B (A is a descendant of B). elif operator == ">>": retval = lambda n, m=None, l=None: any( predicate(x, m, l) for x in ancestors(n) ) # A <<, B B is a left-most descendant of A. elif operator == "<<," or operator == "<<1": retval = lambda n, m=None, l=None: ( _istree(n) and any(predicate(x, m, l) for x in _leftmost_descendants(n)) ) # A >>, B A is a left-most descendant of B. elif operator == ">>,": retval = lambda n, m=None, l=None: any( (predicate(x, m, l) and n in _leftmost_descendants(x)) for x in ancestors(n) ) # A <<' B B is a right-most descendant of A. elif operator == "<<'": retval = lambda n, m=None, l=None: ( _istree(n) and any(predicate(x, m, l) for x in _rightmost_descendants(n)) ) # A >>' B A is a right-most descendant of B. elif operator == ">>'": retval = lambda n, m=None, l=None: any( (predicate(x, m, l) and n in _rightmost_descendants(x)) for x in ancestors(n) ) # A <<: B There is a single path of descent from A and B is on it. elif operator == "<<:": retval = lambda n, m=None, l=None: ( _istree(n) and any(predicate(x, m, l) for x in _unique_descendants(n)) ) # A >>: B There is a single path of descent from B and A is on it. elif operator == ">>:": retval = lambda n, m=None, l=None: any( predicate(x, m, l) for x in unique_ancestors(n) ) # A . B A immediately precedes B. elif operator == ".": retval = lambda n, m=None, l=None: any( predicate(x, m, l) for x in _immediately_after(n) ) # A , B A immediately follows B. elif operator == ",": retval = lambda n, m=None, l=None: any( predicate(x, m, l) for x in _immediately_before(n) ) # A .. B A precedes B. elif operator == "..": retval = lambda n, m=None, l=None: any( predicate(x, m, l) for x in _after(n) ) # A ,, B A follows B. elif operator == ",,": retval = lambda n, m=None, l=None: any( predicate(x, m, l) for x in _before(n) ) # A $ B A is a sister of B (and A != B). elif operator == "$" or operator == "%": retval = lambda n, m=None, l=None: ( hasattr(n, "parent") and bool(n.parent()) and any(predicate(x, m, l) for x in n.parent() if x is not n) ) # A $. B A is a sister of and immediately precedes B. elif operator == "$." or operator == "%.": retval = lambda n, m=None, l=None: ( hasattr(n, "right_sibling") and bool(n.right_sibling()) and predicate(n.right_sibling(), m, l) ) # A $, B A is a sister of and immediately follows B. elif operator == "$," or operator == "%,": retval = lambda n, m=None, l=None: ( hasattr(n, "left_sibling") and bool(n.left_sibling()) and predicate(n.left_sibling(), m, l) ) # A $.. B A is a sister of and precedes B. elif operator == "$.." or operator == "%..": retval = lambda n, m=None, l=None: ( hasattr(n, "parent") and hasattr(n, "parent_index") and bool(n.parent()) and any(predicate(x, m, l) for x in n.parent()[n.parent_index() + 1 :]) ) # A $,, B A is a sister of and follows B. elif operator == "$,," or operator == "%,,": retval = lambda n, m=None, l=None: ( hasattr(n, "parent") and hasattr(n, "parent_index") and bool(n.parent()) and any(predicate(x, m, l) for x in n.parent()[: n.parent_index()]) ) else: raise TgrepException(f'cannot interpret tgrep operator "{operator}"') # now return the built function if negated: return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval) else: return retval def _tgrep_conjunction_action(_s, _l, tokens, join_char="&"): """ Builds a lambda function representing a predicate on a tree node from the conjunction of several other such lambda functions. This is prototypically called for expressions like (`tgrep_rel_conjunction`):: < NP & < AP < VP where tokens is a list of predicates representing the relations (`< NP`, `< AP`, and `< VP`), possibly with the character `&` included (as in the example here). This is also called for expressions like (`tgrep_node_expr2`):: NP < NN S=s < /NP/=n : s < /VP/=v : n .. v tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional) list of segmented patterns (`tgrep_expr_labeled`, processed by `_tgrep_segmented_pattern_action`). """ # filter out the ampersand tokens = [x for x in tokens if x != join_char] if len(tokens) == 1: return tokens[0] else: return ( lambda ts: lambda n, m=None, l=None: all( predicate(n, m, l) for predicate in ts ) )(tokens) def _tgrep_segmented_pattern_action(_s, _l, tokens): """ Builds a lambda function representing a segmented pattern. Called for expressions like (`tgrep_expr_labeled`):: =s .. =v < =n This is a segmented pattern, a tgrep2 expression which begins with a node label. The problem is that for segemented_pattern_action (': =v < =s'), the first element (in this case, =v) is specifically selected by virtue of matching a particular node in the tree; to retrieve the node, we need the label, not a lambda function. For node labels inside a tgrep_node_expr, we need a lambda function which returns true if the node visited is the same as =v. We solve this by creating two copies of a node_label_use in the grammar; the label use inside a tgrep_expr_labeled has a separate parse action to the pred use inside a node_expr. See `_tgrep_node_label_use_action` and `_tgrep_node_label_pred_use_action`. """ # tokens[0] is a string containing the node label node_label = tokens[0] # tokens[1:] is an (optional) list of predicates which must all # hold of the bound node reln_preds = tokens[1:] def pattern_segment_pred(n, m=None, l=None): """This predicate function ignores its node argument.""" # look up the bound node using its label if l is None or node_label not in l: raise TgrepException(f"node_label ={node_label} not bound in pattern") node = l[node_label] # match the relation predicates against the node return all(pred(node, m, l) for pred in reln_preds) return pattern_segment_pred def _tgrep_node_label_use_action(_s, _l, tokens): """ Returns the node label used to begin a tgrep_expr_labeled. See `_tgrep_segmented_pattern_action`. Called for expressions like (`tgrep_node_label_use`):: =s when they appear as the first element of a `tgrep_expr_labeled` expression (see `_tgrep_segmented_pattern_action`). It returns the node label. """ assert len(tokens) == 1 assert tokens[0].startswith("=") return tokens[0][1:] def _tgrep_node_label_pred_use_action(_s, _l, tokens): """ Builds a lambda function representing a predicate on a tree node which describes the use of a previously bound node label. Called for expressions like (`tgrep_node_label_use_pred`):: =s when they appear inside a tgrep_node_expr (for example, inside a relation). The predicate returns true if and only if its node argument is identical the the node looked up in the node label dictionary using the node's label. """ assert len(tokens) == 1 assert tokens[0].startswith("=") node_label = tokens[0][1:] def node_label_use_pred(n, m=None, l=None): # look up the bound node using its label if l is None or node_label not in l: raise TgrepException(f"node_label ={node_label} not bound in pattern") node = l[node_label] # truth means the given node is this node return n is node return node_label_use_pred def _tgrep_bind_node_label_action(_s, _l, tokens): """ Builds a lambda function representing a predicate on a tree node which can optionally bind a matching node into the tgrep2 string's label_dict. Called for expressions like (`tgrep_node_expr2`):: /NP/ @NP=n """ # tokens[0] is a tgrep_node_expr if len(tokens) == 1: return tokens[0] else: # if present, tokens[1] is the character '=', and tokens[2] is # a tgrep_node_label, a string value containing the node label assert len(tokens) == 3 assert tokens[1] == "=" node_pred = tokens[0] node_label = tokens[2] def node_label_bind_pred(n, m=None, l=None): if node_pred(n, m, l): # bind `n` into the dictionary `l` if l is None: raise TgrepException( "cannot bind node_label {}: label_dict is None".format( node_label ) ) l[node_label] = n return True else: return False return node_label_bind_pred def _tgrep_rel_disjunction_action(_s, _l, tokens): """ Builds a lambda function representing a predicate on a tree node from the disjunction of several other such lambda functions. """ # filter out the pipe tokens = [x for x in tokens if x != "|"] if len(tokens) == 1: return tokens[0] elif len(tokens) == 2: return (lambda a, b: lambda n, m=None, l=None: a(n, m, l) or b(n, m, l))( tokens[0], tokens[1] ) def _macro_defn_action(_s, _l, tokens): """ Builds a dictionary structure which defines the given macro. """ assert len(tokens) == 3 assert tokens[0] == "@" return {tokens[1]: tokens[2]} def _tgrep_exprs_action(_s, _l, tokens): """ This is the top-lebel node in a tgrep2 search string; the predicate function it returns binds together all the state of a tgrep2 search string. Builds a lambda function representing a predicate on a tree node from the disjunction of several tgrep expressions. Also handles macro definitions and macro name binding, and node label definitions and node label binding. """ if len(tokens) == 1: return lambda n, m=None, l=None: tokens[0](n, None, {}) # filter out all the semicolons tokens = [x for x in tokens if x != ";"] # collect all macro definitions macro_dict = {} macro_defs = [tok for tok in tokens if isinstance(tok, dict)] for macro_def in macro_defs: macro_dict.update(macro_def) # collect all tgrep expressions tgrep_exprs = [tok for tok in tokens if not isinstance(tok, dict)] # create a new scope for the node label dictionary def top_level_pred(n, m=macro_dict, l=None): label_dict = {} # bind macro definitions and OR together all tgrep_exprs return any(predicate(n, m, label_dict) for predicate in tgrep_exprs) return top_level_pred def _build_tgrep_parser(set_parse_actions=True): """ Builds a pyparsing-based parser object for tokenizing and interpreting tgrep search strings. """ tgrep_op = pyparsing.Optional("!") + pyparsing.Regex("[$%,.<>][%,.<>0-9-':]*") tgrep_qstring = pyparsing.QuotedString( quoteChar='"', escChar="\\", unquoteResults=False ) tgrep_node_regex = pyparsing.QuotedString( quoteChar="/", escChar="\\", unquoteResults=False ) tgrep_qstring_icase = pyparsing.Regex('i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"') tgrep_node_regex_icase = pyparsing.Regex("i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/") tgrep_node_literal = pyparsing.Regex("[^][ \r\t\n;:.,&|<>()$!@%'^=]+") tgrep_expr = pyparsing.Forward() tgrep_relations = pyparsing.Forward() tgrep_parens = pyparsing.Literal("(") + tgrep_expr + ")" tgrep_nltk_tree_pos = ( pyparsing.Literal("N(") + pyparsing.Optional( pyparsing.Word(pyparsing.nums) + "," + pyparsing.Optional( pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=",") + pyparsing.Optional(",") ) ) + ")" ) tgrep_node_label = pyparsing.Regex("[A-Za-z0-9]+") tgrep_node_label_use = pyparsing.Combine("=" + tgrep_node_label) # see _tgrep_segmented_pattern_action tgrep_node_label_use_pred = tgrep_node_label_use.copy() macro_name = pyparsing.Regex("[^];:.,&|<>()[$!@%'^=\r\t\n ]+") macro_name.setWhitespaceChars("") macro_use = pyparsing.Combine("@" + macro_name) tgrep_node_expr = ( tgrep_node_label_use_pred | macro_use | tgrep_nltk_tree_pos | tgrep_qstring_icase | tgrep_node_regex_icase | tgrep_qstring | tgrep_node_regex | "*" | tgrep_node_literal ) tgrep_node_expr2 = ( tgrep_node_expr + pyparsing.Literal("=").setWhitespaceChars("") + tgrep_node_label.copy().setWhitespaceChars("") ) | tgrep_node_expr tgrep_node = tgrep_parens | ( pyparsing.Optional("'") + tgrep_node_expr2 + pyparsing.ZeroOrMore("|" + tgrep_node_expr) ) tgrep_brackets = pyparsing.Optional("!") + "[" + tgrep_relations + "]" tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node) tgrep_rel_conjunction = pyparsing.Forward() tgrep_rel_conjunction << ( tgrep_relation + pyparsing.ZeroOrMore(pyparsing.Optional("&") + tgrep_rel_conjunction) ) tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( "|" + tgrep_relations ) tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations) tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(":" + tgrep_expr_labeled) macro_defn = ( pyparsing.Literal("@") + pyparsing.White().suppress() + macro_name + tgrep_expr2 ) tgrep_exprs = ( pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(";" + macro_defn) + ";") + tgrep_expr2 + pyparsing.ZeroOrMore(";" + (macro_defn | tgrep_expr2)) + pyparsing.ZeroOrMore(";").suppress() ) if set_parse_actions: tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action) tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action) macro_use.setParseAction(_tgrep_macro_use_action) tgrep_node.setParseAction(_tgrep_node_action) tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action) tgrep_parens.setParseAction(_tgrep_parens_action) tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) tgrep_relation.setParseAction(_tgrep_relation_action) tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action) tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) macro_defn.setParseAction(_macro_defn_action) # the whole expression is also the conjunction of two # predicates: the first node predicate, and the remaining # relation predicates tgrep_expr.setParseAction(_tgrep_conjunction_action) tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action) tgrep_expr2.setParseAction( functools.partial(_tgrep_conjunction_action, join_char=":") ) tgrep_exprs.setParseAction(_tgrep_exprs_action) return tgrep_exprs.ignore("#" + pyparsing.restOfLine) def tgrep_tokenize(tgrep_string): """ Tokenizes a TGrep search string into separate tokens. """ parser = _build_tgrep_parser(False) if isinstance(tgrep_string, bytes): tgrep_string = tgrep_string.decode() return list(parser.parseString(tgrep_string)) def tgrep_compile(tgrep_string): """ Parses (and tokenizes, if necessary) a TGrep search string into a lambda function. """ parser = _build_tgrep_parser(True) if isinstance(tgrep_string, bytes): tgrep_string = tgrep_string.decode() return list(parser.parseString(tgrep_string, parseAll=True))[0] def treepositions_no_leaves(tree): """ Returns all the tree positions in the given tree which are not leaf nodes. """ treepositions = tree.treepositions() # leaves are treeposition tuples that are not prefixes of any # other treeposition prefixes = set() for pos in treepositions: for length in range(len(pos)): prefixes.add(pos[:length]) return [pos for pos in treepositions if pos in prefixes] def tgrep_positions(pattern, trees, search_leaves=True): """ Return the tree positions in the trees which match the given pattern. :param pattern: a tgrep search pattern :type pattern: str or output of tgrep_compile() :param trees: a sequence of NLTK trees (usually ParentedTrees) :type trees: iter(ParentedTree) or iter(Tree) :param search_leaves: whether to return matching leaf nodes :type search_leaves: bool :rtype: iter(tree positions) """ if isinstance(pattern, (bytes, str)): pattern = tgrep_compile(pattern) for tree in trees: try: if search_leaves: positions = tree.treepositions() else: positions = treepositions_no_leaves(tree) yield [position for position in positions if pattern(tree[position])] except AttributeError: yield [] def tgrep_nodes(pattern, trees, search_leaves=True): """ Return the tree nodes in the trees which match the given pattern. :param pattern: a tgrep search pattern :type pattern: str or output of tgrep_compile() :param trees: a sequence of NLTK trees (usually ParentedTrees) :type trees: iter(ParentedTree) or iter(Tree) :param search_leaves: whether to return matching leaf nodes :type search_leaves: bool :rtype: iter(tree nodes) """ if isinstance(pattern, (bytes, str)): pattern = tgrep_compile(pattern) for tree in trees: try: if search_leaves: positions = tree.treepositions() else: positions = treepositions_no_leaves(tree) yield [tree[position] for position in positions if pattern(tree[position])] except AttributeError: yield [] nltk-3.7/nltk/tokenize/000077500000000000000000000000001420073152400151315ustar00rootroot00000000000000nltk-3.7/nltk/tokenize/__init__.py000066400000000000000000000115631420073152400172500ustar00rootroot00000000000000# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # Contributors: matthewmc, clouds56 # URL: # For license information, see LICENSE.TXT r""" NLTK Tokenizer Package Tokenizers divide strings into lists of substrings. For example, tokenizers can be used to find the words and punctuation in a string: >>> from nltk.tokenize import word_tokenize >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> word_tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] This particular tokenizer requires the Punkt sentence tokenization models to be installed. NLTK also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation: >>> from nltk.tokenize import wordpunct_tokenize >>> wordpunct_tokenize(s) ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] We can also operate at the level of sentences, using the sentence tokenizer directly as follows: >>> from nltk.tokenize import sent_tokenize, word_tokenize >>> sent_tokenize(s) ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.'] >>> [word_tokenize(t) for t in sent_tokenize(s)] [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'], ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']] Caution: when tokenizing a Unicode string, make sure you are not using an encoded version of the string (it may be necessary to decode it first, e.g. with ``s.decode("utf8")``. NLTK tokenizers can produce token-spans, represented as tuples of integers having the same semantics as string slices, to support efficient comparison of tokenizers. (These methods are implemented as generators.) >>> from nltk.tokenize import WhitespaceTokenizer >>> list(WhitespaceTokenizer().span_tokenize(s)) [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] There are numerous ways to tokenize text. If you need more control over tokenization, see the other methods provided in this package. For further information, please see Chapter 3 of the NLTK book. """ import re from nltk.data import load from nltk.tokenize.casual import TweetTokenizer, casual_tokenize from nltk.tokenize.destructive import NLTKWordTokenizer from nltk.tokenize.legality_principle import LegalitySyllableTokenizer from nltk.tokenize.mwe import MWETokenizer from nltk.tokenize.punkt import PunktSentenceTokenizer from nltk.tokenize.regexp import ( BlanklineTokenizer, RegexpTokenizer, WhitespaceTokenizer, WordPunctTokenizer, blankline_tokenize, regexp_tokenize, wordpunct_tokenize, ) from nltk.tokenize.repp import ReppTokenizer from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize from nltk.tokenize.simple import ( LineTokenizer, SpaceTokenizer, TabTokenizer, line_tokenize, ) from nltk.tokenize.sonority_sequencing import SyllableTokenizer from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tokenize.texttiling import TextTilingTokenizer from nltk.tokenize.toktok import ToktokTokenizer from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize # Standard sentence tokenizer. def sent_tokenize(text, language="english"): """ Return a sentence-tokenized copy of *text*, using NLTK's recommended sentence tokenizer (currently :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into sentences :param language: the model name in the Punkt corpus """ tokenizer = load(f"tokenizers/punkt/{language}.pickle") return tokenizer.tokenize(text) # Standard word tokenizer. _treebank_word_tokenizer = NLTKWordTokenizer() def word_tokenize(text, language="english", preserve_line=False): """ Return a tokenized copy of *text*, using NLTK's recommended word tokenizer (currently an improved :class:`.TreebankWordTokenizer` along with :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into words :type text: str :param language: the model name in the Punkt corpus :type language: str :param preserve_line: A flag to decide whether to sentence tokenize the text or not. :type preserve_line: bool """ sentences = [text] if preserve_line else sent_tokenize(text, language) return [ token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent) ] nltk-3.7/nltk/tokenize/api.py000066400000000000000000000043421420073152400162570ustar00rootroot00000000000000# Natural Language Toolkit: Tokenizer Interface # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ Tokenizer Interface """ from abc import ABC, abstractmethod from typing import Iterator, List, Tuple from nltk.internals import overridden from nltk.tokenize.util import string_span_tokenize class TokenizerI(ABC): """ A processing interface for tokenizing a string. Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both). """ @abstractmethod def tokenize(self, s: str) -> List[str]: """ Return a tokenized copy of *s*. :rtype: List[str] """ if overridden(self.tokenize_sents): return self.tokenize_sents([s])[0] def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]: """ Identify the tokens using integer offsets ``(start_i, end_i)``, where ``s[start_i:end_i]`` is the corresponding token. :rtype: Iterator[Tuple[int, int]] """ raise NotImplementedError() def tokenize_sents(self, strings: List[str]) -> List[List[str]]: """ Apply ``self.tokenize()`` to each element of ``strings``. I.e.: return [self.tokenize(s) for s in strings] :rtype: List[List[str]] """ return [self.tokenize(s) for s in strings] def span_tokenize_sents( self, strings: List[str] ) -> Iterator[List[Tuple[int, int]]]: """ Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.: return [self.span_tokenize(s) for s in strings] :yield: List[Tuple[int, int]] """ for s in strings: yield list(self.span_tokenize(s)) class StringTokenizer(TokenizerI): """A tokenizer that divides a string into substrings by splitting on the specified string (defined in subclasses). """ @property @abstractmethod def _string(self): raise NotImplementedError def tokenize(self, s): return s.split(self._string) def span_tokenize(self, s): yield from string_span_tokenize(s, self._string) nltk-3.7/nltk/tokenize/casual.py000066400000000000000000000345061420073152400167630ustar00rootroot00000000000000# # Natural Language Toolkit: Twitter Tokenizer # # Copyright (C) 2001-2022 NLTK Project # Author: Christopher Potts # Ewan Klein (modifications) # Pierpaolo Pantone <> (modifications) # Tom Aarsen <> (modifications) # URL: # For license information, see LICENSE.TXT # """ Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this: 1. The tuple REGEXPS defines a list of regular expression strings. 2. The REGEXPS strings are put, in order, into a compiled regular expression object called WORD_RE, under the TweetTokenizer class. 3. The tokenization is done by WORD_RE.findall(s), where s is the user-supplied string, inside the tokenize() method of the class TweetTokenizer. 4. When instantiating Tokenizer objects, there are several options: * preserve_case. By default, it is set to True. If it is set to False, then the tokenizer will downcase everything except for emoticons. * reduce_len. By default, it is set to False. It specifies whether to replace repeated character sequences of length 3 or greater with sequences of length 3. * strip_handles. By default, it is set to False. It specifies whether to remove Twitter handles of text used in the `tokenize` method. * match_phone_numbers. By default, it is set to True. It indicates whether the `tokenize` method should look for phone numbers. """ ###################################################################### import html from typing import List import regex # https://github.com/nltk/nltk/issues/2409 from nltk.tokenize.api import TokenizerI ###################################################################### # The following strings are components in the regular expression # that is used for tokenizing. It's important that phone_number # appears first in the final regex (since it can contain whitespace). # It also could matter that tags comes after emoticons, due to the # possibility of having text like # # <:| and some text >:) # # Most importantly, the final element should always be last, since it # does a last ditch whitespace-based tokenization of whatever is left. # ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ? # This particular element is used in a couple ways, so we define it # with a name: EMOTICONS = r""" (?: [<>]? [:;=8] # eyes [\-o\*\']? # optional nose [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth | [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth [\-o\*\']? # optional nose [:;=8] # eyes [<>]? | {}\[\]]+ # Run of non-space, non-()<>{}[] | # or \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | \([^\s]+?\) # balanced parens, non-recursive: (...) )+ (?: # End with: \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | \([^\s]+?\) # balanced parens, non-recursive: (...) | # or [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars ) | # OR, the following to match naked domains: (?: (?\s]+>""", # ASCII Arrows r"""[\-]+>|<[\-]+""", # Twitter username: r"""(?:@[\w_]+)""", # Twitter hashtags: r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""", # email addresses r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""", # Zero-Width-Joiner and Skin tone modifier emojis """.(?: [\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+ | [\U0001F3FB-\U0001F3FF] )""", # Remaining word types: r""" (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes. | (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. | (?:[\w_]+) # Words without apostrophes or dashes. | (?:\.(?:\s*\.){1,}) # Ellipsis dots. | (?:\S) # Everything else that isn't whitespace. """, ) # Take the main components and add a phone regex as the second parameter REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:]) ###################################################################### # TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent # the core tokenizing regexes. They are compiled lazily. # WORD_RE performs poorly on these patterns: HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}") # The emoticon string gets its own regex so that we can preserve case for # them as needed: EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE) # These are for regularizing HTML entities to Unicode: ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);") # For stripping away handles from a tweet: HANDLES_RE = regex.compile( r"(?>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: £100') 'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: £100')) Price: £100 >>> """ def _convert_entity(match): entity_body = match.group(3) if match.group(1): try: if match.group(2): number = int(entity_body, 16) else: number = int(entity_body, 10) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets if 0x80 <= number <= 0x9F: return bytes((number,)).decode("cp1252") except ValueError: number = None else: if entity_body in keep: return match.group(0) number = html.entities.name2codepoint.get(entity_body) if number is not None: try: return chr(number) except (ValueError, OverflowError): pass return "" if remove_illegal else match.group(0) return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding)) ###################################################################### class TweetTokenizer(TokenizerI): r""" Tokenizer for tweets. >>> from nltk.tokenize import TweetTokenizer >>> tknzr = TweetTokenizer() >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" >>> tknzr.tokenize(s0) ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3' , 'and', 'some', 'arrows', '<', '>', '->', '<--'] Examples using `strip_handles` and `reduce_len parameters`: >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' >>> tknzr.tokenize(s1) [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] """ # Values used to lazily compile WORD_RE and PHONE_WORD_RE, # which are the core tokenizing regexes. _WORD_RE = None _PHONE_WORD_RE = None ###################################################################### def __init__( self, preserve_case=True, reduce_len=False, strip_handles=False, match_phone_numbers=True, ): """ Create a `TweetTokenizer` instance with settings for use in the `tokenize` method. :param preserve_case: Flag indicating whether to preserve the casing (capitalisation) of text used in the `tokenize` method. Defaults to True. :type preserve_case: bool :param reduce_len: Flag indicating whether to replace repeated character sequences of length 3 or greater with sequences of length 3. Defaults to False. :type reduce_len: bool :param strip_handles: Flag indicating whether to remove Twitter handles of text used in the `tokenize` method. Defaults to False. :type strip_handles: bool :param match_phone_numbers: Flag indicating whether the `tokenize` method should look for phone numbers. Defaults to True. :type match_phone_numbers: bool """ self.preserve_case = preserve_case self.reduce_len = reduce_len self.strip_handles = strip_handles self.match_phone_numbers = match_phone_numbers def tokenize(self, text: str) -> List[str]: """Tokenize the input text. :param text: str :rtype: list(str) :return: a tokenized list of strings; joining this list returns\ the original string if `preserve_case=False`. """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Shorten problematic sequences of characters safe_text = HANG_RE.sub(r"\1\1\1", text) # Recognise phone numbers during tokenization if self.match_phone_numbers: words = self.PHONE_WORD_RE.findall(safe_text) else: words = self.WORD_RE.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) ) return words @property def WORD_RE(self) -> "regex.Pattern": """Core TweetTokenizer regex""" # Compiles the regex for this and all future instantiations of TweetTokenizer. if not type(self)._WORD_RE: type(self)._WORD_RE = regex.compile( f"({'|'.join(REGEXPS)})", regex.VERBOSE | regex.I | regex.UNICODE, ) return type(self)._WORD_RE @property def PHONE_WORD_RE(self) -> "regex.Pattern": """Secondary core TweetTokenizer regex""" # Compiles the regex for this and all future instantiations of TweetTokenizer. if not type(self)._PHONE_WORD_RE: type(self)._PHONE_WORD_RE = regex.compile( f"({'|'.join(REGEXPS_PHONE)})", regex.VERBOSE | regex.I | regex.UNICODE, ) return type(self)._PHONE_WORD_RE ###################################################################### # Normalization Functions ###################################################################### def reduce_lengthening(text): """ Replace repeated character sequences of length 3 or greater with sequences of length 3. """ pattern = regex.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1\1", text) def remove_handles(text): """ Remove Twitter username handles from text. """ # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly return HANDLES_RE.sub(" ", text) ###################################################################### # Tokenization Function ###################################################################### def casual_tokenize( text, preserve_case=True, reduce_len=False, strip_handles=False, match_phone_numbers=True, ): """ Convenience function for wrapping the tokenizer. """ return TweetTokenizer( preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles, match_phone_numbers=match_phone_numbers, ).tokenize(text) ############################################################################### nltk-3.7/nltk/tokenize/destructive.py000066400000000000000000000221561420073152400200520ustar00rootroot00000000000000# Natural Language Toolkit: NLTK's very own tokenizer. # # Copyright (C) 2001-2022 NLTK Project # Author: Liling Tan # Tom Aarsen <> (modifications) # URL: # For license information, see LICENSE.TXT import re import warnings from typing import Iterator, List, Tuple from nltk.tokenize.api import TokenizerI from nltk.tokenize.util import align_tokens class MacIntyreContractions: """ List of contractions adapted from Robert MacIntyre's tokenizer. """ CONTRACTIONS2 = [ r"(?i)\b(can)(?#X)(not)\b", r"(?i)\b(d)(?#X)('ye)\b", r"(?i)\b(gim)(?#X)(me)\b", r"(?i)\b(gon)(?#X)(na)\b", r"(?i)\b(got)(?#X)(ta)\b", r"(?i)\b(lem)(?#X)(me)\b", r"(?i)\b(more)(?#X)('n)\b", r"(?i)\b(wan)(?#X)(na)(?=\s)", ] CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"] CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"] class NLTKWordTokenizer(TokenizerI): """ The NLTK tokenizer that has improved upon the TreebankWordTokenizer. This is the method that is invoked by ``word_tokenize()``. It assumes that the text has already been segmented into sentences, e.g. using ``sent_tokenize()``. The tokenizer is "destructive" such that the regexes applied will munge the input string to a state beyond re-construction. It is possible to apply `TreebankWordDetokenizer.detokenize` to the tokenized outputs of `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to revert to the original string. """ # Starting quotes. STARTING_QUOTES = [ (re.compile("([«“‘„]|[`]+)", re.U), r" \1 "), (re.compile(r"^\""), r"``"), (re.compile(r"(``)"), r" \1 "), (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "), (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"), ] # Ending quotes. ENDING_QUOTES = [ (re.compile("([»”’])", re.U), r" \1 "), (re.compile(r"''"), " '' "), (re.compile(r'"'), " '' "), (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), ] # For improvements for starting/closing quotes from TreebankWordTokenizer, # see discussion on https://github.com/nltk/nltk/pull/1437 # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on # - chervon quotes u'\xab' and u'\xbb' . # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608 # Also, behavior of splitting on clitics now follows Stanford CoreNLP # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b # Punctuation. PUNCTUATION = [ (re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "), (re.compile(r"([:,])([^\d])"), r" \1 \2"), (re.compile(r"([:,])$"), r" \1 "), ( re.compile(r"\.{2,}", re.U), r" \g<0> ", ), # See https://github.com/nltk/nltk/pull/2322 (re.compile(r"[;@#$%&]"), r" \g<0> "), ( re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r"\1 \2\3 ", ), # Handles the final period. (re.compile(r"[?!]"), r" \g<0> "), (re.compile(r"([^'])' "), r"\1 ' "), ( re.compile(r"[*]", re.U), r" \g<0> ", ), # See https://github.com/nltk/nltk/pull/2322 ] # Pads parentheses PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ") # Optionally: Convert parentheses, brackets and converts them to PTB symbols. CONVERT_PARENTHESES = [ (re.compile(r"\("), "-LRB-"), (re.compile(r"\)"), "-RRB-"), (re.compile(r"\["), "-LSB-"), (re.compile(r"\]"), "-RSB-"), (re.compile(r"\{"), "-LCB-"), (re.compile(r"\}"), "-RCB-"), ] DOUBLE_DASHES = (re.compile(r"--"), r" -- ") # List of contractions adapted from Robert MacIntyre's tokenizer. _contractions = MacIntyreContractions() CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2)) CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) def tokenize( self, text: str, convert_parentheses: bool = False, return_str: bool = False ) -> List[str]: r"""Return a tokenized copy of `text`. >>> from nltk.tokenize import NLTKWordTokenizer >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.''' >>> NLTKWordTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36', 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] >>> NLTKWordTokenizer().tokenize(s, return_str=True) ' Good muffins cost $ 3.88 ( roughly 3,36 euros ) \nin New York. Please buy me\ntwo of them.\nThanks . ' :param text: A string with a sentence or sentences. :type text: str :param convert_parentheses: if True, replace parentheses to PTB symbols, e.g. `(` to `-LRB-`. Defaults to False. :type convert_parentheses: bool, optional :param return_str: If True, return tokens as space-separated string, defaults to False. :type return_str: bool, optional :return: List of tokens from `text`. :rtype: List[str] """ if return_str: warnings.warn( "Parameter 'return_str' has been deprecated and should no " "longer be used.", category=DeprecationWarning, stacklevel=2, ) for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) for regexp, substitution in self.PUNCTUATION: text = regexp.sub(substitution, text) # Handles parentheses. regexp, substitution = self.PARENS_BRACKETS text = regexp.sub(substitution, text) # Optionally convert parentheses if convert_parentheses: for regexp, substitution in self.CONVERT_PARENTHESES: text = regexp.sub(substitution, text) # Handles double dash. regexp, substitution = self.DOUBLE_DASHES text = regexp.sub(substitution, text) # add extra space to make things easier text = " " + text + " " for regexp, substitution in self.ENDING_QUOTES: text = regexp.sub(substitution, text) for regexp in self.CONTRACTIONS2: text = regexp.sub(r" \1 \2 ", text) for regexp in self.CONTRACTIONS3: text = regexp.sub(r" \1 \2 ", text) # We are not using CONTRACTIONS4 since # they are also commented out in the SED scripts # for regexp in self._contractions.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) return text.split() def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]: r""" Returns the spans of the tokens in ``text``. Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import NLTKWordTokenizer >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected True :param text: A string with a sentence or sentences. :type text: str :yield: Tuple[int, int] """ raw_tokens = self.tokenize(text) # Convert converted quotes back to original double quotes # Do this only if original text contains double quote(s) or double # single-quotes (because '' might be transformed to `` if it is # treated as starting quotes). if ('"' in text) or ("''" in text): # Find double quotes and converted quotes matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] # Replace converted quotes back to double quotes tokens = [ matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens ] else: tokens = raw_tokens yield from align_tokens(tokens, text) nltk-3.7/nltk/tokenize/legality_principle.py000066400000000000000000000137111420073152400213650ustar00rootroot00000000000000# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2022 NLTK Project # Author: Christopher Hench # Alex Estes # URL: # For license information, see LICENSE.TXT """ The Legality Principle is a language agnostic principle maintaining that syllable onsets and codas (the beginning and ends of syllables not including the vowel) are only legal if they are found as word onsets or codas in the language. The English word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found word-initially in the English language (Bartlett et al.). This principle was first proposed in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''. Kahn further argues that there is a ''strong tendency to syllabify in such a way that initial clusters are of maximal length, consistent with the general constraints on word-initial consonant clusters.'' Consequently, in addition to being legal onsets, the longest legal onset is preferable---''Onset Maximization''. The default implementation assumes an English vowel set, but the `vowels` attribute can be set to IPA or any other alphabet's vowel set for the use-case. Both a valid set of vowels as well as a text corpus of words in the language are necessary to determine legal onsets and subsequently syllabify words. The legality principle with onset maximization is a universal syllabification algorithm, but that does not mean it performs equally across languages. Bartlett et al. (2009) is a good benchmark for English accuracy if utilizing IPA (pg. 311). References: - Otto Jespersen. 1904. Lehrbuch der Phonetik. Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203. - Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11. - Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976). - Elisabeth Selkirk. 1984. On the major class features and syllable theory. In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology. Cambridge, MIT Press. pp. 107-136. - Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409–436. - Susan Bartlett, et al. 2009. On the Syllabification of Phonemes. In HLT-NAACL. pp. 308-316. - Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley. """ from collections import Counter from nltk.tokenize.api import TokenizerI class LegalitySyllableTokenizer(TokenizerI): """ Syllabifies words based on the Legality Principle and Onset Maximization. >>> from nltk.tokenize import LegalitySyllableTokenizer >>> from nltk import word_tokenize >>> from nltk.corpus import words >>> text = "This is a wonderful sentence." >>> text_words = word_tokenize(text) >>> LP = LegalitySyllableTokenizer(words.words()) >>> [LP.tokenize(word) for word in text_words] [['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']] """ def __init__( self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001 ): """ :param tokenized_source_text: List of valid tokens in the language :type tokenized_source_text: list(str) :param vowels: Valid vowels in language or IPA representation :type vowels: str :param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset :type legal_frequency_threshold: float """ self.legal_frequency_threshold = legal_frequency_threshold self.vowels = vowels self.legal_onsets = self.find_legal_onsets(tokenized_source_text) def find_legal_onsets(self, words): """ Gathers all onsets and then return only those above the frequency threshold :param words: List of words in a language :type words: list(str) :return: Set of legal onsets :rtype: set(str) """ onsets = [self.onset(word) for word in words] legal_onsets = [ k for k, v in Counter(onsets).items() if (v / len(onsets)) > self.legal_frequency_threshold ] return set(legal_onsets) def onset(self, word): """ Returns consonant cluster of word, i.e. all characters until the first vowel. :param word: Single word or token :type word: str :return: String of characters of onset :rtype: str """ onset = "" for c in word.lower(): if c in self.vowels: return onset else: onset += c return onset def tokenize(self, token): """ Apply the Legality Principle in combination with Onset Maximization to return a list of syllables. :param token: Single word or token :type token: str :return syllable_list: Single word or token broken up into syllables. :rtype: list(str) """ syllables = [] syllable, current_onset = "", "" vowel, onset = False, False for char in token[::-1]: char_lower = char.lower() if not vowel: syllable += char vowel = bool(char_lower in self.vowels) else: if char_lower + current_onset[::-1] in self.legal_onsets: syllable += char current_onset += char_lower onset = True elif char_lower in self.vowels and not onset: syllable += char current_onset += char_lower else: syllables.append(syllable) syllable = char current_onset = "" vowel = bool(char_lower in self.vowels) syllables.append(syllable) syllables_ordered = [syllable[::-1] for syllable in syllables][::-1] return syllables_ordered nltk-3.7/nltk/tokenize/mwe.py000066400000000000000000000077311420073152400163030ustar00rootroot00000000000000# Multi-Word Expression tokenizer # # Copyright (C) 2001-2022 NLTK Project # Author: Rob Malouf # URL: # For license information, see LICENSE.TXT """ Multi-Word Expression Tokenizer A ``MWETokenizer`` takes a string which has already been divided into tokens and retokenizes it, merging multi-word expressions into single tokens, using a lexicon of MWEs: >>> from nltk.tokenize import MWETokenizer >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')]) >>> tokenizer.add_mwe(('in', 'spite', 'of')) >>> tokenizer.tokenize('Testing testing testing one two three'.split()) ['Testing', 'testing', 'testing', 'one', 'two', 'three'] >>> tokenizer.tokenize('This is a test in spite'.split()) ['This', 'is', 'a', 'test', 'in', 'spite'] >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split()) ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of'] """ from nltk.tokenize.api import TokenizerI from nltk.util import Trie class MWETokenizer(TokenizerI): """A tokenizer that processes tokenized text and merges multi-word expressions into single tokens. """ def __init__(self, mwes=None, separator="_"): """Initialize the multi-word tokenizer with a list of expressions and a separator :type mwes: list(list(str)) :param mwes: A sequence of multi-word expressions to be merged, where each MWE is a sequence of strings. :type separator: str :param separator: String that should be inserted between words in a multi-word expression token. (Default is '_') """ if not mwes: mwes = [] self._mwes = Trie(mwes) self._separator = separator def add_mwe(self, mwe): """Add a multi-word expression to the lexicon (stored as a word trie) We use ``util.Trie`` to represent the trie. Its form is a dict of dicts. The key True marks the end of a valid MWE. :param mwe: The multi-word expression we're adding into the word trie :type mwe: tuple(str) or list(str) :Example: >>> tokenizer = MWETokenizer() >>> tokenizer.add_mwe(('a', 'b')) >>> tokenizer.add_mwe(('a', 'b', 'c')) >>> tokenizer.add_mwe(('a', 'x')) >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}} >>> tokenizer._mwes == expected True """ self._mwes.insert(mwe) def tokenize(self, text): """ :param text: A list containing tokenized text :type text: list(str) :return: A list of the tokenized text with multi-words merged together :rtype: list(str) :Example: >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+') >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split()) ['An', "hors+d'oeuvre", 'tonight,', 'sir?'] """ i = 0 n = len(text) result = [] while i < n: if text[i] in self._mwes: # possible MWE match j = i trie = self._mwes last_match = -1 while j < n and text[j] in trie: # and len(trie[text[j]]) > 0 : trie = trie[text[j]] j = j + 1 if Trie.LEAF in trie: last_match = j else: if last_match > -1: j = last_match if Trie.LEAF in trie or last_match > -1: # success! result.append(self._separator.join(text[i:j])) i = j else: # no match, so backtrack result.append(text[i]) i += 1 else: result.append(text[i]) i += 1 return result nltk-3.7/nltk/tokenize/nist.py000066400000000000000000000165651420073152400164750ustar00rootroot00000000000000# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer. # # Copyright (C) 2001-2015 NLTK Project # Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl) # Contributors: Ozan Caglayan, Wiktor Stribizew # # URL: # For license information, see LICENSE.TXT """ This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script, https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926 which was also ported into Python in https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162 """ import io import re from nltk.corpus import perluniprops from nltk.tokenize.api import TokenizerI from nltk.tokenize.util import xml_unescape class NISTTokenizer(TokenizerI): """ This NIST tokenizer is sentence-based instead of the original paragraph-based tokenization from mteval-14.pl; The sentence-based tokenization is consistent with the other tokenizers available in NLTK. >>> from nltk.tokenize.nist import NISTTokenizer >>> nist = NISTTokenizer() >>> s = "Good muffins cost $3.88 in New York." >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.'] >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.'] >>> nist.tokenize(s, lowercase=False) == expected_cased True >>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased. True The international_tokenize() is the preferred function when tokenizing non-european text, e.g. >>> from nltk.tokenize.nist import NISTTokenizer >>> nist = NISTTokenizer() # Input strings. >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...' >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...' >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.' # Expected tokens. >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')'] >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm'] >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha'] >>> nist.international_tokenize(albb)[:10] == expected_albb True >>> nist.international_tokenize(amz)[:10] == expected_amz True >>> nist.international_tokenize(rkt)[:10] == expected_rkt True # Doctest for patching issue #1926 >>> sent = u'this is a foo\u2604sentence.' >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.'] >>> nist.international_tokenize(sent) == expected_sent True """ # Strip "skipped" tags STRIP_SKIP = re.compile(""), "" # Strip end-of-line hyphenation and join lines STRIP_EOL_HYPHEN = re.compile("\u2028"), " " # Tokenize punctuation. PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 " # Tokenize period and comma unless preceded by a digit. PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 " # Tokenize period and comma unless followed by a digit. PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2" # Tokenize dash when preceded by a digit DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 " LANG_DEPENDENT_REGEXES = [ PUNCT, PERIOD_COMMA_PRECEED, PERIOD_COMMA_FOLLOW, DASH_PRECEED_DIGIT, ] # Perluniprops characters used in NIST tokenizer. pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N} pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P} pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S} # Python regexes needs to escape some special symbols, see # see https://stackoverflow.com/q/45670950/610569 number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number) punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct) symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol) # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to # (i) strip trailing and heading spaces and # (ii) de-deuplicate spaces. # In Python, this would do: ' '.join(str.strip().split()) # Thus, the next two lines were commented out. # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl} # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z} # Pads non-ascii strings with space. NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 " # Tokenize any punctuation unless followed AND preceded by a digit. PUNCT_1 = ( re.compile(f"([{number_regex}])([{punct_regex}])"), "\\1 \\2 ", ) PUNCT_2 = ( re.compile(f"([{punct_regex}])([{number_regex}])"), " \\1 \\2", ) # Tokenize symbols SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 " INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS] def lang_independent_sub(self, text): """Performs the language independent string substituitions.""" # It's a strange order of regexes. # It'll be better to unescape after STRIP_EOL_HYPHEN # but let's keep it close to the original NIST implementation. regexp, substitution = self.STRIP_SKIP text = regexp.sub(substitution, text) text = xml_unescape(text) regexp, substitution = self.STRIP_EOL_HYPHEN text = regexp.sub(substitution, text) return text def tokenize(self, text, lowercase=False, western_lang=True, return_str=False): text = str(text) # Language independent regex. text = self.lang_independent_sub(text) # Language dependent regex. if western_lang: # Pad string with whitespace. text = " " + text + " " if lowercase: text = text.lower() for regexp, substitution in self.LANG_DEPENDENT_REGEXES: text = regexp.sub(substitution, text) # Remove contiguous whitespaces. text = " ".join(text.split()) # Finally, strips heading and trailing spaces # and converts output string into unicode. text = str(text.strip()) return text if return_str else text.split() def international_tokenize( self, text, lowercase=False, split_non_ascii=True, return_str=False ): text = str(text) # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied # first before unescaping. regexp, substitution = self.STRIP_SKIP text = regexp.sub(substitution, text) regexp, substitution = self.STRIP_EOL_HYPHEN text = regexp.sub(substitution, text) text = xml_unescape(text) if lowercase: text = text.lower() for regexp, substitution in self.INTERNATIONAL_REGEXES: text = regexp.sub(substitution, text) # Make sure that there's only one space only between words. # Strip leading and trailing spaces. text = " ".join(text.strip().split()) return text if return_str else text.split() nltk-3.7/nltk/tokenize/punkt.py000066400000000000000000001767541420073152400166700ustar00rootroot00000000000000# Natural Language Toolkit: Punkt sentence tokenizer # # Copyright (C) 2001-2022 NLTK Project # Algorithm: Kiss & Strunk (2006) # Author: Willy (original Python port) # Steven Bird (additions) # Edward Loper (rewrite) # Joel Nothman (almost rewrite) # Arthur Darcet (fixes) # URL: # For license information, see LICENSE.TXT r""" Punkt Sentence Tokenizer This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences. It must be trained on a large collection of plaintext in the target language before it can be used. The NLTK data package includes a pre-trained Punkt tokenizer for English. >>> import nltk.data >>> text = ''' ... Punkt knows that the periods in Mr. Smith and Johann S. Bach ... do not mark sentence boundaries. And sometimes sentences ... can start with non-capitalized words. i is a good variable ... name. ... ''' >>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') >>> print('\n-----\n'.join(sent_detector.tokenize(text.strip()))) Punkt knows that the periods in Mr. Smith and Johann S. Bach do not mark sentence boundaries. ----- And sometimes sentences can start with non-capitalized words. ----- i is a good variable name. (Note that whitespace from the original text, including newlines, is retained in the output.) Punctuation following sentences is also included by default (from NLTK 3.0 onwards). It can be excluded with the realign_boundaries flag. >>> text = ''' ... (How does it deal with this parenthesis?) "It should be part of the ... previous sentence." "(And the same with this one.)" ('And this one!') ... "('(And (this)) '?)" [(and this. )] ... ''' >>> print('\n-----\n'.join( ... sent_detector.tokenize(text.strip()))) (How does it deal with this parenthesis?) ----- "It should be part of the previous sentence." ----- "(And the same with this one.)" ----- ('And this one!') ----- "('(And (this)) '?)" ----- [(and this. )] >>> print('\n-----\n'.join( ... sent_detector.tokenize(text.strip(), realign_boundaries=False))) (How does it deal with this parenthesis? ----- ) "It should be part of the previous sentence. ----- " "(And the same with this one. ----- )" ('And this one! ----- ') "('(And (this)) '? ----- )" [(and this. ----- )] However, Punkt is designed to learn parameters (a list of abbreviations, etc.) unsupervised from a corpus similar to the target domain. The pre-packaged models may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn parameters from the given text. :class:`.PunktTrainer` learns parameters such as a list of abbreviations (without supervision) from portions of text. Using a ``PunktTrainer`` directly allows for incremental training and modification of the hyper-parameters used to decide what is considered an abbreviation, etc. The algorithm for this tokenizer is described in:: Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection. Computational Linguistics 32: 485-525. """ # TODO: Make orthographic heuristic less susceptible to overtraining # TODO: Frequent sentence starters optionally exclude always-capitalised words # FIXME: Problem with ending string with e.g. '!!!' -> '!! !' import math import re from collections import defaultdict from nltk.probability import FreqDist from nltk.tokenize.api import TokenizerI ###################################################################### # { Orthographic Context Constants ###################################################################### # The following constants are used to describe the orthographic # contexts in which a word can occur. BEG=beginning, MID=middle, # UNK=unknown, UC=uppercase, LC=lowercase, NC=no case. _ORTHO_BEG_UC = 1 << 1 """Orthographic context: beginning of a sentence with upper case.""" _ORTHO_MID_UC = 1 << 2 """Orthographic context: middle of a sentence with upper case.""" _ORTHO_UNK_UC = 1 << 3 """Orthographic context: unknown position in a sentence with upper case.""" _ORTHO_BEG_LC = 1 << 4 """Orthographic context: beginning of a sentence with lower case.""" _ORTHO_MID_LC = 1 << 5 """Orthographic context: middle of a sentence with lower case.""" _ORTHO_UNK_LC = 1 << 6 """Orthographic context: unknown position in a sentence with lower case.""" _ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC """Orthographic context: occurs with upper case.""" _ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC """Orthographic context: occurs with lower case.""" _ORTHO_MAP = { ("initial", "upper"): _ORTHO_BEG_UC, ("internal", "upper"): _ORTHO_MID_UC, ("unknown", "upper"): _ORTHO_UNK_UC, ("initial", "lower"): _ORTHO_BEG_LC, ("internal", "lower"): _ORTHO_MID_LC, ("unknown", "lower"): _ORTHO_UNK_LC, } """A map from context position and first-letter case to the appropriate orthographic context flag.""" # } (end orthographic context constants) ###################################################################### ###################################################################### # { Decision reasons for debugging ###################################################################### REASON_DEFAULT_DECISION = "default decision" REASON_KNOWN_COLLOCATION = "known collocation (both words)" REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = "abbreviation + orthographic heuristic" REASON_ABBR_WITH_SENTENCE_STARTER = "abbreviation + frequent sentence starter" REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic" REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic" REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = ( "initial + special orthographic heuristic" ) # } (end decision reasons for debugging) ###################################################################### ###################################################################### # { Language-dependent variables ###################################################################### class PunktLanguageVars: """ Stores variables, mostly regular expressions, which may be language-dependent for correct application of the algorithm. An extension of this class may modify its properties to suit a language other than English; an instance can then be passed as an argument to PunktSentenceTokenizer and PunktTrainer constructors. """ __slots__ = ("_re_period_context", "_re_word_tokenizer") def __getstate__(self): # All modifications to the class are performed by inheritance. # Non-default parameters to be pickled must be defined in the inherited # class. return 1 def __setstate__(self, state): return 1 sent_end_chars = (".", "?", "!") """Characters which are candidates for sentence boundaries""" @property def _re_sent_end_chars(self): return "[%s]" % re.escape("".join(self.sent_end_chars)) internal_punctuation = ",:;" # might want to extend this.. """sentence internal punctuation, which indicates an abbreviation if preceded by a period-final token.""" re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)', re.MULTILINE) """Used to realign punctuation that should be included in a sentence although it follows the period (or ?, !).""" _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]" """Excludes some characters from starting word tokens""" @property def _re_non_word_chars(self): return r"(?:[)\";}\]\*:@\'\({\[%s])" % re.escape( "".join(set(self.sent_end_chars) - {"."}) ) """Characters that cannot appear within words""" _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)" """Hyphen and ellipsis are multi-character punctuation""" _word_tokenize_fmt = r"""( %(MultiChar)s | (?=%(WordStart)s)\S+? # Accept word characters until end is found (?= # Sequences marking a word's end \s| # White-space $| # End-of-string %(NonWord)s|%(MultiChar)s| # Punctuation ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word ) | \S )""" """Format of a regular expression to split punctuation from words, excluding period.""" def _word_tokenizer_re(self): """Compiles and returns a regular expression for word tokenization""" try: return self._re_word_tokenizer except AttributeError: self._re_word_tokenizer = re.compile( self._word_tokenize_fmt % { "NonWord": self._re_non_word_chars, "MultiChar": self._re_multi_char_punct, "WordStart": self._re_word_start, }, re.UNICODE | re.VERBOSE, ) return self._re_word_tokenizer def word_tokenize(self, s): """Tokenize a string to split off punctuation other than periods""" return self._word_tokenizer_re().findall(s) _period_context_fmt = r""" %(SentEndChars)s # a potential sentence ending (?=(?P %(NonWord)s # either other punctuation | \s+(?P\S+) # or whitespace and some other token ))""" """Format of a regular expression to find contexts including possible sentence boundaries. Matches token which the possible sentence boundary ends, and matches the following token within a lookahead expression.""" def period_context_re(self): """Compiles and returns a regular expression to find contexts including possible sentence boundaries.""" try: return self._re_period_context except: self._re_period_context = re.compile( self._period_context_fmt % { "NonWord": self._re_non_word_chars, "SentEndChars": self._re_sent_end_chars, }, re.UNICODE | re.VERBOSE, ) return self._re_period_context _re_non_punct = re.compile(r"[^\W\d]", re.UNICODE) """Matches token types that are not merely punctuation. (Types for numeric tokens are changed to ##number## and hence contain alpha.)""" # } ###################################################################### # //////////////////////////////////////////////////////////// # { Helper Functions # //////////////////////////////////////////////////////////// def _pair_iter(iterator): """ Yields pairs of tokens from the given iterator such that each input token will appear as the first element in a yielded tuple. The last pair will have None as its second element. """ iterator = iter(iterator) try: prev = next(iterator) except StopIteration: return for el in iterator: yield (prev, el) prev = el yield (prev, None) ###################################################################### # { Punkt Parameters ###################################################################### class PunktParameters: """Stores data used to perform sentence boundary detection with Punkt.""" def __init__(self): self.abbrev_types = set() """A set of word types for known abbreviations.""" self.collocations = set() """A set of word type tuples for known common collocations where the first word ends in a period. E.g., ('S.', 'Bach') is a common collocation in a text that discusses 'Johann S. Bach'. These count as negative evidence for sentence boundaries.""" self.sent_starters = set() """A set of word types for words that often appear at the beginning of sentences.""" self.ortho_context = defaultdict(int) """A dictionary mapping word types to the set of orthographic contexts that word type appears in. Contexts are represented by adding orthographic context flags: ...""" def clear_abbrevs(self): self.abbrev_types = set() def clear_collocations(self): self.collocations = set() def clear_sent_starters(self): self.sent_starters = set() def clear_ortho_context(self): self.ortho_context = defaultdict(int) def add_ortho_context(self, typ, flag): self.ortho_context[typ] |= flag def _debug_ortho_context(self, typ): context = self.ortho_context[typ] if context & _ORTHO_BEG_UC: yield "BEG-UC" if context & _ORTHO_MID_UC: yield "MID-UC" if context & _ORTHO_UNK_UC: yield "UNK-UC" if context & _ORTHO_BEG_LC: yield "BEG-LC" if context & _ORTHO_MID_LC: yield "MID-LC" if context & _ORTHO_UNK_LC: yield "UNK-LC" ###################################################################### # { PunktToken ###################################################################### class PunktToken: """Stores a token of text with annotations produced during sentence boundary detection.""" _properties = ["parastart", "linestart", "sentbreak", "abbr", "ellipsis"] __slots__ = ["tok", "type", "period_final"] + _properties def __init__(self, tok, **params): self.tok = tok self.type = self._get_type(tok) self.period_final = tok.endswith(".") for prop in self._properties: setattr(self, prop, None) for k in params: setattr(self, k, params[k]) # //////////////////////////////////////////////////////////// # { Regular expressions for properties # //////////////////////////////////////////////////////////// # Note: [A-Za-z] is approximated by [^\W\d] in the general case. _RE_ELLIPSIS = re.compile(r"\.\.+$") _RE_NUMERIC = re.compile(r"^-?[\.,]?\d[\d,\.-]*\.?$") _RE_INITIAL = re.compile(r"[^\W\d]\.$", re.UNICODE) _RE_ALPHA = re.compile(r"[^\W\d]+$", re.UNICODE) # //////////////////////////////////////////////////////////// # { Derived properties # //////////////////////////////////////////////////////////// def _get_type(self, tok): """Returns a case-normalized representation of the token.""" return self._RE_NUMERIC.sub("##number##", tok.lower()) @property def type_no_period(self): """ The type with its final period removed if it has one. """ if len(self.type) > 1 and self.type[-1] == ".": return self.type[:-1] return self.type @property def type_no_sentperiod(self): """ The type with its final period removed if it is marked as a sentence break. """ if self.sentbreak: return self.type_no_period return self.type @property def first_upper(self): """True if the token's first character is uppercase.""" return self.tok[0].isupper() @property def first_lower(self): """True if the token's first character is lowercase.""" return self.tok[0].islower() @property def first_case(self): if self.first_lower: return "lower" if self.first_upper: return "upper" return "none" @property def is_ellipsis(self): """True if the token text is that of an ellipsis.""" return self._RE_ELLIPSIS.match(self.tok) @property def is_number(self): """True if the token text is that of a number.""" return self.type.startswith("##number##") @property def is_initial(self): """True if the token text is that of an initial.""" return self._RE_INITIAL.match(self.tok) @property def is_alpha(self): """True if the token text is all alphabetic.""" return self._RE_ALPHA.match(self.tok) @property def is_non_punct(self): """True if the token is either a number or is alphabetic.""" return _re_non_punct.search(self.type) # //////////////////////////////////////////////////////////// # { String representation # //////////////////////////////////////////////////////////// def __repr__(self): """ A string representation of the token that can reproduce it with eval(), which lists all the token's non-default annotations. """ typestr = " type=%s," % repr(self.type) if self.type != self.tok else "" propvals = ", ".join( f"{p}={repr(getattr(self, p))}" for p in self._properties if getattr(self, p) ) return "{}({},{} {})".format( self.__class__.__name__, repr(self.tok), typestr, propvals, ) def __str__(self): """ A string representation akin to that used by Kiss and Strunk. """ res = self.tok if self.abbr: res += "" if self.ellipsis: res += "" if self.sentbreak: res += "" return res ###################################################################### # { Punkt base class ###################################################################### class PunktBaseClass: """ Includes common components of PunktTrainer and PunktSentenceTokenizer. """ def __init__(self, lang_vars=None, token_cls=PunktToken, params=None): if lang_vars is None: lang_vars = PunktLanguageVars() if params is None: params = PunktParameters() self._params = params self._lang_vars = lang_vars self._Token = token_cls """The collection of parameters that determines the behavior of the punkt tokenizer.""" # //////////////////////////////////////////////////////////// # { Word tokenization # //////////////////////////////////////////////////////////// def _tokenize_words(self, plaintext): """ Divide the given text into tokens, using the punkt word segmentation regular expression, and generate the resulting list of tokens augmented as three-tuples with two boolean values for whether the given token occurs at the start of a paragraph or a new line, respectively. """ parastart = False for line in plaintext.split("\n"): if line.strip(): line_toks = iter(self._lang_vars.word_tokenize(line)) try: tok = next(line_toks) except StopIteration: continue yield self._Token(tok, parastart=parastart, linestart=True) parastart = False for tok in line_toks: yield self._Token(tok) else: parastart = True # //////////////////////////////////////////////////////////// # { Annotation Procedures # //////////////////////////////////////////////////////////// def _annotate_first_pass(self, tokens): """ Perform the first pass of annotation, which makes decisions based purely based on the word type of each word: - '?', '!', and '.' are marked as sentence breaks. - sequences of two or more periods are marked as ellipsis. - any word ending in '.' that's a known abbreviation is marked as an abbreviation. - any other word ending in '.' is marked as a sentence break. Return these annotations as a tuple of three sets: - sentbreak_toks: The indices of all sentence breaks. - abbrev_toks: The indices of all abbreviations. - ellipsis_toks: The indices of all ellipsis marks. """ for aug_tok in tokens: self._first_pass_annotation(aug_tok) yield aug_tok def _first_pass_annotation(self, aug_tok): """ Performs type-based annotation on a single token. """ tok = aug_tok.tok if tok in self._lang_vars.sent_end_chars: aug_tok.sentbreak = True elif aug_tok.is_ellipsis: aug_tok.ellipsis = True elif aug_tok.period_final and not tok.endswith(".."): if ( tok[:-1].lower() in self._params.abbrev_types or tok[:-1].lower().split("-")[-1] in self._params.abbrev_types ): aug_tok.abbr = True else: aug_tok.sentbreak = True return ###################################################################### # { Punkt Trainer ###################################################################### class PunktTrainer(PunktBaseClass): """Learns parameters used in Punkt sentence boundary detection.""" def __init__( self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken ): PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls) self._type_fdist = FreqDist() """A frequency distribution giving the frequency of each case-normalized token type in the training data.""" self._num_period_toks = 0 """The number of words ending in period in the training data.""" self._collocation_fdist = FreqDist() """A frequency distribution giving the frequency of all bigrams in the training data where the first word ends in a period. Bigrams are encoded as tuples of word types. Especially common collocations are extracted from this frequency distribution, and stored in ``_params``.``collocations ``.""" self._sent_starter_fdist = FreqDist() """A frequency distribution giving the frequency of all words that occur at the training data at the beginning of a sentence (after the first pass of annotation). Especially common sentence starters are extracted from this frequency distribution, and stored in ``_params.sent_starters``. """ self._sentbreak_count = 0 """The total number of sentence breaks identified in training, used for calculating the frequent sentence starter heuristic.""" self._finalized = True """A flag as to whether the training has been finalized by finding collocations and sentence starters, or whether finalize_training() still needs to be called.""" if train_text: self.train(train_text, verbose, finalize=True) def get_params(self): """ Calculates and returns parameters for sentence boundary detection as derived from training.""" if not self._finalized: self.finalize_training() return self._params # //////////////////////////////////////////////////////////// # { Customization Variables # //////////////////////////////////////////////////////////// ABBREV = 0.3 """cut-off value whether a 'token' is an abbreviation""" IGNORE_ABBREV_PENALTY = False """allows the disabling of the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period.""" ABBREV_BACKOFF = 5 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" COLLOCATION = 7.88 """minimal log-likelihood value that two tokens need to be considered as a collocation""" SENT_STARTER = 30 """minimal log-likelihood value that a token requires to be considered as a frequent sentence starter""" INCLUDE_ALL_COLLOCS = False """this includes as potential collocations all word pairs where the first word ends in a period. It may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify.""" INCLUDE_ABBREV_COLLOCS = False """this includes as potential collocations all word pairs where the first word is an abbreviation. Such collocations override the orthographic heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.""" """""" MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" # //////////////////////////////////////////////////////////// # { Training.. # //////////////////////////////////////////////////////////// def train(self, text, verbose=False, finalize=True): """ Collects training data from a given text. If finalize is True, it will determine all the parameters for sentence boundary detection. If not, this will be delayed until get_params() or finalize_training() is called. If verbose is True, abbreviations found will be listed. """ # Break the text into tokens; record which token indices correspond to # line starts and paragraph starts; and determine their types. self._train_tokens(self._tokenize_words(text), verbose) if finalize: self.finalize_training(verbose) def train_tokens(self, tokens, verbose=False, finalize=True): """ Collects training data from a given list of tokens. """ self._train_tokens((self._Token(t) for t in tokens), verbose) if finalize: self.finalize_training(verbose) def _train_tokens(self, tokens, verbose): self._finalized = False # Ensure tokens are a list tokens = list(tokens) # Find the frequency of each case-normalized type. (Don't # strip off final periods.) Also keep track of the number of # tokens that end in periods. for aug_tok in tokens: self._type_fdist[aug_tok.type] += 1 if aug_tok.period_final: self._num_period_toks += 1 # Look for new abbreviations, and for types that no longer are unique_types = self._unique_types(tokens) for abbr, score, is_add in self._reclassify_abbrev_types(unique_types): if score >= self.ABBREV: if is_add: self._params.abbrev_types.add(abbr) if verbose: print(f" Abbreviation: [{score:6.4f}] {abbr}") else: if not is_add: self._params.abbrev_types.remove(abbr) if verbose: print(f" Removed abbreviation: [{score:6.4f}] {abbr}") # Make a preliminary pass through the document, marking likely # sentence breaks, abbreviations, and ellipsis tokens. tokens = list(self._annotate_first_pass(tokens)) # Check what contexts each word type can appear in, given the # case of its first letter. self._get_orthography_data(tokens) # We need total number of sentence breaks to find sentence starters self._sentbreak_count += self._get_sentbreak_count(tokens) # The remaining heuristics relate to pairs of tokens where the first # ends in a period. for aug_tok1, aug_tok2 in _pair_iter(tokens): if not aug_tok1.period_final or not aug_tok2: continue # Is the first token a rare abbreviation? if self._is_rare_abbrev_type(aug_tok1, aug_tok2): self._params.abbrev_types.add(aug_tok1.type_no_period) if verbose: print(" Rare Abbrev: %s" % aug_tok1.type) # Does second token have a high likelihood of starting a sentence? if self._is_potential_sent_starter(aug_tok2, aug_tok1): self._sent_starter_fdist[aug_tok2.type] += 1 # Is this bigram a potential collocation? if self._is_potential_collocation(aug_tok1, aug_tok2): self._collocation_fdist[ (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod) ] += 1 def _unique_types(self, tokens): return {aug_tok.type for aug_tok in tokens} def finalize_training(self, verbose=False): """ Uses data that has been gathered in training to determine likely collocations and sentence starters. """ self._params.clear_sent_starters() for typ, log_likelihood in self._find_sent_starters(): self._params.sent_starters.add(typ) if verbose: print(f" Sent Starter: [{log_likelihood:6.4f}] {typ!r}") self._params.clear_collocations() for (typ1, typ2), log_likelihood in self._find_collocations(): self._params.collocations.add((typ1, typ2)) if verbose: print(f" Collocation: [{log_likelihood:6.4f}] {typ1!r}+{typ2!r}") self._finalized = True # //////////////////////////////////////////////////////////// # { Overhead reduction # //////////////////////////////////////////////////////////// def freq_threshold( self, ortho_thresh=2, type_thresh=2, colloc_thres=2, sentstart_thresh=2 ): """ Allows memory use to be reduced after much training by removing data about rare tokens that are unlikely to have a statistical effect with further training. Entries occurring above the given thresholds will be retained. """ if ortho_thresh > 1: old_oc = self._params.ortho_context self._params.clear_ortho_context() for tok in self._type_fdist: count = self._type_fdist[tok] if count >= ortho_thresh: self._params.ortho_context[tok] = old_oc[tok] self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh) self._collocation_fdist = self._freq_threshold( self._collocation_fdist, colloc_thres ) self._sent_starter_fdist = self._freq_threshold( self._sent_starter_fdist, sentstart_thresh ) def _freq_threshold(self, fdist, threshold): """ Returns a FreqDist containing only data with counts below a given threshold, as well as a mapping (None -> count_removed). """ # We assume that there is more data below the threshold than above it # and so create a new FreqDist rather than working in place. res = FreqDist() num_removed = 0 for tok in fdist: count = fdist[tok] if count < threshold: num_removed += 1 else: res[tok] += count res[None] += num_removed return res # //////////////////////////////////////////////////////////// # { Orthographic data # //////////////////////////////////////////////////////////// def _get_orthography_data(self, tokens): """ Collect information about whether each token type occurs with different case patterns (i) overall, (ii) at sentence-initial positions, and (iii) at sentence-internal positions. """ # 'initial' or 'internal' or 'unknown' context = "internal" tokens = list(tokens) for aug_tok in tokens: # If we encounter a paragraph break, then it's a good sign # that it's a sentence break. But err on the side of # caution (by not positing a sentence break) if we just # saw an abbreviation. if aug_tok.parastart and context != "unknown": context = "initial" # If we're at the beginning of a line, then we can't decide # between 'internal' and 'initial'. if aug_tok.linestart and context == "internal": context = "unknown" # Find the case-normalized type of the token. If it's a # sentence-final token, strip off the period. typ = aug_tok.type_no_sentperiod # Update the orthographic context table. flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0) if flag: self._params.add_ortho_context(typ, flag) # Decide whether the next word is at a sentence boundary. if aug_tok.sentbreak: if not (aug_tok.is_number or aug_tok.is_initial): context = "initial" else: context = "unknown" elif aug_tok.ellipsis or aug_tok.abbr: context = "unknown" else: context = "internal" # //////////////////////////////////////////////////////////// # { Abbreviations # //////////////////////////////////////////////////////////// def _reclassify_abbrev_types(self, types): """ (Re)classifies each given token if - it is period-final and not a known abbreviation; or - it is not period-final and is otherwise a known abbreviation by checking whether its previous classification still holds according to the heuristics of section 3. Yields triples (abbr, score, is_add) where abbr is the type in question, score is its log-likelihood with penalties applied, and is_add specifies whether the present type is a candidate for inclusion or exclusion as an abbreviation, such that: - (is_add and score >= 0.3) suggests a new abbreviation; and - (not is_add and score < 0.3) suggests excluding an abbreviation. """ # (While one could recalculate abbreviations from all .-final tokens at # every iteration, in cases requiring efficiency, the number of tokens # in the present training document will be much less.) for typ in types: # Check some basic conditions, to rule out words that are # clearly not abbrev_types. if not _re_non_punct.search(typ) or typ == "##number##": continue if typ.endswith("."): if typ in self._params.abbrev_types: continue typ = typ[:-1] is_add = True else: if typ not in self._params.abbrev_types: continue is_add = False # Count how many periods & nonperiods are in the # candidate. num_periods = typ.count(".") + 1 num_nonperiods = len(typ) - num_periods + 1 # Let be the candidate without the period, and # be the period. Find a log likelihood ratio that # indicates whether occurs as a single unit (high # value of log_likelihood), or as two independent units and # (low value of log_likelihood). count_with_period = self._type_fdist[typ + "."] count_without_period = self._type_fdist[typ] log_likelihood = self._dunning_log_likelihood( count_with_period + count_without_period, self._num_period_toks, count_with_period, self._type_fdist.N(), ) # Apply three scaling factors to 'tweak' the basic log # likelihood ratio: # F_length: long word -> less likely to be an abbrev # F_periods: more periods -> more likely to be an abbrev # F_penalty: penalize occurrences w/o a period f_length = math.exp(-num_nonperiods) f_periods = num_periods f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow( num_nonperiods, -count_without_period ) score = log_likelihood * f_length * f_periods * f_penalty yield typ, score, is_add def find_abbrev_types(self): """ Recalculates abbreviations given type frequencies, despite no prior determination of abbreviations. This fails to include abbreviations otherwise found as "rare". """ self._params.clear_abbrevs() tokens = (typ for typ in self._type_fdist if typ and typ.endswith(".")) for abbr, score, _is_add in self._reclassify_abbrev_types(tokens): if score >= self.ABBREV: self._params.abbrev_types.add(abbr) # This function combines the work done by the original code's # functions `count_orthography_context`, `get_orthography_count`, # and `get_rare_abbreviations`. def _is_rare_abbrev_type(self, cur_tok, next_tok): """ A word type is counted as a rare abbreviation if... - it's not already marked as an abbreviation - it occurs fewer than ABBREV_BACKOFF times - either it is followed by a sentence-internal punctuation mark, *or* it is followed by a lower-case word that sometimes appears with upper case, but never occurs with lower case at the beginning of sentences. """ if cur_tok.abbr or not cur_tok.sentbreak: return False # Find the case-normalized type of the token. If it's # a sentence-final token, strip off the period. typ = cur_tok.type_no_sentperiod # Proceed only if the type hasn't been categorized as an # abbreviation already, and is sufficiently rare... count = self._type_fdist[typ] + self._type_fdist[typ[:-1]] if typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF: return False # Record this token as an abbreviation if the next # token is a sentence-internal punctuation mark. # [XX] :1 or check the whole thing?? if next_tok.tok[:1] in self._lang_vars.internal_punctuation: return True # Record this type as an abbreviation if the next # token... (i) starts with a lower case letter, # (ii) sometimes occurs with an uppercase letter, # and (iii) never occus with an uppercase letter # sentence-internally. # [xx] should the check for (ii) be modified?? if next_tok.first_lower: typ2 = next_tok.type_no_sentperiod typ2ortho_context = self._params.ortho_context[typ2] if (typ2ortho_context & _ORTHO_BEG_UC) and not ( typ2ortho_context & _ORTHO_MID_UC ): return True # //////////////////////////////////////////////////////////// # { Log Likelihoods # //////////////////////////////////////////////////////////// # helper for _reclassify_abbrev_types: @staticmethod def _dunning_log_likelihood(count_a, count_b, count_ab, N): """ A function that calculates the modified Dunning log-likelihood ratio scores for abbreviation candidates. The details of how this works is available in the paper. """ p1 = count_b / N p2 = 0.99 null_hypo = count_ab * math.log(p1) + (count_a - count_ab) * math.log(1.0 - p1) alt_hypo = count_ab * math.log(p2) + (count_a - count_ab) * math.log(1.0 - p2) likelihood = null_hypo - alt_hypo return -2.0 * likelihood @staticmethod def _col_log_likelihood(count_a, count_b, count_ab, N): """ A function that will just compute log-likelihood estimate, in the original paper it's described in algorithm 6 and 7. This *should* be the original Dunning log-likelihood values, unlike the previous log_l function where it used modified Dunning log-likelihood values """ p = count_b / N p1 = count_ab / count_a try: p2 = (count_b - count_ab) / (N - count_a) except ZeroDivisionError: p2 = 1 try: summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p) except ValueError: summand1 = 0 try: summand2 = (count_b - count_ab) * math.log(p) + ( N - count_a - count_b + count_ab ) * math.log(1.0 - p) except ValueError: summand2 = 0 if count_a == count_ab or p1 <= 0 or p1 >= 1: summand3 = 0 else: summand3 = count_ab * math.log(p1) + (count_a - count_ab) * math.log( 1.0 - p1 ) if count_b == count_ab or p2 <= 0 or p2 >= 1: summand4 = 0 else: summand4 = (count_b - count_ab) * math.log(p2) + ( N - count_a - count_b + count_ab ) * math.log(1.0 - p2) likelihood = summand1 + summand2 - summand3 - summand4 return -2.0 * likelihood # //////////////////////////////////////////////////////////// # { Collocation Finder # //////////////////////////////////////////////////////////// def _is_potential_collocation(self, aug_tok1, aug_tok2): """ Returns True if the pair of tokens may form a collocation given log-likelihood statistics. """ return ( ( self.INCLUDE_ALL_COLLOCS or (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr) or (aug_tok1.sentbreak and (aug_tok1.is_number or aug_tok1.is_initial)) ) and aug_tok1.is_non_punct and aug_tok2.is_non_punct ) def _find_collocations(self): """ Generates likely collocations and their log-likelihood. """ for types in self._collocation_fdist: try: typ1, typ2 = types except TypeError: # types may be None after calling freq_threshold() continue if typ2 in self._params.sent_starters: continue col_count = self._collocation_fdist[types] typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + "."] typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + "."] if ( typ1_count > 1 and typ2_count > 1 and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count) ): log_likelihood = self._col_log_likelihood( typ1_count, typ2_count, col_count, self._type_fdist.N() ) # Filter out the not-so-collocative if log_likelihood >= self.COLLOCATION and ( self._type_fdist.N() / typ1_count > typ2_count / col_count ): yield (typ1, typ2), log_likelihood # //////////////////////////////////////////////////////////// # { Sentence-Starter Finder # //////////////////////////////////////////////////////////// def _is_potential_sent_starter(self, cur_tok, prev_tok): """ Returns True given a token and the token that precedes it if it seems clear that the token is beginning a sentence. """ # If a token (i) is preceded by a sentece break that is # not a potential ordinal number or initial, and (ii) is # alphabetic, then it is a a sentence-starter. return ( prev_tok.sentbreak and not (prev_tok.is_number or prev_tok.is_initial) and cur_tok.is_alpha ) def _find_sent_starters(self): """ Uses collocation heuristics for each candidate token to determine if it frequently starts sentences. """ for typ in self._sent_starter_fdist: if not typ: continue typ_at_break_count = self._sent_starter_fdist[typ] typ_count = self._type_fdist[typ] + self._type_fdist[typ + "."] if typ_count < typ_at_break_count: # needed after freq_threshold continue log_likelihood = self._col_log_likelihood( self._sentbreak_count, typ_count, typ_at_break_count, self._type_fdist.N(), ) if ( log_likelihood >= self.SENT_STARTER and self._type_fdist.N() / self._sentbreak_count > typ_count / typ_at_break_count ): yield typ, log_likelihood def _get_sentbreak_count(self, tokens): """ Returns the number of sentence breaks marked in a given set of augmented tokens. """ return sum(1 for aug_tok in tokens if aug_tok.sentbreak) ###################################################################### # { Punkt Sentence Tokenizer ###################################################################### class PunktSentenceTokenizer(PunktBaseClass, TokenizerI): """ A sentence tokenizer which uses an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences; and then uses that model to find sentence boundaries. This approach has been shown to work well for many European languages. """ def __init__( self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken ): """ train_text can either be the sole training text for this sentence boundary detector, or can be a PunktParameters object. """ PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls) if train_text: self._params = self.train(train_text, verbose) def train(self, train_text, verbose=False): """ Derives parameters from a given training text, or uses the parameters given. Repeated calls to this method destroy previous parameters. For incremental training, instantiate a separate PunktTrainer instance. """ if not isinstance(train_text, str): return train_text return PunktTrainer( train_text, lang_vars=self._lang_vars, token_cls=self._Token ).get_params() # //////////////////////////////////////////////////////////// # { Tokenization # //////////////////////////////////////////////////////////// def tokenize(self, text, realign_boundaries=True): """ Given a text, returns a list of the sentences in that text. """ return list(self.sentences_from_text(text, realign_boundaries)) def debug_decisions(self, text): """ Classifies candidate periods as sentence breaks, yielding a dict for each that may be used to understand why the decision was made. See format_debug_decision() to help make this output readable. """ for match, decision_text in self._match_potential_end_contexts(text): tokens = self._tokenize_words(decision_text) tokens = list(self._annotate_first_pass(tokens)) while tokens and not tokens[0].tok.endswith(self._lang_vars.sent_end_chars): tokens.pop(0) yield { "period_index": match.end() - 1, "text": decision_text, "type1": tokens[0].type, "type2": tokens[1].type, "type1_in_abbrs": bool(tokens[0].abbr), "type1_is_initial": bool(tokens[0].is_initial), "type2_is_sent_starter": tokens[1].type_no_sentperiod in self._params.sent_starters, "type2_ortho_heuristic": self._ortho_heuristic(tokens[1]), "type2_ortho_contexts": set( self._params._debug_ortho_context(tokens[1].type_no_sentperiod) ), "collocation": ( tokens[0].type_no_sentperiod, tokens[1].type_no_sentperiod, ) in self._params.collocations, "reason": self._second_pass_annotation(tokens[0], tokens[1]) or REASON_DEFAULT_DECISION, "break_decision": tokens[0].sentbreak, } def span_tokenize(self, text, realign_boundaries=True): """ Given a text, generates (start, end) spans of sentences in the text. """ slices = self._slices_from_text(text) if realign_boundaries: slices = self._realign_boundaries(text, slices) for sentence in slices: yield (sentence.start, sentence.stop) def sentences_from_text(self, text, realign_boundaries=True): """ Given a text, generates the sentences in that text by only testing candidate sentence breaks. If realign_boundaries is True, includes in the sentence closing punctuation that follows the period. """ return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] def _match_potential_end_contexts(self, text): """ Given a text, find the matches of potential sentence breaks, alongside the contexts surrounding these sentence breaks. Since the fix for the ReDOS discovered in issue #2866, we no longer match the word before a potential end of sentence token. Instead, we use a separate regex for this. As a consequence, `finditer`'s desire to find non-overlapping matches no longer aids us in finding the single longest match. Where previously, we could use:: >>> pst = PunktSentenceTokenizer() >>> text = "Very bad acting!!! I promise." >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +SKIP [] Now we have to find the word before (i.e. 'acting') separately, and `finditer` returns:: >>> pst = PunktSentenceTokenizer() >>> text = "Very bad acting!!! I promise." >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +NORMALIZE_WHITESPACE [, , ] So, we need to find the word before the match from right to left, and then manually remove the overlaps. That is what this method does:: >>> pst = PunktSentenceTokenizer() >>> text = "Very bad acting!!! I promise." >>> pst._match_potential_end_contexts(text) [(, 'acting!!! I')] :param text: String of one or more sentences :type text: str :return: List of match-context tuples. :rtype: List[Tuple[re.Match, str]] """ before_words = {} matches = [] for match in reversed(list(self._lang_vars.period_context_re().finditer(text))): # Ignore matches that have already been captured by matches to the right of this match if matches and match.end() > before_start: continue # Find the word before the current match split = text[: match.start()].rsplit(maxsplit=1) before_start = len(split[0]) if len(split) == 2 else 0 before_words[match] = split[-1] if split else "" matches.append(match) return [ ( match, before_words[match] + match.group() + match.group("after_tok"), ) for match in matches[::-1] ] def _slices_from_text(self, text): last_break = 0 for match, context in self._match_potential_end_contexts(text): if self.text_contains_sentbreak(context): yield slice(last_break, match.end()) if match.group("next_tok"): # next sentence starts after whitespace last_break = match.start("next_tok") else: # next sentence starts at following punctuation last_break = match.end() # The last sentence should not contain trailing whitespace. yield slice(last_break, len(text.rstrip())) def _realign_boundaries(self, text, slices): """ Attempts to realign punctuation that falls after the period but should otherwise be included in the same sentence. For example: "(Sent1.) Sent2." will otherwise be split as:: ["(Sent1.", ") Sent1."]. This method will produce:: ["(Sent1.)", "Sent2."]. """ realign = 0 for sentence1, sentence2 in _pair_iter(slices): sentence1 = slice(sentence1.start + realign, sentence1.stop) if not sentence2: if text[sentence1]: yield sentence1 continue m = self._lang_vars.re_boundary_realignment.match(text[sentence2]) if m: yield slice(sentence1.start, sentence2.start + len(m.group(0).rstrip())) realign = m.end() else: realign = 0 if text[sentence1]: yield sentence1 def text_contains_sentbreak(self, text): """ Returns True if the given text includes a sentence break. """ found = False # used to ignore last token for tok in self._annotate_tokens(self._tokenize_words(text)): if found: return True if tok.sentbreak: found = True return False def sentences_from_text_legacy(self, text): """ Given a text, generates the sentences in that text. Annotates all tokens, rather than just those with possible sentence breaks. Should produce the same results as ``sentences_from_text``. """ tokens = self._annotate_tokens(self._tokenize_words(text)) return self._build_sentence_list(text, tokens) def sentences_from_tokens(self, tokens): """ Given a sequence of tokens, generates lists of tokens, each list corresponding to a sentence. """ tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens)) sentence = [] for aug_tok in tokens: sentence.append(aug_tok.tok) if aug_tok.sentbreak: yield sentence sentence = [] if sentence: yield sentence def _annotate_tokens(self, tokens): """ Given a set of tokens augmented with markers for line-start and paragraph-start, returns an iterator through those tokens with full annotation including predicted sentence breaks. """ # Make a preliminary pass through the document, marking likely # sentence breaks, abbreviations, and ellipsis tokens. tokens = self._annotate_first_pass(tokens) # Make a second pass through the document, using token context # information to change our preliminary decisions about where # sentence breaks, abbreviations, and ellipsis occurs. tokens = self._annotate_second_pass(tokens) ## [XX] TESTING # tokens = list(tokens) # self.dump(tokens) return tokens def _build_sentence_list(self, text, tokens): """ Given the original text and the list of augmented word tokens, construct and return a tokenized list of sentence strings. """ # Most of the work here is making sure that we put the right # pieces of whitespace back in all the right places. # Our position in the source text, used to keep track of which # whitespace to add: pos = 0 # A regular expression that finds pieces of whitespace: white_space_regexp = re.compile(r"\s*") sentence = "" for aug_tok in tokens: tok = aug_tok.tok # Find the whitespace before this token, and update pos. white_space = white_space_regexp.match(text, pos).group() pos += len(white_space) # Some of the rules used by the punkt word tokenizer # strip whitespace out of the text, resulting in tokens # that contain whitespace in the source text. If our # token doesn't match, see if adding whitespace helps. # If so, then use the version with whitespace. if text[pos : pos + len(tok)] != tok: pat = r"\s*".join(re.escape(c) for c in tok) m = re.compile(pat).match(text, pos) if m: tok = m.group() # Move our position pointer to the end of the token. assert text[pos : pos + len(tok)] == tok pos += len(tok) # Add this token. If it's not at the beginning of the # sentence, then include any whitespace that separated it # from the previous token. if sentence: sentence += white_space sentence += tok # If we're at a sentence break, then start a new sentence. if aug_tok.sentbreak: yield sentence sentence = "" # If the last sentence is empty, discard it. if sentence: yield sentence # [XX] TESTING def dump(self, tokens): print("writing to /tmp/punkt.new...") with open("/tmp/punkt.new", "w") as outfile: for aug_tok in tokens: if aug_tok.parastart: outfile.write("\n\n") elif aug_tok.linestart: outfile.write("\n") else: outfile.write(" ") outfile.write(str(aug_tok)) # //////////////////////////////////////////////////////////// # { Customization Variables # //////////////////////////////////////////////////////////// PUNCTUATION = tuple(";:,.!?") # //////////////////////////////////////////////////////////// # { Annotation Procedures # //////////////////////////////////////////////////////////// def _annotate_second_pass(self, tokens): """ Performs a token-based classification (section 4) over the given tokens, making use of the orthographic heuristic (4.1.1), collocation heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3). """ for token1, token2 in _pair_iter(tokens): self._second_pass_annotation(token1, token2) yield token1 def _second_pass_annotation(self, aug_tok1, aug_tok2): """ Performs token-based classification over a pair of contiguous tokens updating the first. """ # Is it the last token? We can't do anything then. if not aug_tok2: return if not aug_tok1.period_final: # We only care about words ending in periods. return typ = aug_tok1.type_no_period next_typ = aug_tok2.type_no_sentperiod tok_is_initial = aug_tok1.is_initial # [4.1.2. Collocation Heuristic] If there's a # collocation between the word before and after the # period, then label tok as an abbreviation and NOT # a sentence break. Note that collocations with # frequent sentence starters as their second word are # excluded in training. if (typ, next_typ) in self._params.collocations: aug_tok1.sentbreak = False aug_tok1.abbr = True return REASON_KNOWN_COLLOCATION # [4.2. Token-Based Reclassification of Abbreviations] If # the token is an abbreviation or an ellipsis, then decide # whether we should *also* classify it as a sentbreak. if (aug_tok1.abbr or aug_tok1.ellipsis) and (not tok_is_initial): # [4.1.1. Orthographic Heuristic] Check if there's # orthogrpahic evidence about whether the next word # starts a sentence or not. is_sent_starter = self._ortho_heuristic(aug_tok2) if is_sent_starter == True: aug_tok1.sentbreak = True return REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC # [4.1.3. Frequent Sentence Starter Heruistic] If the # next word is capitalized, and is a member of the # frequent-sentence-starters list, then label tok as a # sentence break. if aug_tok2.first_upper and next_typ in self._params.sent_starters: aug_tok1.sentbreak = True return REASON_ABBR_WITH_SENTENCE_STARTER # [4.3. Token-Based Detection of Initials and Ordinals] # Check if any initials or ordinals tokens that are marked # as sentbreaks should be reclassified as abbreviations. if tok_is_initial or typ == "##number##": # [4.1.1. Orthographic Heuristic] Check if there's # orthogrpahic evidence about whether the next word # starts a sentence or not. is_sent_starter = self._ortho_heuristic(aug_tok2) if is_sent_starter == False: aug_tok1.sentbreak = False aug_tok1.abbr = True if tok_is_initial: return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC # Special heuristic for initials: if orthogrpahic # heuristic is unknown, and next word is always # capitalized, then mark as abbrev (eg: J. Bach). if ( is_sent_starter == "unknown" and tok_is_initial and aug_tok2.first_upper and not (self._params.ortho_context[next_typ] & _ORTHO_LC) ): aug_tok1.sentbreak = False aug_tok1.abbr = True return REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC return def _ortho_heuristic(self, aug_tok): """ Decide whether the given token is the first token in a sentence. """ # Sentences don't start with punctuation marks: if aug_tok.tok in self.PUNCTUATION: return False ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod] # If the word is capitalized, occurs at least once with a # lower case first letter, and never occurs with an upper case # first letter sentence-internally, then it's a sentence starter. if ( aug_tok.first_upper and (ortho_context & _ORTHO_LC) and not (ortho_context & _ORTHO_MID_UC) ): return True # If the word is lower case, and either (a) we've seen it used # with upper case, or (b) we've never seen it used # sentence-initially with lower case, then it's not a sentence # starter. if aug_tok.first_lower and ( (ortho_context & _ORTHO_UC) or not (ortho_context & _ORTHO_BEG_LC) ): return False # Otherwise, we're not sure. return "unknown" DEBUG_DECISION_FMT = """Text: {text!r} (at offset {period_index}) Sentence break? {break_decision} ({reason}) Collocation? {collocation} {type1!r}: known abbreviation: {type1_in_abbrs} is initial: {type1_is_initial} {type2!r}: known sentence starter: {type2_is_sent_starter} orthographic heuristic suggests is a sentence starter? {type2_ortho_heuristic} orthographic contexts in training: {type2_ortho_contexts} """ def format_debug_decision(d): return DEBUG_DECISION_FMT.format(**d) def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer): """Builds a punkt model and applies it to the same text""" cleanup = ( lambda s: re.compile(r"(?:\r|^\s+)", re.MULTILINE).sub("", s).replace("\n", " ") ) trainer = train_cls() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sbd = tok_cls(trainer.get_params()) for sentence in sbd.sentences_from_text(text): print(cleanup(sentence)) nltk-3.7/nltk/tokenize/regexp.py000066400000000000000000000173031420073152400170010ustar00rootroot00000000000000# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Trevor Cohn # URL: # For license information, see LICENSE.TXT r""" Regular-Expression Tokenizers A ``RegexpTokenizer`` splits a string into substrings using a regular expression. For example, the following tokenizer forms tokens out of alphabetic sequences, money expressions, and any other non-whitespace sequences: >>> from nltk.tokenize import RegexpTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') >>> tokenizer.tokenize(s) ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] A ``RegexpTokenizer`` can use its regexp to match delimiters instead: >>> tokenizer = RegexpTokenizer('\s+', gaps=True) >>> tokenizer.tokenize(s) ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] Note that empty tokens are not returned when the delimiter appears at the start or end of the string. The material between the tokens is discarded. For example, the following tokenizer selects just the capitalized words: >>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+') >>> capword_tokenizer.tokenize(s) ['Good', 'New', 'York', 'Please', 'Thanks'] This module contains several subclasses of ``RegexpTokenizer`` that use pre-defined regular expressions. >>> from nltk.tokenize import BlanklineTokenizer >>> # Uses '\s*\n\s*\n\s*': >>> BlanklineTokenizer().tokenize(s) ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.'] All of the regular expression tokenizers are also available as functions: >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize >>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+') ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> wordpunct_tokenize(s) ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> blankline_tokenize(s) ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.'] Caution: The function ``regexp_tokenize()`` takes the text as its first argument, and the regular expression pattern as its second argument. This differs from the conventions used by Python's ``re`` functions, where the pattern is always the first argument. (This is for consistency with the other NLTK tokenizers.) """ import re from nltk.tokenize.api import TokenizerI from nltk.tokenize.util import regexp_span_tokenize class RegexpTokenizer(TokenizerI): r""" A tokenizer that splits a string using a regular expression, which matches either the tokens or the separators between tokens. >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') :type pattern: str :param pattern: The pattern used to build this tokenizer. (This pattern must not contain capturing parentheses; Use non-capturing parentheses, e.g. (?:...), instead) :type gaps: bool :param gaps: True if this tokenizer's pattern should be used to find separators between tokens; False if this tokenizer's pattern should be used to find the tokens themselves. :type discard_empty: bool :param discard_empty: True if any empty tokens `''` generated by the tokenizer should be discarded. Empty tokens can only be generated if `_gaps == True`. :type flags: int :param flags: The regexp flags used to compile this tokenizer's pattern. By default, the following flags are used: `re.UNICODE | re.MULTILINE | re.DOTALL`. """ def __init__( self, pattern, gaps=False, discard_empty=True, flags=re.UNICODE | re.MULTILINE | re.DOTALL, ): # If they gave us a regexp object, extract the pattern. pattern = getattr(pattern, "pattern", pattern) self._pattern = pattern self._gaps = gaps self._discard_empty = discard_empty self._flags = flags self._regexp = None def _check_regexp(self): if self._regexp is None: self._regexp = re.compile(self._pattern, self._flags) def tokenize(self, text): self._check_regexp() # If our regexp matches gaps, use re.split: if self._gaps: if self._discard_empty: return [tok for tok in self._regexp.split(text) if tok] else: return self._regexp.split(text) # If our regexp matches tokens, use re.findall: else: return self._regexp.findall(text) def span_tokenize(self, text): self._check_regexp() if self._gaps: for left, right in regexp_span_tokenize(text, self._regexp): if not (self._discard_empty and left == right): yield left, right else: for m in re.finditer(self._regexp, text): yield m.span() def __repr__(self): return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format( self.__class__.__name__, self._pattern, self._gaps, self._discard_empty, self._flags, ) class WhitespaceTokenizer(RegexpTokenizer): r""" Tokenize a string on whitespace (space, tab, newline). In general, users should use the string ``split()`` method instead. >>> from nltk.tokenize import WhitespaceTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> WhitespaceTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] """ def __init__(self): RegexpTokenizer.__init__(self, r"\s+", gaps=True) class BlanklineTokenizer(RegexpTokenizer): """ Tokenize a string, treating any sequence of blank lines as a delimiter. Blank lines are defined as lines containing no characters, except for space or tab characters. """ def __init__(self): RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True) class WordPunctTokenizer(RegexpTokenizer): r""" Tokenize a text into a sequence of alphabetic and non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``. >>> from nltk.tokenize import WordPunctTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> WordPunctTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] """ def __init__(self): RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+") ###################################################################### # { Tokenization Functions ###################################################################### def regexp_tokenize( text, pattern, gaps=False, discard_empty=True, flags=re.UNICODE | re.MULTILINE | re.DOTALL, ): """ Return a tokenized copy of *text*. See :class:`.RegexpTokenizer` for descriptions of the arguments. """ tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags) return tokenizer.tokenize(text) blankline_tokenize = BlanklineTokenizer().tokenize wordpunct_tokenize = WordPunctTokenizer().tokenize nltk-3.7/nltk/tokenize/repp.py000066400000000000000000000176401420073152400164610ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the Repp Tokenizer # # Copyright (C) 2001-2015 NLTK Project # Authors: Rebecca Dridan and Stephan Oepen # Contributors: Liling Tan # # URL: # For license information, see LICENSE.TXT import os import re import subprocess import sys import tempfile from nltk.data import ZipFilePathPointer from nltk.internals import find_dir from nltk.tokenize.api import TokenizerI class ReppTokenizer(TokenizerI): """ A class for word tokenization using the REPP parser described in Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a Long Solved Problem - A Survey, Contrastive Experiment, Recommendations, and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406 >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' , ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' , ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.' ... ] >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP >>> for sent in sents: # doctest: +SKIP ... tokenizer.tokenize(sent) # doctest: +SKIP ... (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.') (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.') (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.') >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP ... print(sent) # doctest: +SKIP ... (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.') (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.') (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.') >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP ... print(sent) # doctest: +SKIP ... [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)] [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)] [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)] """ def __init__(self, repp_dir, encoding="utf8"): self.repp_dir = self.find_repptokenizer(repp_dir) # Set a directory to store the temporary files. self.working_dir = tempfile.gettempdir() # Set an encoding for the input strings. self.encoding = encoding def tokenize(self, sentence): """ Use Repp to tokenize a single sentence. :param sentence: A single sentence string. :type sentence: str :return: A tuple of tokens. :rtype: tuple(str) """ return next(self.tokenize_sents([sentence])) def tokenize_sents(self, sentences, keep_token_positions=False): """ Tokenize multiple sentences using Repp. :param sentences: A list of sentence strings. :type sentences: list(str) :return: A list of tuples of tokens :rtype: iter(tuple(str)) """ with tempfile.NamedTemporaryFile( prefix="repp_input.", dir=self.working_dir, mode="w", delete=False ) as input_file: # Write sentences to temporary input file. for sent in sentences: input_file.write(str(sent) + "\n") input_file.close() # Generate command to run REPP. cmd = self.generate_repp_command(input_file.name) # Decode the stdout and strips the ending newline. repp_output = self._execute(cmd).decode(self.encoding).strip() for tokenized_sent in self.parse_repp_outputs(repp_output): if not keep_token_positions: # Removes token position information. tokenized_sent, starts, ends = zip(*tokenized_sent) yield tokenized_sent def generate_repp_command(self, inputfilename): """ This module generates the REPP command to be used at the terminal. :param inputfilename: path to the input file :type inputfilename: str """ cmd = [self.repp_dir + "/src/repp"] cmd += ["-c", self.repp_dir + "/erg/repp.set"] cmd += ["--format", "triple"] cmd += [inputfilename] return cmd @staticmethod def _execute(cmd): p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() return stdout @staticmethod def parse_repp_outputs(repp_output): """ This module parses the tri-tuple format that REPP outputs using the "--format triple" option and returns an generator with tuple of string tokens. :param repp_output: :type repp_output: type :return: an iterable of the tokenized sentences as tuples of strings :rtype: iter(tuple) """ line_regex = re.compile(r"^\((\d+), (\d+), (.+)\)$", re.MULTILINE) for section in repp_output.split("\n\n"): words_with_positions = [ (token, int(start), int(end)) for start, end, token in line_regex.findall(section) ] words = tuple(t[2] for t in words_with_positions) yield words_with_positions def find_repptokenizer(self, repp_dirname): """ A module to find REPP tokenizer binary and its *repp.set* config file. """ if os.path.exists(repp_dirname): # If a full path is given. _repp_dir = repp_dirname else: # Try to find path to REPP directory in environment variables. _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",)) # Checks for the REPP binary and erg/repp.set config file. assert os.path.exists(_repp_dir + "/src/repp") assert os.path.exists(_repp_dir + "/erg/repp.set") return _repp_dir nltk-3.7/nltk/tokenize/sexpr.py000066400000000000000000000120521420073152400166440ustar00rootroot00000000000000# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2022 NLTK Project # Author: Yoav Goldberg # Steven Bird (minor edits) # URL: # For license information, see LICENSE.TXT """ S-Expression Tokenizer ``SExprTokenizer`` is used to find parenthesized expressions in a string. In particular, it divides a string into a sequence of substrings that are either parenthesized expressions (including any nested parenthesized expressions), or other whitespace-separated tokens. >>> from nltk.tokenize import SExprTokenizer >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') ['(a b (c d))', 'e', 'f', '(g)'] By default, `SExprTokenizer` will raise a ``ValueError`` exception if used to tokenize an expression with non-matching parentheses: >>> SExprTokenizer().tokenize('c) d) e (f (g') Traceback (most recent call last): ... ValueError: Un-matched close paren at char 1 The ``strict`` argument can be set to False to allow for non-matching parentheses. Any unmatched close parentheses will be listed as their own s-expression; and the last partial sexpr with unmatched open parentheses will be listed as its own sexpr: >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') ['c', ')', 'd', ')', 'e', '(f (g'] The characters used for open and close parentheses may be customized using the ``parens`` argument to the `SExprTokenizer` constructor: >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}') ['{a b {c d}}', 'e', 'f', '{g}'] The s-expression tokenizer is also available as a function: >>> from nltk.tokenize import sexpr_tokenize >>> sexpr_tokenize('(a b (c d)) e f (g)') ['(a b (c d))', 'e', 'f', '(g)'] """ import re from nltk.tokenize.api import TokenizerI class SExprTokenizer(TokenizerI): """ A tokenizer that divides strings into s-expressions. An s-expresion can be either: - a parenthesized expression, including any nested parenthesized expressions, or - a sequence of non-whitespace non-parenthesis characters. For example, the string ``(a (b c)) d e (f)`` consists of four s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``. By default, the characters ``(`` and ``)`` are treated as open and close parentheses, but alternative strings may be specified. :param parens: A two-element sequence specifying the open and close parentheses that should be used to find sexprs. This will typically be either a two-character string, or a list of two strings. :type parens: str or list :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr. """ def __init__(self, parens="()", strict=True): if len(parens) != 2: raise ValueError("parens must contain exactly two strings") self._strict = strict self._open_paren = parens[0] self._close_paren = parens[1] self._paren_regexp = re.compile( f"{re.escape(parens[0])}|{re.escape(parens[1])}" ) def tokenize(self, text): """ Return a list of s-expressions extracted from *text*. For example: >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') ['(a b (c d))', 'e', 'f', '(g)'] All parentheses are assumed to mark s-expressions. (No special processing is done to exclude parentheses that occur inside strings, or following backslash characters.) If the given expression contains non-matching parentheses, then the behavior of the tokenizer depends on the ``strict`` parameter to the constructor. If ``strict`` is ``True``, then raise a ``ValueError``. If ``strict`` is ``False``, then any unmatched close parentheses will be listed as their own s-expression; and the last partial s-expression with unmatched open parentheses will be listed as its own s-expression: >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') ['c', ')', 'd', ')', 'e', '(f (g'] :param text: the string to be tokenized :type text: str or iter(str) :rtype: iter(str) """ result = [] pos = 0 depth = 0 for m in self._paren_regexp.finditer(text): paren = m.group() if depth == 0: result += text[pos : m.start()].split() pos = m.start() if paren == self._open_paren: depth += 1 if paren == self._close_paren: if self._strict and depth == 0: raise ValueError("Un-matched close paren at char %d" % m.start()) depth = max(0, depth - 1) if depth == 0: result.append(text[pos : m.end()]) pos = m.end() if self._strict and depth > 0: raise ValueError("Un-matched open paren at char %d" % pos) if pos < len(text): result.append(text[pos:]) return result sexpr_tokenize = SExprTokenizer().tokenize nltk-3.7/nltk/tokenize/simple.py000066400000000000000000000116641420073152400170040ustar00rootroot00000000000000# Natural Language Toolkit: Simple Tokenizers # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT r""" Simple Tokenizers These tokenizers divide strings into substrings using the string ``split()`` method. When tokenizing using a particular delimiter string, use the string ``split()`` method directly, as this is more efficient. The simple tokenizers are *not* available as separate functions; instead, you should just use the string ``split()`` method directly: >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> s.split() ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] >>> s.split(' ') ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] >>> s.split('\n') ['Good muffins cost $3.88', 'in New York. Please buy me', 'two of them.', '', 'Thanks.'] The simple tokenizers are mainly useful because they follow the standard ``TokenizerI`` interface, and so can be used with any code that expects a tokenizer. For example, these tokenizers can be used to specify the tokenization conventions when building a `CorpusReader`. """ from nltk.tokenize.api import StringTokenizer, TokenizerI from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize class SpaceTokenizer(StringTokenizer): r"""Tokenize a string using the space character as a delimiter, which is the same as ``s.split(' ')``. >>> from nltk.tokenize import SpaceTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> SpaceTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] """ _string = " " class TabTokenizer(StringTokenizer): r"""Tokenize a string use the tab character as a delimiter, the same as ``s.split('\t')``. >>> from nltk.tokenize import TabTokenizer >>> TabTokenizer().tokenize('a\tb c\n\t d') ['a', 'b c\n', ' d'] """ _string = "\t" class CharTokenizer(StringTokenizer): """Tokenize a string into individual characters. If this functionality is ever required directly, use ``for char in string``. """ def tokenize(self, s): return list(s) def span_tokenize(self, s): yield from enumerate(range(1, len(s) + 1)) class LineTokenizer(TokenizerI): r"""Tokenize a string into its lines, optionally discarding blank lines. This is similar to ``s.split('\n')``. >>> from nltk.tokenize import LineTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> LineTokenizer(blanklines='keep').tokenize(s) ['Good muffins cost $3.88', 'in New York. Please buy me', 'two of them.', '', 'Thanks.'] >>> # same as [l for l in s.split('\n') if l.strip()]: >>> LineTokenizer(blanklines='discard').tokenize(s) ['Good muffins cost $3.88', 'in New York. Please buy me', 'two of them.', 'Thanks.'] :param blanklines: Indicates how blank lines should be handled. Valid values are: - ``discard``: strip blank lines out of the token list before returning it. A line is considered blank if it contains only whitespace characters. - ``keep``: leave all blank lines in the token list. - ``discard-eof``: if the string ends with a newline, then do not generate a corresponding token ``''`` after that newline. """ def __init__(self, blanklines="discard"): valid_blanklines = ("discard", "keep", "discard-eof") if blanklines not in valid_blanklines: raise ValueError( "Blank lines must be one of: %s" % " ".join(valid_blanklines) ) self._blanklines = blanklines def tokenize(self, s): lines = s.splitlines() # If requested, strip off blank lines. if self._blanklines == "discard": lines = [l for l in lines if l.rstrip()] elif self._blanklines == "discard-eof": if lines and not lines[-1].strip(): lines.pop() return lines # discard-eof not implemented def span_tokenize(self, s): if self._blanklines == "keep": yield from string_span_tokenize(s, r"\n") else: yield from regexp_span_tokenize(s, r"\n(\s+\n)*") ###################################################################### # { Tokenization Functions ###################################################################### # XXX: it is stated in module docs that there is no function versions def line_tokenize(text, blanklines="discard"): return LineTokenizer(blanklines).tokenize(text) nltk-3.7/nltk/tokenize/sonority_sequencing.py000066400000000000000000000163651420073152400216250ustar00rootroot00000000000000# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2022 NLTK Project # Author: Christopher Hench # Alex Estes # URL: # For license information, see LICENSE.TXT """ The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the openness of the lips. Syllable breaks occur before troughs in sonority. For more on the SSP see Selkirk (1984). The default implementation uses the English alphabet, but the `sonority_hiearchy` can be modified to IPA or any other alphabet for the use-case. The SSP is a universal syllabification algorithm, but that does not mean it performs equally across languages. Bartlett et al. (2009) is a good benchmark for English accuracy if utilizing IPA (pg. 311). Importantly, if a custom hierarchy is supplied and vowels span across more than one level, they should be given separately to the `vowels` class attribute. References: - Otto Jespersen. 1904. Lehrbuch der Phonetik. Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203. - Elisabeth Selkirk. 1984. On the major class features and syllable theory. In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology. Cambridge, MIT Press. pp. 107-136. - Susan Bartlett, et al. 2009. On the Syllabification of Phonemes. In HLT-NAACL. pp. 308-316. """ import re import warnings from string import punctuation from nltk.tokenize.api import TokenizerI from nltk.util import ngrams class SyllableTokenizer(TokenizerI): """ Syllabifies words based on the Sonority Sequencing Principle (SSP). >>> from nltk.tokenize import SyllableTokenizer >>> from nltk import word_tokenize >>> SSP = SyllableTokenizer() >>> SSP.tokenize('justification') ['jus', 'ti', 'fi', 'ca', 'tion'] >>> text = "This is a foobar-like sentence." >>> [SSP.tokenize(token) for token in word_tokenize(text)] [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']] """ def __init__(self, lang="en", sonority_hierarchy=False): """ :param lang: Language parameter, default is English, 'en' :type lang: str :param sonority_hierarchy: Sonority hierarchy according to the Sonority Sequencing Principle. :type sonority_hierarchy: list(str) """ # Sonority hierarchy should be provided in descending order. # If vowels are spread across multiple levels, they should be # passed assigned self.vowels var together, otherwise should be # placed in first index of hierarchy. if not sonority_hierarchy and lang == "en": sonority_hierarchy = [ "aeiouy", # vowels. "lmnrw", # nasals. "zvsf", # fricatives. "bcdgtkpqxhj", # stops. ] self.vowels = sonority_hierarchy[0] self.phoneme_map = {} for i, level in enumerate(sonority_hierarchy): for c in level: sonority_level = len(sonority_hierarchy) - i self.phoneme_map[c] = sonority_level self.phoneme_map[c.upper()] = sonority_level def assign_values(self, token): """ Assigns each phoneme its value from the sonority hierarchy. Note: Sentence/text has to be tokenized first. :param token: Single word or token :type token: str :return: List of tuples, first element is character/phoneme and second is the soronity value. :rtype: list(tuple(str, int)) """ syllables_values = [] for c in token: try: syllables_values.append((c, self.phoneme_map[c])) except KeyError: if c not in punctuation: warnings.warn( "Character not defined in sonority_hierarchy," " assigning as vowel: '{}'".format(c) ) syllables_values.append((c, max(self.phoneme_map.values()))) self.vowels += c else: # If it's a punctuation, assign -1. syllables_values.append((c, -1)) return syllables_values def validate_syllables(self, syllable_list): """ Ensures each syllable has at least one vowel. If the following syllable doesn't have vowel, add it to the current one. :param syllable_list: Single word or token broken up into syllables. :type syllable_list: list(str) :return: Single word or token broken up into syllables (with added syllables if necessary) :rtype: list(str) """ valid_syllables = [] front = "" for i, syllable in enumerate(syllable_list): if syllable in punctuation: valid_syllables.append(syllable) continue if not re.search("|".join(self.vowels), syllable): if len(valid_syllables) == 0: front += syllable else: valid_syllables = valid_syllables[:-1] + [ valid_syllables[-1] + syllable ] else: if len(valid_syllables) == 0: valid_syllables.append(front + syllable) else: valid_syllables.append(syllable) return valid_syllables def tokenize(self, token): """ Apply the SSP to return a list of syllables. Note: Sentence/text has to be tokenized first. :param token: Single word or token :type token: str :return syllable_list: Single word or token broken up into syllables. :rtype: list(str) """ # assign values from hierarchy syllables_values = self.assign_values(token) # if only one vowel return word if sum(token.count(x) for x in self.vowels) <= 1: return [token] syllable_list = [] syllable = syllables_values[0][0] # start syllable with first phoneme for trigram in ngrams(syllables_values, n=3): phonemes, values = zip(*trigram) # Sonority of previous, focal and following phoneme prev_value, focal_value, next_value = values # Focal phoneme. focal_phoneme = phonemes[1] # These cases trigger syllable break. if focal_value == -1: # If it's a punctuation, just break. syllable_list.append(syllable) syllable_list.append(focal_phoneme) syllable = "" elif prev_value >= focal_value == next_value: syllable += focal_phoneme syllable_list.append(syllable) syllable = "" elif prev_value > focal_value < next_value: syllable_list.append(syllable) syllable = "" syllable += focal_phoneme # no syllable break else: syllable += focal_phoneme syllable += syllables_values[-1][0] # append last phoneme syllable_list.append(syllable) return self.validate_syllables(syllable_list) nltk-3.7/nltk/tokenize/stanford.py000066400000000000000000000072161420073152400173310ustar00rootroot00000000000000# Natural Language Toolkit: Interface to the Stanford Tokenizer # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Xu # # URL: # For license information, see LICENSE.TXT import json import os import tempfile import warnings from subprocess import PIPE from nltk.internals import _java_options, config_java, find_jar, java from nltk.parse.corenlp import CoreNLPParser from nltk.tokenize.api import TokenizerI _stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml" class StanfordTokenizer(TokenizerI): r""" Interface to the Stanford Tokenizer >>> from nltk.tokenize.stanford import StanfordTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." >>> StanfordTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> s = "The colour of the wall is blue." >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] """ _JAR = "stanford-postagger.jar" def __init__( self, path_to_jar=None, encoding="utf8", options=None, verbose=False, java_options="-mx1000m", ): # Raise deprecation warning. warnings.warn( str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'" ), DeprecationWarning, stacklevel=2, ) self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=("STANFORD_POSTAGGER",), searchpath=(), url=_stanford_url, verbose=verbose, ) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ",".join(f"{key}={val}" for key, val in options.items()) @staticmethod def _parse_tokenized_output(s): return s.splitlines() def tokenize(self, s): """ Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences. """ cmd = ["edu.stanford.nlp.process.PTBTokenizer"] return self._parse_tokenized_output(self._execute(cmd, s)) def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(["-charset", encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(["-options", self._options_cmd]) default_options = " ".join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, str) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() cmd.append(input_file.name) # Run the tagger and get the output. stdout, stderr = java( cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE ) stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout nltk-3.7/nltk/tokenize/stanford_segmenter.py000066400000000000000000000224101420073152400213730ustar00rootroot00000000000000#!/usr/bin/env python # Natural Language Toolkit: Interface to the Stanford Segmenter # for Chinese and Arabic # # Copyright (C) 2001-2022 NLTK Project # Author: 52nlp <52nlpcn@gmail.com> # Casper Lehmann-Strøm # Alex Constantin # # URL: # For license information, see LICENSE.TXT import json import os import tempfile import warnings from subprocess import PIPE from nltk.internals import ( _java_options, config_java, find_dir, find_file, find_jar, java, ) from nltk.tokenize.api import TokenizerI _stanford_url = "https://nlp.stanford.edu/software" class StanfordSegmenter(TokenizerI): """Interface to the Stanford Segmenter If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j should be provieded, for example:: seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar') >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter >>> seg = StanfordSegmenter() >>> seg.default_config('zh') >>> sent = u'这是斯坦福中文分词器测试' >>> print(seg.segment(sent)) \u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5 >>> seg.default_config('ar') >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات' >>> print(seg.segment(sent.split())) \u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a """ _JAR = "stanford-segmenter.jar" def __init__( self, path_to_jar=None, path_to_slf4j=None, java_class=None, path_to_model=None, path_to_dict=None, path_to_sihan_corpora_dict=None, sihan_post_processing="false", keep_whitespaces="false", encoding="UTF-8", options=None, verbose=False, java_options="-mx2g", ): # Raise deprecation warning. warnings.simplefilter("always", DeprecationWarning) warnings.warn( str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'" ), DeprecationWarning, stacklevel=2, ) warnings.simplefilter("ignore", DeprecationWarning) stanford_segmenter = find_jar( self._JAR, path_to_jar, env_vars=("STANFORD_SEGMENTER",), searchpath=(), url=_stanford_url, verbose=verbose, ) if path_to_slf4j is not None: slf4j = find_jar( "slf4j-api.jar", path_to_slf4j, env_vars=("SLF4J", "STANFORD_SEGMENTER"), searchpath=(), url=_stanford_url, verbose=verbose, ) else: slf4j = None # This is passed to java as the -cp option, the old version of segmenter needs slf4j. # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j self._stanford_jar = os.pathsep.join( _ for _ in [stanford_segmenter, slf4j] if _ is not None ) self._java_class = java_class self._model = path_to_model self._sihan_corpora_dict = path_to_sihan_corpora_dict self._sihan_post_processing = sihan_post_processing self._keep_whitespaces = keep_whitespaces self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ",".join( f"{key}={json.dumps(val)}" for key, val in options.items() ) def default_config(self, lang): """ Attempt to initialize Stanford Word Segmenter for the specified language using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables """ search_path = () if os.environ.get("STANFORD_SEGMENTER"): search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")} # init for Chinese-specific files self._dict = None self._sihan_corpora_dict = None self._sihan_post_processing = "false" if lang == "ar": self._java_class = ( "edu.stanford.nlp.international.arabic.process.ArabicSegmenter" ) model = "arabic-segmenter-atb+bn+arztrain.ser.gz" elif lang == "zh": self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier" model = "pku.gz" self._sihan_post_processing = "true" path_to_dict = "dict-chris6.ser.gz" try: self._dict = find_file( path_to_dict, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=("STANFORD_MODELS",), ) except LookupError as e: raise LookupError( "Could not find '%s' (tried using env. " "variables STANFORD_MODELS and /data/)" % path_to_dict ) from e sihan_dir = "./data/" try: path_to_sihan_dir = find_dir( sihan_dir, url=_stanford_url, verbose=False, env_vars=("STANFORD_SEGMENTER",), ) self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir) except LookupError as e: raise LookupError( "Could not find '%s' (tried using the " "STANFORD_SEGMENTER environment variable)" % sihan_dir ) from e else: raise LookupError(f"Unsupported language {lang}") try: self._model = find_file( model, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"), ) except LookupError as e: raise LookupError( "Could not find '%s' (tried using env. " "variables STANFORD_MODELS and /data/)" % model ) from e def tokenize(self, s): super().tokenize(s) def segment_file(self, input_file_path): """ """ cmd = [ self._java_class, "-loadClassifier", self._model, "-keepAllWhitespaces", self._keep_whitespaces, "-textFile", input_file_path, ] if self._sihan_corpora_dict is not None: cmd.extend( [ "-serDictionary", self._dict, "-sighanCorporaDict", self._sihan_corpora_dict, "-sighanPostProcessing", self._sihan_post_processing, ] ) stdout = self._execute(cmd) return stdout def segment(self, tokens): return self.segment_sents([tokens]) def segment_sents(self, sentences): """ """ encoding = self._encoding # Create a temporary input file _input_fh, self._input_file_path = tempfile.mkstemp(text=True) # Write the actural sentences to the temporary input file _input_fh = os.fdopen(_input_fh, "wb") _input = "\n".join(" ".join(x) for x in sentences) if isinstance(_input, str) and encoding: _input = _input.encode(encoding) _input_fh.write(_input) _input_fh.close() cmd = [ self._java_class, "-loadClassifier", self._model, "-keepAllWhitespaces", self._keep_whitespaces, "-textFile", self._input_file_path, ] if self._sihan_corpora_dict is not None: cmd.extend( [ "-serDictionary", self._dict, "-sighanCorporaDict", self._sihan_corpora_dict, "-sighanPostProcessing", self._sihan_post_processing, ] ) stdout = self._execute(cmd) # Delete the temporary file os.unlink(self._input_file_path) return stdout def _execute(self, cmd, verbose=False): encoding = self._encoding cmd.extend(["-inputEncoding", encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(["-options", self._options_cmd]) default_options = " ".join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) stdout, _stderr = java( cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE ) stdout = stdout.decode(encoding) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout nltk-3.7/nltk/tokenize/texttiling.py000066400000000000000000000402211420073152400176750ustar00rootroot00000000000000# Natural Language Toolkit: TextTiling # # Copyright (C) 2001-2022 NLTK Project # Author: George Boutsioukis # # URL: # For license information, see LICENSE.TXT import math import re try: import numpy except ImportError: pass from nltk.tokenize.api import TokenizerI BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1 LC, HC = 0, 1 DEFAULT_SMOOTHING = [0] class TextTilingTokenizer(TokenizerI): """Tokenize a document into topical sections using the TextTiling algorithm. This algorithm detects subtopic shifts based on the analysis of lexical co-occurrence patterns. The process starts by tokenizing the text into pseudosentences of a fixed size w. Then, depending on the method used, similarity scores are assigned at sentence gaps. The algorithm proceeds by detecting the peak differences between these scores and marking them as boundaries. The boundaries are normalized to the closest paragraph break and the segmented text is returned. :param w: Pseudosentence size :type w: int :param k: Size (in sentences) of the block used in the block comparison method :type k: int :param similarity_method: The method used for determining similarity scores: `BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`. :type similarity_method: constant :param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus) :type stopwords: list(str) :param smoothing_method: The method used for smoothing the score plot: `DEFAULT_SMOOTHING` (default) :type smoothing_method: constant :param smoothing_width: The width of the window used by the smoothing method :type smoothing_width: int :param smoothing_rounds: The number of smoothing passes :type smoothing_rounds: int :param cutoff_policy: The policy used to determine the number of boundaries: `HC` (default) or `LC` :type cutoff_policy: constant >>> from nltk.corpus import brown >>> tt = TextTilingTokenizer(demo_mode=True) >>> text = brown.raw()[:4000] >>> s, ss, d, b = tt.tokenize(text) >>> b [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0] """ def __init__( self, w=20, k=10, similarity_method=BLOCK_COMPARISON, stopwords=None, smoothing_method=DEFAULT_SMOOTHING, smoothing_width=2, smoothing_rounds=1, cutoff_policy=HC, demo_mode=False, ): if stopwords is None: from nltk.corpus import stopwords stopwords = stopwords.words("english") self.__dict__.update(locals()) del self.__dict__["self"] def tokenize(self, text): """Return a tokenized copy of *text*, where each "token" represents a separate topic.""" lowercase_text = text.lower() paragraph_breaks = self._mark_paragraph_breaks(text) text_length = len(lowercase_text) # Tokenization step starts here # Remove punctuation nopunct_text = "".join( c for c in lowercase_text if re.match(r"[a-z\-' \n\t]", c) ) nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text) tokseqs = self._divide_to_tokensequences(nopunct_text) # The morphological stemming step mentioned in the TextTile # paper is not implemented. A comment in the original C # implementation states that it offers no benefit to the # process. It might be interesting to test the existing # stemmers though. # words = _stem_words(words) # Filter stopwords for ts in tokseqs: ts.wrdindex_list = [ wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords ] token_table = self._create_token_table(tokseqs, nopunct_par_breaks) # End of the Tokenization step # Lexical score determination if self.similarity_method == BLOCK_COMPARISON: gap_scores = self._block_comparison(tokseqs, token_table) elif self.similarity_method == VOCABULARY_INTRODUCTION: raise NotImplementedError("Vocabulary introduction not implemented") else: raise ValueError( f"Similarity method {self.similarity_method} not recognized" ) if self.smoothing_method == DEFAULT_SMOOTHING: smooth_scores = self._smooth_scores(gap_scores) else: raise ValueError(f"Smoothing method {self.smoothing_method} not recognized") # End of Lexical score Determination # Boundary identification depth_scores = self._depth_scores(smooth_scores) segment_boundaries = self._identify_boundaries(depth_scores) normalized_boundaries = self._normalize_boundaries( text, segment_boundaries, paragraph_breaks ) # End of Boundary Identification segmented_text = [] prevb = 0 for b in normalized_boundaries: if b == 0: continue segmented_text.append(text[prevb:b]) prevb = b if prevb < text_length: # append any text that may be remaining segmented_text.append(text[prevb:]) if not segmented_text: segmented_text = [text] if self.demo_mode: return gap_scores, smooth_scores, depth_scores, segment_boundaries return segmented_text def _block_comparison(self, tokseqs, token_table): """Implements the block comparison method""" def blk_frq(tok, block): ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences) freq = sum(tsocc[1] for tsocc in ts_occs) return freq gap_scores = [] numgaps = len(tokseqs) - 1 for curr_gap in range(numgaps): score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0 score = 0.0 # adjust window size for boundary conditions if curr_gap < self.k - 1: window_size = curr_gap + 1 elif curr_gap > numgaps - self.k: window_size = numgaps - curr_gap else: window_size = self.k b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]] b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]] for t in token_table: score_dividend += blk_frq(t, b1) * blk_frq(t, b2) score_divisor_b1 += blk_frq(t, b1) ** 2 score_divisor_b2 += blk_frq(t, b2) ** 2 try: score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2) except ZeroDivisionError: pass # score += 0.0 gap_scores.append(score) return gap_scores def _smooth_scores(self, gap_scores): "Wraps the smooth function from the SciPy Cookbook" return list( smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1) ) def _mark_paragraph_breaks(self, text): """Identifies indented text or line breaks as the beginning of paragraphs""" MIN_PARAGRAPH = 100 pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*") matches = pattern.finditer(text) last_break = 0 pbreaks = [0] for pb in matches: if pb.start() - last_break < MIN_PARAGRAPH: continue else: pbreaks.append(pb.start()) last_break = pb.start() return pbreaks def _divide_to_tokensequences(self, text): "Divides the text into pseudosentences of fixed size" w = self.w wrdindex_list = [] matches = re.finditer(r"\w+", text) for match in matches: wrdindex_list.append((match.group(), match.start())) return [ TokenSequence(i / w, wrdindex_list[i : i + w]) for i in range(0, len(wrdindex_list), w) ] def _create_token_table(self, token_sequences, par_breaks): "Creates a table of TokenTableFields" token_table = {} current_par = 0 current_tok_seq = 0 pb_iter = par_breaks.__iter__() current_par_break = next(pb_iter) if current_par_break == 0: try: current_par_break = next(pb_iter) # skip break at 0 except StopIteration as e: raise ValueError( "No paragraph breaks were found(text too short perhaps?)" ) from e for ts in token_sequences: for word, index in ts.wrdindex_list: try: while index > current_par_break: current_par_break = next(pb_iter) current_par += 1 except StopIteration: # hit bottom pass if word in token_table: token_table[word].total_count += 1 if token_table[word].last_par != current_par: token_table[word].last_par = current_par token_table[word].par_count += 1 if token_table[word].last_tok_seq != current_tok_seq: token_table[word].last_tok_seq = current_tok_seq token_table[word].ts_occurences.append([current_tok_seq, 1]) else: token_table[word].ts_occurences[-1][1] += 1 else: # new word token_table[word] = TokenTableField( first_pos=index, ts_occurences=[[current_tok_seq, 1]], total_count=1, par_count=1, last_par=current_par, last_tok_seq=current_tok_seq, ) current_tok_seq += 1 return token_table def _identify_boundaries(self, depth_scores): """Identifies boundaries at the peaks of similarity score differences""" boundaries = [0 for x in depth_scores] avg = sum(depth_scores) / len(depth_scores) stdev = numpy.std(depth_scores) # SB: what is the purpose of this conditional? if self.cutoff_policy == LC: cutoff = avg - stdev / 2.0 else: cutoff = avg - stdev / 2.0 depth_tuples = sorted(zip(depth_scores, range(len(depth_scores)))) depth_tuples.reverse() hp = list(filter(lambda x: x[0] > cutoff, depth_tuples)) for dt in hp: boundaries[dt[1]] = 1 for dt2 in hp: # undo if there is a boundary close already if ( dt[1] != dt2[1] and abs(dt2[1] - dt[1]) < 4 and boundaries[dt2[1]] == 1 ): boundaries[dt[1]] = 0 return boundaries def _depth_scores(self, scores): """Calculates the depth of each gap, i.e. the average difference between the left and right peaks and the gap's score""" depth_scores = [0 for x in scores] # clip boundaries: this holds on the rule of thumb(my thumb) # that a section shouldn't be smaller than at least 2 # pseudosentences for small texts and around 5 for larger ones. clip = min(max(len(scores) // 10, 2), 5) index = clip for gapscore in scores[clip:-clip]: lpeak = gapscore for score in scores[index::-1]: if score >= lpeak: lpeak = score else: break rpeak = gapscore for score in scores[index:]: if score >= rpeak: rpeak = score else: break depth_scores[index] = lpeak + rpeak - 2 * gapscore index += 1 return depth_scores def _normalize_boundaries(self, text, boundaries, paragraph_breaks): """Normalize the boundaries identified to the original text's paragraph breaks""" norm_boundaries = [] char_count, word_count, gaps_seen = 0, 0, 0 seen_word = False for char in text: char_count += 1 if char in " \t\n" and seen_word: seen_word = False word_count += 1 if char not in " \t\n" and not seen_word: seen_word = True if gaps_seen < len(boundaries) and word_count > ( max(gaps_seen * self.w, self.w) ): if boundaries[gaps_seen] == 1: # find closest paragraph break best_fit = len(text) for br in paragraph_breaks: if best_fit > abs(br - char_count): best_fit = abs(br - char_count) bestbr = br else: break if bestbr not in norm_boundaries: # avoid duplicates norm_boundaries.append(bestbr) gaps_seen += 1 return norm_boundaries class TokenTableField: """A field in the token table holding parameters for each token, used later in the process""" def __init__( self, first_pos, ts_occurences, total_count=1, par_count=1, last_par=0, last_tok_seq=None, ): self.__dict__.update(locals()) del self.__dict__["self"] class TokenSequence: "A token list with its original length and its index" def __init__(self, index, wrdindex_list, original_length=None): original_length = original_length or len(wrdindex_list) self.__dict__.update(locals()) del self.__dict__["self"] # Pasted from the SciPy cookbook: https://www.scipy.org/Cookbook/SignalSmooth def smooth(x, window_len=11, window="flat"): """smooth the data using a window with requested size. This method is based on the convolution of a scaled window with the signal. The signal is prepared by introducing reflected copies of the signal (with the window size) in both ends so that transient parts are minimized in the beginning and end part of the output signal. :param x: the input signal :param window_len: the dimension of the smoothing window; should be an odd integer :param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman' flat window will produce a moving average smoothing. :return: the smoothed signal example:: t=linspace(-2,2,0.1) x=sin(t)+randn(len(t))*0.1 y=smooth(x) :see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve, scipy.signal.lfilter TODO: the window parameter could be the window itself if an array instead of a string """ if x.ndim != 1: raise ValueError("smooth only accepts 1 dimension arrays.") if x.size < window_len: raise ValueError("Input vector needs to be bigger than window size.") if window_len < 3: return x if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]: raise ValueError( "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'" ) s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]] # print(len(s)) if window == "flat": # moving average w = numpy.ones(window_len, "d") else: w = eval("numpy." + window + "(window_len)") y = numpy.convolve(w / w.sum(), s, mode="same") return y[window_len - 1 : -window_len + 1] def demo(text=None): from matplotlib import pylab from nltk.corpus import brown tt = TextTilingTokenizer(demo_mode=True) if text is None: text = brown.raw()[:10000] s, ss, d, b = tt.tokenize(text) pylab.xlabel("Sentence Gap index") pylab.ylabel("Gap Scores") pylab.plot(range(len(s)), s, label="Gap Scores") pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") pylab.plot(range(len(d)), d, label="Depth scores") pylab.stem(range(len(b)), b) pylab.legend() pylab.show() nltk-3.7/nltk/tokenize/toktok.py000066400000000000000000000165141420073152400170250ustar00rootroot00000000000000# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer. # # Copyright (C) 2001-2015 NLTK Project # Author: Jon Dehdari # Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters # # URL: # For license information, see LICENSE.TXT """ The tok-tok tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized. Tok-tok has been tested on, and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. The input should be in UTF-8 encoding. Reference: Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University. """ import re from nltk.tokenize.api import TokenizerI class ToktokTokenizer(TokenizerI): """ This is a Python port of the tok-tok.pl from https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl >>> toktok = ToktokTokenizer() >>> text = u'Is 9.5 or 525,600 my favorite number?' >>> print(toktok.tokenize(text, return_str=True)) Is 9.5 or 525,600 my favorite number ? >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things' >>> print(toktok.tokenize(text, return_str=True)) The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' >>> assert toktok.tokenize(text, return_str=True) == expected >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf'] True """ # Replace non-breaking spaces with normal spaces. NON_BREAKING = re.compile("\u00A0"), " " # Pad some funky punctuation. FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 " # Pad more funky punctuation. FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 " # Pad En dash and em dash EN_EM_DASHES = re.compile("([–—])"), r" \1 " # Replace problematic character with numeric character reference. AMPERCENT = re.compile("& "), "& " TAB = re.compile("\t"), " " PIPE = re.compile(r"\|"), " | " # Pad numbers with commas to keep them from further tokenization. COMMA_IN_NUM = re.compile(r"(? "something ..." # "something." -> "something ." FINAL_PERIOD_1 = re.compile(r"(? "... stuff ." FINAL_PERIOD_2 = re.compile(r"""(? # Michael Heilman (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed) # Tom Aarsen <> (modifications) # # URL: # For license information, see LICENSE.TXT r""" Penn Treebank Tokenizer The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. This implementation is a port of the tokenizer sed script written by Robert McIntyre and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed. """ import re import warnings from typing import Iterator, List, Tuple from nltk.tokenize.api import TokenizerI from nltk.tokenize.destructive import MacIntyreContractions from nltk.tokenize.util import align_tokens class TreebankWordTokenizer(TokenizerI): r""" The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. This tokenizer performs the following steps: - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll`` - treat most punctuation characters as separate tokens - split off commas and single quotes, when followed by whitespace - separate periods that appear at the end of line >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' >>> TreebankWordTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] >>> s = "They'll save and invest more." >>> TreebankWordTokenizer().tokenize(s) ['They', "'ll", 'save', 'and', 'invest', 'more', '.'] >>> s = "hi, my name can't hello," >>> TreebankWordTokenizer().tokenize(s) ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ','] """ # starting quotes STARTING_QUOTES = [ (re.compile(r"^\""), r"``"), (re.compile(r"(``)"), r" \1 "), (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "), ] # punctuation PUNCTUATION = [ (re.compile(r"([:,])([^\d])"), r" \1 \2"), (re.compile(r"([:,])$"), r" \1 "), (re.compile(r"\.\.\."), r" ... "), (re.compile(r"[;@#$%&]"), r" \g<0> "), ( re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r"\1 \2\3 ", ), # Handles the final period. (re.compile(r"[?!]"), r" \g<0> "), (re.compile(r"([^'])' "), r"\1 ' "), ] # Pads parentheses PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ") # Optionally: Convert parentheses, brackets and converts them to PTB symbols. CONVERT_PARENTHESES = [ (re.compile(r"\("), "-LRB-"), (re.compile(r"\)"), "-RRB-"), (re.compile(r"\["), "-LSB-"), (re.compile(r"\]"), "-RSB-"), (re.compile(r"\{"), "-LCB-"), (re.compile(r"\}"), "-RCB-"), ] DOUBLE_DASHES = (re.compile(r"--"), r" -- ") # ending quotes ENDING_QUOTES = [ (re.compile(r"''"), " '' "), (re.compile(r'"'), " '' "), (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), ] # List of contractions adapted from Robert MacIntyre's tokenizer. _contractions = MacIntyreContractions() CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2)) CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) def tokenize( self, text: str, convert_parentheses: bool = False, return_str: bool = False ) -> List[str]: r"""Return a tokenized copy of `text`. >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.''' >>> TreebankWordTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36', 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] >>> TreebankWordTokenizer().tokenize(s, return_str=True) ' Good muffins cost $ 3.88 ( roughly 3,36 euros ) \nin New York. Please buy me\ntwo of them.\nThanks . ' :param text: A string with a sentence or sentences. :type text: str :param convert_parentheses: if True, replace parentheses to PTB symbols, e.g. `(` to `-LRB-`. Defaults to False. :type convert_parentheses: bool, optional :param return_str: If True, return tokens as space-separated string, defaults to False. :type return_str: bool, optional :return: List of tokens from `text`. :rtype: List[str] """ if return_str is not False: warnings.warn( "Parameter 'return_str' has been deprecated and should no " "longer be used.", category=DeprecationWarning, stacklevel=2, ) for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) for regexp, substitution in self.PUNCTUATION: text = regexp.sub(substitution, text) # Handles parentheses. regexp, substitution = self.PARENS_BRACKETS text = regexp.sub(substitution, text) # Optionally convert parentheses if convert_parentheses: for regexp, substitution in self.CONVERT_PARENTHESES: text = regexp.sub(substitution, text) # Handles double dash. regexp, substitution = self.DOUBLE_DASHES text = regexp.sub(substitution, text) # add extra space to make things easier text = " " + text + " " for regexp, substitution in self.ENDING_QUOTES: text = regexp.sub(substitution, text) for regexp in self.CONTRACTIONS2: text = regexp.sub(r" \1 \2 ", text) for regexp in self.CONTRACTIONS3: text = regexp.sub(r" \1 \2 ", text) # We are not using CONTRACTIONS4 since # they are also commented out in the SED scripts # for regexp in self._contractions.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) return text.split() def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]: r""" Returns the spans of the tokens in ``text``. Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True :param text: A string with a sentence or sentences. :type text: str :yield: Tuple[int, int] """ raw_tokens = self.tokenize(text) # Convert converted quotes back to original double quotes # Do this only if original text contains double quote(s) or double # single-quotes (because '' might be transformed to `` if it is # treated as starting quotes). if ('"' in text) or ("''" in text): # Find double quotes and converted quotes matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] # Replace converted quotes back to double quotes tokens = [ matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens ] else: tokens = raw_tokens yield from align_tokens(tokens, text) class TreebankWordDetokenizer(TokenizerI): r""" The Treebank detokenizer uses the reverse regex operations corresponding to the Treebank tokenizer's regexes. Note: - There're additional assumption mades when undoing the padding of ``[;@#$%&]`` punctuation symbols that isn't presupposed in the TreebankTokenizer. - There're additional regexes added in reversing the parentheses tokenization, such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right padding added to the closing parentheses precedding ``[:;,.]``. - It's not possible to return the original whitespaces as they were because there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at the text.split() operation. >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' >>> d = TreebankWordDetokenizer() >>> t = TreebankWordTokenizer() >>> toks = t.tokenize(s) >>> d.detokenize(toks) 'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.' The MXPOST parentheses substitution can be undone using the ``convert_parentheses`` parameter: >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy', ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.'] >>> expected_tokens == t.tokenize(s, convert_parentheses=True) True >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).' >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True) True During tokenization it's safe to add more spaces but during detokenization, simply undoing the padding doesn't really help. - During tokenization, left and right pad is added to ``[!?]``, when detokenizing, only left shift the ``[!?]`` is needed. Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``. - During tokenization ``[:,]`` are left and right padded but when detokenizing, only left shift is necessary and we keep right pad after comma/colon if the string after is a non-digit. Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``. >>> from nltk.tokenize.treebank import TreebankWordDetokenizer >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!'] >>> twd = TreebankWordDetokenizer() >>> twd.detokenize(toks) "hello, i can't feel my feet! Help!!" >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!', ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!'] >>> twd.detokenize(toks) "hello, i can't feel; my feet! Help!! He said: Help, help?!" """ _contractions = MacIntyreContractions() CONTRACTIONS2 = [ re.compile(pattern.replace("(?#X)", r"\s")) for pattern in _contractions.CONTRACTIONS2 ] CONTRACTIONS3 = [ re.compile(pattern.replace("(?#X)", r"\s")) for pattern in _contractions.CONTRACTIONS3 ] # ending quotes ENDING_QUOTES = [ (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "), (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "), (re.compile(r"(\S)\s(\'\')"), r"\1\2"), ( re.compile(r"(\'\')\s([.,:)\]>};%])"), r"\1\2", ), # Quotes followed by no-left-padded punctuations. (re.compile(r"''"), '"'), ] # Handles double dashes DOUBLE_DASHES = (re.compile(r" -- "), r"--") # Optionally: Convert parentheses, brackets and converts them from PTB symbols. CONVERT_PARENTHESES = [ (re.compile("-LRB-"), "("), (re.compile("-RRB-"), ")"), (re.compile("-LSB-"), "["), (re.compile("-RSB-"), "]"), (re.compile("-LCB-"), "{"), (re.compile("-RCB-"), "}"), ] # Undo padding on parentheses. PARENS_BRACKETS = [ (re.compile(r"([\[\(\{\<])\s"), r"\g<1>"), (re.compile(r"\s([\]\)\}\>])"), r"\g<1>"), (re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"), ] # punctuation PUNCTUATION = [ (re.compile(r"([^'])\s'\s"), r"\1' "), (re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!] # (re.compile(r'\s([?!])\s'), r'\g<1>'), (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"), # When tokenizing, [;@#$%&] are padded with whitespace regardless of # whether there are spaces before or after them. # But during detokenization, we need to distinguish between left/right # pad, so we split this up. (re.compile(r"([#$])\s"), r"\g<1>"), # Left pad. (re.compile(r"\s([;%])"), r"\g<1>"), # Right pad. # (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad. (re.compile(r"\s\.\.\.\s"), r"..."), # (re.compile(r"\s([:,])\s$"), r"\1"), # .strip() takes care of it. ( re.compile(r"\s([:,])"), r"\1", ), # Just remove left padding. Punctuation in numbers won't be padded. ] # starting quotes STARTING_QUOTES = [ (re.compile(r"([ (\[{<])\s``"), r"\1``"), (re.compile(r"(``)\s"), r"\1"), (re.compile(r"``"), r'"'), ] def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str: """ Treebank detokenizer, created by undoing the regexes from the TreebankWordTokenizer.tokenize. :param tokens: A list of strings, i.e. tokenized text. :type tokens: List[str] :param convert_parentheses: if True, replace PTB symbols with parentheses, e.g. `-LRB-` to `(`. Defaults to False. :type convert_parentheses: bool, optional :return: str """ text = " ".join(tokens) # Reverse the contractions regexes. # Note: CONTRACTIONS4 are not used in tokenization. for regexp in self.CONTRACTIONS3: text = regexp.sub(r"\1\2", text) for regexp in self.CONTRACTIONS2: text = regexp.sub(r"\1\2", text) # Reverse the regexes applied for ending quotes. for regexp, substitution in self.ENDING_QUOTES: text = regexp.sub(substitution, text) # Undo the space padding. text = text.strip() # Reverse the padding on double dashes. regexp, substitution = self.DOUBLE_DASHES text = regexp.sub(substitution, text) if convert_parentheses: for regexp, substitution in self.CONVERT_PARENTHESES: text = regexp.sub(substitution, text) # Reverse the padding regexes applied for parenthesis/brackets. for regexp, substitution in self.PARENS_BRACKETS: text = regexp.sub(substitution, text) # Reverse the regexes applied for punctuations. for regexp, substitution in self.PUNCTUATION: text = regexp.sub(substitution, text) # Reverse the regexes applied for starting quotes. for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) return text.strip() def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str: """Duck-typing the abstract *tokenize()*.""" return self.tokenize(tokens, convert_parentheses) nltk-3.7/nltk/tokenize/util.py000066400000000000000000000233311420073152400164620ustar00rootroot00000000000000# Natural Language Toolkit: Tokenizer Utilities # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT from re import finditer from xml.sax.saxutils import escape, unescape def string_span_tokenize(s, sep): r""" Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples, by splitting the string at each occurrence of *sep*. >>> from nltk.tokenize.util import string_span_tokenize >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(string_span_tokenize(s, " ")) [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37), (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)] :param s: the string to be tokenized :type s: str :param sep: the token separator :type sep: str :rtype: iter(tuple(int, int)) """ if len(sep) == 0: raise ValueError("Token delimiter must not be empty") left = 0 while True: try: right = s.index(sep, left) if right != 0: yield left, right except ValueError: if left != len(s): yield left, len(s) break left = right + len(sep) def regexp_span_tokenize(s, regexp): r""" Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples, by splitting the string at each successive match of *regexp*. >>> from nltk.tokenize.util import regexp_span_tokenize >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(regexp_span_tokenize(s, r'\s')) [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] :param s: the string to be tokenized :type s: str :param regexp: regular expression that matches token separators (must not be empty) :type regexp: str :rtype: iter(tuple(int, int)) """ left = 0 for m in finditer(regexp, s): right, next = m.span() if right != left: yield left, right left = next yield left, len(s) def spans_to_relative(spans): r""" Return a sequence of relative spans, given a sequence of spans. >>> from nltk.tokenize import WhitespaceTokenizer >>> from nltk.tokenize.util import spans_to_relative >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6), (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)] :param spans: a sequence of (start, end) offsets of the tokens :type spans: iter(tuple(int, int)) :rtype: iter(tuple(int, int)) """ prev = 0 for left, right in spans: yield left - prev, right - left prev = right class CJKChars: """ An object that enumerates the code points of the CJK characters as listed on https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane This is a Python port of the CJK code point enumerations of Moses tokenizer: https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309 """ # Hangul Jamo (1100–11FF) Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff")) # CJK Radicals Supplement (2E80–2EFF) # Kangxi Radicals (2F00–2FDF) # Ideographic Description Characters (2FF0–2FFF) # CJK Symbols and Punctuation (3000–303F) # Hiragana (3040–309F) # Katakana (30A0–30FF) # Bopomofo (3100–312F) # Hangul Compatibility Jamo (3130–318F) # Kanbun (3190–319F) # Bopomofo Extended (31A0–31BF) # CJK Strokes (31C0–31EF) # Katakana Phonetic Extensions (31F0–31FF) # Enclosed CJK Letters and Months (3200–32FF) # CJK Compatibility (3300–33FF) # CJK Unified Ideographs Extension A (3400–4DBF) # Yijing Hexagram Symbols (4DC0–4DFF) # CJK Unified Ideographs (4E00–9FFF) # Yi Syllables (A000–A48F) # Yi Radicals (A490–A4CF) CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf")) # Phags-pa (A840–A87F) Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f")) # Hangul Syllables (AC00–D7AF) Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF")) # CJK Compatibility Ideographs (F900–FAFF) CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF")) # CJK Compatibility Forms (FE30–FE4F) CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F")) # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC")) # Supplementary Ideographic Plane 20000–2FFFF Supplementary_Ideographic_Plane = ( 131072, 196607, ) # (ord(u"\U00020000"), ord(u"\U0002FFFF")) ranges = [ Hangul_Jamo, CJK_Radicals, Phags_Pa, Hangul_Syllables, CJK_Compatibility_Ideographs, CJK_Compatibility_Forms, Katakana_Hangul_Halfwidth, Supplementary_Ideographic_Plane, ] def is_cjk(character): """ Python port of Moses' code to check for CJK character. >>> CJKChars().ranges [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)] >>> is_cjk(u'\u33fe') True >>> is_cjk(u'\uFE5F') False :param character: The character that needs to be checked. :type character: char :return: bool """ return any( [ start <= ord(character) <= end for start, end in [ (4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607), ] ] ) def xml_escape(text): """ This function transforms the input text into an "escaped" version suitable for well-formed XML formatting. Note that the default xml.sax.saxutils.escape() function don't escape some characters that Moses does so we have to manually add them to the entities dictionary. >>> input_str = ''')| & < > ' " ] [''' >>> expected_output = ''')| & < > ' " ] [''' >>> escape(input_str) == expected_output True >>> xml_escape(input_str) ')| & < > ' " ] [' :param text: The text that needs to be escaped. :type text: str :rtype: str """ return escape( text, entities={ r"'": r"'", r'"': r""", r"|": r"|", r"[": r"[", r"]": r"]", }, ) def xml_unescape(text): """ This function transforms the "escaped" version suitable for well-formed XML formatting into humanly-readable string. Note that the default xml.sax.saxutils.unescape() function don't unescape some characters that Moses does so we have to manually add them to the entities dictionary. >>> from xml.sax.saxutils import unescape >>> s = ')| & < > ' " ] [' >>> expected = ''')| & < > \' " ] [''' >>> xml_unescape(s) == expected True :param text: The text that needs to be unescaped. :type text: str :rtype: str """ return unescape( text, entities={ r"'": r"'", r""": r'"', r"|": r"|", r"[": r"[", r"]": r"]", }, ) def align_tokens(tokens, sentence): """ This module attempt to find the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples, given the tokens and also the source string. >>> from nltk.tokenize import TreebankWordTokenizer >>> from nltk.tokenize.util import align_tokens >>> s = str("The plane, bound for St Petersburg, crashed in Egypt's " ... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh " ... "on Saturday.") >>> tokens = TreebankWordTokenizer().tokenize(s) >>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), ... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), ... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), ... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), ... (123, 131), (131, 132)] >>> output = list(align_tokens(tokens, s)) >>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same. True >>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected. True >>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens. True :param tokens: The list of strings that are the result of tokenization :type tokens: list(str) :param sentence: The original string :type sentence: str :rtype: list(tuple(int,int)) """ point = 0 offsets = [] for token in tokens: try: start = sentence.index(token, point) except ValueError as e: raise ValueError(f'substring "{token}" not found in "{sentence}"') from e point = start + len(token) offsets.append((start, point)) return offsets nltk-3.7/nltk/toolbox.py000066400000000000000000000426251420073152400153520ustar00rootroot00000000000000# Natural Language Toolkit: Toolbox Reader # # Copyright (C) 2001-2022 NLTK Project # Author: Greg Aumann # URL: # For license information, see LICENSE.TXT """ Module for reading, writing and manipulating Toolbox databases and settings files. """ import codecs import re from io import StringIO from xml.etree.ElementTree import Element, ElementTree, SubElement, TreeBuilder from nltk.data import PathPointer, find class StandardFormat: """ Class for reading and processing standard format marker files and strings. """ def __init__(self, filename=None, encoding=None): self._encoding = encoding if filename is not None: self.open(filename) def open(self, sfm_file): """ Open a standard format marker file for sequential reading. :param sfm_file: name of the standard format marker input file :type sfm_file: str """ if isinstance(sfm_file, PathPointer): self._file = sfm_file.open(self._encoding) else: self._file = codecs.open(sfm_file, "r", self._encoding) def open_string(self, s): """ Open a standard format marker string for sequential reading. :param s: string to parse as a standard format marker input file :type s: str """ self._file = StringIO(s) def raw_fields(self): """ Return an iterator that returns the next field in a (marker, value) tuple. Linebreaks and trailing white space are preserved except for the final newline in each field. :rtype: iter(tuple(str, str)) """ join_string = "\n" line_regexp = r"^%s(?:\\(\S+)\s*)?(.*)$" # discard a BOM in the first line first_line_pat = re.compile(line_regexp % "(?:\xef\xbb\xbf)?") line_pat = re.compile(line_regexp % "") # need to get first line outside the loop for correct handling # of the first marker if it spans multiple lines file_iter = iter(self._file) # PEP 479, prevent RuntimeError when StopIteration is raised inside generator try: line = next(file_iter) except StopIteration: # no more data is available, terminate the generator return mobj = re.match(first_line_pat, line) mkr, line_value = mobj.groups() value_lines = [line_value] self.line_num = 0 for line in file_iter: self.line_num += 1 mobj = re.match(line_pat, line) line_mkr, line_value = mobj.groups() if line_mkr: yield (mkr, join_string.join(value_lines)) mkr = line_mkr value_lines = [line_value] else: value_lines.append(line_value) self.line_num += 1 yield (mkr, join_string.join(value_lines)) def fields( self, strip=True, unwrap=True, encoding=None, errors="strict", unicode_fields=None, ): """ Return an iterator that returns the next field in a ``(marker, value)`` tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding`` was specified in the ``fields()`` method. Otherwise they are non-unicode strings. :param strip: strip trailing whitespace from the last line of each field :type strip: bool :param unwrap: Convert newlines in a field to spaces. :type unwrap: bool :param encoding: Name of an encoding to use. If it is specified then the ``fields()`` method returns unicode strings rather than non unicode strings. :type encoding: str or None :param errors: Error handling scheme for codec. Same as the ``decode()`` builtin string method. :type errors: str :param unicode_fields: Set of marker names whose values are UTF-8 encoded. Ignored if encoding is None. If the whole file is UTF-8 encoded set ``encoding='utf8'`` and leave ``unicode_fields`` with its default value of None. :type unicode_fields: sequence :rtype: iter(tuple(str, str)) """ if encoding is None and unicode_fields is not None: raise ValueError("unicode_fields is set but not encoding.") unwrap_pat = re.compile(r"\n+") for mkr, val in self.raw_fields(): if unwrap: val = unwrap_pat.sub(" ", val) if strip: val = val.rstrip() yield (mkr, val) def close(self): """Close a previously opened standard format marker file or string.""" self._file.close() try: del self.line_num except AttributeError: pass class ToolboxData(StandardFormat): def parse(self, grammar=None, **kwargs): if grammar: return self._chunk_parse(grammar=grammar, **kwargs) else: return self._record_parse(**kwargs) def _record_parse(self, key=None, **kwargs): r""" Returns an element tree structure corresponding to a toolbox data file with all markers at the same level. Thus the following Toolbox database:: \_sh v3.0 400 Rotokas Dictionary \_DateStampHasFourDigitYear \lx kaa \ps V.A \ge gag \gp nek i pas \lx kaa \ps V.B \ge strangle \gp pasim nek after parsing will end up with the same structure (ignoring the extra whitespace) as the following XML fragment after being parsed by ElementTree::
    <_sh>v3.0 400 Rotokas Dictionary <_DateStampHasFourDigitYear/>
    kaa V.A gag nek i pas kaa V.B strangle pasim nek
    :param key: Name of key marker at the start of each record. If set to None (the default value) the first marker that doesn't begin with an underscore is assumed to be the key. :type key: str :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` :type kwargs: dict :rtype: ElementTree._ElementInterface :return: contents of toolbox data divided into header and records """ builder = TreeBuilder() builder.start("toolbox_data", {}) builder.start("header", {}) in_records = False for mkr, value in self.fields(**kwargs): if key is None and not in_records and mkr[0] != "_": key = mkr if mkr == key: if in_records: builder.end("record") else: builder.end("header") in_records = True builder.start("record", {}) builder.start(mkr, {}) builder.data(value) builder.end(mkr) if in_records: builder.end("record") else: builder.end("header") builder.end("toolbox_data") return builder.close() def _tree2etree(self, parent): from nltk.tree import Tree root = Element(parent.label()) for child in parent: if isinstance(child, Tree): root.append(self._tree2etree(child)) else: text, tag = child e = SubElement(root, tag) e.text = text return root def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs): """ Returns an element tree structure corresponding to a toolbox data file parsed according to the chunk grammar. :type grammar: str :param grammar: Contains the chunking rules used to parse the database. See ``chunk.RegExp`` for documentation. :type root_label: str :param root_label: The node value that should be used for the top node of the chunk structure. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. :type kwargs: dict :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()`` :rtype: ElementTree._ElementInterface """ from nltk import chunk from nltk.tree import Tree cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace) db = self.parse(**kwargs) tb_etree = Element("toolbox_data") header = db.find("header") tb_etree.append(header) for record in db.findall("record"): parsed = cp.parse([(elem.text, elem.tag) for elem in record]) tb_etree.append(self._tree2etree(parsed)) return tb_etree _is_value = re.compile(r"\S") def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None): """ Return a string with a standard format representation of the toolbox data in tree (tree can be a toolbox database or a single record). :param tree: flat representation of toolbox data (whole database or single record) :type tree: ElementTree._ElementInterface :param encoding: Name of an encoding to use. :type encoding: str :param errors: Error handling scheme for codec. Same as the ``encode()`` builtin string method. :type errors: str :param unicode_fields: :type unicode_fields: dict(str) or set(str) :rtype: str """ if tree.tag == "record": root = Element("toolbox_data") root.append(tree) tree = root if tree.tag != "toolbox_data": raise ValueError("not a toolbox_data element structure") if encoding is None and unicode_fields is not None: raise ValueError( "if encoding is not specified then neither should unicode_fields" ) l = [] for rec in tree: l.append("\n") for field in rec: mkr = field.tag value = field.text if encoding is not None: if unicode_fields is not None and mkr in unicode_fields: cur_encoding = "utf8" else: cur_encoding = encoding if re.search(_is_value, value): l.append((f"\\{mkr} {value}\n").encode(cur_encoding, errors)) else: l.append((f"\\{mkr}{value}\n").encode(cur_encoding, errors)) else: if re.search(_is_value, value): l.append(f"\\{mkr} {value}\n") else: l.append(f"\\{mkr}{value}\n") return "".join(l[1:]) class ToolboxSettings(StandardFormat): """This class is the base class for settings files.""" def __init__(self): super().__init__() def parse(self, encoding=None, errors="strict", **kwargs): """ Return the contents of toolbox settings file with a nested structure. :param encoding: encoding used by settings file :type encoding: str :param errors: Error handling scheme for codec. Same as ``decode()`` builtin method. :type errors: str :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` :type kwargs: dict :rtype: ElementTree._ElementInterface """ builder = TreeBuilder() for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs): # Check whether the first char of the field marker # indicates a block start (+) or end (-) block = mkr[0] if block in ("+", "-"): mkr = mkr[1:] else: block = None # Build tree on the basis of block char if block == "+": builder.start(mkr, {}) builder.data(value) elif block == "-": builder.end(mkr) else: builder.start(mkr, {}) builder.data(value) builder.end(mkr) return builder.close() def to_settings_string(tree, encoding=None, errors="strict", unicode_fields=None): # write XML to file l = list() _to_settings_string( tree.getroot(), l, encoding=encoding, errors=errors, unicode_fields=unicode_fields, ) return "".join(l) def _to_settings_string(node, l, **kwargs): # write XML to file tag = node.tag text = node.text if len(node) == 0: if text: l.append(f"\\{tag} {text}\n") else: l.append("\\%s\n" % tag) else: if text: l.append(f"\\+{tag} {text}\n") else: l.append("\\+%s\n" % tag) for n in node: _to_settings_string(n, l, **kwargs) l.append("\\-%s\n" % tag) return def remove_blanks(elem): """ Remove all elements and subelements with no text and no child elements. :param elem: toolbox data in an elementtree structure :type elem: ElementTree._ElementInterface """ out = list() for child in elem: remove_blanks(child) if child.text or len(child) > 0: out.append(child) elem[:] = out def add_default_fields(elem, default_fields): """ Add blank elements and subelements specified in default_fields. :param elem: toolbox data in an elementtree structure :type elem: ElementTree._ElementInterface :param default_fields: fields to add to each type of element and subelement :type default_fields: dict(tuple) """ for field in default_fields.get(elem.tag, []): if elem.find(field) is None: SubElement(elem, field) for child in elem: add_default_fields(child, default_fields) def sort_fields(elem, field_orders): """ Sort the elements and subelements in order specified in field_orders. :param elem: toolbox data in an elementtree structure :type elem: ElementTree._ElementInterface :param field_orders: order of fields for each type of element and subelement :type field_orders: dict(tuple) """ order_dicts = dict() for field, order in field_orders.items(): order_dicts[field] = order_key = dict() for i, subfield in enumerate(order): order_key[subfield] = i _sort_fields(elem, order_dicts) def _sort_fields(elem, orders_dicts): """sort the children of elem""" try: order = orders_dicts[elem.tag] except KeyError: pass else: tmp = sorted( ((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem) ) elem[:] = [child for key, child in tmp] for child in elem: if len(child): _sort_fields(child, orders_dicts) def add_blank_lines(tree, blanks_before, blanks_between): """ Add blank lines before all elements and subelements specified in blank_before. :param elem: toolbox data in an elementtree structure :type elem: ElementTree._ElementInterface :param blank_before: elements and subelements to add blank lines before :type blank_before: dict(tuple) """ try: before = blanks_before[tree.tag] between = blanks_between[tree.tag] except KeyError: for elem in tree: if len(elem): add_blank_lines(elem, blanks_before, blanks_between) else: last_elem = None for elem in tree: tag = elem.tag if last_elem is not None and last_elem.tag != tag: if tag in before and last_elem is not None: e = last_elem.getiterator()[-1] e.text = (e.text or "") + "\n" else: if tag in between: e = last_elem.getiterator()[-1] e.text = (e.text or "") + "\n" if len(elem): add_blank_lines(elem, blanks_before, blanks_between) last_elem = elem def demo(): from itertools import islice # zip_path = find('corpora/toolbox.zip') # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse() file_path = find("corpora/toolbox/rotokas.dic") lexicon = ToolboxData(file_path).parse() print("first field in fourth record:") print(lexicon[3][0].tag) print(lexicon[3][0].text) print("\nfields in sequential order:") for field in islice(lexicon.find("record"), 10): print(field.tag, field.text) print("\nlx fields:") for field in islice(lexicon.findall("record/lx"), 10): print(field.text) settings = ToolboxSettings() file_path = find("corpora/toolbox/MDF/MDF_AltH.typ") settings.open(file_path) # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ')) tree = settings.parse(unwrap=False, encoding="cp1252") print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text) settings_tree = ElementTree(tree) print(to_settings_string(settings_tree).encode("utf8")) if __name__ == "__main__": demo() nltk-3.7/nltk/translate/000077500000000000000000000000001420073152400152765ustar00rootroot00000000000000nltk-3.7/nltk/translate/__init__.py000066400000000000000000000024231420073152400174100ustar00rootroot00000000000000# Natural Language Toolkit: Machine Translation # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird , Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Experimental features for machine translation. These interfaces are prone to change. isort:skip_file """ from nltk.translate.api import AlignedSent, Alignment, PhraseTable from nltk.translate.ibm_model import IBMModel from nltk.translate.ibm1 import IBMModel1 from nltk.translate.ibm2 import IBMModel2 from nltk.translate.ibm3 import IBMModel3 from nltk.translate.ibm4 import IBMModel4 from nltk.translate.ibm5 import IBMModel5 from nltk.translate.bleu_score import sentence_bleu as bleu from nltk.translate.ribes_score import sentence_ribes as ribes from nltk.translate.meteor_score import meteor_score as meteor from nltk.translate.metrics import alignment_error_rate from nltk.translate.stack_decoder import StackDecoder from nltk.translate.nist_score import sentence_nist as nist from nltk.translate.chrf_score import sentence_chrf as chrf from nltk.translate.gale_church import trace from nltk.translate.gdfa import grow_diag_final_and from nltk.translate.gleu_score import sentence_gleu as gleu from nltk.translate.phrase_based import extract nltk-3.7/nltk/translate/api.py000066400000000000000000000250271420073152400164270ustar00rootroot00000000000000# Natural Language Toolkit: API for alignment and translation objects # # Copyright (C) 2001-2022 NLTK Project # Author: Will Zhang # Guan Gui # Steven Bird # Tah Wei Hoon # URL: # For license information, see LICENSE.TXT import subprocess from collections import namedtuple class AlignedSent: """ Return an aligned sentence object, which encapsulates two sentences along with an ``Alignment`` between them. Typically used in machine translation to represent a sentence and its translation. >>> from nltk.translate import AlignedSent, Alignment >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'], ... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1')) >>> algnsent.words ['klein', 'ist', 'das', 'Haus'] >>> algnsent.mots ['the', 'house', 'is', 'small'] >>> algnsent.alignment Alignment([(0, 3), (1, 2), (2, 0), (3, 1)]) >>> from nltk.corpus import comtrans >>> print(comtrans.aligned_sents()[54]) 'So why should EU arm...'> >>> print(comtrans.aligned_sents()[54].alignment) 0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13 :param words: Words in the target language sentence :type words: list(str) :param mots: Words in the source language sentence :type mots: list(str) :param alignment: Word-level alignments between ``words`` and ``mots``. Each alignment is represented as a 2-tuple (words_index, mots_index). :type alignment: Alignment """ def __init__(self, words, mots, alignment=None): self._words = words self._mots = mots if alignment is None: self.alignment = Alignment([]) else: assert type(alignment) is Alignment self.alignment = alignment @property def words(self): return self._words @property def mots(self): return self._mots def _get_alignment(self): return self._alignment def _set_alignment(self, alignment): _check_alignment(len(self.words), len(self.mots), alignment) self._alignment = alignment alignment = property(_get_alignment, _set_alignment) def __repr__(self): """ Return a string representation for this ``AlignedSent``. :rtype: str """ words = "[%s]" % (", ".join("'%s'" % w for w in self._words)) mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots)) return f"AlignedSent({words}, {mots}, {self._alignment!r})" def _to_dot(self): """ Dot representation of the aligned sentence """ s = "graph align {\n" s += "node[shape=plaintext]\n" # Declare node for w in self._words: s += f'"{w}_source" [label="{w}"] \n' for w in self._mots: s += f'"{w}_target" [label="{w}"] \n' # Alignment for u, v in self._alignment: s += f'"{self._words[u]}_source" -- "{self._mots[v]}_target" \n' # Connect the source words for i in range(len(self._words) - 1): s += '"{}_source" -- "{}_source" [style=invis]\n'.format( self._words[i], self._words[i + 1], ) # Connect the target words for i in range(len(self._mots) - 1): s += '"{}_target" -- "{}_target" [style=invis]\n'.format( self._mots[i], self._mots[i + 1], ) # Put it in the same rank s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words)) s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots)) s += "}" return s def _repr_svg_(self): """ Ipython magic : show SVG representation of this ``AlignedSent``. """ dot_string = self._to_dot().encode("utf8") output_format = "svg" try: process = subprocess.Popen( ["dot", "-T%s" % output_format], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) except OSError as e: raise Exception("Cannot find the dot binary from Graphviz package") from e out, err = process.communicate(dot_string) return out.decode("utf8") def __str__(self): """ Return a human-readable string representation for this ``AlignedSent``. :rtype: str """ source = " ".join(self._words)[:20] + "..." target = " ".join(self._mots)[:20] + "..." return f" '{target}'>" def invert(self): """ Return the aligned sentence pair, reversing the directionality :rtype: AlignedSent """ return AlignedSent(self._mots, self._words, self._alignment.invert()) class Alignment(frozenset): """ A storage class for representing alignment between two sequences, s1, s2. In general, an alignment is a set of tuples of the form (i, j, ...) representing an alignment between the i-th element of s1 and the j-th element of s2. Tuples are extensible (they might contain additional data, such as a boolean to indicate sure vs possible alignments). >>> from nltk.translate import Alignment >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)]) >>> a.invert() Alignment([(0, 0), (1, 0), (2, 1), (2, 2)]) >>> print(a.invert()) 0-0 1-0 2-1 2-2 >>> a[0] [(0, 1), (0, 0)] >>> a.invert()[2] [(2, 1), (2, 2)] >>> b = Alignment([(0, 0), (0, 1)]) >>> b.issubset(a) True >>> c = Alignment.fromstring('0-0 0-1') >>> b == c True """ def __new__(cls, pairs): self = frozenset.__new__(cls, pairs) self._len = max(p[0] for p in self) if self != frozenset([]) else 0 self._index = None return self @classmethod def fromstring(cls, s): """ Read a giza-formatted string and return an Alignment object. >>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5') Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)]) :type s: str :param s: the positional alignments in giza format :rtype: Alignment :return: An Alignment object corresponding to the string representation ``s``. """ return Alignment([_giza2pair(a) for a in s.split()]) def __getitem__(self, key): """ Look up the alignments that map from a given index or slice. """ if not self._index: self._build_index() return self._index.__getitem__(key) def invert(self): """ Return an Alignment object, being the inverted mapping. """ return Alignment(((p[1], p[0]) + p[2:]) for p in self) def range(self, positions=None): """ Work out the range of the mapping from the given positions. If no positions are specified, compute the range of the entire mapping. """ image = set() if not self._index: self._build_index() if not positions: positions = list(range(len(self._index))) for p in positions: image.update(f for _, f in self._index[p]) return sorted(image) def __repr__(self): """ Produce a Giza-formatted string representing the alignment. """ return "Alignment(%r)" % sorted(self) def __str__(self): """ Produce a Giza-formatted string representing the alignment. """ return " ".join("%d-%d" % p[:2] for p in sorted(self)) def _build_index(self): """ Build a list self._index such that self._index[i] is a list of the alignments originating from word i. """ self._index = [[] for _ in range(self._len + 1)] for p in self: self._index[p[0]].append(p) def _giza2pair(pair_string): i, j = pair_string.split("-") return int(i), int(j) def _naacl2pair(pair_string): i, j, p = pair_string.split("-") return int(i), int(j) def _check_alignment(num_words, num_mots, alignment): """ Check whether the alignments are legal. :param num_words: the number of source language words :type num_words: int :param num_mots: the number of target language words :type num_mots: int :param alignment: alignment to be checked :type alignment: Alignment :raise IndexError: if alignment falls outside the sentence """ assert type(alignment) is Alignment if not all(0 <= pair[0] < num_words for pair in alignment): raise IndexError("Alignment is outside boundary of words") if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment): raise IndexError("Alignment is outside boundary of mots") PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"]) class PhraseTable: """ In-memory store of translations for a given phrase, and the log probability of the those translations """ def __init__(self): self.src_phrases = dict() def translations_for(self, src_phrase): """ Get the translations for a source language phrase :param src_phrase: Source language phrase of interest :type src_phrase: tuple(str) :return: A list of target language phrases that are translations of ``src_phrase``, ordered in decreasing order of likelihood. Each list element is a tuple of the target phrase and its log probability. :rtype: list(PhraseTableEntry) """ return self.src_phrases[src_phrase] def add(self, src_phrase, trg_phrase, log_prob): """ :type src_phrase: tuple(str) :type trg_phrase: tuple(str) :param log_prob: Log probability that given ``src_phrase``, ``trg_phrase`` is its translation :type log_prob: float """ entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob) if src_phrase not in self.src_phrases: self.src_phrases[src_phrase] = [] self.src_phrases[src_phrase].append(entry) self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True) def __contains__(self, src_phrase): return src_phrase in self.src_phrases nltk-3.7/nltk/translate/bleu_score.py000066400000000000000000000720461420073152400200030ustar00rootroot00000000000000# Natural Language Toolkit: BLEU Score # # Copyright (C) 2001-2022 NLTK Project # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim # Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan # URL: # For license information, see LICENSE.TXT """BLEU score implementation.""" import math import sys import warnings from collections import Counter from fractions import Fraction from nltk.util import ngrams def sentence_bleu( references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, ): """ Calculate BLEU score (Bilingual Evaluation Understudy) from Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. "BLEU: a method for automatic evaluation of machine translation." In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS 0.5045... If there is no ngrams overlap for any order of n-grams, BLEU returns the value 0. This is because the precision for the order of n-grams without overlap is 0, and the geometric mean in the final BLEU score computation multiplies the 0 with the precision of other n-grams. This results in 0 (independently of the precision of the other n-gram orders). The following example has zero 3-gram and 4-gram overlaps: >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS 0.0 To avoid this harsh behaviour when no ngram overlaps are found a smoothing function can be used. >>> chencherry = SmoothingFunction() >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS 0.0370... The default BLEU calculates a score for up to 4-grams using uniform weights (this is called BLEU-4). To evaluate your translations with higher/lower order ngrams, use customized weights. E.g. when accounting for up to 5-grams with uniform weights (this is called BLEU-5) use: >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS 0.3920... Multiple BLEU scores can be computed at once, by supplying a list of weights. E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: >>> weights = [ ... (1./2., 1./2.), ... (1./3., 1./3., 1./3.), ... (1./4., 1./4., 1./4., 1./4.) ... ] >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS [0.7453..., 0.6240..., 0.5045...] :param references: reference sentences :type references: list(list(str)) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) :type weights: tuple(float) / list(tuple(float)) :param smoothing_function: :type smoothing_function: SmoothingFunction :param auto_reweigh: Option to re-normalize the weights uniformly. :type auto_reweigh: bool :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. :rtype: float / list(float) """ return corpus_bleu( [references], [hypothesis], weights, smoothing_function, auto_reweigh ) def corpus_bleu( list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, ): """ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all the hypotheses and their respective references. Instead of averaging the sentence level BLEU scores (i.e. macro-average precision), the original BLEU metric (Papineni et al. 2002) accounts for the micro-average precision (i.e. summing the numerators and denominators for each hypothesis-reference(s) pairs before the division). >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', ... 'interested', 'in', 'world', 'history'] >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', ... 'because', 'he', 'read', 'the', 'book'] >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] >>> hypotheses = [hyp1, hyp2] >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS 0.5920... The example below show that corpus_bleu() is different from averaging sentence_bleu() for hypotheses >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) >>> score2 = sentence_bleu([ref2a], hyp2) >>> (score1 + score2) / 2 # doctest: +ELLIPSIS 0.6223... Custom weights may be supplied to fine-tune the BLEU score further. A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. >>> weights = (0.1, 0.3, 0.5, 0.1) >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS 0.5818... This particular weight gave extra value to trigrams. Furthermore, multiple weights can be given, resulting in multiple BLEU scores. >>> weights = [ ... (0.5, 0.5), ... (0.333, 0.333, 0.334), ... (0.25, 0.25, 0.25, 0.25), ... (0.2, 0.2, 0.2, 0.2, 0.2) ... ] >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS [0.8242..., 0.7067..., 0.5920..., 0.4719...] :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses :type list_of_references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) :type weights: tuple(float) / list(tuple(float)) :param smoothing_function: :type smoothing_function: SmoothingFunction :param auto_reweigh: Option to re-normalize the weights uniformly. :type auto_reweigh: bool :return: The corpus-level BLEU score. :rtype: float """ # Before proceeding to compute BLEU, perform sanity checks. p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len(hypotheses), ( "The number of hypotheses and their reference(s) should be the " "same " ) try: weights[0][0] except TypeError: weights = [weights] max_weight_length = max(len(weight) for weight in weights) # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i in range(1, max_weight_length + 1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i in range(1, max_weight_length + 1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0 if len(weights) == 1 else [0] * len(weights) # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function( p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths ) bleu_scores = [] for weight in weights: # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): weight = (1 / hyp_lengths,) * hyp_lengths s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) s = bp * math.exp(math.fsum(s)) bleu_scores.append(s) return bleu_scores[0] if len(weights) == 1 else bleu_scores def modified_precision(references, hypothesis, n): """ Calculate modified ngram precision. The normal precision method may lead to some wrong translations with high-precision, e.g., the translation, in which a word of reference repeats several times, has very high precision. This function only returns the Fraction object that contains the numerator and denominator necessary to calculate the corpus-level precision. To calculate the modified precision for a single pair of hypothesis and references, cast the Fraction object into a float. The famous "the the the ... " example shows that you can get BLEU precision by duplicating high frequency words. >>> reference1 = 'the cat is on the mat'.split() >>> reference2 = 'there is a cat on the mat'.split() >>> hypothesis1 = 'the the the the the the the'.split() >>> references = [reference1, reference2] >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS 0.2857... In the modified n-gram precision, a reference word will be considered exhausted after a matching hypothesis word is identified, e.g. >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', ... 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hypothesis = 'of the'.split() >>> references = [reference1, reference2, reference3] >>> float(modified_precision(references, hypothesis, n=1)) 1.0 >>> float(modified_precision(references, hypothesis, n=2)) 1.0 An example of a normal machine translation hypothesis: >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', ... 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> references = [reference1, reference2, reference3] >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS 0.9444... >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS 0.5714... >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS 0.5882352941176471 >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS 0.07692... :param references: A list of reference translations. :type references: list(list(str)) :param hypothesis: A hypothesis translation. :type hypothesis: list(str) :param n: The ngram order. :type n: int :return: BLEU's modified precision for the nth order ngram. :rtype: Fraction """ # Extracts all ngrams in hypothesis # Set an empty Counter if hypothesis is empty. counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() # Extract a union of references' counts. # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) max_counts = {} for reference in references: reference_counts = ( Counter(ngrams(reference, n)) if len(reference) >= n else Counter() ) for ngram in counts: max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) # Assigns the intersection between hypothesis and references' counts. clipped_counts = { ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() } numerator = sum(clipped_counts.values()) # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. # Usually this happens when the ngram order is > len(reference). denominator = max(1, sum(counts.values())) return Fraction(numerator, denominator, _normalize=False) def closest_ref_length(references, hyp_len): """ This function finds the reference that is the closest length to the hypothesis. The closest reference length is referred to as *r* variable from the brevity penalty formula in Papineni et. al. (2002) :param references: A list of reference translations. :type references: list(list(str)) :param hyp_len: The length of the hypothesis. :type hyp_len: int :return: The length of the reference that's closest to the hypothesis. :rtype: int """ ref_lens = (len(reference) for reference in references) closest_ref_len = min( ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) ) return closest_ref_len def brevity_penalty(closest_ref_len, hyp_len): """ Calculate brevity penalty. As the modified n-gram precision still has the problem from the short length sentence, brevity penalty is used to modify the overall BLEU score according to length. An example from the paper. There are three references with length 12, 15 and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 >>> references = [reference1, reference2, reference3] >>> hyp_len = len(hypothesis) >>> closest_ref_len = closest_ref_length(references, hyp_len) >>> brevity_penalty(closest_ref_len, hyp_len) 1.0 In case a hypothesis translation is shorter than the references, penalty is applied. >>> references = [['a'] * 28, ['a'] * 28] >>> hypothesis = ['a'] * 12 >>> hyp_len = len(hypothesis) >>> closest_ref_len = closest_ref_length(references, hyp_len) >>> brevity_penalty(closest_ref_len, hyp_len) 0.2635971381157267 The length of the closest reference is used to compute the penalty. If the length of a hypothesis is 12, and the reference lengths are 13 and 2, the penalty is applied because the hypothesis length (12) is less then the closest reference length (13). >>> references = [['a'] * 13, ['a'] * 2] >>> hypothesis = ['a'] * 12 >>> hyp_len = len(hypothesis) >>> closest_ref_len = closest_ref_length(references, hyp_len) >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS 0.9200... The brevity penalty doesn't depend on reference order. More importantly, when two reference sentences are at the same distance, the shortest reference sentence length is used. >>> references = [['a'] * 13, ['a'] * 11] >>> hypothesis = ['a'] * 12 >>> hyp_len = len(hypothesis) >>> closest_ref_len = closest_ref_length(references, hyp_len) >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) >>> hyp_len = len(hypothesis) >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) >>> bp1 == bp2 == 1 True A test example from mteval-v13a.pl (starting from the line 705): >>> references = [['a'] * 11, ['a'] * 8] >>> hypothesis = ['a'] * 7 >>> hyp_len = len(hypothesis) >>> closest_ref_len = closest_ref_length(references, hyp_len) >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS 0.8668... >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] >>> hypothesis = ['a'] * 7 >>> hyp_len = len(hypothesis) >>> closest_ref_len = closest_ref_length(references, hyp_len) >>> brevity_penalty(closest_ref_len, hyp_len) 1.0 :param hyp_len: The length of the hypothesis for a single sentence OR the sum of all the hypotheses' lengths for a corpus :type hyp_len: int :param closest_ref_len: The length of the closest reference for a single hypothesis OR the sum of all the closest references for every hypotheses. :type closest_ref_len: int :return: BLEU's brevity penalty. :rtype: float """ if hyp_len > closest_ref_len: return 1 # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 elif hyp_len == 0: return 0 else: return math.exp(1 - closest_ref_len / hyp_len) class SmoothingFunction: """ This is an implementation of the smoothing techniques for segment-level BLEU scores that was presented in Boxing Chen and Collin Cherry (2014) A Systematic Comparison of Smoothing Techniques for Sentence-Level BLEU. In WMT14. http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf """ def __init__(self, epsilon=0.1, alpha=5, k=5): """ This will initialize the parameters required for the various smoothing techniques, the default values are set to the numbers used in the experiments from Chen and Cherry (2014). >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', ... 'that', 'the', 'military', 'always', 'obeys', 'the', ... 'commands', 'of', 'the', 'party'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', ... 'that', 'the', 'military', 'will', 'forever', 'heed', ... 'Party', 'commands'] >>> chencherry = SmoothingFunction() >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS 0.4118... >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS 0.4118... >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS 0.4118... >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS 0.4452... >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS 0.4118... >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS 0.4118... >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS 0.4905... >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS 0.4135... >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS 0.4905... :param epsilon: the epsilon value use in method 1 :type epsilon: float :param alpha: the alpha value use in method 6 :type alpha: int :param k: the k value use in method 4 :type k: int """ self.epsilon = epsilon self.alpha = alpha self.k = k def method0(self, p_n, *args, **kwargs): """ No smoothing. """ p_n_new = [] for i, p_i in enumerate(p_n): if p_i.numerator != 0: p_n_new.append(p_i) else: _msg = str( "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" "Therefore the BLEU score evaluates to 0, independently of\n" "how many N-gram overlaps of lower order it contains.\n" "Consider using lower n-gram order or use " "SmoothingFunction()" ).format(i + 1) warnings.warn(_msg) # When numerator==0 where denonminator==0 or !=0, the result # for the precision score should be equal to 0 or undefined. # Due to BLEU geometric mean computation in logarithm space, # we we need to take the return sys.float_info.min such that # math.log(sys.float_info.min) returns a 0 precision score. p_n_new.append(sys.float_info.min) return p_n_new def method1(self, p_n, *args, **kwargs): """ Smoothing method 1: Add *epsilon* counts to precision with 0 counts. """ return [ (p_i.numerator + self.epsilon) / p_i.denominator if p_i.numerator == 0 else p_i for p_i in p_n ] def method2(self, p_n, *args, **kwargs): """ Smoothing method 2: Add 1 to both numerator and denominator from Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation. In COLING 2004. """ return [ Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) if i != 0 else p_n[0] for i in range(len(p_n)) ] def method3(self, p_n, *args, **kwargs): """ Smoothing method 3: NIST geometric sequence smoothing The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null. k is 1 for the first 'n' value for which the n-gram match count is null/ For example, if the text contains: - one 2-gram match - and (consequently) two 1-gram matches the n-gram count for each individual precision score would be: - n=1 => prec_count = 2 (two unigrams) - n=2 => prec_count = 1 (one bigram) - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) """ incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. for i, p_i in enumerate(p_n): if p_i.numerator == 0: p_n[i] = 1 / (2 ** incvnt * p_i.denominator) incvnt += 1 return p_n def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): """ Smoothing method 4: Shorter translations may have inflated precision values due to having smaller denominators; therefore, we give them proportionally smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry suggests dividing by 1/ln(len(T)), where T is the length of the translation. """ incvnt = 1 hyp_len = hyp_len if hyp_len else len(hypothesis) for i, p_i in enumerate(p_n): if p_i.numerator == 0 and hyp_len > 1: # incvnt = i + 1 * self.k / math.log( # hyp_len # ) # Note that this K is different from the K from NIST. # p_n[i] = incvnt / p_i.denominator\ numerator = 1 / (2 ** incvnt * self.k / math.log(hyp_len)) p_n[i] = numerator / p_i.denominator incvnt += 1 return p_n def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): """ Smoothing method 5: The matched counts for similar values of n should be similar. To a calculate the n-gram matched count, it averages the n−1, n and n+1 gram matched counts. """ hyp_len = hyp_len if hyp_len else len(hypothesis) m = {} # Requires an precision value for an addition ngram order. p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] m[-1] = p_n[0] + 1 for i, p_i in enumerate(p_n): p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 m[i] = p_n[i] return p_n def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): """ Smoothing method 6: Interpolates the maximum likelihood estimate of the precision *p_n* with a prior estimate *pi0*. The prior is estimated by assuming that the ratio between pn and pn−1 will be the same as that between pn−1 and pn−2; from Gao and He (2013) Training MRF-Based Phrase Translation Models using Gradient Ascent. In NAACL. """ hyp_len = hyp_len if hyp_len else len(hypothesis) # This smoothing only works when p_1 and p_2 is non-zero. # Raise an error with an appropriate message when the input is too short # to use this smoothing technique. assert p_n[2], "This smoothing method requires non-zero precision for bigrams." for i, p_i in enumerate(p_n): if i in [0, 1]: # Skips the first 2 orders of ngrams. continue else: pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] # No. of ngrams in translation that matches the reference. m = p_i.numerator # No. of ngrams in translation. l = sum(1 for _ in ngrams(hypothesis, i + 1)) # Calculates the interpolated precision. p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) return p_n def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): """ Smoothing method 7: Interpolates methods 4 and 5. """ hyp_len = hyp_len if hyp_len else len(hypothesis) p_n = self.method4(p_n, references, hypothesis, hyp_len) p_n = self.method5(p_n, references, hypothesis, hyp_len) return p_n nltk-3.7/nltk/translate/chrf_score.py000066400000000000000000000210661420073152400177720ustar00rootroot00000000000000# Natural Language Toolkit: ChrF score # # Copyright (C) 2001-2022 NLTK Project # Authors: Maja Popovic # Contributors: Liling Tan, Aleš Tamchyna (Memsource) # URL: # For license information, see LICENSE.TXT """ ChrF score implementation """ import re from collections import Counter, defaultdict from nltk.util import ngrams def sentence_chrf( reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True ): """ Calculates the sentence level CHRF (Character n-gram F-score) described in - Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation. In Proceedings of the 10th Workshop on Machine Translation. https://www.statmt.org/wmt15/pdf/WMT49.pdf - Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights. In Proceedings of the 1st Conference on Machine Translation. https://www.statmt.org/wmt16/pdf/W16-2341.pdf This implementation of CHRF only supports a single reference at the moment. For details not reported in the paper, consult Maja Popovic's original implementation: https://github.com/m-popovic/chrF The code should output results equivalent to running CHRF++ with the following options: -nw 0 -b 3 An example from the original BLEU paper https://www.aclweb.org/anthology/P02-1040.pdf >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands').split() >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party').split() >>> hyp2 = str('It is to insure the troops forever hearing the activity ' ... 'guidebook that party direct').split() >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS 0.6349... >>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS 0.3330... The infamous "the the the ... " example >>> ref = 'the cat is on the mat'.split() >>> hyp = 'the the the the the the the'.split() >>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS 0.1468... An example to show that this function allows users to use strings instead of tokens, i.e. list(str) as inputs. >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands') >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party') >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS 0.6349... >>> type(ref1) == type(hyp1) == str True >>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS 0.6349... To skip the unigrams and only use 2- to 3-grams: >>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS 0.6617... :param references: reference sentence :type references: list(str) / str :param hypothesis: a hypothesis sentence :type hypothesis: list(str) / str :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :param beta: the parameter to assign more importance to recall over precision :type beta: float :param ignore_whitespace: ignore whitespace characters in scoring :type ignore_whitespace: bool :return: the sentence level CHRF score. :rtype: float """ return corpus_chrf( [reference], [hypothesis], min_len, max_len, beta=beta, ignore_whitespace=ignore_whitespace, ) def _preprocess(sent, ignore_whitespace): if type(sent) != str: # turn list of tokens into a string sent = " ".join(sent) if ignore_whitespace: sent = re.sub(r"\s+", "", sent) return sent def chrf_precision_recall_fscore_support( reference, hypothesis, n, beta=3.0, epsilon=1e-16 ): """ This function computes the precision, recall and fscore from the ngram overlaps. It returns the `support` which is the true positive score. By underspecifying the input type, the function will be agnostic as to how it computes the ngrams and simply take the whichever element in the list; it could be either token or character. :param reference: The reference sentence. :type reference: list :param hypothesis: The hypothesis sentence. :type hypothesis: list :param n: Extract up to the n-th order ngrams :type n: int :param beta: The parameter to assign more importance to recall over precision. :type beta: float :param epsilon: The fallback value if the hypothesis or reference is empty. :type epsilon: float :return: Returns the precision, recall and f-score and support (true positive). :rtype: tuple(float) """ ref_ngrams = Counter(ngrams(reference, n)) hyp_ngrams = Counter(ngrams(hypothesis, n)) # calculate the number of ngram matches overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. tpfp = sum(hyp_ngrams.values()) # True positives + False positives. tpfn = sum(ref_ngrams.values()) # True positives + False negatives. try: prec = tp / tpfp # precision rec = tp / tpfn # recall factor = beta ** 2 fscore = (1 + factor) * (prec * rec) / (factor * prec + rec) except ZeroDivisionError: prec = rec = fscore = epsilon return prec, rec, fscore, tp def corpus_chrf( references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True ): """ Calculates the corpus level CHRF (Character n-gram F-score), it is the macro-averaged value of the sentence/segment level CHRF score. This implementation of CHRF only supports a single reference at the moment. >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands').split() >>> ref2 = str('It is the guiding principle which guarantees the military ' ... 'forces always being under the command of the Party').split() >>> >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party').split() >>> hyp2 = str('It is to insure the troops forever hearing the activity ' ... 'guidebook that party direct') >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS 0.3910... :param references: a corpus of list of reference sentences, w.r.t. hypotheses :type references: list(list(str)) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :param beta: the parameter to assign more importance to recall over precision :type beta: float :param ignore_whitespace: ignore whitespace characters in scoring :type ignore_whitespace: bool :return: the sentence level CHRF score. :rtype: float """ assert len(references) == len( hypotheses ), "The number of hypotheses and their references should be the same" num_sents = len(hypotheses) # Keep f-scores for each n-gram order separate ngram_fscores = defaultdict(lambda: list()) # Iterate through each hypothesis and their corresponding references. for reference, hypothesis in zip(references, hypotheses): # preprocess both reference and hypothesis reference = _preprocess(reference, ignore_whitespace) hypothesis = _preprocess(hypothesis, ignore_whitespace) # Calculate f-scores for each sentence and for each n-gram order # separately. for n in range(min_len, max_len + 1): # Compute the precision, recall, fscore and support. prec, rec, fscore, tp = chrf_precision_recall_fscore_support( reference, hypothesis, n, beta=beta ) ngram_fscores[n].append(fscore) # how many n-gram sizes num_ngram_sizes = len(ngram_fscores) # sum of f-scores over all sentences for each n-gram order total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()] # macro-average over n-gram orders and over all sentences return (sum(total_scores) / num_ngram_sizes) / num_sents nltk-3.7/nltk/translate/gale_church.py000066400000000000000000000204251420073152400201170ustar00rootroot00000000000000# Natural Language Toolkit: Gale-Church Aligner # # Copyright (C) 2001-2022 NLTK Project # Author: Torsten Marek # Contributor: Cassidy Laidlaw, Liling Tan # URL: # For license information, see LICENSE.TXT """ A port of the Gale-Church Aligner. Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora. https://aclweb.org/anthology/J93-1004.pdf """ import math try: from norm import logsf as norm_logsf from scipy.stats import norm except ImportError: def erfcc(x): """Complementary error function.""" z = abs(x) t = 1 / (1 + 0.5 * z) r = t * math.exp( -z * z - 1.26551223 + t * ( 1.00002368 + t * ( 0.37409196 + t * ( 0.09678418 + t * ( -0.18628806 + t * ( 0.27886807 + t * ( -1.13520398 + t * (1.48851587 + t * (-0.82215223 + t * 0.17087277)) ) ) ) ) ) ) ) if x >= 0.0: return r else: return 2.0 - r def norm_cdf(x): """Return the area under the normal distribution from M{-∞..x}.""" return 1 - 0.5 * erfcc(x / math.sqrt(2)) def norm_logsf(x): try: return math.log(1 - norm_cdf(x)) except ValueError: return float("-inf") LOG2 = math.log(2) class LanguageIndependent: # These are the language-independent probabilities and parameters # given in Gale & Church # for the computation, l_1 is always the language with less characters PRIORS = { (1, 0): 0.0099, (0, 1): 0.0099, (1, 1): 0.89, (2, 1): 0.089, (1, 2): 0.089, (2, 2): 0.011, } AVERAGE_CHARACTERS = 1 VARIANCE_CHARACTERS = 6.8 def trace(backlinks, source_sents_lens, target_sents_lens): """ Traverse the alignment cost from the tracebacks and retrieves appropriate sentence pairs. :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS) :type backlinks: dict :param source_sents_lens: A list of target sentences' lengths :type source_sents_lens: list(int) :param target_sents_lens: A list of target sentences' lengths :type target_sents_lens: list(int) """ links = [] position = (len(source_sents_lens), len(target_sents_lens)) while position != (0, 0) and all(p >= 0 for p in position): try: s, t = backlinks[position] except TypeError: position = (position[0] - 1, position[1] - 1) continue for i in range(s): for j in range(t): links.append((position[0] - i - 1, position[1] - j - 1)) position = (position[0] - s, position[1] - t) return links[::-1] def align_log_prob(i, j, source_sents, target_sents, alignment, params): """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]} being aligned with a specific C{alignment}. @param i: The offset of the source sentence. @param j: The offset of the target sentence. @param source_sents: The list of source sentence lengths. @param target_sents: The list of target sentence lengths. @param alignment: The alignment type, a tuple of two integers. @param params: The sentence alignment parameters. @returns: The log probability of a specific alignment between the two sentences, given the parameters. """ l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0])) l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1])) try: # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C # reference implementation. With l_s in the denominator, insertions are impossible. m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2 delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt( m * params.VARIANCE_CHARACTERS ) except ZeroDivisionError: return float("-inf") return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment])) def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent): """Return the sentence alignment of two text blocks (usually paragraphs). >>> align_blocks([5,5,5], [7,7,7]) [(0, 0), (1, 1), (2, 2)] >>> align_blocks([10,5,5], [12,20]) [(0, 0), (1, 1), (2, 1)] >>> align_blocks([12,20], [10,5,5]) [(0, 0), (1, 1), (1, 2)] >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12]) [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)] @param source_sents_lens: The list of source sentence lengths. @param target_sents_lens: The list of target sentence lengths. @param params: the sentence alignment parameters. @return: The sentence alignments, a list of index pairs. """ alignment_types = list(params.PRIORS.keys()) # there are always three rows in the history (with the last of them being filled) D = [[]] backlinks = {} for i in range(len(source_sents_lens) + 1): for j in range(len(target_sents_lens) + 1): min_dist = float("inf") min_align = None for a in alignment_types: prev_i = -1 - a[0] prev_j = j - a[1] if prev_i < -len(D) or prev_j < 0: continue p = D[prev_i][prev_j] + align_log_prob( i, j, source_sents_lens, target_sents_lens, a, params ) if p < min_dist: min_dist = p min_align = a if min_dist == float("inf"): min_dist = 0 backlinks[(i, j)] = min_align D[-1].append(min_dist) if len(D) > 2: D.pop(0) D.append([]) return trace(backlinks, source_sents_lens, target_sents_lens) def align_texts(source_blocks, target_blocks, params=LanguageIndependent): """Creates the sentence alignment of two texts. Texts can consist of several blocks. Block boundaries cannot be crossed by sentence alignment links. Each block consists of a list that contains the lengths (in characters) of the sentences in this block. @param source_blocks: The list of blocks in the source text. @param target_blocks: The list of blocks in the target text. @param params: the sentence alignment parameters. @returns: A list of sentence alignment lists """ if len(source_blocks) != len(target_blocks): raise ValueError( "Source and target texts do not have the same number of blocks." ) return [ align_blocks(source_block, target_block, params) for source_block, target_block in zip(source_blocks, target_blocks) ] # File I/O functions; may belong in a corpus reader def split_at(it, split_value): """Splits an iterator C{it} at values of C{split_value}. Each instance of C{split_value} is swallowed. The iterator produces subiterators which need to be consumed fully before the next subiterator can be used. """ def _chunk_iterator(first): v = first while v != split_value: yield v v = it.next() while True: yield _chunk_iterator(it.next()) def parse_token_stream(stream, soft_delimiter, hard_delimiter): """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens) and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function. """ return [ [ sum(len(token) for token in sentence_it) for sentence_it in split_at(block_it, soft_delimiter) ] for block_it in split_at(stream, hard_delimiter) ] nltk-3.7/nltk/translate/gdfa.py000066400000000000000000000137341420073152400165610ustar00rootroot00000000000000# Natural Language Toolkit: GDFA word alignment symmetrization # # Copyright (C) 2001-2022 NLTK Project # Authors: Liling Tan # URL: # For license information, see LICENSE.TXT from collections import defaultdict def grow_diag_final_and(srclen, trglen, e2f, f2e): """ This module symmetrisatizes the source-to-target and target-to-source word alignment output and produces, aka. GDFA algorithm (Koehn, 2005). Step 1: Find the intersection of the bidirectional alignment. Step 2: Search for additional neighbor alignment points to be added, given these criteria: (i) neighbor alignments points are not in the intersection and (ii) neighbor alignments are in the union. Step 3: Add all other alignment points that are not in the intersection, not in the neighboring alignments that met the criteria but in the original forward/backward alignment outputs. >>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 ' ... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18') >>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 ' ... '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 ' ... '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18') >>> srctext = ("この よう な ハロー 白色 わい 星 の L 関数 " ... "は L と 共 に 不連続 に 増加 する こと が " ... "期待 さ れる こと を 示し た 。") >>> trgtext = ("Therefore , we expect that the luminosity function " ... "of such halo white dwarfs increases discontinuously " ... "with the luminosity .") >>> srclen = len(srctext.split()) >>> trglen = len(trgtext.split()) >>> >>> gdfa = grow_diag_final_and(srclen, trglen, forw, back) >>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12), ... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20, ... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5), ... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22, ... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5, ... 12), (11, 6), (12, 8)])) True References: Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot. 2005. Edinburgh System Description for the 2005 IWSLT Speech Translation Evaluation. In MT Eval Workshop. :type srclen: int :param srclen: the number of tokens in the source language :type trglen: int :param trglen: the number of tokens in the target language :type e2f: str :param e2f: the forward word alignment outputs from source-to-target language (in pharaoh output format) :type f2e: str :param f2e: the backward word alignment outputs from target-to-source language (in pharaoh output format) :rtype: set(tuple(int)) :return: the symmetrized alignment points from the GDFA algorithm """ # Converts pharaoh text format into list of tuples. e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()] f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()] neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)] alignment = set(e2f).intersection(set(f2e)) # Find the intersection. union = set(e2f).union(set(f2e)) # *aligned* is used to check if neighbors are aligned in grow_diag() aligned = defaultdict(set) for i, j in alignment: aligned["e"].add(i) aligned["f"].add(j) def grow_diag(): """ Search for the neighbor points and them to the intersected alignment points if criteria are met. """ prev_len = len(alignment) - 1 # iterate until no new points added while prev_len < len(alignment): no_new_points = True # for english word e = 0 ... en for e in range(srclen): # for foreign word f = 0 ... fn for f in range(trglen): # if ( e aligned with f) if (e, f) in alignment: # for each neighboring point (e-new, f-new) for neighbor in neighbors: neighbor = tuple(i + j for i, j in zip((e, f), neighbor)) e_new, f_new = neighbor # if ( ( e-new not aligned and f-new not aligned) # and (e-new, f-new in union(e2f, f2e) ) if ( e_new not in aligned and f_new not in aligned ) and neighbor in union: alignment.add(neighbor) aligned["e"].add(e_new) aligned["f"].add(f_new) prev_len += 1 no_new_points = False # iterate until no new points added if no_new_points: break def final_and(a): """ Adds remaining points that are not in the intersection, not in the neighboring alignments but in the original *e2f* and *f2e* alignments """ # for english word e = 0 ... en for e_new in range(srclen): # for foreign word f = 0 ... fn for f_new in range(trglen): # if ( ( e-new not aligned and f-new not aligned) # and (e-new, f-new in union(e2f, f2e) ) if ( e_new not in aligned and f_new not in aligned and (e_new, f_new) in union ): alignment.add((e_new, f_new)) aligned["e"].add(e_new) aligned["f"].add(f_new) grow_diag() final_and(e2f) final_and(f2e) return sorted(alignment) nltk-3.7/nltk/translate/gleu_score.py000066400000000000000000000207011420073152400177770ustar00rootroot00000000000000# Natural Language Toolkit: GLEU Score # # Copyright (C) 2001-2022 NLTK Project # Authors: # Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan # URL: # For license information, see LICENSE.TXT """ GLEU score implementation. """ from collections import Counter from nltk.util import everygrams, ngrams def sentence_gleu(references, hypothesis, min_len=1, max_len=4): """ Calculates the sentence level GLEU (Google-BLEU) score described in Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser, Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey Dean. (2016) Google’s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf Retrieved on 27 Oct 2016. From Wu et al. (2016): "The BLEU score has some undesirable properties when used for single sentences, as it was designed to be a corpus measure. We therefore use a slightly different score for our RL experiments which we call the 'GLEU score'. For the GLEU score, we record all sub-sequences of 1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then compute a recall, which is the ratio of the number of matching n-grams to the number of total n-grams in the target (ground truth) sequence, and a precision, which is the ratio of the number of matching n-grams to the number of total n-grams in the generated output sequence. Then GLEU score is simply the minimum of recall and precision. This GLEU score's range is always between 0 (no matches) and 1 (all match) and it is symmetrical when switching output and target. According to our experiments, GLEU score correlates quite well with the BLEU metric on a corpus level but does not have its drawbacks for our per sentence reward objective." Note: The initial implementation only allowed a single reference, but now a list of references is required (which is consistent with bleu_score.sentence_bleu()). The infamous "the the the ... " example >>> ref = 'the cat is on the mat'.split() >>> hyp = 'the the the the the the the'.split() >>> sentence_gleu([ref], hyp) # doctest: +ELLIPSIS 0.0909... An example to evaluate normal machine translation outputs >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands').split() >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party').split() >>> hyp2 = str('It is to insure the troops forever hearing the activity ' ... 'guidebook that party direct').split() >>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS 0.4393... >>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS 0.1206... :param references: a list of reference sentences :type references: list(list(str)) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :return: the sentence level GLEU score. :rtype: float """ return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len) def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4): """ Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all the hypotheses and their respective references. Instead of averaging the sentence level GLEU scores (i.e. macro-average precision), Wu et al. (2016) sum up the matching tokens and the max of hypothesis and reference tokens for each sentence, then compute using the aggregate values. From Mike Schuster (via email): "For the corpus, we just add up the two statistics n_match and n_all = max(n_all_output, n_all_target) for all sentences, then calculate gleu_score = n_match / n_all, so it is not just a mean of the sentence gleu scores (in our case, longer sentences count more, which I think makes sense as they are more difficult to translate)." >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', ... 'interested', 'in', 'world', 'history'] >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', ... 'because', 'he', 'read', 'the', 'book'] >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] >>> hypotheses = [hyp1, hyp2] >>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS 0.5673... The example below show that corpus_gleu() is different from averaging sentence_gleu() for hypotheses >>> score1 = sentence_gleu([ref1a], hyp1) >>> score2 = sentence_gleu([ref2a], hyp2) >>> (score1 + score2) / 2 # doctest: +ELLIPSIS 0.6144... :param list_of_references: a list of reference sentences, w.r.t. hypotheses :type list_of_references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :return: The corpus-level GLEU score. :rtype: float """ # sanity check assert len(list_of_references) == len( hypotheses ), "The number of hypotheses and their reference(s) should be the same" # sum matches and max-token-lengths over all sentences corpus_n_match = 0 corpus_n_all = 0 for references, hypothesis in zip(list_of_references, hypotheses): hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) tpfp = sum(hyp_ngrams.values()) # True positives + False positives. hyp_counts = [] for reference in references: ref_ngrams = Counter(everygrams(reference, min_len, max_len)) tpfn = sum(ref_ngrams.values()) # True positives + False negatives. overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. # While GLEU is defined as the minimum of precision and # recall, we can reduce the number of division operations by one by # instead finding the maximum of the denominators for the precision # and recall formulae, since the numerators are the same: # precision = tp / tpfp # recall = tp / tpfn # gleu_score = min(precision, recall) == tp / max(tpfp, tpfn) n_all = max(tpfp, tpfn) if n_all > 0: hyp_counts.append((tp, n_all)) # use the reference yielding the highest score if hyp_counts: n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1]) corpus_n_match += n_match corpus_n_all += n_all # corner case: empty corpus or empty references---don't divide by zero! if corpus_n_all == 0: gleu_score = 0.0 else: gleu_score = corpus_n_match / corpus_n_all return gleu_score nltk-3.7/nltk/translate/ibm1.py000066400000000000000000000220331420073152400165000ustar00rootroot00000000000000# Natural Language Toolkit: IBM Model 1 # # Copyright (C) 2001-2013 NLTK Project # Author: Chin Yee Lee # Hengfeng Li # Ruxin Hou # Calvin Tanujaya Lim # Based on earlier version by: # Will Zhang # Guan Gui # URL: # For license information, see LICENSE.TXT """ Lexical translation model that ignores word order. In IBM Model 1, word order is ignored for simplicity. As long as the word alignments are equivalent, it doesn't matter where the word occurs in the source or target sentence. Thus, the following three alignments are equally likely:: Source: je mange du jambon Target: i eat some ham Alignment: (0,0) (1,1) (2,2) (3,3) Source: je mange du jambon Target: some ham eat i Alignment: (0,2) (1,3) (2,1) (3,1) Source: du jambon je mange Target: eat i some ham Alignment: (0,3) (1,2) (2,0) (3,1) Note that an alignment is represented here as (word_index_in_target, word_index_in_source). The EM algorithm used in Model 1 is: :E step: In the training data, count how many times a source language word is translated into a target language word, weighted by the prior probability of the translation. :M step: Estimate the new probability of translation based on the counts from the Expectation step. Notations --------- :i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence :j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence :s: A word in the source language :t: A word in the target language References ---------- Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ import warnings from collections import defaultdict from nltk.translate import AlignedSent, Alignment, IBMModel from nltk.translate.ibm_model import Counts class IBMModel1(IBMModel): """ Lexical translation model that ignores word order >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> ibm1 = IBMModel1(bitext, 5) >>> print(ibm1.translation_table['buch']['book']) 0.889... >>> print(ibm1.translation_table['das']['book']) 0.061... >>> print(ibm1.translation_table['buch'][None]) 0.113... >>> print(ibm1.translation_table['ja'][None]) 0.072... >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) """ def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, the following entry must be present: ``translation_table``. See ``IBMModel`` for the type and purpose of this table. :type probability_tables: dict[str]: object """ super().__init__(sentence_aligned_corpus) if probability_tables is None: self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables["translation_table"] for n in range(0, iterations): self.train(sentence_aligned_corpus) self.align_all(sentence_aligned_corpus) def set_uniform_probabilities(self, sentence_aligned_corpus): initial_prob = 1 / len(self.trg_vocab) if initial_prob < IBMModel.MIN_PROB: warnings.warn( "Target language vocabulary is too large (" + str(len(self.trg_vocab)) + " words). " "Results may be less accurate." ) for t in self.trg_vocab: self.translation_table[t] = defaultdict(lambda: initial_prob) def train(self, parallel_corpus): counts = Counts() for aligned_sentence in parallel_corpus: trg_sentence = aligned_sentence.words src_sentence = [None] + aligned_sentence.mots # E step (a): Compute normalization factors to weigh counts total_count = self.prob_all_alignments(src_sentence, trg_sentence) # E step (b): Collect counts for t in trg_sentence: for s in src_sentence: count = self.prob_alignment_point(s, t) normalized_count = count / total_count[t] counts.t_given_s[t][s] += normalized_count counts.any_t_given_s[s] += normalized_count # M step: Update probabilities with maximum likelihood estimate self.maximize_lexical_translation_probabilities(counts) def prob_all_alignments(self, src_sentence, trg_sentence): """ Computes the probability of all possible word alignments, expressed as a marginal distribution over target words t Each entry in the return value represents the contribution to the total alignment probability by the target word t. To obtain probability(alignment | src_sentence, trg_sentence), simply sum the entries in the return value. :return: Probability of t for all s in ``src_sentence`` :rtype: dict(str): float """ alignment_prob_for_t = defaultdict(lambda: 0.0) for t in trg_sentence: for s in src_sentence: alignment_prob_for_t[t] += self.prob_alignment_point(s, t) return alignment_prob_for_t def prob_alignment_point(self, s, t): """ Probability that word ``t`` in the target sentence is aligned to word ``s`` in the source sentence """ return self.translation_table[t][s] def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ prob = 1.0 for j, i in enumerate(alignment_info.alignment): if j == 0: continue # skip the dummy zeroeth element trg_word = alignment_info.trg_sentence[j] src_word = alignment_info.src_sentence[i] prob *= self.translation_table[trg_word][src_word] return max(prob, IBMModel.MIN_PROB) def align_all(self, parallel_corpus): for sentence_pair in parallel_corpus: self.align(sentence_pair) def align(self, sentence_pair): """ Determines the best word alignment for one sentence pair from the corpus that the model was trained on. The best alignment will be set in ``sentence_pair`` when the method returns. In contrast with the internal implementation of IBM models, the word indices in the ``Alignment`` are zero- indexed, not one-indexed. :param sentence_pair: A sentence in the source language and its counterpart sentence in the target language :type sentence_pair: AlignedSent """ best_alignment = [] for j, trg_word in enumerate(sentence_pair.words): # Initialize trg_word to align with the NULL token best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB) best_alignment_point = None for i, src_word in enumerate(sentence_pair.mots): align_prob = self.translation_table[trg_word][src_word] if align_prob >= best_prob: # prefer newer word in case of tie best_prob = align_prob best_alignment_point = i best_alignment.append((j, best_alignment_point)) sentence_pair.alignment = Alignment(best_alignment) nltk-3.7/nltk/translate/ibm2.py000066400000000000000000000277131420073152400165130ustar00rootroot00000000000000# Natural Language Toolkit: IBM Model 2 # # Copyright (C) 2001-2013 NLTK Project # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim # URL: # For license information, see LICENSE.TXT """ Lexical translation model that considers word order. IBM Model 2 improves on Model 1 by accounting for word order. An alignment probability is introduced, a(i | j,l,m), which predicts a source word position, given its aligned target word's position. The EM algorithm used in Model 2 is: :E step: In the training data, collect counts, weighted by prior probabilities. - (a) count how many times a source language word is translated into a target language word - (b) count how many times a particular position in the source sentence is aligned to a particular position in the target sentence :M step: Estimate new probabilities based on the counts from the E step Notations --------- :i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence :j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence :l: Number of words in the source sentence, excluding NULL :m: Number of words in the target sentence :s: A word in the source language :t: A word in the target language References ---------- Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ import warnings from collections import defaultdict from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel1 from nltk.translate.ibm_model import Counts class IBMModel2(IBMModel): """ Lexical translation model that considers word order >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> ibm2 = IBMModel2(bitext, 5) >>> print(round(ibm2.translation_table['buch']['book'], 3)) 1.0 >>> print(round(ibm2.translation_table['das']['book'], 3)) 0.0 >>> print(round(ibm2.translation_table['buch'][None], 3)) 0.0 >>> print(round(ibm2.translation_table['ja'][None], 3)) 0.0 >>> print(ibm2.alignment_table[1][1][2][2]) 0.938... >>> print(round(ibm2.alignment_table[1][2][2][2], 3)) 0.0 >>> print(round(ibm2.alignment_table[2][2][4][5], 3)) 1.0 >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) """ def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model and an alignment model. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``. See ``IBMModel`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super().__init__(sentence_aligned_corpus) if probability_tables is None: # Get translation probabilities from IBM Model 1 # Run more iterations of training for Model 1, since it is # faster than Model 2 ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations) self.translation_table = ibm1.translation_table self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables["translation_table"] self.alignment_table = probability_tables["alignment_table"] for n in range(0, iterations): self.train(sentence_aligned_corpus) self.align_all(sentence_aligned_corpus) def set_uniform_probabilities(self, sentence_aligned_corpus): # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m l_m_combinations = set() for aligned_sentence in sentence_aligned_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) if (l, m) not in l_m_combinations: l_m_combinations.add((l, m)) initial_prob = 1 / (l + 1) if initial_prob < IBMModel.MIN_PROB: warnings.warn( "A source sentence is too long (" + str(l) + " words). Results may be less accurate." ) for i in range(0, l + 1): for j in range(1, m + 1): self.alignment_table[i][j][l][m] = initial_prob def train(self, parallel_corpus): counts = Model2Counts() for aligned_sentence in parallel_corpus: src_sentence = [None] + aligned_sentence.mots trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed l = len(aligned_sentence.mots) m = len(aligned_sentence.words) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_all_alignments(src_sentence, trg_sentence) # E step (b): Collect counts for j in range(1, m + 1): t = trg_sentence[j] for i in range(0, l + 1): s = src_sentence[i] count = self.prob_alignment_point(i, j, src_sentence, trg_sentence) normalized_count = count / total_count[t] counts.update_lexical_translation(normalized_count, s, t) counts.update_alignment(normalized_count, i, j, l, m) # M step: Update probabilities with maximum likelihood estimates self.maximize_lexical_translation_probabilities(counts) self.maximize_alignment_probabilities(counts) def maximize_alignment_probabilities(self, counts): MIN_PROB = IBMModel.MIN_PROB for i, j_s in counts.alignment.items(): for j, src_sentence_lengths in j_s.items(): for l, trg_sentence_lengths in src_sentence_lengths.items(): for m in trg_sentence_lengths: estimate = ( counts.alignment[i][j][l][m] / counts.alignment_for_any_i[j][l][m] ) self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB) def prob_all_alignments(self, src_sentence, trg_sentence): """ Computes the probability of all possible word alignments, expressed as a marginal distribution over target words t Each entry in the return value represents the contribution to the total alignment probability by the target word t. To obtain probability(alignment | src_sentence, trg_sentence), simply sum the entries in the return value. :return: Probability of t for all s in ``src_sentence`` :rtype: dict(str): float """ alignment_prob_for_t = defaultdict(lambda: 0.0) for j in range(1, len(trg_sentence)): t = trg_sentence[j] for i in range(0, len(src_sentence)): alignment_prob_for_t[t] += self.prob_alignment_point( i, j, src_sentence, trg_sentence ) return alignment_prob_for_t def prob_alignment_point(self, i, j, src_sentence, trg_sentence): """ Probability that position j in ``trg_sentence`` is aligned to position i in the ``src_sentence`` """ l = len(src_sentence) - 1 m = len(trg_sentence) - 1 s = src_sentence[i] t = trg_sentence[j] return self.translation_table[t][s] * self.alignment_table[i][j][l][m] def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ prob = 1.0 l = len(alignment_info.src_sentence) - 1 m = len(alignment_info.trg_sentence) - 1 for j, i in enumerate(alignment_info.alignment): if j == 0: continue # skip the dummy zeroeth element trg_word = alignment_info.trg_sentence[j] src_word = alignment_info.src_sentence[i] prob *= ( self.translation_table[trg_word][src_word] * self.alignment_table[i][j][l][m] ) return max(prob, IBMModel.MIN_PROB) def align_all(self, parallel_corpus): for sentence_pair in parallel_corpus: self.align(sentence_pair) def align(self, sentence_pair): """ Determines the best word alignment for one sentence pair from the corpus that the model was trained on. The best alignment will be set in ``sentence_pair`` when the method returns. In contrast with the internal implementation of IBM models, the word indices in the ``Alignment`` are zero- indexed, not one-indexed. :param sentence_pair: A sentence in the source language and its counterpart sentence in the target language :type sentence_pair: AlignedSent """ best_alignment = [] l = len(sentence_pair.mots) m = len(sentence_pair.words) for j, trg_word in enumerate(sentence_pair.words): # Initialize trg_word to align with the NULL token best_prob = ( self.translation_table[trg_word][None] * self.alignment_table[0][j + 1][l][m] ) best_prob = max(best_prob, IBMModel.MIN_PROB) best_alignment_point = None for i, src_word in enumerate(sentence_pair.mots): align_prob = ( self.translation_table[trg_word][src_word] * self.alignment_table[i + 1][j + 1][l][m] ) if align_prob >= best_prob: best_prob = align_prob best_alignment_point = i best_alignment.append((j, best_alignment_point)) sentence_pair.alignment = Alignment(best_alignment) class Model2Counts(Counts): """ Data object to store counts of various parameters during training. Includes counts for alignment. """ def __init__(self): super().__init__() self.alignment = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) ) self.alignment_for_any_i = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) ) def update_lexical_translation(self, count, s, t): self.t_given_s[t][s] += count self.any_t_given_s[s] += count def update_alignment(self, count, i, j, l, m): self.alignment[i][j][l][m] += count self.alignment_for_any_i[j][l][m] += count nltk-3.7/nltk/translate/ibm3.py000066400000000000000000000327511420073152400165120ustar00rootroot00000000000000# Natural Language Toolkit: IBM Model 3 # # Copyright (C) 2001-2013 NLTK Project # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim # URL: # For license information, see LICENSE.TXT """ Translation model that considers how a word can be aligned to multiple words in another language. IBM Model 3 improves on Model 2 by directly modeling the phenomenon where a word in one language may be translated into zero or more words in another. This is expressed by the fertility probability, n(phi | source word). If a source word translates into more than one word, it is possible to generate sentences that have the same alignment in multiple ways. This is modeled by a distortion step. The distortion probability, d(j|i,l,m), predicts a target word position, given its aligned source word's position. The distortion probability replaces the alignment probability of Model 2. The fertility probability is not applicable for NULL. Target words that align to NULL are assumed to be distributed uniformly in the target sentence. The existence of these words is modeled by p1, the probability that a target word produced by a real source word requires another target word that is produced by NULL. The EM algorithm used in Model 3 is: :E step: In the training data, collect counts, weighted by prior probabilities. - (a) count how many times a source language word is translated into a target language word - (b) count how many times a particular position in the target sentence is aligned to a particular position in the source sentence - (c) count how many times a source word is aligned to phi number of target words - (d) count how many times NULL is aligned to a target word :M step: Estimate new probabilities based on the counts from the E step Because there are too many possible alignments, only the most probable ones are considered. First, the best alignment is determined using prior probabilities. Then, a hill climbing approach is used to find other good candidates. Notations --------- :i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence :j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence :l: Number of words in the source sentence, excluding NULL :m: Number of words in the target sentence :s: A word in the source language :t: A word in the target language :phi: Fertility, the number of target words produced by a source word :p1: Probability that a target word produced by a source word is accompanied by another target word that is aligned to NULL :p0: 1 - p1 References ---------- Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ import warnings from collections import defaultdict from math import factorial from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel2 from nltk.translate.ibm_model import Counts class IBMModel3(IBMModel): """ Translation model that considers how a word can be aligned to multiple words in another language >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) >>> ibm3 = IBMModel3(bitext, 5) >>> print(round(ibm3.translation_table['buch']['book'], 3)) 1.0 >>> print(round(ibm3.translation_table['das']['book'], 3)) 0.0 >>> print(round(ibm3.translation_table['ja'][None], 3)) 1.0 >>> print(round(ibm3.distortion_table[1][1][2][2], 3)) 1.0 >>> print(round(ibm3.distortion_table[1][2][2][2], 3)) 0.0 >>> print(round(ibm3.distortion_table[2][2][4][5], 3)) 0.75 >>> print(round(ibm3.fertility_table[2]['summarize'], 3)) 1.0 >>> print(round(ibm3.fertility_table[1]['book'], 3)) 1.0 >>> print(ibm3.p1) 0.054... >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) """ def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model, a distortion model, a fertility model, and a model for generating NULL-aligned words. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``, ``fertility_table``, ``p1``, ``distortion_table``. See ``IBMModel`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super().__init__(sentence_aligned_corpus) self.reset_probabilities() if probability_tables is None: # Get translation and alignment probabilities from IBM Model 2 ibm2 = IBMModel2(sentence_aligned_corpus, iterations) self.translation_table = ibm2.translation_table self.alignment_table = ibm2.alignment_table self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables["translation_table"] self.alignment_table = probability_tables["alignment_table"] self.fertility_table = probability_tables["fertility_table"] self.p1 = probability_tables["p1"] self.distortion_table = probability_tables["distortion_table"] for n in range(0, iterations): self.train(sentence_aligned_corpus) def reset_probabilities(self): super().reset_probabilities() self.distortion_table = defaultdict( lambda: defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) ) ) """ dict[int][int][int][int]: float. Probability(j | i,l,m). Values accessed as ``distortion_table[j][i][l][m]``. """ def set_uniform_probabilities(self, sentence_aligned_corpus): # d(j | i,l,m) = 1 / m for all i, j, l, m l_m_combinations = set() for aligned_sentence in sentence_aligned_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) if (l, m) not in l_m_combinations: l_m_combinations.add((l, m)) initial_prob = 1 / m if initial_prob < IBMModel.MIN_PROB: warnings.warn( "A target sentence is too long (" + str(m) + " words). Results may be less accurate." ) for j in range(1, m + 1): for i in range(0, l + 1): self.distortion_table[j][i][l][m] = initial_prob # simple initialization, taken from GIZA++ self.fertility_table[0] = defaultdict(lambda: 0.2) self.fertility_table[1] = defaultdict(lambda: 0.65) self.fertility_table[2] = defaultdict(lambda: 0.1) self.fertility_table[3] = defaultdict(lambda: 0.04) MAX_FERTILITY = 10 initial_fert_prob = 0.01 / (MAX_FERTILITY - 4) for phi in range(4, MAX_FERTILITY): self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob) self.p1 = 0.5 def train(self, parallel_corpus): counts = Model3Counts() for aligned_sentence in parallel_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) # Sample the alignment space sampled_alignments, best_alignment = self.sample(aligned_sentence) # Record the most probable alignment aligned_sentence.alignment = Alignment( best_alignment.zero_indexed_alignment() ) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_of_alignments(sampled_alignments) # E step (b): Collect counts for alignment_info in sampled_alignments: count = self.prob_t_a_given_s(alignment_info) normalized_count = count / total_count for j in range(1, m + 1): counts.update_lexical_translation( normalized_count, alignment_info, j ) counts.update_distortion(normalized_count, alignment_info, j, l, m) counts.update_null_generation(normalized_count, alignment_info) counts.update_fertility(normalized_count, alignment_info) # M step: Update probabilities with maximum likelihood estimates # If any probability is less than MIN_PROB, clamp it to MIN_PROB existing_alignment_table = self.alignment_table self.reset_probabilities() self.alignment_table = existing_alignment_table # don't retrain self.maximize_lexical_translation_probabilities(counts) self.maximize_distortion_probabilities(counts) self.maximize_fertility_probabilities(counts) self.maximize_null_generation_probabilities(counts) def maximize_distortion_probabilities(self, counts): MIN_PROB = IBMModel.MIN_PROB for j, i_s in counts.distortion.items(): for i, src_sentence_lengths in i_s.items(): for l, trg_sentence_lengths in src_sentence_lengths.items(): for m in trg_sentence_lengths: estimate = ( counts.distortion[j][i][l][m] / counts.distortion_for_any_j[i][l][m] ) self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB) def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ src_sentence = alignment_info.src_sentence trg_sentence = alignment_info.trg_sentence l = len(src_sentence) - 1 # exclude NULL m = len(trg_sentence) - 1 p1 = self.p1 p0 = 1 - p1 probability = 1.0 MIN_PROB = IBMModel.MIN_PROB # Combine NULL insertion probability null_fertility = alignment_info.fertility_of_i(0) probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) if probability < MIN_PROB: return MIN_PROB # Compute combination (m - null_fertility) choose null_fertility for i in range(1, null_fertility + 1): probability *= (m - null_fertility - i + 1) / i if probability < MIN_PROB: return MIN_PROB # Combine fertility probabilities for i in range(1, l + 1): fertility = alignment_info.fertility_of_i(i) probability *= ( factorial(fertility) * self.fertility_table[fertility][src_sentence[i]] ) if probability < MIN_PROB: return MIN_PROB # Combine lexical and distortion probabilities for j in range(1, m + 1): t = trg_sentence[j] i = alignment_info.alignment[j] s = src_sentence[i] probability *= ( self.translation_table[t][s] * self.distortion_table[j][i][l][m] ) if probability < MIN_PROB: return MIN_PROB return probability class Model3Counts(Counts): """ Data object to store counts of various parameters during training. Includes counts for distortion. """ def __init__(self): super().__init__() self.distortion = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) ) self.distortion_for_any_j = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) ) def update_distortion(self, count, alignment_info, j, l, m): i = alignment_info.alignment[j] self.distortion[j][i][l][m] += count self.distortion_for_any_j[i][l][m] += count nltk-3.7/nltk/translate/ibm4.py000066400000000000000000000474541420073152400165210ustar00rootroot00000000000000# Natural Language Toolkit: IBM Model 4 # # Copyright (C) 2001-2022 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Translation model that reorders output words based on their type and distance from other related words in the output sentence. IBM Model 4 improves the distortion model of Model 3, motivated by the observation that certain words tend to be re-ordered in a predictable way relative to one another. For example, in English usually has its order flipped as in French. Model 4 requires words in the source and target vocabularies to be categorized into classes. This can be linguistically driven, like parts of speech (adjective, nouns, prepositions, etc). Word classes can also be obtained by statistical methods. The original IBM Model 4 uses an information theoretic approach to group words into 50 classes for each vocabulary. Terminology ----------- :Cept: A source word with non-zero fertility i.e. aligned to one or more target words. :Tablet: The set of target word(s) aligned to a cept. :Head of cept: The first word of the tablet of that cept. :Center of cept: The average position of the words in that cept's tablet. If the value is not an integer, the ceiling is taken. For example, for a tablet with words in positions 2, 5, 6 in the target sentence, the center of the corresponding cept is ceil((2 + 5 + 6) / 3) = 5 :Displacement: For a head word, defined as (position of head word - position of previous cept's center). Can be positive or negative. For a non-head word, defined as (position of non-head word - position of previous word in the same tablet). Always positive, because successive words in a tablet are assumed to appear to the right of the previous word. In contrast to Model 3 which reorders words in a tablet independently of other words, Model 4 distinguishes between three cases. 1. Words generated by NULL are distributed uniformly. 2. For a head word t, its position is modeled by the probability d_head(displacement | word_class_s(s),word_class_t(t)), where s is the previous cept, and word_class_s and word_class_t maps s and t to a source and target language word class respectively. 3. For a non-head word t, its position is modeled by the probability d_non_head(displacement | word_class_t(t)) The EM algorithm used in Model 4 is: :E step: In the training data, collect counts, weighted by prior probabilities. - (a) count how many times a source language word is translated into a target language word - (b) for a particular word class, count how many times a head word is located at a particular displacement from the previous cept's center - (c) for a particular word class, count how many times a non-head word is located at a particular displacement from the previous target word - (d) count how many times a source word is aligned to phi number of target words - (e) count how many times NULL is aligned to a target word :M step: Estimate new probabilities based on the counts from the E step Like Model 3, there are too many possible alignments to consider. Thus, a hill climbing approach is used to sample good candidates. Notations --------- :i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence :j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence :l: Number of words in the source sentence, excluding NULL :m: Number of words in the target sentence :s: A word in the source language :t: A word in the target language :phi: Fertility, the number of target words produced by a source word :p1: Probability that a target word produced by a source word is accompanied by another target word that is aligned to NULL :p0: 1 - p1 :dj: Displacement, Δj References ---------- Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ import warnings from collections import defaultdict from math import factorial from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel3 from nltk.translate.ibm_model import Counts, longest_target_sentence_length class IBMModel4(IBMModel): """ Translation model that reorders output words based on their type and their distance from other related words in the output sentence >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes) >>> print(round(ibm4.translation_table['buch']['book'], 3)) 1.0 >>> print(round(ibm4.translation_table['das']['book'], 3)) 0.0 >>> print(round(ibm4.translation_table['ja'][None], 3)) 1.0 >>> print(round(ibm4.head_distortion_table[1][0][1], 3)) 1.0 >>> print(round(ibm4.head_distortion_table[2][0][1], 3)) 0.0 >>> print(round(ibm4.non_head_distortion_table[3][6], 3)) 0.5 >>> print(round(ibm4.fertility_table[2]['summarize'], 3)) 1.0 >>> print(round(ibm4.fertility_table[1]['book'], 3)) 1.0 >>> print(ibm4.p1) 0.033... >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) """ def __init__( self, sentence_aligned_corpus, iterations, source_word_classes, target_word_classes, probability_tables=None, ): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model, distortion models, a fertility model, and a model for generating NULL-aligned words. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param source_word_classes: Lookup table that maps a source word to its word class, the latter represented by an integer id :type source_word_classes: dict[str]: int :param target_word_classes: Lookup table that maps a target word to its word class, the latter represented by an integer id :type target_word_classes: dict[str]: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``, ``fertility_table``, ``p1``, ``head_distortion_table``, ``non_head_distortion_table``. See ``IBMModel`` and ``IBMModel4`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super().__init__(sentence_aligned_corpus) self.reset_probabilities() self.src_classes = source_word_classes self.trg_classes = target_word_classes if probability_tables is None: # Get probabilities from IBM model 3 ibm3 = IBMModel3(sentence_aligned_corpus, iterations) self.translation_table = ibm3.translation_table self.alignment_table = ibm3.alignment_table self.fertility_table = ibm3.fertility_table self.p1 = ibm3.p1 self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables["translation_table"] self.alignment_table = probability_tables["alignment_table"] self.fertility_table = probability_tables["fertility_table"] self.p1 = probability_tables["p1"] self.head_distortion_table = probability_tables["head_distortion_table"] self.non_head_distortion_table = probability_tables[ "non_head_distortion_table" ] for n in range(0, iterations): self.train(sentence_aligned_corpus) def reset_probabilities(self): super().reset_probabilities() self.head_distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) ) """ dict[int][int][int]: float. Probability(displacement of head word | word class of previous cept,target word class). Values accessed as ``distortion_table[dj][src_class][trg_class]``. """ self.non_head_distortion_table = defaultdict( lambda: defaultdict(lambda: self.MIN_PROB) ) """ dict[int][int]: float. Probability(displacement of non-head word | target word class). Values accessed as ``distortion_table[dj][trg_class]``. """ def set_uniform_probabilities(self, sentence_aligned_corpus): """ Set distortion probabilities uniformly to 1 / cardinality of displacement values """ max_m = longest_target_sentence_length(sentence_aligned_corpus) # The maximum displacement is m-1, when a word is in the last # position m of the target sentence and the previously placed # word is in the first position. # Conversely, the minimum displacement is -(m-1). # Thus, the displacement range is (m-1) - (-(m-1)). Note that # displacement cannot be zero and is not included in the range. if max_m <= 1: initial_prob = IBMModel.MIN_PROB else: initial_prob = 1 / (2 * (max_m - 1)) if initial_prob < IBMModel.MIN_PROB: warnings.warn( "A target sentence is too long (" + str(max_m) + " words). Results may be less accurate." ) for dj in range(1, max_m): self.head_distortion_table[dj] = defaultdict( lambda: defaultdict(lambda: initial_prob) ) self.head_distortion_table[-dj] = defaultdict( lambda: defaultdict(lambda: initial_prob) ) self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob) self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob) def train(self, parallel_corpus): counts = Model4Counts() for aligned_sentence in parallel_corpus: m = len(aligned_sentence.words) # Sample the alignment space sampled_alignments, best_alignment = self.sample(aligned_sentence) # Record the most probable alignment aligned_sentence.alignment = Alignment( best_alignment.zero_indexed_alignment() ) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_of_alignments(sampled_alignments) # E step (b): Collect counts for alignment_info in sampled_alignments: count = self.prob_t_a_given_s(alignment_info) normalized_count = count / total_count for j in range(1, m + 1): counts.update_lexical_translation( normalized_count, alignment_info, j ) counts.update_distortion( normalized_count, alignment_info, j, self.src_classes, self.trg_classes, ) counts.update_null_generation(normalized_count, alignment_info) counts.update_fertility(normalized_count, alignment_info) # M step: Update probabilities with maximum likelihood estimates # If any probability is less than MIN_PROB, clamp it to MIN_PROB existing_alignment_table = self.alignment_table self.reset_probabilities() self.alignment_table = existing_alignment_table # don't retrain self.maximize_lexical_translation_probabilities(counts) self.maximize_distortion_probabilities(counts) self.maximize_fertility_probabilities(counts) self.maximize_null_generation_probabilities(counts) def maximize_distortion_probabilities(self, counts): head_d_table = self.head_distortion_table for dj, src_classes in counts.head_distortion.items(): for s_cls, trg_classes in src_classes.items(): for t_cls in trg_classes: estimate = ( counts.head_distortion[dj][s_cls][t_cls] / counts.head_distortion_for_any_dj[s_cls][t_cls] ) head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB) non_head_d_table = self.non_head_distortion_table for dj, trg_classes in counts.non_head_distortion.items(): for t_cls in trg_classes: estimate = ( counts.non_head_distortion[dj][t_cls] / counts.non_head_distortion_for_any_dj[t_cls] ) non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB) def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ return IBMModel4.model4_prob_t_a_given_s(alignment_info, self) @staticmethod # exposed for Model 5 to use def model4_prob_t_a_given_s(alignment_info, ibm_model): probability = 1.0 MIN_PROB = IBMModel.MIN_PROB def null_generation_term(): # Binomial distribution: B(m - null_fertility, p1) value = 1.0 p1 = ibm_model.p1 p0 = 1 - p1 null_fertility = alignment_info.fertility_of_i(0) m = len(alignment_info.trg_sentence) - 1 value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) if value < MIN_PROB: return MIN_PROB # Combination: (m - null_fertility) choose null_fertility for i in range(1, null_fertility + 1): value *= (m - null_fertility - i + 1) / i return value def fertility_term(): value = 1.0 src_sentence = alignment_info.src_sentence for i in range(1, len(src_sentence)): fertility = alignment_info.fertility_of_i(i) value *= ( factorial(fertility) * ibm_model.fertility_table[fertility][src_sentence[i]] ) if value < MIN_PROB: return MIN_PROB return value def lexical_translation_term(j): t = alignment_info.trg_sentence[j] i = alignment_info.alignment[j] s = alignment_info.src_sentence[i] return ibm_model.translation_table[t][s] def distortion_term(j): t = alignment_info.trg_sentence[j] i = alignment_info.alignment[j] if i == 0: # case 1: t is aligned to NULL return 1.0 if alignment_info.is_head_word(j): # case 2: t is the first word of a tablet previous_cept = alignment_info.previous_cept(j) src_class = None if previous_cept is not None: previous_s = alignment_info.src_sentence[previous_cept] src_class = ibm_model.src_classes[previous_s] trg_class = ibm_model.trg_classes[t] dj = j - alignment_info.center_of_cept(previous_cept) return ibm_model.head_distortion_table[dj][src_class][trg_class] # case 3: t is a subsequent word of a tablet previous_position = alignment_info.previous_in_tablet(j) trg_class = ibm_model.trg_classes[t] dj = j - previous_position return ibm_model.non_head_distortion_table[dj][trg_class] # end nested functions # Abort computation whenever probability falls below MIN_PROB at # any point, since MIN_PROB can be considered as zero probability *= null_generation_term() if probability < MIN_PROB: return MIN_PROB probability *= fertility_term() if probability < MIN_PROB: return MIN_PROB for j in range(1, len(alignment_info.trg_sentence)): probability *= lexical_translation_term(j) if probability < MIN_PROB: return MIN_PROB probability *= distortion_term(j) if probability < MIN_PROB: return MIN_PROB return probability class Model4Counts(Counts): """ Data object to store counts of various parameters during training. Includes counts for distortion. """ def __init__(self): super().__init__() self.head_distortion = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) ) self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(lambda: 0.0)) self.non_head_distortion = defaultdict(lambda: defaultdict(lambda: 0.0)) self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0) def update_distortion(self, count, alignment_info, j, src_classes, trg_classes): i = alignment_info.alignment[j] t = alignment_info.trg_sentence[j] if i == 0: # case 1: t is aligned to NULL pass elif alignment_info.is_head_word(j): # case 2: t is the first word of a tablet previous_cept = alignment_info.previous_cept(j) if previous_cept is not None: previous_src_word = alignment_info.src_sentence[previous_cept] src_class = src_classes[previous_src_word] else: src_class = None trg_class = trg_classes[t] dj = j - alignment_info.center_of_cept(previous_cept) self.head_distortion[dj][src_class][trg_class] += count self.head_distortion_for_any_dj[src_class][trg_class] += count else: # case 3: t is a subsequent word of a tablet previous_j = alignment_info.previous_in_tablet(j) trg_class = trg_classes[t] dj = j - previous_j self.non_head_distortion[dj][trg_class] += count self.non_head_distortion_for_any_dj[trg_class] += count nltk-3.7/nltk/translate/ibm5.py000066400000000000000000000652271420073152400165200ustar00rootroot00000000000000# Natural Language Toolkit: IBM Model 5 # # Copyright (C) 2001-2022 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Translation model that keeps track of vacant positions in the target sentence to decide where to place translated words. Translation can be viewed as a process where each word in the source sentence is stepped through sequentially, generating translated words for each source word. The target sentence can be viewed as being made up of ``m`` empty slots initially, which gradually fill up as generated words are placed in them. Models 3 and 4 use distortion probabilities to decide how to place translated words. For simplicity, these models ignore the history of which slots have already been occupied with translated words. Consider the placement of the last translated word: there is only one empty slot left in the target sentence, so the distortion probability should be 1.0 for that position and 0.0 everywhere else. However, the distortion probabilities for Models 3 and 4 are set up such that all positions are under consideration. IBM Model 5 fixes this deficiency by accounting for occupied slots during translation. It introduces the vacancy function v(j), the number of vacancies up to, and including, position j in the target sentence. Terminology ----------- :Maximum vacancy: The number of valid slots that a word can be placed in. This is not necessarily the same as the number of vacant slots. For example, if a tablet contains more than one word, the head word cannot be placed at the last vacant slot because there will be no space for the other words in the tablet. The number of valid slots has to take into account the length of the tablet. Non-head words cannot be placed before the head word, so vacancies to the left of the head word are ignored. :Vacancy difference: For a head word: (v(j) - v(center of previous cept)) Can be positive or negative. For a non-head word: (v(j) - v(position of previously placed word)) Always positive, because successive words in a tablet are assumed to appear to the right of the previous word. Positioning of target words fall under three cases: 1. Words generated by NULL are distributed uniformly 2. For a head word t, its position is modeled by the probability v_head(dv | max_v,word_class_t(t)) 3. For a non-head word t, its position is modeled by the probability v_non_head(dv | max_v,word_class_t(t)) dv and max_v are defined differently for head and non-head words. The EM algorithm used in Model 5 is: :E step: In the training data, collect counts, weighted by prior probabilities. - (a) count how many times a source language word is translated into a target language word - (b) for a particular word class and maximum vacancy, count how many times a head word and the previous cept's center have a particular difference in number of vacancies - (b) for a particular word class and maximum vacancy, count how many times a non-head word and the previous target word have a particular difference in number of vacancies - (d) count how many times a source word is aligned to phi number of target words - (e) count how many times NULL is aligned to a target word :M step: Estimate new probabilities based on the counts from the E step Like Model 4, there are too many possible alignments to consider. Thus, a hill climbing approach is used to sample good candidates. In addition, pruning is used to weed out unlikely alignments based on Model 4 scores. Notations --------- :i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence :j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence :l: Number of words in the source sentence, excluding NULL :m: Number of words in the target sentence :s: A word in the source language :t: A word in the target language :phi: Fertility, the number of target words produced by a source word :p1: Probability that a target word produced by a source word is accompanied by another target word that is aligned to NULL :p0: 1 - p1 :max_v: Maximum vacancy :dv: Vacancy difference, Δv The definition of v_head here differs from GIZA++, section 4.7 of [Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is v_head(v(j) | v(center of previous cept),max_v,word_class(t)). Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with v(center of previous cept) to obtain dv: v_head(v(j) - v(center of previous cept) | max_v,word_class(t)). References ---------- Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ import warnings from collections import defaultdict from math import factorial from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel4 from nltk.translate.ibm_model import Counts, longest_target_sentence_length class IBMModel5(IBMModel): """ Translation model that keeps track of vacant positions in the target sentence to decide where to place translated words >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } >>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes) >>> print(round(ibm5.head_vacancy_table[1][1][1], 3)) 1.0 >>> print(round(ibm5.head_vacancy_table[2][1][1], 3)) 0.0 >>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3)) 1.0 >>> print(round(ibm5.fertility_table[2]['summarize'], 3)) 1.0 >>> print(round(ibm5.fertility_table[1]['book'], 3)) 1.0 >>> print(ibm5.p1) 0.033... >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) """ MIN_SCORE_FACTOR = 0.2 """ Alignments with scores below this factor are pruned during sampling """ def __init__( self, sentence_aligned_corpus, iterations, source_word_classes, target_word_classes, probability_tables=None, ): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model, vacancy models, a fertility model, and a model for generating NULL-aligned words. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param source_word_classes: Lookup table that maps a source word to its word class, the latter represented by an integer id :type source_word_classes: dict[str]: int :param target_word_classes: Lookup table that maps a target word to its word class, the latter represented by an integer id :type target_word_classes: dict[str]: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``, ``fertility_table``, ``p1``, ``head_distortion_table``, ``non_head_distortion_table``, ``head_vacancy_table``, ``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``, and ``IBMModel5`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super().__init__(sentence_aligned_corpus) self.reset_probabilities() self.src_classes = source_word_classes self.trg_classes = target_word_classes if probability_tables is None: # Get probabilities from IBM model 4 ibm4 = IBMModel4( sentence_aligned_corpus, iterations, source_word_classes, target_word_classes, ) self.translation_table = ibm4.translation_table self.alignment_table = ibm4.alignment_table self.fertility_table = ibm4.fertility_table self.p1 = ibm4.p1 self.head_distortion_table = ibm4.head_distortion_table self.non_head_distortion_table = ibm4.non_head_distortion_table self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables["translation_table"] self.alignment_table = probability_tables["alignment_table"] self.fertility_table = probability_tables["fertility_table"] self.p1 = probability_tables["p1"] self.head_distortion_table = probability_tables["head_distortion_table"] self.non_head_distortion_table = probability_tables[ "non_head_distortion_table" ] self.head_vacancy_table = probability_tables["head_vacancy_table"] self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"] for n in range(0, iterations): self.train(sentence_aligned_corpus) def reset_probabilities(self): super().reset_probabilities() self.head_vacancy_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) ) """ dict[int][int][int]: float. Probability(vacancy difference | number of remaining valid positions,target word class). Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``. """ self.non_head_vacancy_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) ) """ dict[int][int][int]: float. Probability(vacancy difference | number of remaining valid positions,target word class). Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``. """ def set_uniform_probabilities(self, sentence_aligned_corpus): """ Set vacancy probabilities uniformly to 1 / cardinality of vacancy difference values """ max_m = longest_target_sentence_length(sentence_aligned_corpus) # The maximum vacancy difference occurs when a word is placed in # the last available position m of the target sentence and the # previous word position has no vacancies. # The minimum is 1-max_v, when a word is placed in the first # available position and the previous word is placed beyond the # last available position. # Thus, the number of possible vacancy difference values is # (max_v) - (1-max_v) + 1 = 2 * max_v. if max_m > 0 and (1 / (2 * max_m)) < IBMModel.MIN_PROB: warnings.warn( "A target sentence is too long (" + str(max_m) + " words). Results may be less accurate." ) for max_v in range(1, max_m + 1): for dv in range(1, max_m + 1): initial_prob = 1 / (2 * max_v) self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob) self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict( lambda: initial_prob ) self.non_head_vacancy_table[dv][max_v] = defaultdict( lambda: initial_prob ) self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict( lambda: initial_prob ) def train(self, parallel_corpus): counts = Model5Counts() for aligned_sentence in parallel_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) # Sample the alignment space sampled_alignments, best_alignment = self.sample(aligned_sentence) # Record the most probable alignment aligned_sentence.alignment = Alignment( best_alignment.zero_indexed_alignment() ) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_of_alignments(sampled_alignments) # E step (b): Collect counts for alignment_info in sampled_alignments: count = self.prob_t_a_given_s(alignment_info) normalized_count = count / total_count for j in range(1, m + 1): counts.update_lexical_translation( normalized_count, alignment_info, j ) slots = Slots(m) for i in range(1, l + 1): counts.update_vacancy( normalized_count, alignment_info, i, self.trg_classes, slots ) counts.update_null_generation(normalized_count, alignment_info) counts.update_fertility(normalized_count, alignment_info) # M step: Update probabilities with maximum likelihood estimates # If any probability is less than MIN_PROB, clamp it to MIN_PROB existing_alignment_table = self.alignment_table self.reset_probabilities() self.alignment_table = existing_alignment_table # don't retrain self.maximize_lexical_translation_probabilities(counts) self.maximize_vacancy_probabilities(counts) self.maximize_fertility_probabilities(counts) self.maximize_null_generation_probabilities(counts) def sample(self, sentence_pair): """ Sample the most probable alignments from the entire alignment space according to Model 4 Note that Model 4 scoring is used instead of Model 5 because the latter is too expensive to compute. First, determine the best alignment according to IBM Model 2. With this initial alignment, use hill climbing to determine the best alignment according to a IBM Model 4. Add this alignment and its neighbors to the sample set. Repeat this process with other initial alignments obtained by pegging an alignment point. Finally, prune alignments that have substantially lower Model 4 scores than the best alignment. :param sentence_pair: Source and target language sentence pair to generate a sample of alignments from :type sentence_pair: AlignedSent :return: A set of best alignments represented by their ``AlignmentInfo`` and the best alignment of the set for convenience :rtype: set(AlignmentInfo), AlignmentInfo """ sampled_alignments, best_alignment = super().sample(sentence_pair) return self.prune(sampled_alignments), best_alignment def prune(self, alignment_infos): """ Removes alignments from ``alignment_infos`` that have substantially lower Model 4 scores than the best alignment :return: Pruned alignments :rtype: set(AlignmentInfo) """ alignments = [] best_score = 0 for alignment_info in alignment_infos: score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self) best_score = max(score, best_score) alignments.append((alignment_info, score)) threshold = IBMModel5.MIN_SCORE_FACTOR * best_score alignments = [a[0] for a in alignments if a[1] > threshold] return set(alignments) def hillclimb(self, alignment_info, j_pegged=None): """ Starting from the alignment in ``alignment_info``, look at neighboring alignments iteratively for the best one, according to Model 4 Note that Model 4 scoring is used instead of Model 5 because the latter is too expensive to compute. There is no guarantee that the best alignment in the alignment space will be found, because the algorithm might be stuck in a local maximum. :param j_pegged: If specified, the search will be constrained to alignments where ``j_pegged`` remains unchanged :type j_pegged: int :return: The best alignment found from hill climbing :rtype: AlignmentInfo """ alignment = alignment_info # alias with shorter name max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self) while True: old_alignment = alignment for neighbor_alignment in self.neighboring(alignment, j_pegged): neighbor_probability = IBMModel4.model4_prob_t_a_given_s( neighbor_alignment, self ) if neighbor_probability > max_probability: alignment = neighbor_alignment max_probability = neighbor_probability if alignment == old_alignment: # Until there are no better alignments break alignment.score = max_probability return alignment def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ probability = 1.0 MIN_PROB = IBMModel.MIN_PROB slots = Slots(len(alignment_info.trg_sentence) - 1) def null_generation_term(): # Binomial distribution: B(m - null_fertility, p1) value = 1.0 p1 = self.p1 p0 = 1 - p1 null_fertility = alignment_info.fertility_of_i(0) m = len(alignment_info.trg_sentence) - 1 value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) if value < MIN_PROB: return MIN_PROB # Combination: (m - null_fertility) choose null_fertility for i in range(1, null_fertility + 1): value *= (m - null_fertility - i + 1) / i return value def fertility_term(): value = 1.0 src_sentence = alignment_info.src_sentence for i in range(1, len(src_sentence)): fertility = alignment_info.fertility_of_i(i) value *= ( factorial(fertility) * self.fertility_table[fertility][src_sentence[i]] ) if value < MIN_PROB: return MIN_PROB return value def lexical_translation_term(j): t = alignment_info.trg_sentence[j] i = alignment_info.alignment[j] s = alignment_info.src_sentence[i] return self.translation_table[t][s] def vacancy_term(i): value = 1.0 tablet = alignment_info.cepts[i] tablet_length = len(tablet) total_vacancies = slots.vacancies_at(len(slots)) # case 1: NULL-aligned words if tablet_length == 0: return value # case 2: head word j = tablet[0] previous_cept = alignment_info.previous_cept(j) previous_center = alignment_info.center_of_cept(previous_cept) dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center) max_v = total_vacancies - tablet_length + 1 trg_class = self.trg_classes[alignment_info.trg_sentence[j]] value *= self.head_vacancy_table[dv][max_v][trg_class] slots.occupy(j) # mark position as occupied total_vacancies -= 1 if value < MIN_PROB: return MIN_PROB # case 3: non-head words for k in range(1, tablet_length): previous_position = tablet[k - 1] previous_vacancies = slots.vacancies_at(previous_position) j = tablet[k] dv = slots.vacancies_at(j) - previous_vacancies max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies trg_class = self.trg_classes[alignment_info.trg_sentence[j]] value *= self.non_head_vacancy_table[dv][max_v][trg_class] slots.occupy(j) # mark position as occupied total_vacancies -= 1 if value < MIN_PROB: return MIN_PROB return value # end nested functions # Abort computation whenever probability falls below MIN_PROB at # any point, since MIN_PROB can be considered as zero probability *= null_generation_term() if probability < MIN_PROB: return MIN_PROB probability *= fertility_term() if probability < MIN_PROB: return MIN_PROB for j in range(1, len(alignment_info.trg_sentence)): probability *= lexical_translation_term(j) if probability < MIN_PROB: return MIN_PROB for i in range(1, len(alignment_info.src_sentence)): probability *= vacancy_term(i) if probability < MIN_PROB: return MIN_PROB return probability def maximize_vacancy_probabilities(self, counts): MIN_PROB = IBMModel.MIN_PROB head_vacancy_table = self.head_vacancy_table for dv, max_vs in counts.head_vacancy.items(): for max_v, trg_classes in max_vs.items(): for t_cls in trg_classes: estimate = ( counts.head_vacancy[dv][max_v][t_cls] / counts.head_vacancy_for_any_dv[max_v][t_cls] ) head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB) non_head_vacancy_table = self.non_head_vacancy_table for dv, max_vs in counts.non_head_vacancy.items(): for max_v, trg_classes in max_vs.items(): for t_cls in trg_classes: estimate = ( counts.non_head_vacancy[dv][max_v][t_cls] / counts.non_head_vacancy_for_any_dv[max_v][t_cls] ) non_head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB) class Model5Counts(Counts): """ Data object to store counts of various parameters during training. Includes counts for vacancies. """ def __init__(self): super().__init__() self.head_vacancy = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) ) self.head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(lambda: 0.0)) self.non_head_vacancy = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) ) self.non_head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(lambda: 0.0)) def update_vacancy(self, count, alignment_info, i, trg_classes, slots): """ :param count: Value to add to the vacancy counts :param alignment_info: Alignment under consideration :param i: Source word position under consideration :param trg_classes: Target word classes :param slots: Vacancy states of the slots in the target sentence. Output parameter that will be modified as new words are placed in the target sentence. """ tablet = alignment_info.cepts[i] tablet_length = len(tablet) total_vacancies = slots.vacancies_at(len(slots)) # case 1: NULL aligned words if tablet_length == 0: return # ignore zero fertility words # case 2: head word j = tablet[0] previous_cept = alignment_info.previous_cept(j) previous_center = alignment_info.center_of_cept(previous_cept) dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center) max_v = total_vacancies - tablet_length + 1 trg_class = trg_classes[alignment_info.trg_sentence[j]] self.head_vacancy[dv][max_v][trg_class] += count self.head_vacancy_for_any_dv[max_v][trg_class] += count slots.occupy(j) # mark position as occupied total_vacancies -= 1 # case 3: non-head words for k in range(1, tablet_length): previous_position = tablet[k - 1] previous_vacancies = slots.vacancies_at(previous_position) j = tablet[k] dv = slots.vacancies_at(j) - previous_vacancies max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies trg_class = trg_classes[alignment_info.trg_sentence[j]] self.non_head_vacancy[dv][max_v][trg_class] += count self.non_head_vacancy_for_any_dv[max_v][trg_class] += count slots.occupy(j) # mark position as occupied total_vacancies -= 1 class Slots: """ Represents positions in a target sentence. Used to keep track of which slot (position) is occupied. """ def __init__(self, target_sentence_length): self._slots = [False] * (target_sentence_length + 1) # 1-indexed def occupy(self, position): """ :return: Mark slot at ``position`` as occupied """ self._slots[position] = True def vacancies_at(self, position): """ :return: Number of vacant slots up to, and including, ``position`` """ vacancies = 0 for k in range(1, position + 1): if not self._slots[k]: vacancies += 1 return vacancies def __len__(self): return len(self._slots) - 1 # exclude dummy zeroeth element nltk-3.7/nltk/translate/ibm_model.py000066400000000000000000000467631420073152400176170ustar00rootroot00000000000000# Natural Language Toolkit: IBM Model Core # # Copyright (C) 2001-2022 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Common methods and classes for all IBM models. See ``IBMModel1``, ``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5`` for specific implementations. The IBM models are a series of generative models that learn lexical translation probabilities, p(target language word|source language word), given a sentence-aligned parallel corpus. The models increase in sophistication from model 1 to 5. Typically, the output of lower models is used to seed the higher models. All models use the Expectation-Maximization (EM) algorithm to learn various probability tables. Words in a sentence are one-indexed. The first word of a sentence has position 1, not 0. Index 0 is reserved in the source sentence for the NULL token. The concept of position does not apply to NULL, but it is indexed at 0 by convention. Each target word is aligned to exactly one source word or the NULL token. References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ from bisect import insort_left from collections import defaultdict from copy import deepcopy from math import ceil def longest_target_sentence_length(sentence_aligned_corpus): """ :param sentence_aligned_corpus: Parallel corpus under consideration :type sentence_aligned_corpus: list(AlignedSent) :return: Number of words in the longest target language sentence of ``sentence_aligned_corpus`` """ max_m = 0 for aligned_sentence in sentence_aligned_corpus: m = len(aligned_sentence.words) max_m = max(m, max_m) return max_m class IBMModel: """ Abstract base class for all IBM models """ # Avoid division by zero and precision errors by imposing a minimum # value for probabilities. Note that this approach is theoretically # incorrect, since it may create probabilities that sum to more # than 1. In practice, the contribution of probabilities with MIN_PROB # is tiny enough that the value of MIN_PROB can be treated as zero. MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7 def __init__(self, sentence_aligned_corpus): self.init_vocab(sentence_aligned_corpus) self.reset_probabilities() def reset_probabilities(self): self.translation_table = defaultdict( lambda: defaultdict(lambda: IBMModel.MIN_PROB) ) """ dict[str][str]: float. Probability(target word | source word). Values accessed as ``translation_table[target_word][source_word]``. """ self.alignment_table = defaultdict( lambda: defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB)) ) ) """ dict[int][int][int][int]: float. Probability(i | j,l,m). Values accessed as ``alignment_table[i][j][l][m]``. Used in model 2 and hill climbing in models 3 and above """ self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) """ dict[int][str]: float. Probability(fertility | source word). Values accessed as ``fertility_table[fertility][source_word]``. Used in model 3 and higher. """ self.p1 = 0.5 """ Probability that a generated word requires another target word that is aligned to NULL. Used in model 3 and higher. """ def set_uniform_probabilities(self, sentence_aligned_corpus): """ Initialize probability tables to a uniform distribution Derived classes should implement this accordingly. """ pass def init_vocab(self, sentence_aligned_corpus): src_vocab = set() trg_vocab = set() for aligned_sentence in sentence_aligned_corpus: trg_vocab.update(aligned_sentence.words) src_vocab.update(aligned_sentence.mots) # Add the NULL token src_vocab.add(None) self.src_vocab = src_vocab """ set(str): All source language words used in training """ self.trg_vocab = trg_vocab """ set(str): All target language words used in training """ def sample(self, sentence_pair): """ Sample the most probable alignments from the entire alignment space First, determine the best alignment according to IBM Model 2. With this initial alignment, use hill climbing to determine the best alignment according to a higher IBM Model. Add this alignment and its neighbors to the sample set. Repeat this process with other initial alignments obtained by pegging an alignment point. Hill climbing may be stuck in a local maxima, hence the pegging and trying out of different alignments. :param sentence_pair: Source and target language sentence pair to generate a sample of alignments from :type sentence_pair: AlignedSent :return: A set of best alignments represented by their ``AlignmentInfo`` and the best alignment of the set for convenience :rtype: set(AlignmentInfo), AlignmentInfo """ sampled_alignments = set() l = len(sentence_pair.mots) m = len(sentence_pair.words) # Start from the best model 2 alignment initial_alignment = self.best_model2_alignment(sentence_pair) potential_alignment = self.hillclimb(initial_alignment) sampled_alignments.update(self.neighboring(potential_alignment)) best_alignment = potential_alignment # Start from other model 2 alignments, # with the constraint that j is aligned (pegged) to i for j in range(1, m + 1): for i in range(0, l + 1): initial_alignment = self.best_model2_alignment(sentence_pair, j, i) potential_alignment = self.hillclimb(initial_alignment, j) neighbors = self.neighboring(potential_alignment, j) sampled_alignments.update(neighbors) if potential_alignment.score > best_alignment.score: best_alignment = potential_alignment return sampled_alignments, best_alignment def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0): """ Finds the best alignment according to IBM Model 2 Used as a starting point for hill climbing in Models 3 and above, because it is easier to compute than the best alignments in higher models :param sentence_pair: Source and target language sentence pair to be word-aligned :type sentence_pair: AlignedSent :param j_pegged: If specified, the alignment point of j_pegged will be fixed to i_pegged :type j_pegged: int :param i_pegged: Alignment point to j_pegged :type i_pegged: int """ src_sentence = [None] + sentence_pair.mots trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed l = len(src_sentence) - 1 # exclude NULL m = len(trg_sentence) - 1 alignment = [0] * (m + 1) # init all alignments to NULL cepts = [[] for i in range(l + 1)] # init all cepts to empty list for j in range(1, m + 1): if j == j_pegged: # use the pegged alignment instead of searching for best one best_i = i_pegged else: best_i = 0 max_alignment_prob = IBMModel.MIN_PROB t = trg_sentence[j] for i in range(0, l + 1): s = src_sentence[i] alignment_prob = ( self.translation_table[t][s] * self.alignment_table[i][j][l][m] ) if alignment_prob >= max_alignment_prob: max_alignment_prob = alignment_prob best_i = i alignment[j] = best_i cepts[best_i].append(j) return AlignmentInfo( tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts ) def hillclimb(self, alignment_info, j_pegged=None): """ Starting from the alignment in ``alignment_info``, look at neighboring alignments iteratively for the best one There is no guarantee that the best alignment in the alignment space will be found, because the algorithm might be stuck in a local maximum. :param j_pegged: If specified, the search will be constrained to alignments where ``j_pegged`` remains unchanged :type j_pegged: int :return: The best alignment found from hill climbing :rtype: AlignmentInfo """ alignment = alignment_info # alias with shorter name max_probability = self.prob_t_a_given_s(alignment) while True: old_alignment = alignment for neighbor_alignment in self.neighboring(alignment, j_pegged): neighbor_probability = self.prob_t_a_given_s(neighbor_alignment) if neighbor_probability > max_probability: alignment = neighbor_alignment max_probability = neighbor_probability if alignment == old_alignment: # Until there are no better alignments break alignment.score = max_probability return alignment def neighboring(self, alignment_info, j_pegged=None): """ Determine the neighbors of ``alignment_info``, obtained by moving or swapping one alignment point :param j_pegged: If specified, neighbors that have a different alignment point from j_pegged will not be considered :type j_pegged: int :return: A set neighboring alignments represented by their ``AlignmentInfo`` :rtype: set(AlignmentInfo) """ neighbors = set() l = len(alignment_info.src_sentence) - 1 # exclude NULL m = len(alignment_info.trg_sentence) - 1 original_alignment = alignment_info.alignment original_cepts = alignment_info.cepts for j in range(1, m + 1): if j != j_pegged: # Add alignments that differ by one alignment point for i in range(0, l + 1): new_alignment = list(original_alignment) new_cepts = deepcopy(original_cepts) old_i = original_alignment[j] # update alignment new_alignment[j] = i # update cepts insort_left(new_cepts[i], j) new_cepts[old_i].remove(j) new_alignment_info = AlignmentInfo( tuple(new_alignment), alignment_info.src_sentence, alignment_info.trg_sentence, new_cepts, ) neighbors.add(new_alignment_info) for j in range(1, m + 1): if j != j_pegged: # Add alignments that have two alignment points swapped for other_j in range(1, m + 1): if other_j != j_pegged and other_j != j: new_alignment = list(original_alignment) new_cepts = deepcopy(original_cepts) other_i = original_alignment[other_j] i = original_alignment[j] # update alignments new_alignment[j] = other_i new_alignment[other_j] = i # update cepts new_cepts[other_i].remove(other_j) insort_left(new_cepts[other_i], j) new_cepts[i].remove(j) insort_left(new_cepts[i], other_j) new_alignment_info = AlignmentInfo( tuple(new_alignment), alignment_info.src_sentence, alignment_info.trg_sentence, new_cepts, ) neighbors.add(new_alignment_info) return neighbors def maximize_lexical_translation_probabilities(self, counts): for t, src_words in counts.t_given_s.items(): for s in src_words: estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s] self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB) def maximize_fertility_probabilities(self, counts): for phi, src_words in counts.fertility.items(): for s in src_words: estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s] self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB) def maximize_null_generation_probabilities(self, counts): p1_estimate = counts.p1 / (counts.p1 + counts.p0) p1_estimate = max(p1_estimate, IBMModel.MIN_PROB) # Clip p1 if it is too large, because p0 = 1 - p1 should not be # smaller than MIN_PROB self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB) def prob_of_alignments(self, alignments): probability = 0 for alignment_info in alignments: probability += self.prob_t_a_given_s(alignment_info) return probability def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence All required information is assumed to be in ``alignment_info`` and self. Derived classes should override this method """ return 0.0 class AlignmentInfo: """ Helper data object for training IBM Models 3 and up Read-only. For a source sentence and its counterpart in the target language, this class holds information about the sentence pair's alignment, cepts, and fertility. Warning: Alignments are one-indexed here, in contrast to nltk.translate.Alignment and AlignedSent, which are zero-indexed This class is not meant to be used outside of IBM models. """ def __init__(self, alignment, src_sentence, trg_sentence, cepts): if not isinstance(alignment, tuple): raise TypeError( "The alignment must be a tuple because it is used " "to uniquely identify AlignmentInfo objects." ) self.alignment = alignment """ tuple(int): Alignment function. ``alignment[j]`` is the position in the source sentence that is aligned to the position j in the target sentence. """ self.src_sentence = src_sentence """ tuple(str): Source sentence referred to by this object. Should include NULL token (None) in index 0. """ self.trg_sentence = trg_sentence """ tuple(str): Target sentence referred to by this object. Should have a dummy element in index 0 so that the first word starts from index 1. """ self.cepts = cepts """ list(list(int)): The positions of the target words, in ascending order, aligned to a source word position. For example, cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7 of the target sentence are aligned to the word in position 4 of the source sentence """ self.score = None """ float: Optional. Probability of alignment, as defined by the IBM model that assesses this alignment """ def fertility_of_i(self, i): """ Fertility of word in position ``i`` of the source sentence """ return len(self.cepts[i]) def is_head_word(self, j): """ :return: Whether the word in position ``j`` of the target sentence is a head word """ i = self.alignment[j] return self.cepts[i][0] == j def center_of_cept(self, i): """ :return: The ceiling of the average positions of the words in the tablet of cept ``i``, or 0 if ``i`` is None """ if i is None: return 0 average_position = sum(self.cepts[i]) / len(self.cepts[i]) return int(ceil(average_position)) def previous_cept(self, j): """ :return: The previous cept of ``j``, or None if ``j`` belongs to the first cept """ i = self.alignment[j] if i == 0: raise ValueError( "Words aligned to NULL cannot have a previous " "cept because NULL has no position" ) previous_cept = i - 1 while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0: previous_cept -= 1 if previous_cept <= 0: previous_cept = None return previous_cept def previous_in_tablet(self, j): """ :return: The position of the previous word that is in the same tablet as ``j``, or None if ``j`` is the first word of the tablet """ i = self.alignment[j] tablet_position = self.cepts[i].index(j) if tablet_position == 0: return None return self.cepts[i][tablet_position - 1] def zero_indexed_alignment(self): """ :return: Zero-indexed alignment, suitable for use in external ``nltk.translate`` modules like ``nltk.translate.Alignment`` :rtype: list(tuple) """ zero_indexed_alignment = [] for j in range(1, len(self.trg_sentence)): i = self.alignment[j] - 1 if i < 0: i = None # alignment to NULL token zero_indexed_alignment.append((j - 1, i)) return zero_indexed_alignment def __eq__(self, other): return self.alignment == other.alignment def __ne__(self, other): return not self == other def __hash__(self): return hash(self.alignment) class Counts: """ Data object to store counts of various parameters during training """ def __init__(self): self.t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0)) self.any_t_given_s = defaultdict(lambda: 0.0) self.p0 = 0.0 self.p1 = 0.0 self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0)) self.fertility_for_any_phi = defaultdict(lambda: 0.0) def update_lexical_translation(self, count, alignment_info, j): i = alignment_info.alignment[j] t = alignment_info.trg_sentence[j] s = alignment_info.src_sentence[i] self.t_given_s[t][s] += count self.any_t_given_s[s] += count def update_null_generation(self, count, alignment_info): m = len(alignment_info.trg_sentence) - 1 fertility_of_null = alignment_info.fertility_of_i(0) self.p1 += fertility_of_null * count self.p0 += (m - 2 * fertility_of_null) * count def update_fertility(self, count, alignment_info): for i in range(0, len(alignment_info.src_sentence)): s = alignment_info.src_sentence[i] phi = alignment_info.fertility_of_i(i) self.fertility[phi][s] += count self.fertility_for_any_phi[s] += count nltk-3.7/nltk/translate/meteor_score.py000066400000000000000000000407671420073152400203540ustar00rootroot00000000000000# Natural Language Toolkit: Machine Translation # # Copyright (C) 2001-2022 NLTK Project # Author: Uday Krishna # Contributor: Tom Aarsen # URL: # For license information, see LICENSE.TXT from itertools import chain, product from typing import Callable, Iterable, List, Tuple from nltk.corpus import WordNetCorpusReader, wordnet from nltk.stem.api import StemmerI from nltk.stem.porter import PorterStemmer def _generate_enums( hypothesis: Iterable[str], reference: Iterable[str], preprocess: Callable[[str], str] = str.lower, ) -> Tuple[List[Tuple[int, str]], List[Tuple[int, str]]]: """ Takes in pre-tokenized inputs for hypothesis and reference and returns enumerated word lists for each of them :param hypothesis: pre-tokenized hypothesis :param reference: pre-tokenized reference :preprocess: preprocessing method (default str.lower) :return: enumerated words list """ if isinstance(hypothesis, str): raise TypeError( f'"hypothesis" expects pre-tokenized hypothesis (Iterable[str]): {hypothesis}' ) if isinstance(reference, str): raise TypeError( f'"reference" expects pre-tokenized reference (Iterable[str]): {reference}' ) enum_hypothesis_list = list(enumerate(map(preprocess, hypothesis))) enum_reference_list = list(enumerate(map(preprocess, reference))) return enum_hypothesis_list, enum_reference_list def exact_match( hypothesis: Iterable[str], reference: Iterable[str] ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: """ matches exact words in hypothesis and reference and returns a word mapping based on the enumerated word id between hypothesis and reference :param hypothesis: pre-tokenized hypothesis :param reference: pre-tokenized reference :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, enumerated unmatched reference tuples """ enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) return _match_enums(enum_hypothesis_list, enum_reference_list) def _match_enums( enum_hypothesis_list: List[Tuple[int, str]], enum_reference_list: List[Tuple[int, str]], ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: """ matches exact words in hypothesis and reference and returns a word mapping between enum_hypothesis_list and enum_reference_list based on the enumerated word id. :param enum_hypothesis_list: enumerated hypothesis list :param enum_reference_list: enumerated reference list :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, enumerated unmatched reference tuples """ word_match = [] for i in range(len(enum_hypothesis_list))[::-1]: for j in range(len(enum_reference_list))[::-1]: if enum_hypothesis_list[i][1] == enum_reference_list[j][1]: word_match.append( (enum_hypothesis_list[i][0], enum_reference_list[j][0]) ) enum_hypothesis_list.pop(i) enum_reference_list.pop(j) break return word_match, enum_hypothesis_list, enum_reference_list def _enum_stem_match( enum_hypothesis_list: List[Tuple[int, str]], enum_reference_list: List[Tuple[int, str]], stemmer: StemmerI = PorterStemmer(), ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: """ Stems each word and matches them in hypothesis and reference and returns a word mapping between enum_hypothesis_list and enum_reference_list based on the enumerated word id. The function also returns a enumerated list of unmatched words for hypothesis and reference. :param enum_hypothesis_list: enumerated hypothesis list :param enum_reference_list: enumerated reference list :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, enumerated unmatched reference tuples """ stemmed_enum_hypothesis_list = [ (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list ] stemmed_enum_reference_list = [ (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list ] return _match_enums(stemmed_enum_hypothesis_list, stemmed_enum_reference_list) def stem_match( hypothesis: Iterable[str], reference: Iterable[str], stemmer: StemmerI = PorterStemmer(), ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: """ Stems each word and matches them in hypothesis and reference and returns a word mapping between hypothesis and reference :param hypothesis: pre-tokenized hypothesis :param reference: pre-tokenized reference :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, enumerated unmatched reference tuples """ enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer) def _enum_wordnetsyn_match( enum_hypothesis_list: List[Tuple[int, str]], enum_reference_list: List[Tuple[int, str]], wordnet: WordNetCorpusReader = wordnet, ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: """ Matches each word in reference to a word in hypothesis if any synonym of a hypothesis word is the exact match to the reference word. :param enum_hypothesis_list: enumerated hypothesis list :param enum_reference_list: enumerated reference list :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) """ word_match = [] for i in range(len(enum_hypothesis_list))[::-1]: hypothesis_syns = set( chain.from_iterable( ( lemma.name() for lemma in synset.lemmas() if lemma.name().find("_") < 0 ) for synset in wordnet.synsets(enum_hypothesis_list[i][1]) ) ).union({enum_hypothesis_list[i][1]}) for j in range(len(enum_reference_list))[::-1]: if enum_reference_list[j][1] in hypothesis_syns: word_match.append( (enum_hypothesis_list[i][0], enum_reference_list[j][0]) ) enum_hypothesis_list.pop(i) enum_reference_list.pop(j) break return word_match, enum_hypothesis_list, enum_reference_list def wordnetsyn_match( hypothesis: Iterable[str], reference: Iterable[str], wordnet: WordNetCorpusReader = wordnet, ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: """ Matches each word in reference to a word in hypothesis if any synonym of a hypothesis word is the exact match to the reference word. :param hypothesis: pre-tokenized hypothesis :param reference: pre-tokenized reference :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) :return: list of mapped tuples """ enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) return _enum_wordnetsyn_match( enum_hypothesis_list, enum_reference_list, wordnet=wordnet ) def _enum_align_words( enum_hypothesis_list: List[Tuple[int, str]], enum_reference_list: List[Tuple[int, str]], stemmer: StemmerI = PorterStemmer(), wordnet: WordNetCorpusReader = wordnet, ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: """ Aligns/matches words in the hypothesis to reference by sequentially applying exact match, stemmed match and wordnet based synonym match. in case there are multiple matches the match which has the least number of crossing is chosen. Takes enumerated list as input instead of string input :param enum_hypothesis_list: enumerated hypothesis list :param enum_reference_list: enumerated reference list :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) :return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list """ exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums( enum_hypothesis_list, enum_reference_list ) stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match( enum_hypothesis_list, enum_reference_list, stemmer=stemmer ) wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match( enum_hypothesis_list, enum_reference_list, wordnet=wordnet ) return ( sorted( exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0] ), enum_hypothesis_list, enum_reference_list, ) def align_words( hypothesis: Iterable[str], reference: Iterable[str], stemmer: StemmerI = PorterStemmer(), wordnet: WordNetCorpusReader = wordnet, ) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: """ Aligns/matches words in the hypothesis to reference by sequentially applying exact match, stemmed match and wordnet based synonym match. In case there are multiple matches the match which has the least number of crossing is chosen. :param hypothesis: pre-tokenized hypothesis :param reference: pre-tokenized reference :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) :return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list """ enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) return _enum_align_words( enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet ) def _count_chunks(matches: List[Tuple[int, int]]) -> int: """ Counts the fewest possible number of chunks such that matched unigrams of each chunk are adjacent to each other. This is used to calculate the fragmentation part of the metric. :param matches: list containing a mapping of matched words (output of align_words) :return: Number of chunks a sentence is divided into post alignment """ i = 0 chunks = 1 while i < len(matches) - 1: if (matches[i + 1][0] == matches[i][0] + 1) and ( matches[i + 1][1] == matches[i][1] + 1 ): i += 1 continue i += 1 chunks += 1 return chunks def single_meteor_score( reference: Iterable[str], hypothesis: Iterable[str], preprocess: Callable[[str], str] = str.lower, stemmer: StemmerI = PorterStemmer(), wordnet: WordNetCorpusReader = wordnet, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5, ) -> float: """ Calculates METEOR score for single hypothesis and reference as per "Meteor: An Automatic Metric for MT Evaluation with HighLevels of Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal, in Proceedings of ACL. https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'] >>> round(single_meteor_score(reference1, hypothesis1),4) 0.7398 If there is no words match during the alignment the method returns the score as 0. We can safely return a zero instead of raising a division by zero error as no match usually implies a bad translation. >>> round(meteor_score(['this', 'is', 'a', 'cat'], ['non', 'matching', 'hypothesis']),4) 0.0 :param reference: pre-tokenized reference :param hypothesis: pre-tokenized hypothesis :param preprocess: preprocessing function (default str.lower) :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) :param alpha: parameter for controlling relative weights of precision and recall. :param beta: parameter for controlling shape of penalty as a function of as a function of fragmentation. :param gamma: relative weight assigned to fragmentation penalty. :return: The sentence-level METEOR score. """ enum_hypothesis, enum_reference = _generate_enums( hypothesis, reference, preprocess=preprocess ) translation_length = len(enum_hypothesis) reference_length = len(enum_reference) matches, _, _ = _enum_align_words( enum_hypothesis, enum_reference, stemmer=stemmer, wordnet=wordnet ) matches_count = len(matches) try: precision = float(matches_count) / translation_length recall = float(matches_count) / reference_length fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall) chunk_count = float(_count_chunks(matches)) frag_frac = chunk_count / matches_count except ZeroDivisionError: return 0.0 penalty = gamma * frag_frac ** beta return (1 - penalty) * fmean def meteor_score( references: Iterable[Iterable[str]], hypothesis: Iterable[str], preprocess: Callable[[str], str] = str.lower, stemmer: StemmerI = PorterStemmer(), wordnet: WordNetCorpusReader = wordnet, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5, ) -> float: """ Calculates METEOR score for hypothesis with multiple references as described in "Meteor: An Automatic Metric for MT Evaluation with HighLevels of Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal, in Proceedings of ACL. https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf In case of multiple references the best score is chosen. This method iterates over single_meteor_score and picks the best pair among all the references for a given hypothesis >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 'forever', 'hearing', 'the', 'activity', 'guidebook', 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party'] >>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4) 0.7398 If there is no words match during the alignment the method returns the score as 0. We can safely return a zero instead of raising a division by zero error as no match usually implies a bad translation. >>> round(meteor_score([['this', 'is', 'a', 'cat']], ['non', 'matching', 'hypothesis']),4) 0.0 :param references: pre-tokenized reference sentences :param hypothesis: a pre-tokenized hypothesis sentence :param preprocess: preprocessing function (default str.lower) :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) :param alpha: parameter for controlling relative weights of precision and recall. :param beta: parameter for controlling shape of penalty as a function of as a function of fragmentation. :param gamma: relative weight assigned to fragmentation penalty. :return: The sentence-level METEOR score. """ return max( single_meteor_score( reference, hypothesis, preprocess=preprocess, stemmer=stemmer, wordnet=wordnet, alpha=alpha, beta=beta, gamma=gamma, ) for reference in references ) nltk-3.7/nltk/translate/metrics.py000066400000000000000000000027001420073152400173150ustar00rootroot00000000000000# Natural Language Toolkit: Translation metrics # # Copyright (C) 2001-2022 NLTK Project # Author: Will Zhang # Guan Gui # Steven Bird # URL: # For license information, see LICENSE.TXT def alignment_error_rate(reference, hypothesis, possible=None): """ Return the Alignment Error Rate (AER) of an alignment with respect to a "gold standard" reference alignment. Return an error rate between 0.0 (perfect alignment) and 1.0 (no alignment). >>> from nltk.translate import Alignment >>> ref = Alignment([(0, 0), (1, 1), (2, 2)]) >>> test = Alignment([(0, 0), (1, 2), (2, 1)]) >>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS 0.6666666666666667 :type reference: Alignment :param reference: A gold standard alignment (sure alignments) :type hypothesis: Alignment :param hypothesis: A hypothesis alignment (aka. candidate alignments) :type possible: Alignment or None :param possible: A gold standard reference of possible alignments (defaults to *reference* if None) :rtype: float or None """ if possible is None: possible = reference else: assert reference.issubset(possible) # sanity check return 1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) / float( len(hypothesis) + len(reference) ) nltk-3.7/nltk/translate/nist_score.py000066400000000000000000000174211420073152400200250ustar00rootroot00000000000000# Natural Language Toolkit: NIST Score # # Copyright (C) 2001-2022 NLTK Project # Authors: # Contributors: # URL: # For license information, see LICENSE.TXT """NIST score implementation.""" import fractions import math from collections import Counter from nltk.util import ngrams def sentence_nist(references, hypothesis, n=5): """ Calculate NIST score from George Doddington. 2002. "Automatic evaluation of machine translation quality using n-gram co-occurrence statistics." Proceedings of HLT. Morgan Kaufmann Publishers Inc. https://dl.acm.org/citation.cfm?id=1289189.1289273 DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU score. The official script used by NIST to compute BLEU and NIST score is mteval-14.pl. The main differences are: - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean. - NIST has a different brevity penalty - NIST score from mteval-14.pl has a self-contained tokenizer Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT used in the NIST score computation. >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS 3.3709... >>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS 1.4619... :param references: reference sentences :type references: list(list(str)) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) :param n: highest n-gram order :type n: int """ return corpus_nist([references], [hypothesis], n) def corpus_nist(list_of_references, hypotheses, n=5): """ Calculate a single corpus-level NIST score (aka. system-level BLEU) for all the hypotheses and their respective references. :param references: a corpus of lists of reference sentences, w.r.t. hypotheses :type references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param n: highest n-gram order :type n: int """ # Before proceeding to compute NIST, perform sanity checks. assert len(list_of_references) == len( hypotheses ), "The number of hypotheses and their reference(s) should be the same" # Collect the ngram coounts from the reference sentences. ngram_freq = Counter() total_reference_words = 0 for ( references ) in list_of_references: # For each source sent, there's a list of reference sents. for reference in references: # For each order of ngram, count the ngram occurrences. for i in range(1, n + 1): ngram_freq.update(ngrams(reference, i)) total_reference_words += len(reference) # Compute the information weights based on the reference sentences. # Eqn 2 in Doddington (2002): # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ] information_weights = {} for _ngram in ngram_freq: # w_1 ... w_n _mgram = _ngram[:-1] # w_1 ... w_n-1 # From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl#L546 # it's computed as such: # denominator = ngram_freq[_mgram] if _mgram and _mgram in ngram_freq else denominator = total_reference_words # information_weights[_ngram] = -1 * math.log(ngram_freq[_ngram]/denominator) / math.log(2) # # Mathematically, it's equivalent to the our implementation: if _mgram and _mgram in ngram_freq: numerator = ngram_freq[_mgram] else: numerator = total_reference_words information_weights[_ngram] = math.log(numerator / ngram_freq[_ngram], 2) # Micro-average. nist_precision_numerator_per_ngram = Counter() nist_precision_denominator_per_ngram = Counter() l_ref, l_sys = 0, 0 # For each order of ngram. for i in range(1, n + 1): # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): hyp_len = len(hypothesis) # Find reference with the best NIST score. nist_score_per_ref = [] for reference in references: _ref_len = len(reference) # Counter of ngrams in hypothesis. hyp_ngrams = ( Counter(ngrams(hypothesis, i)) if len(hypothesis) >= i else Counter() ) ref_ngrams = ( Counter(ngrams(reference, i)) if len(reference) >= i else Counter() ) ngram_overlaps = hyp_ngrams & ref_ngrams # Precision part of the score in Eqn 3 _numerator = sum( information_weights[_ngram] * count for _ngram, count in ngram_overlaps.items() ) _denominator = sum(hyp_ngrams.values()) _precision = 0 if _denominator == 0 else _numerator / _denominator nist_score_per_ref.append( (_precision, _numerator, _denominator, _ref_len) ) # Best reference. precision, numerator, denominator, ref_len = max(nist_score_per_ref) nist_precision_numerator_per_ngram[i] += numerator nist_precision_denominator_per_ngram[i] += denominator l_ref += ref_len l_sys += hyp_len # Final NIST micro-average mean aggregation. nist_precision = 0 for i in nist_precision_numerator_per_ngram: precision = ( nist_precision_numerator_per_ngram[i] / nist_precision_denominator_per_ngram[i] ) nist_precision += precision # Eqn 3 in Doddington(2002) return nist_precision * nist_length_penalty(l_ref, l_sys) def nist_length_penalty(ref_len, hyp_len): """ Calculates the NIST length penalty, from Eq. 3 in Doddington (2002) penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 ))) where, `beta` is chosen to make the brevity penalty factor = 0.5 when the no. of words in the system output (hyp) is 2/3 of the average no. of words in the reference translation (ref) The NIST penalty is different from BLEU's such that it minimize the impact of the score of small variations in the length of a translation. See Fig. 4 in Doddington (2002) """ ratio = hyp_len / ref_len if 0 < ratio < 1: ratio_x, score_x = 1.5, 0.5 beta = math.log(score_x) / math.log(ratio_x) ** 2 return math.exp(beta * math.log(ratio) ** 2) else: # ratio <= 0 or ratio >= 1 return max(min(ratio, 1.0), 0.0) nltk-3.7/nltk/translate/phrase_based.py000066400000000000000000000167631420073152400203050ustar00rootroot00000000000000# Natural Language Toolkit: Phrase Extraction Algorithm # # Copyright (C) 2001-2022 NLTK Project # Authors: Liling Tan, Fredrik Hedman, Petra Barancikova # URL: # For license information, see LICENSE.TXT def extract( f_start, f_end, e_start, e_end, alignment, f_aligned, srctext, trgtext, srclen, trglen, max_phrase_length, ): """ This function checks for alignment point consistency and extracts phrases using the chunk of consistent phrases. A phrase pair (e, f ) is consistent with an alignment A if and only if: (i) No English words in the phrase pair are aligned to words outside it. ∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f (ii) No Foreign words in the phrase pair are aligned to words outside it. ∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e (iii) The phrase pair contains at least one alignment point. ∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A :type f_start: int :param f_start: Starting index of the possible foreign language phrases :type f_end: int :param f_end: End index of the possible foreign language phrases :type e_start: int :param e_start: Starting index of the possible source language phrases :type e_end: int :param e_end: End index of the possible source language phrases :type srctext: list :param srctext: The source language tokens, a list of string. :type trgtext: list :param trgtext: The target language tokens, a list of string. :type srclen: int :param srclen: The number of tokens in the source language tokens. :type trglen: int :param trglen: The number of tokens in the target language tokens. """ if f_end < 0: # 0-based indexing. return {} # Check if alignment points are consistent. for e, f in alignment: if (f_start <= f <= f_end) and (e < e_start or e > e_end): return {} # Add phrase pairs (incl. additional unaligned f) phrases = set() fs = f_start while True: fe = min(f_end, f_start + max_phrase_length - 1) while True: # add phrase pair ([e_start, e_end], [fs, fe]) to set E # Need to +1 in range to include the end-point. src_phrase = " ".join(srctext[e_start : e_end + 1]) trg_phrase = " ".join(trgtext[fs : fe + 1]) # Include more data for later ordering. phrases.add(((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase)) fe += 1 if fe in f_aligned or fe >= trglen: break fs -= 1 if fs in f_aligned or fs < 0: break return phrases def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0): """ Phrase extraction algorithm extracts all consistent phrase pairs from a word-aligned sentence pair. The idea is to loop over all possible source language (e) phrases and find the minimal foreign phrase (f) that matches each of them. Matching is done by identifying all alignment points for the source phrase and finding the shortest foreign phrase that includes all the foreign counterparts for the source words. In short, a phrase alignment has to (a) contain all alignment points for all covered words (b) contain at least one alignment point >>> srctext = "michael assumes that he will stay in the house" >>> trgtext = "michael geht davon aus , dass er im haus bleibt" >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), ... (5,9), (6,7), (7,7), (8,8)] >>> phrases = phrase_extraction(srctext, trgtext, alignment) >>> for i in sorted(phrases): ... print(i) ... ((0, 1), (0, 1), 'michael', 'michael') ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus') ((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,') ((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass') ((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er') ((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt') ((1, 2), (1, 4), 'assumes', 'geht davon aus') ((1, 2), (1, 5), 'assumes', 'geht davon aus ,') ((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass') ((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er') ((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt') ((2, 3), (4, 6), 'that', ', dass') ((2, 3), (5, 6), 'that', 'dass') ((2, 4), (4, 7), 'that he', ', dass er') ((2, 4), (5, 7), 'that he', 'dass er') ((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt') ((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt') ((3, 4), (6, 7), 'he', 'er') ((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt') ((4, 6), (9, 10), 'will stay', 'bleibt') ((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt') ((6, 8), (7, 8), 'in the', 'im') ((6, 9), (7, 9), 'in the house', 'im haus') ((8, 9), (8, 9), 'house', 'haus') :type srctext: str :param srctext: The sentence string from the source language. :type trgtext: str :param trgtext: The sentence string from the target language. :type alignment: list(tuple) :param alignment: The word alignment outputs as list of tuples, where the first elements of tuples are the source words' indices and second elements are the target words' indices. This is also the output format of nltk.translate.ibm1 :rtype: list(tuple) :return: A list of tuples, each element in a list is a phrase and each phrase is a tuple made up of (i) its source location, (ii) its target location, (iii) the source phrase and (iii) the target phrase. The phrase list of tuples represents all the possible phrases extracted from the word alignments. :type max_phrase_length: int :param max_phrase_length: maximal phrase length, if 0 or not specified it is set to a length of the longer sentence (srctext or trgtext). """ srctext = srctext.split() # e trgtext = trgtext.split() # f srclen = len(srctext) # len(e) trglen = len(trgtext) # len(f) # Keeps an index of which source/target words that are aligned. f_aligned = [j for _, j in alignment] max_phrase_length = max_phrase_length or max(srclen, trglen) # set of phrase pairs BP bp = set() for e_start in range(srclen): max_idx = min(srclen, e_start + max_phrase_length) for e_end in range(e_start, max_idx): # // find the minimally matching foreign phrase # (f start , f end ) = ( length(f), 0 ) # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1] f_start, f_end = trglen - 1, -1 # 0-based indexing for e, f in alignment: if e_start <= e <= e_end: f_start = min(f, f_start) f_end = max(f, f_end) # add extract (f start , f end , e start , e end ) to set BP phrases = extract( f_start, f_end, e_start, e_end, alignment, f_aligned, srctext, trgtext, srclen, trglen, max_phrase_length, ) if phrases: bp.update(phrases) return bp nltk-3.7/nltk/translate/ribes_score.py000066400000000000000000000326051420073152400201550ustar00rootroot00000000000000# Natural Language Toolkit: RIBES Score # # Copyright (C) 2001-2022 NLTK Project # Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian # Mark Byers, ekhumoro, P. Ortiz # URL: # For license information, see LICENSE.TXT """ RIBES score implementation """ import math from itertools import islice from nltk.util import choose, ngrams def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10): """ The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for Distant Language Pairs". In Proceedings of EMNLP. https://www.aclweb.org/anthology/D/D10/D10-1092.pdf The generic RIBES scores used in shared task, e.g. Workshop for Asian Translation (WAT) uses the following RIBES calculations: RIBES = kendall_tau * (alpha**p1) * (beta**bp) Please note that this re-implementation differs from the official RIBES implementation and though it emulates the results as describe in the original paper, there are further optimization implemented in the official RIBES script. Users are encouraged to use the official RIBES script instead of this implementation when evaluating your machine translation system. Refer to https://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script. :param references: a list of reference sentences :type references: list(list(str)) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) :param alpha: hyperparameter used as a prior for the unigram precision. :type alpha: float :param beta: hyperparameter used as a prior for the brevity penalty. :type beta: float :return: The best ribes score from one of the references. :rtype: float """ best_ribes = -1.0 # Calculates RIBES for each reference and returns the best score. for reference in references: # Collects the *worder* from the ranked correlation alignments. worder = word_rank_alignment(reference, hypothesis) nkt = kendall_tau(worder) # Calculates the brevity penalty bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis))) # Calculates the unigram precision, *p1* p1 = len(worder) / len(hypothesis) _ribes = nkt * (p1 ** alpha) * (bp ** beta) if _ribes > best_ribes: # Keeps the best score. best_ribes = _ribes return best_ribes def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10): """ This function "calculates RIBES for a system output (hypothesis) with multiple references, and returns "best" score among multi-references and individual scores. The scores are corpus-wise, i.e., averaged by the number of sentences." (c.f. RIBES version 1.03.1 code). Different from BLEU's micro-average precision, RIBES calculates the macro-average precision by averaging the best RIBES score for each pair of hypothesis and its corresponding references >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', ... 'interested', 'in', 'world', 'history'] >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', ... 'because', 'he', 'read', 'the', 'book'] >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] >>> hypotheses = [hyp1, hyp2] >>> round(corpus_ribes(list_of_references, hypotheses),4) 0.3597 :param references: a corpus of lists of reference sentences, w.r.t. hypotheses :type references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param alpha: hyperparameter used as a prior for the unigram precision. :type alpha: float :param beta: hyperparameter used as a prior for the brevity penalty. :type beta: float :return: The best ribes score from one of the references. :rtype: float """ corpus_best_ribes = 0.0 # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta) return corpus_best_ribes / len(hypotheses) def position_of_ngram(ngram, sentence): """ This function returns the position of the first instance of the ngram appearing in a sentence. Note that one could also use string as follows but the code is a little convoluted with type casting back and forth: char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))] word_pos = char_pos.count(' ') Another way to conceive this is: return next(i for i, ng in enumerate(ngrams(sentence, len(ngram))) if ng == ngram) :param ngram: The ngram that needs to be searched :type ngram: tuple :param sentence: The list of tokens to search from. :type sentence: list(str) """ # Iterates through the ngrams in sentence. for i, sublist in enumerate(ngrams(sentence, len(ngram))): # Returns the index of the word when ngram matches. if ngram == sublist: return i def word_rank_alignment(reference, hypothesis, character_based=False): """ This is the word rank alignment algorithm described in the paper to produce the *worder* list, i.e. a list of word indices of the hypothesis word orders w.r.t. the list of reference words. Below is (H0, R0) example from the Isozaki et al. 2010 paper, note the examples are indexed from 1 but the results here are indexed from 0: >>> ref = str('he was interested in world history because he ' ... 'read the book').split() >>> hyp = str('he read the book because he was interested in world ' ... 'history').split() >>> word_rank_alignment(ref, hyp) [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] The (H1, R1) example from the paper, note the 0th index: >>> ref = 'John hit Bob yesterday'.split() >>> hyp = 'Bob hit John yesterday'.split() >>> word_rank_alignment(ref, hyp) [2, 1, 0, 3] Here is the (H2, R2) example from the paper, note the 0th index here too: >>> ref = 'the boy read the book'.split() >>> hyp = 'the book was read by the boy'.split() >>> word_rank_alignment(ref, hyp) [3, 4, 2, 0, 1] :param reference: a reference sentence :type reference: list(str) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) """ worder = [] hyp_len = len(hypothesis) # Stores a list of possible ngrams from the reference sentence. # This is used for matching context window later in the algorithm. ref_ngrams = [] hyp_ngrams = [] for n in range(1, len(reference) + 1): for ng in ngrams(reference, n): ref_ngrams.append(ng) for ng in ngrams(hypothesis, n): hyp_ngrams.append(ng) for i, h_word in enumerate(hypothesis): # If word is not in the reference, continue. if h_word not in reference: continue # If we can determine one-to-one word correspondence for unigrams that # only appear once in both the reference and hypothesis. elif hypothesis.count(h_word) == reference.count(h_word) == 1: worder.append(reference.index(h_word)) else: max_window_size = max(i, hyp_len - i + 1) for window in range(1, max_window_size): if i + window < hyp_len: # If searching the right context is possible. # Retrieve the right context window. right_context_ngram = tuple(islice(hypothesis, i, i + window + 1)) num_times_in_ref = ref_ngrams.count(right_context_ngram) num_times_in_hyp = hyp_ngrams.count(right_context_ngram) # If ngram appears only once in both ref and hyp. if num_times_in_ref == num_times_in_hyp == 1: # Find the position of ngram that matched the reference. pos = position_of_ngram(right_context_ngram, reference) worder.append(pos) # Add the positions of the ngram. break if window <= i: # If searching the left context is possible. # Retrieve the left context window. left_context_ngram = tuple(islice(hypothesis, i - window, i + 1)) num_times_in_ref = ref_ngrams.count(left_context_ngram) num_times_in_hyp = hyp_ngrams.count(left_context_ngram) if num_times_in_ref == num_times_in_hyp == 1: # Find the position of ngram that matched the reference. pos = position_of_ngram(left_context_ngram, reference) # Add the positions of the ngram. worder.append(pos + len(left_context_ngram) - 1) break return worder def find_increasing_sequences(worder): """ Given the *worder* list, this function groups monotonic +1 sequences. >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] >>> list(find_increasing_sequences(worder)) [(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)] :param worder: The worder list output from word_rank_alignment :param type: list(int) """ items = iter(worder) a, b = None, next(items, None) result = [b] while b is not None: a, b = b, next(items, None) if b is not None and a + 1 == b: result.append(b) else: if len(result) > 1: yield tuple(result) result = [b] def kendall_tau(worder, normalize=True): """ Calculates the Kendall's Tau correlation coefficient given the *worder* list of word alignments from word_rank_alignment(), using the formula: tau = 2 * num_increasing_pairs / num_possible_pairs -1 Note that the no. of increasing pairs can be discontinuous in the *worder* list and each each increasing sequence can be tabulated as choose(len(seq), 2) no. of increasing pairs, e.g. >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] >>> number_possible_pairs = choose(len(worder), 2) >>> round(kendall_tau(worder, normalize=False),3) -0.236 >>> round(kendall_tau(worder),3) 0.382 :param worder: The worder list output from word_rank_alignment :type worder: list(int) :param normalize: Flag to indicate normalization to between 0.0 and 1.0. :type normalize: boolean :return: The Kendall's Tau correlation coefficient. :rtype: float """ worder_len = len(worder) # With worder_len < 2, `choose(worder_len, 2)` will be 0. # As we divide by this, it will give a ZeroDivisionError. # To avoid this, we can just return the lowest possible score. if worder_len < 2: tau = -1 else: # Extract the groups of increasing/monotonic sequences. increasing_sequences = find_increasing_sequences(worder) # Calculate no. of increasing_pairs in *worder* list. num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences) # Calculate no. of possible pairs. num_possible_pairs = choose(worder_len, 2) # Kendall's Tau computation. tau = 2 * num_increasing_pairs / num_possible_pairs - 1 if normalize: # If normalized, the tau output falls between 0.0 to 1.0 return (tau + 1) / 2 else: # Otherwise, the tau outputs falls between -1.0 to +1.0 return tau def spearman_rho(worder, normalize=True): """ Calculates the Spearman's Rho correlation coefficient given the *worder* list of word alignment from word_rank_alignment(), using the formula: rho = 1 - sum(d**2) / choose(len(worder)+1, 3) Given that d is the sum of difference between the *worder* list of indices and the original word indices from the reference sentence. Using the (H0,R0) and (H5, R5) example from the paper >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] >>> round(spearman_rho(worder, normalize=False), 3) -0.591 >>> round(spearman_rho(worder), 3) 0.205 :param worder: The worder list output from word_rank_alignment :param type: list(int) """ worder_len = len(worder) sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len))) rho = 1 - sum_d_square / choose(worder_len + 1, 3) if normalize: # If normalized, the rho output falls between 0.0 to 1.0 return (rho + 1) / 2 else: # Otherwise, the rho outputs falls between -1.0 to +1.0 return rho nltk-3.7/nltk/translate/stack_decoder.py000066400000000000000000000470411420073152400204500ustar00rootroot00000000000000# Natural Language Toolkit: Stack decoder # # Copyright (C) 2001-2022 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ A decoder that uses stacks to implement phrase-based translation. In phrase-based translation, the source sentence is segmented into phrases of one or more words, and translations for those phrases are used to build the target sentence. Hypothesis data structures are used to keep track of the source words translated so far and the partial output. A hypothesis can be expanded by selecting an untranslated phrase, looking up its translation in a phrase table, and appending that translation to the partial output. Translation is complete when a hypothesis covers all source words. The search space is huge because the source sentence can be segmented in different ways, the source phrases can be selected in any order, and there could be multiple translations for the same source phrase in the phrase table. To make decoding tractable, stacks are used to limit the number of candidate hypotheses by doing histogram and/or threshold pruning. Hypotheses with the same number of words translated are placed in the same stack. In histogram pruning, each stack has a size limit, and the hypothesis with the lowest score is removed when the stack is full. In threshold pruning, hypotheses that score below a certain threshold of the best hypothesis in that stack are removed. Hypothesis scoring can include various factors such as phrase translation probability, language model probability, length of translation, cost of remaining words to be translated, and so on. References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. """ import warnings from collections import defaultdict from math import log class StackDecoder: """ Phrase-based stack decoder for machine translation >>> from nltk.translate import PhraseTable >>> phrase_table = PhraseTable() >>> phrase_table.add(('niemand',), ('nobody',), log(0.8)) >>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2)) >>> phrase_table.add(('erwartet',), ('expects',), log(0.8)) >>> phrase_table.add(('erwartet',), ('expecting',), log(0.2)) >>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1)) >>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8)) >>> phrase_table.add(('!',), ('!',), log(0.8)) >>> # nltk.model should be used here once it is implemented >>> from collections import defaultdict >>> language_prob = defaultdict(lambda: -999.0) >>> language_prob[('nobody',)] = log(0.5) >>> language_prob[('expects',)] = log(0.4) >>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2) >>> language_prob[('!',)] = log(0.1) >>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})() >>> stack_decoder = StackDecoder(phrase_table, language_model) >>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!']) ['nobody', 'expects', 'the', 'spanish', 'inquisition', '!'] """ def __init__(self, phrase_table, language_model): """ :param phrase_table: Table of translations for source language phrases and the log probabilities for those translations. :type phrase_table: PhraseTable :param language_model: Target language model. Must define a ``probability_change`` method that calculates the change in log probability of a sentence, if a given string is appended to it. This interface is experimental and will likely be replaced with nltk.model once it is implemented. :type language_model: object """ self.phrase_table = phrase_table self.language_model = language_model self.word_penalty = 0.0 """ float: Influences the translation length exponentially. If positive, shorter translations are preferred. If negative, longer translations are preferred. If zero, no penalty is applied. """ self.beam_threshold = 0.0 """ float: Hypotheses that score below this factor of the best hypothesis in a stack are dropped from consideration. Value between 0.0 and 1.0. """ self.stack_size = 100 """ int: Maximum number of hypotheses to consider in a stack. Higher values increase the likelihood of a good translation, but increases processing time. """ self.__distortion_factor = 0.5 self.__compute_log_distortion() @property def distortion_factor(self): """ float: Amount of reordering of source phrases. Lower values favour monotone translation, suitable when word order is similar for both source and target languages. Value between 0.0 and 1.0. Default 0.5. """ return self.__distortion_factor @distortion_factor.setter def distortion_factor(self, d): self.__distortion_factor = d self.__compute_log_distortion() def __compute_log_distortion(self): # cache log(distortion_factor) so we don't have to recompute it # when scoring hypotheses if self.__distortion_factor == 0.0: self.__log_distortion_factor = log(1e-9) # 1e-9 is almost zero else: self.__log_distortion_factor = log(self.__distortion_factor) def translate(self, src_sentence): """ :param src_sentence: Sentence to be translated :type src_sentence: list(str) :return: Translated sentence :rtype: list(str) """ sentence = tuple(src_sentence) # prevent accidental modification sentence_length = len(sentence) stacks = [ _Stack(self.stack_size, self.beam_threshold) for _ in range(0, sentence_length + 1) ] empty_hypothesis = _Hypothesis() stacks[0].push(empty_hypothesis) all_phrases = self.find_all_src_phrases(sentence) future_score_table = self.compute_future_scores(sentence) for stack in stacks: for hypothesis in stack: possible_expansions = StackDecoder.valid_phrases( all_phrases, hypothesis ) for src_phrase_span in possible_expansions: src_phrase = sentence[src_phrase_span[0] : src_phrase_span[1]] for translation_option in self.phrase_table.translations_for( src_phrase ): raw_score = self.expansion_score( hypothesis, translation_option, src_phrase_span ) new_hypothesis = _Hypothesis( raw_score=raw_score, src_phrase_span=src_phrase_span, trg_phrase=translation_option.trg_phrase, previous=hypothesis, ) new_hypothesis.future_score = self.future_score( new_hypothesis, future_score_table, sentence_length ) total_words = new_hypothesis.total_translated_words() stacks[total_words].push(new_hypothesis) if not stacks[sentence_length]: warnings.warn( "Unable to translate all words. " "The source sentence contains words not in " "the phrase table" ) # Instead of returning empty output, perhaps a partial # translation could be returned return [] best_hypothesis = stacks[sentence_length].best() return best_hypothesis.translation_so_far() def find_all_src_phrases(self, src_sentence): """ Finds all subsequences in src_sentence that have a phrase translation in the translation table :type src_sentence: tuple(str) :return: Subsequences that have a phrase translation, represented as a table of lists of end positions. For example, if result[2] is [5, 6, 9], then there are three phrases starting from position 2 in ``src_sentence``, ending at positions 5, 6, and 9 exclusive. The list of ending positions are in ascending order. :rtype: list(list(int)) """ sentence_length = len(src_sentence) phrase_indices = [[] for _ in src_sentence] for start in range(0, sentence_length): for end in range(start + 1, sentence_length + 1): potential_phrase = src_sentence[start:end] if potential_phrase in self.phrase_table: phrase_indices[start].append(end) return phrase_indices def compute_future_scores(self, src_sentence): """ Determines the approximate scores for translating every subsequence in ``src_sentence`` Future scores can be used a look-ahead to determine the difficulty of translating the remaining parts of a src_sentence. :type src_sentence: tuple(str) :return: Scores of subsequences referenced by their start and end positions. For example, result[2][5] is the score of the subsequence covering positions 2, 3, and 4. :rtype: dict(int: (dict(int): float)) """ scores = defaultdict(lambda: defaultdict(lambda: float("-inf"))) for seq_length in range(1, len(src_sentence) + 1): for start in range(0, len(src_sentence) - seq_length + 1): end = start + seq_length phrase = src_sentence[start:end] if phrase in self.phrase_table: score = self.phrase_table.translations_for(phrase)[ 0 ].log_prob # pick best (first) translation # Warning: API of language_model is subject to change score += self.language_model.probability(phrase) scores[start][end] = score # check if a better score can be obtained by combining # two child subsequences for mid in range(start + 1, end): combined_score = scores[start][mid] + scores[mid][end] if combined_score > scores[start][end]: scores[start][end] = combined_score return scores def future_score(self, hypothesis, future_score_table, sentence_length): """ Determines the approximate score for translating the untranslated words in ``hypothesis`` """ score = 0.0 for span in hypothesis.untranslated_spans(sentence_length): score += future_score_table[span[0]][span[1]] return score def expansion_score(self, hypothesis, translation_option, src_phrase_span): """ Calculate the score of expanding ``hypothesis`` with ``translation_option`` :param hypothesis: Hypothesis being expanded :type hypothesis: _Hypothesis :param translation_option: Information about the proposed expansion :type translation_option: PhraseTableEntry :param src_phrase_span: Word position span of the source phrase :type src_phrase_span: tuple(int, int) """ score = hypothesis.raw_score score += translation_option.log_prob # The API of language_model is subject to change; it could accept # a string, a list of words, and/or some other type score += self.language_model.probability_change( hypothesis, translation_option.trg_phrase ) score += self.distortion_score(hypothesis, src_phrase_span) score -= self.word_penalty * len(translation_option.trg_phrase) return score def distortion_score(self, hypothesis, next_src_phrase_span): if not hypothesis.src_phrase_span: return 0.0 next_src_phrase_start = next_src_phrase_span[0] prev_src_phrase_end = hypothesis.src_phrase_span[1] distortion_distance = next_src_phrase_start - prev_src_phrase_end return abs(distortion_distance) * self.__log_distortion_factor @staticmethod def valid_phrases(all_phrases_from, hypothesis): """ Extract phrases from ``all_phrases_from`` that contains words that have not been translated by ``hypothesis`` :param all_phrases_from: Phrases represented by their spans, in the same format as the return value of ``find_all_src_phrases`` :type all_phrases_from: list(list(int)) :type hypothesis: _Hypothesis :return: A list of phrases, represented by their spans, that cover untranslated positions. :rtype: list(tuple(int, int)) """ untranslated_spans = hypothesis.untranslated_spans(len(all_phrases_from)) valid_phrases = [] for available_span in untranslated_spans: start = available_span[0] available_end = available_span[1] while start < available_end: for phrase_end in all_phrases_from[start]: if phrase_end > available_end: # Subsequent elements in all_phrases_from[start] # will also be > available_end, since the # elements are in ascending order break valid_phrases.append((start, phrase_end)) start += 1 return valid_phrases class _Hypothesis: """ Partial solution to a translation. Records the word positions of the phrase being translated, its translation, raw score, and the cost of the untranslated parts of the sentence. When the next phrase is selected to build upon the partial solution, a new _Hypothesis object is created, with a back pointer to the previous hypothesis. To find out which words have been translated so far, look at the ``src_phrase_span`` in the hypothesis chain. Similarly, the translation output can be found by traversing up the chain. """ def __init__( self, raw_score=0.0, src_phrase_span=(), trg_phrase=(), previous=None, future_score=0.0, ): """ :param raw_score: Likelihood of hypothesis so far. Higher is better. Does not account for untranslated words. :type raw_score: float :param src_phrase_span: Span of word positions covered by the source phrase in this hypothesis expansion. For example, (2, 5) means that the phrase is from the second word up to, but not including the fifth word in the source sentence. :type src_phrase_span: tuple(int) :param trg_phrase: Translation of the source phrase in this hypothesis expansion :type trg_phrase: tuple(str) :param previous: Previous hypothesis before expansion to this one :type previous: _Hypothesis :param future_score: Approximate score for translating the remaining words not covered by this hypothesis. Higher means that the remaining words are easier to translate. :type future_score: float """ self.raw_score = raw_score self.src_phrase_span = src_phrase_span self.trg_phrase = trg_phrase self.previous = previous self.future_score = future_score def score(self): """ Overall score of hypothesis after accounting for local and global features """ return self.raw_score + self.future_score def untranslated_spans(self, sentence_length): """ Starting from each untranslated word, find the longest continuous span of untranslated positions :param sentence_length: Length of source sentence being translated by the hypothesis :type sentence_length: int :rtype: list(tuple(int, int)) """ translated_positions = self.translated_positions() translated_positions.sort() translated_positions.append(sentence_length) # add sentinel position untranslated_spans = [] start = 0 # each untranslated span must end in one of the translated_positions for end in translated_positions: if start < end: untranslated_spans.append((start, end)) start = end + 1 return untranslated_spans def translated_positions(self): """ List of positions in the source sentence of words already translated. The list is not sorted. :rtype: list(int) """ translated_positions = [] current_hypothesis = self while current_hypothesis.previous is not None: translated_span = current_hypothesis.src_phrase_span translated_positions.extend(range(translated_span[0], translated_span[1])) current_hypothesis = current_hypothesis.previous return translated_positions def total_translated_words(self): return len(self.translated_positions()) def translation_so_far(self): translation = [] self.__build_translation(self, translation) return translation def __build_translation(self, hypothesis, output): if hypothesis.previous is None: return self.__build_translation(hypothesis.previous, output) output.extend(hypothesis.trg_phrase) class _Stack: """ Collection of _Hypothesis objects """ def __init__(self, max_size=100, beam_threshold=0.0): """ :param beam_threshold: Hypotheses that score less than this factor of the best hypothesis are discarded from the stack. Value must be between 0.0 and 1.0. :type beam_threshold: float """ self.max_size = max_size self.items = [] if beam_threshold == 0.0: self.__log_beam_threshold = float("-inf") else: self.__log_beam_threshold = log(beam_threshold) def push(self, hypothesis): """ Add ``hypothesis`` to the stack. Removes lowest scoring hypothesis if the stack is full. After insertion, hypotheses that score less than ``beam_threshold`` times the score of the best hypothesis are removed. """ self.items.append(hypothesis) self.items.sort(key=lambda h: h.score(), reverse=True) while len(self.items) > self.max_size: self.items.pop() self.threshold_prune() def threshold_prune(self): if not self.items: return # log(score * beam_threshold) = log(score) + log(beam_threshold) threshold = self.items[0].score() + self.__log_beam_threshold for hypothesis in reversed(self.items): if hypothesis.score() < threshold: self.items.pop() else: break def best(self): """ :return: Hypothesis with the highest score in the stack :rtype: _Hypothesis """ if self.items: return self.items[0] return None def __iter__(self): return iter(self.items) def __contains__(self, hypothesis): return hypothesis in self.items def __bool__(self): return len(self.items) != 0 __nonzero__ = __bool__ nltk-3.7/nltk/tree/000077500000000000000000000000001420073152400142405ustar00rootroot00000000000000nltk-3.7/nltk/tree/__init__.py000066400000000000000000000020361420073152400163520ustar00rootroot00000000000000# Natural Language Toolkit: Machine Translation # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Peter Ljunglöf # Tom Aarsen <> # URL: # For license information, see LICENSE.TXT """ NLTK Tree Package This package may be used for representing hierarchical language structures, such as syntax trees and morphological trees. """ # TODO: add LabelledTree (can be used for dependency trees) from nltk.tree.immutable import ( ImmutableMultiParentedTree, ImmutableParentedTree, ImmutableProbabilisticTree, ImmutableTree, ) from nltk.tree.parented import MultiParentedTree, ParentedTree from nltk.tree.parsing import bracket_parse, sinica_parse from nltk.tree.prettyprinter import TreePrettyPrinter from nltk.tree.probabilistic import ProbabilisticTree from nltk.tree.transforms import ( chomsky_normal_form, collapse_unary, un_chomsky_normal_form, ) from nltk.tree.tree import Tree nltk-3.7/nltk/tree/immutable.py000066400000000000000000000077261420073152400166050ustar00rootroot00000000000000# Natural Language Toolkit: Text Trees # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Peter Ljunglöf # Tom Aarsen <> # URL: # For license information, see LICENSE.TXT from nltk.probability import ProbabilisticMixIn from nltk.tree.parented import MultiParentedTree, ParentedTree from nltk.tree.tree import Tree class ImmutableTree(Tree): def __init__(self, node, children=None): super().__init__(node, children) # Precompute our hash value. This ensures that we're really # immutable. It also means we only have to calculate it once. try: self._hash = hash((self._label, tuple(self))) except (TypeError, ValueError) as e: raise ValueError( "%s: node value and children " "must be immutable" % type(self).__name__ ) from e def __setitem__(self, index, value): raise ValueError("%s may not be modified" % type(self).__name__) def __setslice__(self, i, j, value): raise ValueError("%s may not be modified" % type(self).__name__) def __delitem__(self, index): raise ValueError("%s may not be modified" % type(self).__name__) def __delslice__(self, i, j): raise ValueError("%s may not be modified" % type(self).__name__) def __iadd__(self, other): raise ValueError("%s may not be modified" % type(self).__name__) def __imul__(self, other): raise ValueError("%s may not be modified" % type(self).__name__) def append(self, v): raise ValueError("%s may not be modified" % type(self).__name__) def extend(self, v): raise ValueError("%s may not be modified" % type(self).__name__) def pop(self, v=None): raise ValueError("%s may not be modified" % type(self).__name__) def remove(self, v): raise ValueError("%s may not be modified" % type(self).__name__) def reverse(self): raise ValueError("%s may not be modified" % type(self).__name__) def sort(self): raise ValueError("%s may not be modified" % type(self).__name__) def __hash__(self): return self._hash def set_label(self, value): """ Set the node label. This will only succeed the first time the node label is set, which should occur in ImmutableTree.__init__(). """ if hasattr(self, "_label"): raise ValueError("%s may not be modified" % type(self).__name__) self._label = value class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn): def __init__(self, node, children=None, **prob_kwargs): ImmutableTree.__init__(self, node, children) ProbabilisticMixIn.__init__(self, **prob_kwargs) self._hash = hash((self._label, tuple(self), self.prob())) # We have to patch up these methods to make them work right: def _frozen_class(self): return ImmutableProbabilisticTree def __repr__(self): return f"{Tree.__repr__(self)} [{self.prob()}]" def __str__(self): return f"{self.pformat(margin=60)} [{self.prob()}]" def copy(self, deep=False): if not deep: return type(self)(self._label, self, prob=self.prob()) else: return type(self).convert(self) @classmethod def convert(cls, val): if isinstance(val, Tree): children = [cls.convert(child) for child in val] if isinstance(val, ProbabilisticMixIn): return cls(val._label, children, prob=val.prob()) else: return cls(val._label, children, prob=1.0) else: return val class ImmutableParentedTree(ImmutableTree, ParentedTree): pass class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree): pass __all__ = [ "ImmutableProbabilisticTree", "ImmutableTree", "ImmutableParentedTree", "ImmutableMultiParentedTree", ] nltk-3.7/nltk/tree/parented.py000066400000000000000000000541121420073152400164170ustar00rootroot00000000000000# Natural Language Toolkit: Text Trees # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Peter Ljunglöf # Tom Aarsen <> # URL: # For license information, see LICENSE.TXT import warnings from abc import ABCMeta, abstractmethod from nltk.tree.tree import Tree from nltk.util import slice_bounds ###################################################################### ## Parented trees ###################################################################### class AbstractParentedTree(Tree, metaclass=ABCMeta): """ An abstract base class for a ``Tree`` that automatically maintains pointers to parent nodes. These parent pointers are updated whenever any change is made to a tree's structure. Two subclasses are currently defined: - ``ParentedTree`` is used for tree structures where each subtree has at most one parent. This class should be used in cases where there is no"sharing" of subtrees. - ``MultiParentedTree`` is used for tree structures where a subtree may have zero or more parents. This class should be used in cases where subtrees may be shared. Subclassing =========== The ``AbstractParentedTree`` class redefines all operations that modify a tree's structure to call two methods, which are used by subclasses to update parent information: - ``_setparent()`` is called whenever a new child is added. - ``_delparent()`` is called whenever a child is removed. """ def __init__(self, node, children=None): super().__init__(node, children) # If children is None, the tree is read from node, and # all parents will be set during parsing. if children is not None: # Otherwise we have to set the parent of the children. # Iterate over self, and *not* children, because children # might be an iterator. for i, child in enumerate(self): if isinstance(child, Tree): self._setparent(child, i, dry_run=True) for i, child in enumerate(self): if isinstance(child, Tree): self._setparent(child, i) # //////////////////////////////////////////////////////////// # Parent management # //////////////////////////////////////////////////////////// @abstractmethod def _setparent(self, child, index, dry_run=False): """ Update the parent pointer of ``child`` to point to ``self``. This method is only called if the type of ``child`` is ``Tree``; i.e., it is not called when adding a leaf to a tree. This method is always called before the child is actually added to the child list of ``self``. :type child: Tree :type index: int :param index: The index of ``child`` in ``self``. :raise TypeError: If ``child`` is a tree with an impropriate type. Typically, if ``child`` is a tree, then its type needs to match the type of ``self``. This prevents mixing of different tree types (single-parented, multi-parented, and non-parented). :param dry_run: If true, the don't actually set the child's parent pointer; just check for any error conditions, and raise an exception if one is found. """ @abstractmethod def _delparent(self, child, index): """ Update the parent pointer of ``child`` to not point to self. This method is only called if the type of ``child`` is ``Tree``; i.e., it is not called when removing a leaf from a tree. This method is always called before the child is actually removed from the child list of ``self``. :type child: Tree :type index: int :param index: The index of ``child`` in ``self``. """ # //////////////////////////////////////////////////////////// # Methods that add/remove children # //////////////////////////////////////////////////////////// # Every method that adds or removes a child must make # appropriate calls to _setparent() and _delparent(). def __delitem__(self, index): # del ptree[start:stop] if isinstance(index, slice): start, stop, step = slice_bounds(self, index, allow_step=True) # Clear all the children pointers. for i in range(start, stop, step): if isinstance(self[i], Tree): self._delparent(self[i], i) # Delete the children from our child list. super().__delitem__(index) # del ptree[i] elif isinstance(index, int): if index < 0: index += len(self) if index < 0: raise IndexError("index out of range") # Clear the child's parent pointer. if isinstance(self[index], Tree): self._delparent(self[index], index) # Remove the child from our child list. super().__delitem__(index) elif isinstance(index, (list, tuple)): # del ptree[()] if len(index) == 0: raise IndexError("The tree position () may not be deleted.") # del ptree[(i,)] elif len(index) == 1: del self[index[0]] # del ptree[i1, i2, i3] else: del self[index[0]][index[1:]] else: raise TypeError( "%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__) ) def __setitem__(self, index, value): # ptree[start:stop] = value if isinstance(index, slice): start, stop, step = slice_bounds(self, index, allow_step=True) # make a copy of value, in case it's an iterator if not isinstance(value, (list, tuple)): value = list(value) # Check for any error conditions, so we can avoid ending # up in an inconsistent state if an error does occur. for i, child in enumerate(value): if isinstance(child, Tree): self._setparent(child, start + i * step, dry_run=True) # clear the child pointers of all parents we're removing for i in range(start, stop, step): if isinstance(self[i], Tree): self._delparent(self[i], i) # set the child pointers of the new children. We do this # after clearing *all* child pointers, in case we're e.g. # reversing the elements in a tree. for i, child in enumerate(value): if isinstance(child, Tree): self._setparent(child, start + i * step) # finally, update the content of the child list itself. super().__setitem__(index, value) # ptree[i] = value elif isinstance(index, int): if index < 0: index += len(self) if index < 0: raise IndexError("index out of range") # if the value is not changing, do nothing. if value is self[index]: return # Set the new child's parent pointer. if isinstance(value, Tree): self._setparent(value, index) # Remove the old child's parent pointer if isinstance(self[index], Tree): self._delparent(self[index], index) # Update our child list. super().__setitem__(index, value) elif isinstance(index, (list, tuple)): # ptree[()] = value if len(index) == 0: raise IndexError("The tree position () may not be assigned to.") # ptree[(i,)] = value elif len(index) == 1: self[index[0]] = value # ptree[i1, i2, i3] = value else: self[index[0]][index[1:]] = value else: raise TypeError( "%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__) ) def append(self, child): if isinstance(child, Tree): self._setparent(child, len(self)) super().append(child) def extend(self, children): for child in children: if isinstance(child, Tree): self._setparent(child, len(self)) super().append(child) def insert(self, index, child): # Handle negative indexes. Note that if index < -len(self), # we do *not* raise an IndexError, unlike __getitem__. This # is done for consistency with list.__getitem__ and list.index. if index < 0: index += len(self) if index < 0: index = 0 # Set the child's parent, and update our child list. if isinstance(child, Tree): self._setparent(child, index) super().insert(index, child) def pop(self, index=-1): if index < 0: index += len(self) if index < 0: raise IndexError("index out of range") if isinstance(self[index], Tree): self._delparent(self[index], index) return super().pop(index) # n.b.: like `list`, this is done by equality, not identity! # To remove a specific child, use del ptree[i]. def remove(self, child): index = self.index(child) if isinstance(self[index], Tree): self._delparent(self[index], index) super().remove(child) # We need to implement __getslice__ and friends, even though # they're deprecated, because otherwise list.__getslice__ will get # called (since we're subclassing from list). Just delegate to # __getitem__ etc., but use max(0, start) and max(0, stop) because # because negative indices are already handled *before* # __getslice__ is called; and we don't want to double-count them. if hasattr(list, "__getslice__"): def __getslice__(self, start, stop): return self.__getitem__(slice(max(0, start), max(0, stop))) def __delslice__(self, start, stop): return self.__delitem__(slice(max(0, start), max(0, stop))) def __setslice__(self, start, stop, value): return self.__setitem__(slice(max(0, start), max(0, stop)), value) def __getnewargs__(self): """Method used by the pickle module when un-pickling. This method provides the arguments passed to ``__new__`` upon un-pickling. Without this method, ParentedTree instances cannot be pickled and unpickled in Python 3.7+ onwards. :return: Tuple of arguments for ``__new__``, i.e. the label and the children of this node. :rtype: Tuple[Any, List[AbstractParentedTree]] """ return (self._label, list(self)) class ParentedTree(AbstractParentedTree): """ A ``Tree`` that automatically maintains parent pointers for single-parented trees. The following are methods for querying the structure of a parented tree: ``parent``, ``parent_index``, ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``. Each ``ParentedTree`` may have at most one parent. In particular, subtrees may not be shared. Any attempt to reuse a single ``ParentedTree`` as a child of more than one parent (or as multiple children of the same parent) will cause a ``ValueError`` exception to be raised. ``ParentedTrees`` should never be used in the same tree as ``Trees`` or ``MultiParentedTrees``. Mixing tree implementations may result in incorrect parent pointers and in ``TypeError`` exceptions. """ def __init__(self, node, children=None): self._parent = None """The parent of this Tree, or None if it has no parent.""" super().__init__(node, children) if children is None: # If children is None, the tree is read from node. # After parsing, the parent of the immediate children # will point to an intermediate tree, not self. # We fix this by brute force: for i, child in enumerate(self): if isinstance(child, Tree): child._parent = None self._setparent(child, i) def _frozen_class(self): from nltk.tree.immutable import ImmutableParentedTree return ImmutableParentedTree def copy(self, deep=False): if not deep: warnings.warn( f"{self.__class__.__name__} objects do not support shallow copies. Defaulting to a deep copy." ) return super().copy(deep=True) # ///////////////////////////////////////////////////////////////// # Methods # ///////////////////////////////////////////////////////////////// def parent(self): """The parent of this tree, or None if it has no parent.""" return self._parent def parent_index(self): """ The index of this tree in its parent. I.e., ``ptree.parent()[ptree.parent_index()] is ptree``. Note that ``ptree.parent_index()`` is not necessarily equal to ``ptree.parent.index(ptree)``, since the ``index()`` method returns the first child that is equal to its argument. """ if self._parent is None: return None for i, child in enumerate(self._parent): if child is self: return i assert False, "expected to find self in self._parent!" def left_sibling(self): """The left sibling of this tree, or None if it has none.""" parent_index = self.parent_index() if self._parent and parent_index > 0: return self._parent[parent_index - 1] return None # no left sibling def right_sibling(self): """The right sibling of this tree, or None if it has none.""" parent_index = self.parent_index() if self._parent and parent_index < (len(self._parent) - 1): return self._parent[parent_index + 1] return None # no right sibling def root(self): """ The root of this tree. I.e., the unique ancestor of this tree whose parent is None. If ``ptree.parent()`` is None, then ``ptree`` is its own root. """ root = self while root.parent() is not None: root = root.parent() return root def treeposition(self): """ The tree position of this tree, relative to the root of the tree. I.e., ``ptree.root[ptree.treeposition] is ptree``. """ if self.parent() is None: return () else: return self.parent().treeposition() + (self.parent_index(),) # ///////////////////////////////////////////////////////////////// # Parent Management # ///////////////////////////////////////////////////////////////// def _delparent(self, child, index): # Sanity checks assert isinstance(child, ParentedTree) assert self[index] is child assert child._parent is self # Delete child's parent pointer. child._parent = None def _setparent(self, child, index, dry_run=False): # If the child's type is incorrect, then complain. if not isinstance(child, ParentedTree): raise TypeError("Can not insert a non-ParentedTree into a ParentedTree") # If child already has a parent, then complain. if hasattr(child, "_parent") and child._parent is not None: raise ValueError("Can not insert a subtree that already has a parent.") # Set child's parent pointer & index. if not dry_run: child._parent = self class MultiParentedTree(AbstractParentedTree): """ A ``Tree`` that automatically maintains parent pointers for multi-parented trees. The following are methods for querying the structure of a multi-parented tree: ``parents()``, ``parent_indices()``, ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``. Each ``MultiParentedTree`` may have zero or more parents. In particular, subtrees may be shared. If a single ``MultiParentedTree`` is used as multiple children of the same parent, then that parent will appear multiple times in its ``parents()`` method. ``MultiParentedTrees`` should never be used in the same tree as ``Trees`` or ``ParentedTrees``. Mixing tree implementations may result in incorrect parent pointers and in ``TypeError`` exceptions. """ def __init__(self, node, children=None): self._parents = [] """A list of this tree's parents. This list should not contain duplicates, even if a parent contains this tree multiple times.""" super().__init__(node, children) if children is None: # If children is None, the tree is read from node. # After parsing, the parent(s) of the immediate children # will point to an intermediate tree, not self. # We fix this by brute force: for i, child in enumerate(self): if isinstance(child, Tree): child._parents = [] self._setparent(child, i) def _frozen_class(self): from nltk.tree.immutable import ImmutableMultiParentedTree return ImmutableMultiParentedTree # ///////////////////////////////////////////////////////////////// # Methods # ///////////////////////////////////////////////////////////////// def parents(self): """ The set of parents of this tree. If this tree has no parents, then ``parents`` is the empty set. To check if a tree is used as multiple children of the same parent, use the ``parent_indices()`` method. :type: list(MultiParentedTree) """ return list(self._parents) def left_siblings(self): """ A list of all left siblings of this tree, in any of its parent trees. A tree may be its own left sibling if it is used as multiple contiguous children of the same parent. A tree may appear multiple times in this list if it is the left sibling of this tree with respect to multiple parents. :type: list(MultiParentedTree) """ return [ parent[index - 1] for (parent, index) in self._get_parent_indices() if index > 0 ] def right_siblings(self): """ A list of all right siblings of this tree, in any of its parent trees. A tree may be its own right sibling if it is used as multiple contiguous children of the same parent. A tree may appear multiple times in this list if it is the right sibling of this tree with respect to multiple parents. :type: list(MultiParentedTree) """ return [ parent[index + 1] for (parent, index) in self._get_parent_indices() if index < (len(parent) - 1) ] def _get_parent_indices(self): return [ (parent, index) for parent in self._parents for index, child in enumerate(parent) if child is self ] def roots(self): """ The set of all roots of this tree. This set is formed by tracing all possible parent paths until trees with no parents are found. :type: list(MultiParentedTree) """ return list(self._get_roots_helper({}).values()) def _get_roots_helper(self, result): if self._parents: for parent in self._parents: parent._get_roots_helper(result) else: result[id(self)] = self return result def parent_indices(self, parent): """ Return a list of the indices where this tree occurs as a child of ``parent``. If this child does not occur as a child of ``parent``, then the empty list is returned. The following is always true:: for parent_index in ptree.parent_indices(parent): parent[parent_index] is ptree """ if parent not in self._parents: return [] else: return [index for (index, child) in enumerate(parent) if child is self] def treepositions(self, root): """ Return a list of all tree positions that can be used to reach this multi-parented tree starting from ``root``. I.e., the following is always true:: for treepos in ptree.treepositions(root): root[treepos] is ptree """ if self is root: return [()] else: return [ treepos + (index,) for parent in self._parents for treepos in parent.treepositions(root) for (index, child) in enumerate(parent) if child is self ] # ///////////////////////////////////////////////////////////////// # Parent Management # ///////////////////////////////////////////////////////////////// def _delparent(self, child, index): # Sanity checks assert isinstance(child, MultiParentedTree) assert self[index] is child assert len([p for p in child._parents if p is self]) == 1 # If the only copy of child in self is at index, then delete # self from child's parent list. for i, c in enumerate(self): if c is child and i != index: break else: child._parents.remove(self) def _setparent(self, child, index, dry_run=False): # If the child's type is incorrect, then complain. if not isinstance(child, MultiParentedTree): raise TypeError( "Can not insert a non-MultiParentedTree into a MultiParentedTree" ) # Add self as a parent pointer if it's not already listed. if not dry_run: for parent in child._parents: if parent is self: break else: child._parents.append(self) __all__ = [ "ParentedTree", "MultiParentedTree", ] nltk-3.7/nltk/tree/parsing.py000066400000000000000000000037411420073152400162620ustar00rootroot00000000000000# Natural Language Toolkit: Text Trees # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Peter Ljunglöf # Tom Aarsen <> # URL: # For license information, see LICENSE.TXT import re from nltk.tree.tree import Tree ###################################################################### ## Parsing ###################################################################### def bracket_parse(s): """ Use Tree.read(s, remove_empty_top_bracketing=True) instead. """ raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.") def sinica_parse(s): """ Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, as shown in the following example (X represents a Chinese character): S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) :return: A tree corresponding to the string representation. :rtype: Tree :param s: The string to be converted :type s: str """ tokens = re.split(r"([()| ])", s) for i in range(len(tokens)): if tokens[i] == "(": tokens[i - 1], tokens[i] = ( tokens[i], tokens[i - 1], ) # pull nonterminal inside parens elif ":" in tokens[i]: fields = tokens[i].split(":") if len(fields) == 2: # non-terminal tokens[i] = fields[1] else: tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" elif tokens[i] == "|": tokens[i] = "" treebank_string = " ".join(tokens) return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True) # s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier # s = re.sub(r'\w+:', '', s) # remove role tags # return s __all__ = [ "bracket_parse", "sinica_parse", ] nltk-3.7/nltk/tree/prettyprinter.py000066400000000000000000000606361420073152400175600ustar00rootroot00000000000000# Natural Language Toolkit: ASCII visualization of NLTK trees # # Copyright (C) 2001-2022 NLTK Project # Author: Andreas van Cranenburgh # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT """ Pretty-printing of discontinuous trees. Adapted from the disco-dop project, by Andreas van Cranenburgh. https://github.com/andreasvc/disco-dop Interesting reference (not used for this code): T. Eschbach et al., Orth. Hypergraph Drawing, Journal of Graph Algorithms and Applications, 10(2) 141--157 (2006)149. https://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf """ import re try: from html import escape except ImportError: from cgi import escape from collections import defaultdict from operator import itemgetter from nltk.tree.tree import Tree from nltk.util import OrderedDict ANSICOLOR = { "black": 30, "red": 31, "green": 32, "yellow": 33, "blue": 34, "magenta": 35, "cyan": 36, "white": 37, } class TreePrettyPrinter: """ Pretty-print a tree in text format, either as ASCII or Unicode. The tree can be a normal tree, or discontinuous. ``TreePrettyPrinter(tree, sentence=None, highlight=())`` creates an object from which different visualizations can be created. :param tree: a Tree object. :param sentence: a list of words (strings). If `sentence` is given, `tree` must contain integers as leaves, which are taken as indices in `sentence`. Using this you can display a discontinuous tree. :param highlight: Optionally, a sequence of Tree objects in `tree` which should be highlighted. Has the effect of only applying colors to nodes in this sequence (nodes should be given as Tree objects, terminals as indices). >>> from nltk.tree import Tree >>> tree = Tree.fromstring('(S (NP Mary) (VP walks))') >>> print(TreePrettyPrinter(tree).text()) ... # doctest: +NORMALIZE_WHITESPACE S ____|____ NP VP | | Mary walks """ def __init__(self, tree, sentence=None, highlight=()): if sentence is None: leaves = tree.leaves() if ( leaves and all(len(a) > 0 for a in tree.subtrees()) and all(isinstance(a, int) for a in leaves) ): sentence = [str(a) for a in leaves] else: # this deals with empty nodes (frontier non-terminals) # and multiple/mixed terminals under non-terminals. tree = tree.copy(True) sentence = [] for a in tree.subtrees(): if len(a) == 0: a.append(len(sentence)) sentence.append(None) elif any(not isinstance(b, Tree) for b in a): for n, b in enumerate(a): if not isinstance(b, Tree): a[n] = len(sentence) if type(b) == tuple: b = "/".join(b) sentence.append("%s" % b) self.nodes, self.coords, self.edges, self.highlight = self.nodecoords( tree, sentence, highlight ) def __str__(self): return self.text() def __repr__(self): return "" % len(self.nodes) @staticmethod def nodecoords(tree, sentence, highlight): """ Produce coordinates of nodes on a grid. Objective: - Produce coordinates for a non-overlapping placement of nodes and horizontal lines. - Order edges so that crossing edges cross a minimal number of previous horizontal lines (never vertical lines). Approach: - bottom up level order traversal (start at terminals) - at each level, identify nodes which cannot be on the same row - identify nodes which cannot be in the same column - place nodes into a grid at (row, column) - order child-parent edges with crossing edges last Coordinates are (row, column); the origin (0, 0) is at the top left; the root node is on row 0. Coordinates do not consider the size of a node (which depends on font, &c), so the width of a column of the grid should be automatically determined by the element with the greatest width in that column. Alternatively, the integer coordinates could be converted to coordinates in which the distances between adjacent nodes are non-uniform. Produces tuple (nodes, coords, edges, highlighted) where: - nodes[id]: Tree object for the node with this integer id - coords[id]: (n, m) coordinate where to draw node with id in the grid - edges[id]: parent id of node with this id (ordered dictionary) - highlighted: set of ids that should be highlighted """ def findcell(m, matrix, startoflevel, children): """ Find vacant row, column index for node ``m``. Iterate over current rows for this level (try lowest first) and look for cell between first and last child of this node, add new row to level if no free row available. """ candidates = [a for _, a in children[m]] minidx, maxidx = min(candidates), max(candidates) leaves = tree[m].leaves() center = scale * sum(leaves) // len(leaves) # center of gravity if minidx < maxidx and not minidx < center < maxidx: center = sum(candidates) // len(candidates) if max(candidates) - min(candidates) > 2 * scale: center -= center % scale # round to unscaled coordinate if minidx < maxidx and not minidx < center < maxidx: center += scale if ids[m] == 0: startoflevel = len(matrix) for rowidx in range(startoflevel, len(matrix) + 1): if rowidx == len(matrix): # need to add a new row matrix.append( [ vertline if a not in (corner, None) else None for a in matrix[-1] ] ) row = matrix[rowidx] i = j = center if len(children[m]) == 1: # place unaries directly above child return rowidx, next(iter(children[m]))[1] elif all( a is None or a == vertline for a in row[min(candidates) : max(candidates) + 1] ): # find free column for n in range(scale): i = j = center + n while j > minidx or i < maxidx: if i < maxidx and ( matrix[rowidx][i] is None or i in candidates ): return rowidx, i elif j > minidx and ( matrix[rowidx][j] is None or j in candidates ): return rowidx, j i += scale j -= scale raise ValueError( "could not find a free cell for:\n%s\n%s" "min=%d; max=%d" % (tree[m], minidx, maxidx, dumpmatrix()) ) def dumpmatrix(): """Dump matrix contents for debugging purposes.""" return "\n".join( "%2d: %s" % (n, " ".join(("%2r" % i)[:2] for i in row)) for n, row in enumerate(matrix) ) leaves = tree.leaves() if not all(isinstance(n, int) for n in leaves): raise ValueError("All leaves must be integer indices.") if len(leaves) != len(set(leaves)): raise ValueError("Indices must occur at most once.") if not all(0 <= n < len(sentence) for n in leaves): raise ValueError( "All leaves must be in the interval 0..n " "with n=len(sentence)\ntokens: %d indices: " "%r\nsentence: %s" % (len(sentence), tree.leaves(), sentence) ) vertline, corner = -1, -2 # constants tree = tree.copy(True) for a in tree.subtrees(): a.sort(key=lambda n: min(n.leaves()) if isinstance(n, Tree) else n) scale = 2 crossed = set() # internal nodes and lexical nodes (no frontiers) positions = tree.treepositions() maxdepth = max(map(len, positions)) + 1 childcols = defaultdict(set) matrix = [[None] * (len(sentence) * scale)] nodes = {} ids = {a: n for n, a in enumerate(positions)} highlighted_nodes = { n for a, n in ids.items() if not highlight or tree[a] in highlight } levels = {n: [] for n in range(maxdepth - 1)} terminals = [] for a in positions: node = tree[a] if isinstance(node, Tree): levels[maxdepth - node.height()].append(a) else: terminals.append(a) for n in levels: levels[n].sort(key=lambda n: max(tree[n].leaves()) - min(tree[n].leaves())) terminals.sort() positions = set(positions) for m in terminals: i = int(tree[m]) * scale assert matrix[0][i] is None, (matrix[0][i], m, i) matrix[0][i] = ids[m] nodes[ids[m]] = sentence[tree[m]] if nodes[ids[m]] is None: nodes[ids[m]] = "..." highlighted_nodes.discard(ids[m]) positions.remove(m) childcols[m[:-1]].add((0, i)) # add other nodes centered on their children, # if the center is already taken, back off # to the left and right alternately, until an empty cell is found. for n in sorted(levels, reverse=True): nodesatdepth = levels[n] startoflevel = len(matrix) matrix.append( [vertline if a not in (corner, None) else None for a in matrix[-1]] ) for m in nodesatdepth: # [::-1]: if n < maxdepth - 1 and childcols[m]: _, pivot = min(childcols[m], key=itemgetter(1)) if { a[:-1] for row in matrix[:-1] for a in row[:pivot] if isinstance(a, tuple) } & { a[:-1] for row in matrix[:-1] for a in row[pivot:] if isinstance(a, tuple) }: crossed.add(m) rowidx, i = findcell(m, matrix, startoflevel, childcols) positions.remove(m) # block positions where children of this node branch out for _, x in childcols[m]: matrix[rowidx][x] = corner # assert m == () or matrix[rowidx][i] in (None, corner), ( # matrix[rowidx][i], m, str(tree), ' '.join(sentence)) # node itself matrix[rowidx][i] = ids[m] nodes[ids[m]] = tree[m] # add column to the set of children for its parent if len(m) > 0: childcols[m[:-1]].add((rowidx, i)) assert len(positions) == 0 # remove unused columns, right to left for m in range(scale * len(sentence) - 1, -1, -1): if not any(isinstance(row[m], (Tree, int)) for row in matrix): for row in matrix: del row[m] # remove unused rows, reverse matrix = [ row for row in reversed(matrix) if not all(a is None or a == vertline for a in row) ] # collect coordinates of nodes coords = {} for n, _ in enumerate(matrix): for m, i in enumerate(matrix[n]): if isinstance(i, int) and i >= 0: coords[i] = n, m # move crossed edges last positions = sorted( (a for level in levels.values() for a in level), key=lambda a: a[:-1] in crossed, ) # collect edges from node to node edges = OrderedDict() for i in reversed(positions): for j, _ in enumerate(tree[i]): edges[ids[i + (j,)]] = ids[i] return nodes, coords, edges, highlighted_nodes def text( self, nodedist=1, unicodelines=False, html=False, ansi=False, nodecolor="blue", leafcolor="red", funccolor="green", abbreviate=None, maxwidth=16, ): """ :return: ASCII art for a discontinuous tree. :param unicodelines: whether to use Unicode line drawing characters instead of plain (7-bit) ASCII. :param html: whether to wrap output in html code (default plain text). :param ansi: whether to produce colors with ANSI escape sequences (only effective when html==False). :param leafcolor, nodecolor: specify colors of leaves and phrasal nodes; effective when either html or ansi is True. :param abbreviate: if True, abbreviate labels longer than 5 characters. If integer, abbreviate labels longer than `abbr` characters. :param maxwidth: maximum number of characters before a label starts to wrap; pass None to disable. """ if abbreviate == True: abbreviate = 5 if unicodelines: horzline = "\u2500" leftcorner = "\u250c" rightcorner = "\u2510" vertline = " \u2502 " tee = horzline + "\u252C" + horzline bottom = horzline + "\u2534" + horzline cross = horzline + "\u253c" + horzline ellipsis = "\u2026" else: horzline = "_" leftcorner = rightcorner = " " vertline = " | " tee = 3 * horzline cross = bottom = "_|_" ellipsis = "." def crosscell(cur, x=vertline): """Overwrite center of this cell with a vertical branch.""" splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1 lst = list(cur) lst[splitl : splitl + len(x)] = list(x) return "".join(lst) result = [] matrix = defaultdict(dict) maxnodewith = defaultdict(lambda: 3) maxnodeheight = defaultdict(lambda: 1) maxcol = 0 minchildcol = {} maxchildcol = {} childcols = defaultdict(set) labels = {} wrapre = re.compile( "(.{%d,%d}\\b\\W*|.{%d})" % (maxwidth - 4, maxwidth, maxwidth) ) # collect labels and coordinates for a in self.nodes: row, column = self.coords[a] matrix[row][column] = a maxcol = max(maxcol, column) label = ( self.nodes[a].label() if isinstance(self.nodes[a], Tree) else self.nodes[a] ) if abbreviate and len(label) > abbreviate: label = label[:abbreviate] + ellipsis if maxwidth and len(label) > maxwidth: label = wrapre.sub(r"\1\n", label).strip() label = label.split("\n") maxnodeheight[row] = max(maxnodeheight[row], len(label)) maxnodewith[column] = max(maxnodewith[column], max(map(len, label))) labels[a] = label if a not in self.edges: continue # e.g., root parent = self.edges[a] childcols[parent].add((row, column)) minchildcol[parent] = min(minchildcol.get(parent, column), column) maxchildcol[parent] = max(maxchildcol.get(parent, column), column) # bottom up level order traversal for row in sorted(matrix, reverse=True): noderows = [ ["".center(maxnodewith[col]) for col in range(maxcol + 1)] for _ in range(maxnodeheight[row]) ] branchrow = ["".center(maxnodewith[col]) for col in range(maxcol + 1)] for col in matrix[row]: n = matrix[row][col] node = self.nodes[n] text = labels[n] if isinstance(node, Tree): # draw horizontal branch towards children for this node if n in minchildcol and minchildcol[n] < maxchildcol[n]: i, j = minchildcol[n], maxchildcol[n] a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2 branchrow[i] = ((" " * a) + leftcorner).ljust( maxnodewith[i], horzline ) branchrow[j] = (rightcorner + (" " * b)).rjust( maxnodewith[j], horzline ) for i in range(minchildcol[n] + 1, maxchildcol[n]): if i == col and any(a == i for _, a in childcols[n]): line = cross elif i == col: line = bottom elif any(a == i for _, a in childcols[n]): line = tee else: line = horzline branchrow[i] = line.center(maxnodewith[i], horzline) else: # if n and n in minchildcol: branchrow[col] = crosscell(branchrow[col]) text = [a.center(maxnodewith[col]) for a in text] color = nodecolor if isinstance(node, Tree) else leafcolor if isinstance(node, Tree) and node.label().startswith("-"): color = funccolor if html: text = [escape(a, quote=False) for a in text] if n in self.highlight: text = [f"{a}" for a in text] elif ansi and n in self.highlight: text = ["\x1b[%d;1m%s\x1b[0m" % (ANSICOLOR[color], a) for a in text] for x in range(maxnodeheight[row]): # draw vertical lines in partially filled multiline node # labels, but only if it's not a frontier node. noderows[x][col] = ( text[x] if x < len(text) else (vertline if childcols[n] else " ").center( maxnodewith[col], " " ) ) # for each column, if there is a node below us which has a parent # above us, draw a vertical branch in that column. if row != max(matrix): for n, (childrow, col) in self.coords.items(): if n > 0 and self.coords[self.edges[n]][0] < row < childrow: branchrow[col] = crosscell(branchrow[col]) if col not in matrix[row]: for noderow in noderows: noderow[col] = crosscell(noderow[col]) branchrow = [ a + ((a[-1] if a[-1] != " " else b[0]) * nodedist) for a, b in zip(branchrow, branchrow[1:] + [" "]) ] result.append("".join(branchrow)) result.extend( (" " * nodedist).join(noderow) for noderow in reversed(noderows) ) return "\n".join(reversed(result)) + "\n" def svg(self, nodecolor="blue", leafcolor="red", funccolor="green"): """ :return: SVG representation of a tree. """ fontsize = 12 hscale = 40 vscale = 25 hstart = vstart = 20 width = max(col for _, col in self.coords.values()) height = max(row for row, _ in self.coords.values()) result = [ '' % ( width * 3, height * 2.5, -hstart, -vstart, width * hscale + 3 * hstart, height * vscale + 3 * vstart, ) ] children = defaultdict(set) for n in self.nodes: if n: children[self.edges[n]].add(n) # horizontal branches from nodes to children for node in self.nodes: if not children[node]: continue y, x = self.coords[node] x *= hscale y *= vscale x += hstart y += vstart + fontsize // 2 childx = [self.coords[c][1] for c in children[node]] xmin = hstart + hscale * min(childx) xmax = hstart + hscale * max(childx) result.append( '\t' % (xmin, y, xmax, y) ) result.append( '\t' % (x, y, x, y - fontsize // 3) ) # vertical branches from children to parents for child, parent in self.edges.items(): y, _ = self.coords[parent] y *= vscale y += vstart + fontsize // 2 childy, childx = self.coords[child] childx *= hscale childy *= vscale childx += hstart childy += vstart - fontsize result += [ '\t' % (childx, childy, childx, y + 5), '\t' % (childx, childy, childx, y), ] # write nodes with coordinates for n, (row, column) in self.coords.items(): node = self.nodes[n] x = column * hscale + hstart y = row * vscale + vstart if n in self.highlight: color = nodecolor if isinstance(node, Tree) else leafcolor if isinstance(node, Tree) and node.label().startswith("-"): color = funccolor else: color = "black" result += [ '\t%s' % ( color, fontsize, x, y, escape( node.label() if isinstance(node, Tree) else node, quote=False ), ) ] result += [""] return "\n".join(result) def test(): """Do some tree drawing tests.""" def print_tree(n, tree, sentence=None, ansi=True, **xargs): print() print('{}: "{}"'.format(n, " ".join(sentence or tree.leaves()))) print(tree) print() drawtree = TreePrettyPrinter(tree, sentence) try: print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs)) except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False, **xargs)) from nltk.corpus import treebank for n in [0, 1440, 1591, 2771, 2170]: tree = treebank.parsed_sents()[n] print_tree(n, tree, nodedist=2, maxwidth=8) print() print("ASCII version:") print(TreePrettyPrinter(tree).text(nodedist=2)) tree = Tree.fromstring( "(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) " "(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) " "(vg 10) (inf (verb 11)))))) (punct 12))", read_leaf=int, ) sentence = ( "Ze had met haar moeder kunnen gaan winkelen ," " zwemmen of terrassen .".split() ) print_tree("Discontinuous tree", tree, sentence, nodedist=2) __all__ = ["TreePrettyPrinter"] if __name__ == "__main__": test() nltk-3.7/nltk/tree/probabilistic.py000066400000000000000000000046621420073152400174500ustar00rootroot00000000000000# Natural Language Toolkit: Text Trees # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Peter Ljunglöf # Tom Aarsen <> # URL: # For license information, see LICENSE.TXT from nltk.internals import raise_unorderable_types from nltk.probability import ProbabilisticMixIn from nltk.tree.immutable import ImmutableProbabilisticTree from nltk.tree.tree import Tree ###################################################################### ## Probabilistic trees ###################################################################### class ProbabilisticTree(Tree, ProbabilisticMixIn): def __init__(self, node, children=None, **prob_kwargs): Tree.__init__(self, node, children) ProbabilisticMixIn.__init__(self, **prob_kwargs) # We have to patch up these methods to make them work right: def _frozen_class(self): return ImmutableProbabilisticTree def __repr__(self): return f"{Tree.__repr__(self)} (p={self.prob()!r})" def __str__(self): return f"{self.pformat(margin=60)} (p={self.prob():.6g})" def copy(self, deep=False): if not deep: return type(self)(self._label, self, prob=self.prob()) else: return type(self).convert(self) @classmethod def convert(cls, val): if isinstance(val, Tree): children = [cls.convert(child) for child in val] if isinstance(val, ProbabilisticMixIn): return cls(val._label, children, prob=val.prob()) else: return cls(val._label, children, prob=1.0) else: return val def __eq__(self, other): return ( self.__class__ is other.__class__ and ( self._label, list(self), self.prob(), ) == (other._label, list(other), other.prob()) ) def __lt__(self, other): if not isinstance(other, Tree): raise_unorderable_types("<", self, other) if self.__class__ is other.__class__: return (self._label, list(self), self.prob()) < ( other._label, list(other), other.prob(), ) else: return self.__class__.__name__ < other.__class__.__name__ __all__ = ["ProbabilisticTree"] nltk-3.7/nltk/tree/transforms.py000066400000000000000000000320471420073152400170160ustar00rootroot00000000000000# Natural Language Toolkit: Tree Transformations # # Copyright (C) 2005-2007 Oregon Graduate Institute # Author: Nathan Bodenstab # URL: # For license information, see LICENSE.TXT r""" A collection of methods for tree (grammar) transformations used in parsing natural language. Although many of these methods are technically grammar transformations (ie. Chomsky Norm Form), when working with treebanks it is much more natural to visualize these modifications in a tree structure. Hence, we will do all transformation directly to the tree itself. Transforming the tree directly also allows us to do parent annotation. A grammar can then be simply induced from the modified tree. The following is a short tutorial on the available transformations. 1. Chomsky Normal Form (binarization) It is well known that any grammar has a Chomsky Normal Form (CNF) equivalent grammar where CNF is defined by every production having either two non-terminals or one terminal on its right hand side. When we have hierarchically structured data (ie. a treebank), it is natural to view this in terms of productions where the root of every subtree is the head (left hand side) of the production and all of its children are the right hand side constituents. In order to convert a tree into CNF, we simply need to ensure that every subtree has either two subtrees as children (binarization), or one leaf node (non-terminal). In order to binarize a subtree with more than two children, we must introduce artificial nodes. There are two popular methods to convert a tree into CNF: left factoring and right factoring. The following example demonstrates the difference between them. Example:: Original Right-Factored Left-Factored A A A / | \ / \ / \ B C D ==> B A| OR A| D / \ / \ C D B C 2. Parent Annotation In addition to binarizing the tree, there are two standard modifications to node labels we can do in the same traversal: parent annotation and Markov order-N smoothing (or sibling smoothing). The purpose of parent annotation is to refine the probabilities of productions by adding a small amount of context. With this simple addition, a CYK (inside-outside, dynamic programming chart parse) can improve from 74% to 79% accuracy. A natural generalization from parent annotation is to grandparent annotation and beyond. The tradeoff becomes accuracy gain vs. computational complexity. We must also keep in mind data sparcity issues. Example:: Original Parent Annotation A A^ / | \ / \ B C D ==> B^
    A|^ where ? is the / \ parent of A C^ D^ 3. Markov order-N smoothing Markov smoothing combats data sparcity issues as well as decreasing computational requirements by limiting the number of children included in artificial nodes. In practice, most people use an order 2 grammar. Example:: Original No Smoothing Markov order 1 Markov order 2 etc. __A__ A A A / /|\ \ / \ / \ / \ B C D E F ==> B A| ==> B A| ==> B A| / \ / \ / \ C ... C ... C ... Annotation decisions can be thought about in the vertical direction (parent, grandparent, etc) and the horizontal direction (number of siblings to keep). Parameters to the following functions specify these values. For more information see: Dan Klein and Chris Manning (2003) "Accurate Unlexicalized Parsing", ACL-03. https://www.aclweb.org/anthology/P03-1054 4. Unary Collapsing Collapse unary productions (ie. subtrees with a single child) into a new non-terminal (Tree node). This is useful when working with algorithms that do not allow unary productions, yet you do not wish to lose the parent information. Example:: A | B ==> A+B / \ / \ C D C D """ from nltk.tree.tree import Tree def chomsky_normal_form( tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^" ): # assume all subtrees have homogeneous children # assume all terminals have no siblings # A semi-hack to have elegant looking code below. As a result, # any subtree with a branching factor greater than 999 will be incorrectly truncated. if horzMarkov is None: horzMarkov = 999 # Traverse the tree depth-first keeping a list of ancestor nodes to the root. # I chose not to use the tree.treepositions() method since it requires # two traversals of the tree (one to get the positions, one to iterate # over them) and node access time is proportional to the height of the node. # This method is 7x faster which helps when parsing 40,000 sentences. nodeList = [(tree, [tree.label()])] while nodeList != []: node, parent = nodeList.pop() if isinstance(node, Tree): # parent annotation parentString = "" originalNode = node.label() if vertMarkov != 0 and node != tree and isinstance(node[0], Tree): parentString = "{}<{}>".format(parentChar, "-".join(parent)) node.set_label(node.label() + parentString) parent = [originalNode] + parent[: vertMarkov - 1] # add children to the agenda before we mess with them for child in node: nodeList.append((child, parent)) # chomsky normal form factorization if len(node) > 2: childNodes = [child.label() for child in node] nodeCopy = node.copy() node[0:] = [] # delete the children curNode = node numChildren = len(nodeCopy) for i in range(1, numChildren - 1): if factor == "right": newHead = "{}{}<{}>{}".format( originalNode, childChar, "-".join( childNodes[i : min([i + horzMarkov, numChildren])] ), parentString, ) # create new head newNode = Tree(newHead, []) curNode[0:] = [nodeCopy.pop(0), newNode] else: newHead = "{}{}<{}>{}".format( originalNode, childChar, "-".join( childNodes[max([numChildren - i - horzMarkov, 0]) : -i] ), parentString, ) newNode = Tree(newHead, []) curNode[0:] = [newNode, nodeCopy.pop()] curNode = newNode curNode[0:] = [child for child in nodeCopy] def un_chomsky_normal_form( tree, expandUnary=True, childChar="|", parentChar="^", unaryChar="+" ): # Traverse the tree-depth first keeping a pointer to the parent for modification purposes. nodeList = [(tree, [])] while nodeList != []: node, parent = nodeList.pop() if isinstance(node, Tree): # if the node contains the 'childChar' character it means that # it is an artificial node and can be removed, although we still need # to move its children to its parent childIndex = node.label().find(childChar) if childIndex != -1: nodeIndex = parent.index(node) parent.remove(parent[nodeIndex]) # Generated node was on the left if the nodeIndex is 0 which # means the grammar was left factored. We must insert the children # at the beginning of the parent's children if nodeIndex == 0: parent.insert(0, node[0]) parent.insert(1, node[1]) else: parent.extend([node[0], node[1]]) # parent is now the current node so the children of parent will be added to the agenda node = parent else: parentIndex = node.label().find(parentChar) if parentIndex != -1: # strip the node name of the parent annotation node.set_label(node.label()[:parentIndex]) # expand collapsed unary productions if expandUnary == True: unaryIndex = node.label().find(unaryChar) if unaryIndex != -1: newNode = Tree( node.label()[unaryIndex + 1 :], [i for i in node] ) node.set_label(node.label()[:unaryIndex]) node[0:] = [newNode] for child in node: nodeList.append((child, node)) def collapse_unary(tree, collapsePOS=False, collapseRoot=False, joinChar="+"): """ Collapse subtrees with a single child (ie. unary productions) into a new non-terminal (Tree node) joined by 'joinChar'. This is useful when working with algorithms that do not allow unary productions, and completely removing the unary productions would require loss of useful information. The Tree is modified directly (since it is passed by reference) and no value is returned. :param tree: The Tree to be collapsed :type tree: Tree :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. Part-of-Speech tags) since they are always unary productions :type collapsePOS: bool :param collapseRoot: 'False' (default) will not modify the root production if it is unary. For the Penn WSJ treebank corpus, this corresponds to the TOP -> productions. :type collapseRoot: bool :param joinChar: A string used to connect collapsed node values (default = "+") :type joinChar: str """ if collapseRoot == False and isinstance(tree, Tree) and len(tree) == 1: nodeList = [tree[0]] else: nodeList = [tree] # depth-first traversal of tree while nodeList != []: node = nodeList.pop() if isinstance(node, Tree): if ( len(node) == 1 and isinstance(node[0], Tree) and (collapsePOS == True or isinstance(node[0, 0], Tree)) ): node.set_label(node.label() + joinChar + node[0].label()) node[0:] = [child for child in node[0]] # since we assigned the child's children to the current node, # evaluate the current node again nodeList.append(node) else: for child in node: nodeList.append(child) ################################################################# # Demonstration ################################################################# def demo(): """ A demonstration showing how each tree transform can be used. """ from copy import deepcopy from nltk.draw.tree import draw_trees from nltk.tree.tree import Tree # original tree from WSJ bracketed text sentence = """(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))""" t = Tree.fromstring(sentence, remove_empty_top_bracketing=True) # collapse subtrees with only one child collapsedTree = deepcopy(t) collapse_unary(collapsedTree) # convert the tree to CNF cnfTree = deepcopy(collapsedTree) chomsky_normal_form(cnfTree) # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two parentTree = deepcopy(collapsedTree) chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) # convert the tree back to its original form (used to make CYK results comparable) original = deepcopy(parentTree) un_chomsky_normal_form(original) # convert tree back to bracketed text sentence2 = original.pprint() print(sentence) print(sentence2) print("Sentences the same? ", sentence == sentence2) draw_trees(t, collapsedTree, cnfTree, parentTree, original) if __name__ == "__main__": demo() __all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"] nltk-3.7/nltk/tree/tree.py000066400000000000000000001053111420073152400155520ustar00rootroot00000000000000# Natural Language Toolkit: Text Trees # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # Peter Ljunglöf # Nathan Bodenstab (tree transforms) # Eric Kafe (Tree.fromlist()) # Mohaned mashaly (Deprecating methods) # URL: # For license information, see LICENSE.TXT """ Class for representing hierarchical language structures, such as syntax trees and morphological trees. """ import re import sys from nltk.grammar import Nonterminal, Production from nltk.internals import deprecated ###################################################################### ## Trees ###################################################################### class Tree(list): r""" A Tree represents a hierarchical grouping of leaves and subtrees. For example, each constituent in a syntax tree is represented by a single Tree. A tree's children are encoded as a list of leaves and subtrees, where a leaf is a basic (non-tree) value; and a subtree is a nested Tree. >>> from nltk.tree import Tree >>> print(Tree(1, [2, Tree(3, [4]), 5])) (1 2 (3 4) 5) >>> vp = Tree('VP', [Tree('V', ['saw']), ... Tree('NP', ['him'])]) >>> s = Tree('S', [Tree('NP', ['I']), vp]) >>> print(s) (S (NP I) (VP (V saw) (NP him))) >>> print(s[1]) (VP (V saw) (NP him)) >>> print(s[1,1]) (NP him) >>> t = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))") >>> s == t True >>> t[1][1].set_label('X') >>> t[1][1].label() 'X' >>> print(t) (S (NP I) (VP (V saw) (X him))) >>> t[0], t[1,1] = t[1,1], t[0] >>> print(t) (S (X him) (VP (V saw) (NP I))) The length of a tree is the number of children it has. >>> len(t) 2 The set_label() and label() methods allow individual constituents to be labeled. For example, syntax trees use this label to specify phrase tags, such as "NP" and "VP". Several Tree methods use "tree positions" to specify children or descendants of a tree. Tree positions are defined as follows: - The tree position *i* specifies a Tree's *i*\ th child. - The tree position ``()`` specifies the Tree itself. - If *p* is the tree position of descendant *d*, then *p+i* specifies the *i*\ th child of *d*. I.e., every tree position is either a single index *i*, specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*, specifying ``tree[i1][i2]...[iN]``. Construct a new tree. This constructor can be called in one of two ways: - ``Tree(label, children)`` constructs a new tree with the specified label and list of children. - ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``. """ def __init__(self, node, children=None): if children is None: raise TypeError( "%s: Expected a node value and child list " % type(self).__name__ ) elif isinstance(children, str): raise TypeError( "%s() argument 2 should be a list, not a " "string" % type(self).__name__ ) else: list.__init__(self, children) self._label = node # //////////////////////////////////////////////////////////// # Comparison operators # //////////////////////////////////////////////////////////// def __eq__(self, other): return self.__class__ is other.__class__ and (self._label, list(self)) == ( other._label, list(other), ) def __lt__(self, other): if not isinstance(other, Tree): # raise_unorderable_types("<", self, other) # Sometimes children can be pure strings, # so we need to be able to compare with non-trees: return self.__class__.__name__ < other.__class__.__name__ elif self.__class__ is other.__class__: return (self._label, list(self)) < (other._label, list(other)) else: return self.__class__.__name__ < other.__class__.__name__ # @total_ordering doesn't work here, since the class inherits from a builtin class __ne__ = lambda self, other: not self == other __gt__ = lambda self, other: not (self < other or self == other) __le__ = lambda self, other: self < other or self == other __ge__ = lambda self, other: not self < other # //////////////////////////////////////////////////////////// # Disabled list operations # //////////////////////////////////////////////////////////// def __mul__(self, v): raise TypeError("Tree does not support multiplication") def __rmul__(self, v): raise TypeError("Tree does not support multiplication") def __add__(self, v): raise TypeError("Tree does not support addition") def __radd__(self, v): raise TypeError("Tree does not support addition") # //////////////////////////////////////////////////////////// # Indexing (with support for tree positions) # //////////////////////////////////////////////////////////// def __getitem__(self, index): if isinstance(index, (int, slice)): return list.__getitem__(self, index) elif isinstance(index, (list, tuple)): if len(index) == 0: return self elif len(index) == 1: return self[index[0]] else: return self[index[0]][index[1:]] else: raise TypeError( "%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__) ) def __setitem__(self, index, value): if isinstance(index, (int, slice)): return list.__setitem__(self, index, value) elif isinstance(index, (list, tuple)): if len(index) == 0: raise IndexError("The tree position () may not be " "assigned to.") elif len(index) == 1: self[index[0]] = value else: self[index[0]][index[1:]] = value else: raise TypeError( "%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__) ) def __delitem__(self, index): if isinstance(index, (int, slice)): return list.__delitem__(self, index) elif isinstance(index, (list, tuple)): if len(index) == 0: raise IndexError("The tree position () may not be deleted.") elif len(index) == 1: del self[index[0]] else: del self[index[0]][index[1:]] else: raise TypeError( "%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__) ) # //////////////////////////////////////////////////////////// # Basic tree operations # //////////////////////////////////////////////////////////// @deprecated("Use label() instead") def _get_node(self): """Outdated method to access the node value; use the label() method instead.""" @deprecated("Use set_label() instead") def _set_node(self, value): """Outdated method to set the node value; use the set_label() method instead.""" node = property(_get_node, _set_node) def label(self): """ Return the node label of the tree. >>> t = Tree.fromstring('(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))') >>> t.label() 'S' :return: the node label (typically a string) :rtype: any """ return self._label def set_label(self, label): """ Set the node label of the tree. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.set_label("T") >>> print(t) (T (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat)))) :param label: the node label (typically a string) :type label: any """ self._label = label def leaves(self): """ Return the leaves of the tree. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.leaves() ['the', 'dog', 'chased', 'the', 'cat'] :return: a list containing this tree's leaves. The order reflects the order of the leaves in the tree's hierarchical structure. :rtype: list """ leaves = [] for child in self: if isinstance(child, Tree): leaves.extend(child.leaves()) else: leaves.append(child) return leaves def flatten(self): """ Return a flat version of the tree, with all non-root non-terminals removed. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> print(t.flatten()) (S the dog chased the cat) :return: a tree consisting of this tree's root connected directly to its leaves, omitting all intervening non-terminal nodes. :rtype: Tree """ return Tree(self.label(), self.leaves()) def height(self): """ Return the height of the tree. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.height() 5 >>> print(t[0,0]) (D the) >>> t[0,0].height() 2 :return: The height of this tree. The height of a tree containing no children is 1; the height of a tree containing only leaves is 2; and the height of any other tree is one plus the maximum of its children's heights. :rtype: int """ max_child_height = 0 for child in self: if isinstance(child, Tree): max_child_height = max(max_child_height, child.height()) else: max_child_height = max(max_child_height, 1) return 1 + max_child_height def treepositions(self, order="preorder"): """ >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.treepositions() # doctest: +ELLIPSIS [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...] >>> for pos in t.treepositions('leaves'): ... t[pos] = t[pos][::-1].upper() >>> print(t) (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC)))) :param order: One of: ``preorder``, ``postorder``, ``bothorder``, ``leaves``. """ positions = [] if order in ("preorder", "bothorder"): positions.append(()) for i, child in enumerate(self): if isinstance(child, Tree): childpos = child.treepositions(order) positions.extend((i,) + p for p in childpos) else: positions.append((i,)) if order in ("postorder", "bothorder"): positions.append(()) return positions def subtrees(self, filter=None): """ Generate all the subtrees of this tree, optionally restricted to trees matching the filter function. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> for s in t.subtrees(lambda t: t.height() == 2): ... print(s) (D the) (N dog) (V chased) (D the) (N cat) :type filter: function :param filter: the function to filter all local trees """ if not filter or filter(self): yield self for child in self: if isinstance(child, Tree): yield from child.subtrees(filter) def productions(self): """ Generate the productions that correspond to the non-terminal nodes of the tree. For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the form P -> C1 C2 ... Cn. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.productions() # doctest: +NORMALIZE_WHITESPACE [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased', NP -> D N, D -> 'the', N -> 'cat'] :rtype: list(Production) """ if not isinstance(self._label, str): raise TypeError( "Productions can only be generated from trees having node labels that are strings" ) prods = [Production(Nonterminal(self._label), _child_names(self))] for child in self: if isinstance(child, Tree): prods += child.productions() return prods def pos(self): """ Return a sequence of pos-tagged words extracted from the tree. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.pos() [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')] :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags). The order reflects the order of the leaves in the tree's hierarchical structure. :rtype: list(tuple) """ pos = [] for child in self: if isinstance(child, Tree): pos.extend(child.pos()) else: pos.append((child, self._label)) return pos def leaf_treeposition(self, index): """ :return: The tree position of the ``index``-th leaf in this tree. I.e., if ``tp=self.leaf_treeposition(i)``, then ``self[tp]==self.leaves()[i]``. :raise IndexError: If this tree contains fewer than ``index+1`` leaves, or if ``index<0``. """ if index < 0: raise IndexError("index must be non-negative") stack = [(self, ())] while stack: value, treepos = stack.pop() if not isinstance(value, Tree): if index == 0: return treepos else: index -= 1 else: for i in range(len(value) - 1, -1, -1): stack.append((value[i], treepos + (i,))) raise IndexError("index must be less than or equal to len(self)") def treeposition_spanning_leaves(self, start, end): """ :return: The tree position of the lowest descendant of this tree that dominates ``self.leaves()[start:end]``. :raise ValueError: if ``end <= start`` """ if end <= start: raise ValueError("end must be greater than start") # Find the tree positions of the start & end leaves, and # take the longest common subsequence. start_treepos = self.leaf_treeposition(start) end_treepos = self.leaf_treeposition(end - 1) # Find the first index where they mismatch: for i in range(len(start_treepos)): if i == len(end_treepos) or start_treepos[i] != end_treepos[i]: return start_treepos[:i] return start_treepos # //////////////////////////////////////////////////////////// # Transforms # //////////////////////////////////////////////////////////// def chomsky_normal_form( self, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^", ): """ This method can modify a tree in three ways: 1. Convert a tree into its Chomsky Normal Form (CNF) equivalent -- Every subtree has either two non-terminals or one terminal as its children. This process requires the creation of more"artificial" non-terminal nodes. 2. Markov (vertical) smoothing of children in new artificial nodes 3. Horizontal (parent) annotation of nodes :param factor: Right or left factoring method (default = "right") :type factor: str = [left|right] :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings) :type horzMarkov: int | None :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation) :type vertMarkov: int | None :param childChar: A string used in construction of the artificial nodes, separating the head of the original subtree from the child nodes that have yet to be expanded (default = "|") :type childChar: str :param parentChar: A string used to separate the node representation from its vertical annotation :type parentChar: str """ from nltk.tree.transforms import chomsky_normal_form chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar) def un_chomsky_normal_form( self, expandUnary=True, childChar="|", parentChar="^", unaryChar="+" ): """ This method modifies the tree in three ways: 1. Transforms a tree in Chomsky Normal Form back to its original structure (branching greater than two) 2. Removes any parent annotation (if it exists) 3. (optional) expands unary subtrees (if previously collapsed with collapseUnary(...) ) :param expandUnary: Flag to expand unary or not (default = True) :type expandUnary: bool :param childChar: A string separating the head node from its children in an artificial node (default = "|") :type childChar: str :param parentChar: A string separating the node label from its parent annotation (default = "^") :type parentChar: str :param unaryChar: A string joining two non-terminals in a unary production (default = "+") :type unaryChar: str """ from nltk.tree.transforms import un_chomsky_normal_form un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar) def collapse_unary(self, collapsePOS=False, collapseRoot=False, joinChar="+"): """ Collapse subtrees with a single child (ie. unary productions) into a new non-terminal (Tree node) joined by 'joinChar'. This is useful when working with algorithms that do not allow unary productions, and completely removing the unary productions would require loss of useful information. The Tree is modified directly (since it is passed by reference) and no value is returned. :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. Part-of-Speech tags) since they are always unary productions :type collapsePOS: bool :param collapseRoot: 'False' (default) will not modify the root production if it is unary. For the Penn WSJ treebank corpus, this corresponds to the TOP -> productions. :type collapseRoot: bool :param joinChar: A string used to connect collapsed node values (default = "+") :type joinChar: str """ from nltk.tree.transforms import collapse_unary collapse_unary(self, collapsePOS, collapseRoot, joinChar) # //////////////////////////////////////////////////////////// # Convert, copy # //////////////////////////////////////////////////////////// @classmethod def convert(cls, tree): """ Convert a tree between different subtypes of Tree. ``cls`` determines which class will be used to encode the new tree. :type tree: Tree :param tree: The tree that should be converted. :return: The new Tree. """ if isinstance(tree, Tree): children = [cls.convert(child) for child in tree] return cls(tree._label, children) else: return tree def __copy__(self): return self.copy() def __deepcopy__(self, memo): return self.copy(deep=True) def copy(self, deep=False): if not deep: return type(self)(self._label, self) else: return type(self).convert(self) def _frozen_class(self): from nltk.tree.immutable import ImmutableTree return ImmutableTree def freeze(self, leaf_freezer=None): frozen_class = self._frozen_class() if leaf_freezer is None: newcopy = frozen_class.convert(self) else: newcopy = self.copy(deep=True) for pos in newcopy.treepositions("leaves"): newcopy[pos] = leaf_freezer(newcopy[pos]) newcopy = frozen_class.convert(newcopy) hash(newcopy) # Make sure the leaves are hashable. return newcopy # //////////////////////////////////////////////////////////// # Parsing # //////////////////////////////////////////////////////////// @classmethod def fromstring( cls, s, brackets="()", read_node=None, read_leaf=None, node_pattern=None, leaf_pattern=None, remove_empty_top_bracketing=False, ): """ Read a bracketed tree string and return the resulting tree. Trees are represented as nested brackettings, such as:: (S (NP (NNP John)) (VP (V runs))) :type s: str :param s: The string to read :type brackets: str (length=2) :param brackets: The bracket characters used to mark the beginning and end of trees and subtrees. :type read_node: function :type read_leaf: function :param read_node, read_leaf: If specified, these functions are applied to the substrings of ``s`` corresponding to nodes and leaves (respectively) to obtain the values for those nodes and leaves. They should have the following signature: read_node(str) -> value For example, these functions could be used to process nodes and leaves whose values should be some type other than string (such as ``FeatStruct``). Note that by default, node strings and leaf strings are delimited by whitespace and brackets; to override this default, use the ``node_pattern`` and ``leaf_pattern`` arguments. :type node_pattern: str :type leaf_pattern: str :param node_pattern, leaf_pattern: Regular expression patterns used to find node and leaf substrings in ``s``. By default, both nodes patterns are defined to match any sequence of non-whitespace non-bracket characters. :type remove_empty_top_bracketing: bool :param remove_empty_top_bracketing: If the resulting tree has an empty node label, and is length one, then return its single child instead. This is useful for treebank trees, which sometimes contain an extra level of bracketing. :return: A tree corresponding to the string representation ``s``. If this class method is called using a subclass of Tree, then it will return a tree of that type. :rtype: Tree """ if not isinstance(brackets, str) or len(brackets) != 2: raise TypeError("brackets must be a length-2 string") if re.search(r"\s", brackets): raise TypeError("whitespace brackets not allowed") # Construct a regexp that will tokenize the string. open_b, close_b = brackets open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b)) if node_pattern is None: node_pattern = fr"[^\s{open_pattern}{close_pattern}]+" if leaf_pattern is None: leaf_pattern = fr"[^\s{open_pattern}{close_pattern}]+" token_re = re.compile( r"%s\s*(%s)?|%s|(%s)" % (open_pattern, node_pattern, close_pattern, leaf_pattern) ) # Walk through each token, updating a stack of trees. stack = [(None, [])] # list of (node, children) tuples for match in token_re.finditer(s): token = match.group() # Beginning of a tree/subtree if token[0] == open_b: if len(stack) == 1 and len(stack[0][1]) > 0: cls._parse_error(s, match, "end-of-string") label = token[1:].lstrip() if read_node is not None: label = read_node(label) stack.append((label, [])) # End of a tree/subtree elif token == close_b: if len(stack) == 1: if len(stack[0][1]) == 0: cls._parse_error(s, match, open_b) else: cls._parse_error(s, match, "end-of-string") label, children = stack.pop() stack[-1][1].append(cls(label, children)) # Leaf node else: if len(stack) == 1: cls._parse_error(s, match, open_b) if read_leaf is not None: token = read_leaf(token) stack[-1][1].append(token) # check that we got exactly one complete tree. if len(stack) > 1: cls._parse_error(s, "end-of-string", close_b) elif len(stack[0][1]) == 0: cls._parse_error(s, "end-of-string", open_b) else: assert stack[0][0] is None assert len(stack[0][1]) == 1 tree = stack[0][1][0] # If the tree has an extra level with node='', then get rid of # it. E.g.: "((S (NP ...) (VP ...)))" if remove_empty_top_bracketing and tree._label == "" and len(tree) == 1: tree = tree[0] # return the tree. return tree @classmethod def _parse_error(cls, s, match, expecting): """ Display a friendly error message when parsing a tree string fails. :param s: The string we're parsing. :param match: regexp match of the problem token. :param expecting: what we expected to see instead. """ # Construct a basic error message if match == "end-of-string": pos, token = len(s), "end-of-string" else: pos, token = match.start(), match.group() msg = "%s.read(): expected %r but got %r\n%sat index %d." % ( cls.__name__, expecting, token, " " * 12, pos, ) # Add a display showing the error token itsels: s = s.replace("\n", " ").replace("\t", " ") offset = pos if len(s) > pos + 10: s = s[: pos + 10] + "..." if pos > 10: s = "..." + s[pos - 10 :] offset = 13 msg += '\n{}"{}"\n{}^'.format(" " * 16, s, " " * (17 + offset)) raise ValueError(msg) @classmethod def fromlist(cls, l): """ :type l: list :param l: a tree represented as nested lists :return: A tree corresponding to the list representation ``l``. :rtype: Tree Convert nested lists to a NLTK Tree """ if type(l) == list and len(l) > 0: label = repr(l[0]) if len(l) > 1: return Tree(label, [cls.fromlist(child) for child in l[1:]]) else: return label # //////////////////////////////////////////////////////////// # Visualization & String Representation # //////////////////////////////////////////////////////////// def draw(self): """ Open a new window containing a graphical diagram of this tree. """ from nltk.draw.tree import draw_trees draw_trees(self) def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs): """ Pretty-print this tree as ASCII or Unicode art. For explanation of the arguments, see the documentation for `nltk.tree.prettyprinter.TreePrettyPrinter`. """ from nltk.tree.prettyprinter import TreePrettyPrinter print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream) def __repr__(self): childstr = ", ".join(repr(c) for c in self) return "{}({}, [{}])".format( type(self).__name__, repr(self._label), childstr, ) def _repr_svg_(self): from svgling import draw_tree return draw_tree(self)._repr_svg_() def __str__(self): return self.pformat() def pprint(self, **kwargs): """ Print a string representation of this Tree to 'stream' """ if "stream" in kwargs: stream = kwargs["stream"] del kwargs["stream"] else: stream = None print(self.pformat(**kwargs), file=stream) def pformat(self, margin=70, indent=0, nodesep="", parens="()", quotes=False): """ :return: A pretty-printed string representation of this tree. :rtype: str :param margin: The right margin at which to do line-wrapping. :type margin: int :param indent: The indentation level at which printing begins. This number is used to decide how far to indent subsequent lines. :type indent: int :param nodesep: A string that is used to separate the node from the children. E.g., the default value ``':'`` gives trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``. """ # Try writing it on one line. s = self._pformat_flat(nodesep, parens, quotes) if len(s) + indent < margin: return s # If it doesn't fit on one line, then write it on multi-lines. if isinstance(self._label, str): s = f"{parens[0]}{self._label}{nodesep}" else: s = f"{parens[0]}{repr(self._label)}{nodesep}" for child in self: if isinstance(child, Tree): s += ( "\n" + " " * (indent + 2) + child.pformat(margin, indent + 2, nodesep, parens, quotes) ) elif isinstance(child, tuple): s += "\n" + " " * (indent + 2) + "/".join(child) elif isinstance(child, str) and not quotes: s += "\n" + " " * (indent + 2) + "%s" % child else: s += "\n" + " " * (indent + 2) + repr(child) return s + parens[1] def pformat_latex_qtree(self): r""" Returns a representation of the tree compatible with the LaTeX qtree package. This consists of the string ``\Tree`` followed by the tree represented in bracketed notation. For example, the following result was generated from a parse tree of the sentence ``The announcement astounded us``:: \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ] See https://www.ling.upenn.edu/advice/latex.html for the LaTeX style file for the qtree package. :return: A latex qtree representation of this tree. :rtype: str """ reserved_chars = re.compile(r"([#\$%&~_\{\}])") pformat = self.pformat(indent=6, nodesep="", parens=("[.", " ]")) return r"\Tree " + re.sub(reserved_chars, r"\\\1", pformat) def _pformat_flat(self, nodesep, parens, quotes): childstrs = [] for child in self: if isinstance(child, Tree): childstrs.append(child._pformat_flat(nodesep, parens, quotes)) elif isinstance(child, tuple): childstrs.append("/".join(child)) elif isinstance(child, str) and not quotes: childstrs.append("%s" % child) else: childstrs.append(repr(child)) if isinstance(self._label, str): return "{}{}{} {}{}".format( parens[0], self._label, nodesep, " ".join(childstrs), parens[1], ) else: return "{}{}{} {}{}".format( parens[0], repr(self._label), nodesep, " ".join(childstrs), parens[1], ) def _child_names(tree): names = [] for child in tree: if isinstance(child, Tree): names.append(Nonterminal(child._label)) else: names.append(child) return names ###################################################################### ## Demonstration ###################################################################### def demo(): """ A demonstration showing how Trees and Trees can be used. This demonstration creates a Tree, and loads a Tree from the Treebank corpus, and shows the results of calling several of their methods. """ from nltk import ProbabilisticTree, Tree # Demonstrate tree parsing. s = "(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))" t = Tree.fromstring(s) print("Convert bracketed string into tree:") print(t) print(t.__repr__()) print("Display tree properties:") print(t.label()) # tree's constituent type print(t[0]) # tree's first child print(t[1]) # tree's second child print(t.height()) print(t.leaves()) print(t[1]) print(t[1, 1]) print(t[1, 1, 0]) # Demonstrate tree modification. the_cat = t[0] the_cat.insert(1, Tree.fromstring("(JJ big)")) print("Tree modification:") print(t) t[1, 1, 1] = Tree.fromstring("(NN cake)") print(t) print() # Tree transforms print("Collapse unary:") t.collapse_unary() print(t) print("Chomsky normal form:") t.chomsky_normal_form() print(t) print() # Demonstrate probabilistic trees. pt = ProbabilisticTree("x", ["y", "z"], prob=0.5) print("Probabilistic Tree:") print(pt) print() # Demonstrate parsing of treebank output format. t = Tree.fromstring(t.pformat()) print("Convert tree to bracketed string and back again:") print(t) print() # Demonstrate LaTeX output print("LaTeX output:") print(t.pformat_latex_qtree()) print() # Demonstrate Productions print("Production output:") print(t.productions()) print() # Demonstrate tree nodes containing objects other than strings t.set_label(("test", 3)) print(t) __all__ = [ "Tree", ] nltk-3.7/nltk/treeprettyprinter.py000066400000000000000000000016631420073152400174740ustar00rootroot00000000000000# Natural Language Toolkit: ASCII visualization of NLTK trees # # Copyright (C) 2001-2022 NLTK Project # Author: Andreas van Cranenburgh # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT """ Pretty-printing of discontinuous trees. Adapted from the disco-dop project, by Andreas van Cranenburgh. https://github.com/andreasvc/disco-dop Interesting reference (not used for this code): T. Eschbach et al., Orth. Hypergraph Drawing, Journal of Graph Algorithms and Applications, 10(2) 141--157 (2006)149. https://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf """ from nltk.internals import Deprecated from nltk.tree.prettyprinter import TreePrettyPrinter as TPP class TreePrettyPrinter(Deprecated, TPP): """Import `TreePrettyPrinter` using `from nltk.tree import TreePrettyPrinter` instead.""" __all__ = ["TreePrettyPrinter"] nltk-3.7/nltk/treetransforms.py000066400000000000000000000120521420073152400167310ustar00rootroot00000000000000# Natural Language Toolkit: Tree Transformations # # Copyright (C) 2005-2007 Oregon Graduate Institute # Author: Nathan Bodenstab # URL: # For license information, see LICENSE.TXT r""" A collection of methods for tree (grammar) transformations used in parsing natural language. Although many of these methods are technically grammar transformations (ie. Chomsky Norm Form), when working with treebanks it is much more natural to visualize these modifications in a tree structure. Hence, we will do all transformation directly to the tree itself. Transforming the tree directly also allows us to do parent annotation. A grammar can then be simply induced from the modified tree. The following is a short tutorial on the available transformations. 1. Chomsky Normal Form (binarization) It is well known that any grammar has a Chomsky Normal Form (CNF) equivalent grammar where CNF is defined by every production having either two non-terminals or one terminal on its right hand side. When we have hierarchically structured data (ie. a treebank), it is natural to view this in terms of productions where the root of every subtree is the head (left hand side) of the production and all of its children are the right hand side constituents. In order to convert a tree into CNF, we simply need to ensure that every subtree has either two subtrees as children (binarization), or one leaf node (non-terminal). In order to binarize a subtree with more than two children, we must introduce artificial nodes. There are two popular methods to convert a tree into CNF: left factoring and right factoring. The following example demonstrates the difference between them. Example:: Original Right-Factored Left-Factored A A A / | \ / \ / \ B C D ==> B A| OR A| D / \ / \ C D B C 2. Parent Annotation In addition to binarizing the tree, there are two standard modifications to node labels we can do in the same traversal: parent annotation and Markov order-N smoothing (or sibling smoothing). The purpose of parent annotation is to refine the probabilities of productions by adding a small amount of context. With this simple addition, a CYK (inside-outside, dynamic programming chart parse) can improve from 74% to 79% accuracy. A natural generalization from parent annotation is to grandparent annotation and beyond. The tradeoff becomes accuracy gain vs. computational complexity. We must also keep in mind data sparcity issues. Example:: Original Parent Annotation A A^ / | \ / \ B C D ==> B^ A|^ where ? is the / \ parent of A C^ D^ 3. Markov order-N smoothing Markov smoothing combats data sparcity issues as well as decreasing computational requirements by limiting the number of children included in artificial nodes. In practice, most people use an order 2 grammar. Example:: Original No Smoothing Markov order 1 Markov order 2 etc. __A__ A A A / /|\ \ / \ / \ / \ B C D E F ==> B A| ==> B A| ==> B A| / \ / \ / \ C ... C ... C ... Annotation decisions can be thought about in the vertical direction (parent, grandparent, etc) and the horizontal direction (number of siblings to keep). Parameters to the following functions specify these values. For more information see: Dan Klein and Chris Manning (2003) "Accurate Unlexicalized Parsing", ACL-03. https://www.aclweb.org/anthology/P03-1054 4. Unary Collapsing Collapse unary productions (ie. subtrees with a single child) into a new non-terminal (Tree node). This is useful when working with algorithms that do not allow unary productions, yet you do not wish to lose the parent information. Example:: A | B ==> A+B / \ / \ C D C D """ from nltk.internals import deprecated from nltk.tree.transforms import chomsky_normal_form as cnf from nltk.tree.transforms import collapse_unary as cu from nltk.tree.transforms import un_chomsky_normal_form as ucnf chomsky_normal_form = deprecated( "Import using `from nltk.tree import chomsky_normal_form` instead." )(cnf) un_chomsky_normal_form = deprecated( "Import using `from nltk.tree import un_chomsky_normal_form` instead." )(ucnf) collapse_unary = deprecated( "Import using `from nltk.tree import collapse_unary` instead." )(cu) __all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"] nltk-3.7/nltk/twitter/000077500000000000000000000000001420073152400150035ustar00rootroot00000000000000nltk-3.7/nltk/twitter/__init__.py000066400000000000000000000014201420073152400171110ustar00rootroot00000000000000# Natural Language Toolkit: Twitter # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ NLTK Twitter Package This package contains classes for retrieving Tweet documents using the Twitter API. """ try: import twython except ImportError: import warnings warnings.warn( "The twython library has not been installed. " "Some functionality from the twitter package will not be available." ) else: from nltk.twitter.util import Authenticate, credsfromfile from nltk.twitter.twitterclient import ( Streamer, Query, Twitter, TweetViewer, TweetWriter, ) from nltk.twitter.common import json2csv nltk-3.7/nltk/twitter/api.py000066400000000000000000000107031420073152400161270ustar00rootroot00000000000000# Natural Language Toolkit: Twitter API # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ This module provides an interface for TweetHandlers, and support for timezone handling. """ import time as _time from abc import ABCMeta, abstractmethod from datetime import datetime, timedelta, timezone, tzinfo class LocalTimezoneOffsetWithUTC(tzinfo): """ This is not intended to be a general purpose class for dealing with the local timezone. In particular: * it assumes that the date passed has been created using `datetime(..., tzinfo=Local)`, where `Local` is an instance of the object `LocalTimezoneOffsetWithUTC`; * for such an object, it returns the offset with UTC, used for date comparisons. Reference: https://docs.python.org/3/library/datetime.html """ STDOFFSET = timedelta(seconds=-_time.timezone) if _time.daylight: DSTOFFSET = timedelta(seconds=-_time.altzone) else: DSTOFFSET = STDOFFSET def utcoffset(self, dt): """ Access the relevant time offset. """ return self.DSTOFFSET LOCAL = LocalTimezoneOffsetWithUTC() class BasicTweetHandler(metaclass=ABCMeta): """ Minimal implementation of `TweetHandler`. Counts the number of Tweets and decides when the client should stop fetching them. """ def __init__(self, limit=20): self.limit = limit self.counter = 0 """ A flag to indicate to the client whether to stop fetching data given some condition (e.g., reaching a date limit). """ self.do_stop = False """ Stores the id of the last fetched Tweet to handle pagination. """ self.max_id = None def do_continue(self): """ Returns `False` if the client should stop fetching Tweets. """ return self.counter < self.limit and not self.do_stop class TweetHandlerI(BasicTweetHandler): """ Interface class whose subclasses should implement a handle method that Twitter clients can delegate to. """ def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None): """ :param int limit: The number of data items to process in the current\ round of processing. :param tuple upper_date_limit: The date at which to stop collecting\ new data. This should be entered as a tuple which can serve as the\ argument to `datetime.datetime`.\ E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. :param tuple lower_date_limit: The date at which to stop collecting\ new data. See `upper_data_limit` for formatting. """ BasicTweetHandler.__init__(self, limit) self.upper_date_limit = None self.lower_date_limit = None if upper_date_limit: self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL) if lower_date_limit: self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL) self.startingup = True @abstractmethod def handle(self, data): """ Deal appropriately with data returned by the Twitter API """ @abstractmethod def on_finish(self): """ Actions when the tweet limit has been reached """ def check_date_limit(self, data, verbose=False): """ Validate date limits. """ if self.upper_date_limit or self.lower_date_limit: date_fmt = "%a %b %d %H:%M:%S +0000 %Y" tweet_date = datetime.strptime(data["created_at"], date_fmt).replace( tzinfo=timezone.utc ) if (self.upper_date_limit and tweet_date > self.upper_date_limit) or ( self.lower_date_limit and tweet_date < self.lower_date_limit ): if self.upper_date_limit: message = "earlier" date_limit = self.upper_date_limit else: message = "later" date_limit = self.lower_date_limit if verbose: print( "Date limit {} is {} than date of current tweet {}".format( date_limit, message, tweet_date ) ) self.do_stop = True nltk-3.7/nltk/twitter/common.py000066400000000000000000000231721420073152400166520ustar00rootroot00000000000000# Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ Utility functions for the `twitterclient` module which do not require the `twython` library to have been installed. """ import csv import gzip import json from nltk.internals import deprecated HIER_SEPARATOR = "." def extract_fields(tweet, fields): """ Extract field values from a full tweet and return them as a list :param json tweet: The tweet in JSON format :param list fields: The fields to be extracted from the tweet :rtype: list(str) """ out = [] for field in fields: try: _add_field_to_out(tweet, field, out) except TypeError as e: raise RuntimeError( "Fatal error when extracting fields. Cannot find field ", field ) from e return out def _add_field_to_out(json, field, out): if _is_composed_key(field): key, value = _get_key_value_composed(field) _add_field_to_out(json[key], value, out) else: out += [json[field]] def _is_composed_key(field): return HIER_SEPARATOR in field def _get_key_value_composed(field): out = field.split(HIER_SEPARATOR) # there could be up to 3 levels key = out[0] value = HIER_SEPARATOR.join(out[1:]) return key, value def _get_entity_recursive(json, entity): if not json: return None elif isinstance(json, dict): for key, value in json.items(): if key == entity: return value # 'entities' and 'extended_entities' are wrappers in Twitter json # structure that contain other Twitter objects. See: # https://dev.twitter.com/overview/api/entities-in-twitter-objects if key == "entities" or key == "extended_entities": candidate = _get_entity_recursive(value, entity) if candidate is not None: return candidate return None elif isinstance(json, list): for item in json: candidate = _get_entity_recursive(item, entity) if candidate is not None: return candidate return None else: return None def json2csv( fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False ): """ Extract selected fields from a file of line-separated JSON tweets and write to a file in CSV format. This utility function allows a file of full tweets to be easily converted to a CSV file for easier processing. For example, just TweetIDs or just the text content of the Tweets can be extracted. Additionally, the function allows combinations of fields of other Twitter objects (mainly the users, see below). For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see `json2csv_entities` :param str infile: The name of the file containing full tweets :param str outfile: The name of the text file where results should be\ written :param list fields: The list of fields to be extracted. Useful examples\ are 'id_str' for the tweetID and 'text' for the text of the tweet. See\ for a full list of fields.\ e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\ Additionally, it allows IDs from other Twitter objects, e. g.,\ ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count'] :param error: Behaviour for encoding errors, see\ https://docs.python.org/3/library/codecs.html#codec-base-classes :param gzip_compress: if `True`, output files are compressed with gzip """ (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) # write the list of fields as header writer.writerow(fields) # process the file for line in fp: tweet = json.loads(line) row = extract_fields(tweet, fields) writer.writerow(row) outf.close() @deprecated("Use open() and csv.writer() directly instead.") def outf_writer_compat(outfile, encoding, errors, gzip_compress=False): """Get a CSV writer with optional compression.""" return _outf_writer(outfile, encoding, errors, gzip_compress) def _outf_writer(outfile, encoding, errors, gzip_compress=False): if gzip_compress: outf = gzip.open(outfile, "wt", newline="", encoding=encoding, errors=errors) else: outf = open(outfile, "w", newline="", encoding=encoding, errors=errors) writer = csv.writer(outf) return (writer, outf) def json2csv_entities( tweets_file, outfile, main_fields, entity_type, entity_fields, encoding="utf8", errors="replace", gzip_compress=False, ): """ Extract selected fields from a file of line-separated JSON tweets and write to a file in CSV format. This utility function allows a file of full Tweets to be easily converted to a CSV file for easier processing of Twitter entities. For example, the hashtags or media elements of a tweet can be extracted. It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags there will be two lines in the output file, one per hashtag :param tweets_file: the file-like object containing full Tweets :param str outfile: The path of the text file where results should be\ written :param list main_fields: The list of fields to be extracted from the main\ object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\ for a full list of fields. e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count'] If `entity_type` is expressed with hierarchy, then it is the list of\ fields of the object that corresponds to the key of the entity_type,\ (e.g., for entity_type='user.urls', the fields in the main_fields list\ belong to the user object; for entity_type='place.bounding_box', the\ files in the main_field list belong to the place object of the tweet). :param list entity_type: The name of the entity: 'hashtags', 'media',\ 'urls' and 'user_mentions' for the tweet object. For a user object,\ this needs to be expressed with a hierarchy: `'user.urls'`. For the\ bounding box of the Tweet location, use `'place.bounding_box'`. :param list entity_fields: The list of fields to be extracted from the\ entity. E.g. `['text']` (of the Tweet) :param error: Behaviour for encoding errors, see\ https://docs.python.org/3/library/codecs.html#codec-base-classes :param gzip_compress: if `True`, output files are compressed with gzip """ (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) header = get_header_field_list(main_fields, entity_type, entity_fields) writer.writerow(header) for line in tweets_file: tweet = json.loads(line) if _is_composed_key(entity_type): key, value = _get_key_value_composed(entity_type) object_json = _get_entity_recursive(tweet, key) if not object_json: # this can happen in the case of "place" continue object_fields = extract_fields(object_json, main_fields) items = _get_entity_recursive(object_json, value) _write_to_file(object_fields, items, entity_fields, writer) else: tweet_fields = extract_fields(tweet, main_fields) items = _get_entity_recursive(tweet, entity_type) _write_to_file(tweet_fields, items, entity_fields, writer) outf.close() def get_header_field_list(main_fields, entity_type, entity_fields): if _is_composed_key(entity_type): key, value = _get_key_value_composed(entity_type) main_entity = key sub_entity = value else: main_entity = None sub_entity = entity_type if main_entity: output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields] else: output1 = main_fields output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields] return output1 + output2 def _write_to_file(object_fields, items, entity_fields, writer): if not items: # it could be that the entity is just not present for the tweet # e.g. tweet hashtag is always present, even as [], however # tweet media may not be present return if isinstance(items, dict): # this happens e.g. for "place" of a tweet row = object_fields # there might be composed keys in de list of required fields entity_field_values = [x for x in entity_fields if not _is_composed_key(x)] entity_field_composed = [x for x in entity_fields if _is_composed_key(x)] for field in entity_field_values: value = items[field] if isinstance(value, list): row += value else: row += [value] # now check required dictionaries for d in entity_field_composed: kd, vd = _get_key_value_composed(d) json_dict = items[kd] if not isinstance(json_dict, dict): raise RuntimeError( """Key {} does not contain a dictionary in the json file""".format( kd ) ) row += [json_dict[vd]] writer.writerow(row) return # in general it is a list for item in items: row = object_fields + extract_fields(item, entity_fields) writer.writerow(row) nltk-3.7/nltk/twitter/twitter_demo.py000066400000000000000000000175031420073152400200710ustar00rootroot00000000000000# Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ Examples to demo the :py:mod:`twitterclient` code. These demo functions should all run, with the following caveats: * You must have obtained API keys from Twitter, and installed them according to the instructions in the `twitter HOWTO `_. * If you are on a slow network, some of the calls to the Twitter API may timeout. * If you are being rate limited while searching, you will receive a 420 error response. * Your terminal window / console must be able to display UTF-8 encoded characters. For documentation about the Twitter APIs, see `The Streaming APIs Overview `_ and `The REST APIs Overview `_. For error codes see Twitter's `Error Codes and Responses ` """ import datetime import json from functools import wraps from io import StringIO from nltk.twitter import ( Query, Streamer, TweetViewer, TweetWriter, Twitter, credsfromfile, ) SPACER = "###################################" def verbose(func): """Decorator for demo functions""" @wraps(func) def with_formatting(*args, **kwargs): print() print(SPACER) print("Using %s" % (func.__name__)) print(SPACER) return func(*args, **kwargs) return with_formatting def yesterday(): """ Get yesterday's datetime as a 5-tuple. """ date = datetime.datetime.now() date -= datetime.timedelta(days=1) date_tuple = date.timetuple()[:6] return date_tuple def setup(): """ Initialize global variables for the demos. """ global USERIDS, FIELDS USERIDS = ["759251", "612473", "15108702", "6017542", "2673523800"] # UserIDs corresponding to\ # @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive FIELDS = ["id_str"] @verbose def twitterclass_demo(): """ Use the simplified :class:`Twitter` class to write some tweets to a file. """ tw = Twitter() print("Track from the public stream\n") tw.tweets(keywords="love, hate", limit=10) # public stream print(SPACER) print("Search past Tweets\n") tw = Twitter() tw.tweets(keywords="love, hate", stream=False, limit=10) # search past tweets print(SPACER) print( "Follow two accounts in the public stream" + " -- be prepared to wait a few minutes\n" ) tw = Twitter() tw.tweets(follow=["759251", "6017542"], stream=True, limit=5) # public stream @verbose def sampletoscreen_demo(limit=20): """ Sample from the Streaming API and send output to terminal. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.sample() @verbose def tracktoscreen_demo(track="taylor swift", limit=10): """ Track keywords from the public Streaming API and send output to terminal. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.filter(track=track) @verbose def search_demo(keywords="nltk"): """ Use the REST API to search for past tweets containing a given keyword. """ oauth = credsfromfile() client = Query(**oauth) for tweet in client.search_tweets(keywords=keywords, limit=10): print(tweet["text"]) @verbose def tweets_by_user_demo(user="NLTK_org", count=200): """ Use the REST API to search for past tweets by a given user. """ oauth = credsfromfile() client = Query(**oauth) client.register(TweetWriter()) client.user_tweets(user, count) @verbose def lookup_by_userid_demo(): """ Use the REST API to convert a userID to a screen name. """ oauth = credsfromfile() client = Query(**oauth) user_info = client.user_info_from_id(USERIDS) for info in user_info: name = info["screen_name"] followers = info["followers_count"] following = info["friends_count"] print(f"{name}, followers: {followers}, following: {following}") @verbose def followtoscreen_demo(limit=10): """ Using the Streaming API, select just the tweets from a specified list of userIDs. This is will only give results in a reasonable time if the users in question produce a high volume of tweets, and may even so show some delay. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.statuses.filter(follow=USERIDS) @verbose def streamtofile_demo(limit=20): """ Write 20 tweets sampled from the public Streaming API to a file. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetWriter(limit=limit, repeat=False)) client.statuses.sample() @verbose def limit_by_time_demo(keywords="nltk"): """ Query the REST API for Tweets about NLTK since yesterday and send the output to terminal. This example makes the assumption that there are sufficient Tweets since yesterday for the date to be an effective cut-off. """ date = yesterday() dt_date = datetime.datetime(*date) oauth = credsfromfile() client = Query(**oauth) client.register(TweetViewer(limit=100, lower_date_limit=date)) print(f"Cutoff date: {dt_date}\n") for tweet in client.search_tweets(keywords=keywords): print("{} ".format(tweet["created_at"]), end="") client.handler.handle(tweet) @verbose def corpusreader_demo(): """ Use `TwitterCorpusReader` tp read a file of tweets, and print out * some full tweets in JSON format; * some raw strings from the tweets (i.e., the value of the `text` field); and * the result of tokenising the raw strings. """ from nltk.corpus import twitter_samples as tweets print() print("Complete tweet documents") print(SPACER) for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: print(json.dumps(tweet, indent=1, sort_keys=True)) print() print("Raw tweet strings:") print(SPACER) for text in tweets.strings("tweets.20150430-223406.json")[:15]: print(text) print() print("Tokenized tweet strings:") print(SPACER) for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: print(toks) @verbose def expand_tweetids_demo(): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets, if available. """ ids_f = StringIO( """\ 588665495492124672 588665495487909888 588665495508766721 588665495513006080 588665495517200384 588665495487811584 588665495525588992 588665495487844352 588665495492014081 588665495512948737""" ) oauth = credsfromfile() client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) for tweet in hydrated: id_str = tweet["id_str"] print(f"id: {id_str}") text = tweet["text"] if text.startswith("@null"): text = "[Tweet not available]" print(text + "\n") ALL = [ twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo, search_demo, tweets_by_user_demo, lookup_by_userid_demo, followtoscreen_demo, streamtofile_demo, limit_by_time_demo, corpusreader_demo, expand_tweetids_demo, ] """ Select demo functions to run. E.g. replace the following line with "DEMOS = ALL[8:]" to execute only the final three demos. """ DEMOS = ALL[:] if __name__ == "__main__": setup() for demo in DEMOS: demo() print("\n" + SPACER) print("All demos completed") print(SPACER) nltk-3.7/nltk/twitter/twitterclient.py000066400000000000000000000456431420073152400202720ustar00rootroot00000000000000# Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ NLTK Twitter client This module offers methods for collecting and processing Tweets. Most of the functionality depends on access to the Twitter APIs, and this is handled via the third party Twython library. If one of the methods below returns an integer, it is probably a `Twitter error code `_. For example, the response of '420' means that you have reached the limit of the requests you can currently make to the Twitter API. Currently, `rate limits for the search API `_ are divided into 15 minute windows. """ import datetime import gzip import itertools import json import os import time import requests from twython import Twython, TwythonStreamer from twython.exceptions import TwythonError, TwythonRateLimitError from nltk.twitter.api import BasicTweetHandler, TweetHandlerI from nltk.twitter.util import credsfromfile, guess_path class Streamer(TwythonStreamer): """ Retrieve data from the Twitter Streaming API. The streaming API requires `OAuth 1.0 `_ authentication. """ def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): self.handler = None self.do_continue = True TwythonStreamer.__init__( self, app_key, app_secret, oauth_token, oauth_token_secret ) def register(self, handler): """ Register a method for handling Tweets. :param TweetHandlerI handler: method for viewing """ self.handler = handler def on_success(self, data): """ :param data: response from Twitter API """ if self.do_continue: if self.handler is not None: if "text" in data: self.handler.counter += 1 self.handler.handle(data) self.do_continue = self.handler.do_continue() else: raise ValueError("No data handler has been registered.") else: self.disconnect() self.handler.on_finish() def on_error(self, status_code, data): """ :param status_code: The status code returned by the Twitter API :param data: The response from Twitter API """ print(status_code) def sample(self): """ Wrapper for 'statuses / sample' API call """ while self.do_continue: # Stream in an endless loop until limit is reached. See twython # issue 288: https://github.com/ryanmcgrath/twython/issues/288 # colditzjb commented on 9 Dec 2014 try: self.statuses.sample() except requests.exceptions.ChunkedEncodingError as e: if e is not None: print(f"Error (stream will continue): {e}") continue def filter(self, track="", follow="", lang="en"): """ Wrapper for 'statuses / filter' API call """ while self.do_continue: # Stream in an endless loop until limit is reached try: if track == "" and follow == "": msg = "Please supply a value for 'track', 'follow'" raise ValueError(msg) self.statuses.filter(track=track, follow=follow, lang=lang) except requests.exceptions.ChunkedEncodingError as e: if e is not None: print(f"Error (stream will continue): {e}") continue class Query(Twython): """ Retrieve data from the Twitter REST API. """ def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): """ :param app_key: (optional) Your applications key :param app_secret: (optional) Your applications secret key :param oauth_token: (optional) When using **OAuth 1**, combined with oauth_token_secret to make authenticated calls :param oauth_token_secret: (optional) When using **OAuth 1** combined with oauth_token to make authenticated calls """ self.handler = None self.do_continue = True Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret) def register(self, handler): """ Register a method for handling Tweets. :param TweetHandlerI handler: method for viewing or writing Tweets to a file. """ self.handler = handler def expand_tweetids(self, ids_f, verbose=True): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets from the Twitter API. The API call `statuses/lookup` will fail to retrieve a Tweet if the user has deleted it. This call to the Twitter API is rate-limited. See for details. :param ids_f: input file object consisting of Tweet IDs, one to a line :return: iterable of Tweet objects in JSON format """ ids = [line.strip() for line in ids_f if line] if verbose: print(f"Counted {len(ids)} Tweet IDs in {ids_f}.") # The Twitter endpoint takes lists of up to 100 ids, so we chunk the # ids. id_chunks = [ids[i : i + 100] for i in range(0, len(ids), 100)] chunked_tweets = (self.lookup_status(id=chunk) for chunk in id_chunks) return itertools.chain.from_iterable(chunked_tweets) def _search_tweets(self, keywords, limit=100, lang="en"): """ Assumes that the handler has been informed. Fetches Tweets from search_tweets generator output and passses them to handler :param str keywords: A list of query terms to search for, written as\ a comma-separated string. :param int limit: Number of Tweets to process :param str lang: language """ while True: tweets = self.search_tweets( keywords=keywords, limit=limit, lang=lang, max_id=self.handler.max_id ) for tweet in tweets: self.handler.handle(tweet) if not (self.handler.do_continue() and self.handler.repeat): break self.handler.on_finish() def search_tweets( self, keywords, limit=100, lang="en", max_id=None, retries_after_twython_exception=0, ): """ Call the REST API ``'search/tweets'`` endpoint with some plausible defaults. See `the Twitter search documentation `_ for more information about admissible search parameters. :param str keywords: A list of query terms to search for, written as\ a comma-separated string :param int limit: Number of Tweets to process :param str lang: language :param int max_id: id of the last tweet fetched :param int retries_after_twython_exception: number of retries when\ searching Tweets before raising an exception :rtype: python generator """ if not self.handler: # if no handler is provided, `BasicTweetHandler` provides minimum # functionality for limiting the number of Tweets retrieved self.handler = BasicTweetHandler(limit=limit) count_from_query = 0 if max_id: self.handler.max_id = max_id else: results = self.search( q=keywords, count=min(100, limit), lang=lang, result_type="recent" ) count = len(results["statuses"]) if count == 0: print("No Tweets available through REST API for those keywords") return count_from_query = count self.handler.max_id = results["statuses"][count - 1]["id"] - 1 for result in results["statuses"]: yield result self.handler.counter += 1 if self.handler.do_continue() == False: return # Pagination loop: keep fetching Tweets until the desired count is # reached while dealing with Twitter rate limits. retries = 0 while count_from_query < limit: try: mcount = min(100, limit - count_from_query) results = self.search( q=keywords, count=mcount, lang=lang, max_id=self.handler.max_id, result_type="recent", ) except TwythonRateLimitError as e: print(f"Waiting for 15 minutes -{e}") time.sleep(15 * 60) # wait 15 minutes continue except TwythonError as e: print(f"Fatal error in Twython request -{e}") if retries_after_twython_exception == retries: raise e retries += 1 count = len(results["statuses"]) if count == 0: print("No more Tweets available through rest api") return count_from_query += count # the max_id is also present in the Tweet metadata # results['search_metadata']['next_results'], but as part of a # query and difficult to fetch. This is doing the equivalent # (last tweet id minus one) self.handler.max_id = results["statuses"][count - 1]["id"] - 1 for result in results["statuses"]: yield result self.handler.counter += 1 if self.handler.do_continue() == False: return def user_info_from_id(self, userids): """ Convert a list of userIDs into a variety of information about the users. See . :param list userids: A list of integer strings corresponding to Twitter userIDs :rtype: list(json) """ return [self.show_user(user_id=userid) for userid in userids] def user_tweets(self, screen_name, limit, include_rts="false"): """ Return a collection of the most recent Tweets posted by the user :param str user: The user's screen name; the initial '@' symbol\ should be omitted :param int limit: The number of Tweets to recover; 200 is the maximum allowed :param str include_rts: Whether to include statuses which have been\ retweeted by the user; possible values are 'true' and 'false' """ data = self.get_user_timeline( screen_name=screen_name, count=limit, include_rts=include_rts ) for item in data: self.handler.handle(item) class Twitter: """ Wrapper class with restricted functionality and fewer options. """ def __init__(self): self._oauth = credsfromfile() self.streamer = Streamer(**self._oauth) self.query = Query(**self._oauth) def tweets( self, keywords="", follow="", to_screen=True, stream=True, limit=100, date_limit=None, lang="en", repeat=False, gzip_compress=False, ): """ Process some Tweets in a simple manner. :param str keywords: Keywords to use for searching or filtering :param list follow: UserIDs to use for filtering Tweets from the public stream :param bool to_screen: If `True`, display the tweet texts on the screen,\ otherwise print to a file :param bool stream: If `True`, use the live public stream,\ otherwise search past public Tweets :param int limit: The number of data items to process in the current\ round of processing. :param tuple date_limit: The date at which to stop collecting\ new data. This should be entered as a tuple which can serve as the\ argument to `datetime.datetime`.\ E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. Note that, in the case of streaming, this is the maximum date, i.e.\ a date in the future; if not, it is the minimum date, i.e. a date\ in the past :param str lang: language :param bool repeat: A flag to determine whether multiple files should\ be written. If `True`, the length of each file will be set by the\ value of `limit`. Use only if `to_screen` is `False`. See also :py:func:`handle`. :param gzip_compress: if `True`, output files are compressed with gzip. """ if stream: upper_date_limit = date_limit lower_date_limit = None else: upper_date_limit = None lower_date_limit = date_limit if to_screen: handler = TweetViewer( limit=limit, upper_date_limit=upper_date_limit, lower_date_limit=lower_date_limit, ) else: handler = TweetWriter( limit=limit, upper_date_limit=upper_date_limit, lower_date_limit=lower_date_limit, repeat=repeat, gzip_compress=gzip_compress, ) if to_screen: handler = TweetViewer(limit=limit) else: if stream: upper_date_limit = date_limit lower_date_limit = None else: upper_date_limit = None lower_date_limit = date_limit handler = TweetWriter( limit=limit, upper_date_limit=upper_date_limit, lower_date_limit=lower_date_limit, repeat=repeat, gzip_compress=gzip_compress, ) if stream: self.streamer.register(handler) if keywords == "" and follow == "": self.streamer.sample() else: self.streamer.filter(track=keywords, follow=follow, lang=lang) else: self.query.register(handler) if keywords == "": raise ValueError("Please supply at least one keyword to search for.") else: self.query._search_tweets(keywords, limit=limit, lang=lang) class TweetViewer(TweetHandlerI): """ Handle data by sending it to the terminal. """ def handle(self, data): """ Direct data to `sys.stdout` :return: return ``False`` if processing should cease, otherwise return ``True``. :rtype: bool :param data: Tweet object returned by Twitter API """ text = data["text"] print(text) self.check_date_limit(data) if self.do_stop: return def on_finish(self): print(f"Written {self.counter} Tweets") class TweetWriter(TweetHandlerI): """ Handle data by writing it to a file. """ def __init__( self, limit=2000, upper_date_limit=None, lower_date_limit=None, fprefix="tweets", subdir="twitter-files", repeat=False, gzip_compress=False, ): """ The difference between the upper and lower date limits depends on whether Tweets are coming in an ascending date order (i.e. when streaming) or descending date order (i.e. when searching past Tweets). :param int limit: number of data items to process in the current\ round of processing. :param tuple upper_date_limit: The date at which to stop collecting new\ data. This should be entered as a tuple which can serve as the\ argument to `datetime.datetime`. E.g. `upper_date_limit=(2015, 4, 1, 12,\ 40)` for 12:30 pm on April 1 2015. :param tuple lower_date_limit: The date at which to stop collecting new\ data. See `upper_data_limit` for formatting. :param str fprefix: The prefix to use in creating file names for Tweet\ collections. :param str subdir: The name of the directory where Tweet collection\ files should be stored. :param bool repeat: flag to determine whether multiple files should be\ written. If `True`, the length of each file will be set by the value\ of `limit`. See also :py:func:`handle`. :param gzip_compress: if `True`, output files are compressed with gzip. """ self.fprefix = fprefix self.subdir = guess_path(subdir) self.gzip_compress = gzip_compress self.fname = self.timestamped_file() self.repeat = repeat self.output = None TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit) def timestamped_file(self): """ :return: timestamped file name :rtype: str """ subdir = self.subdir fprefix = self.fprefix if subdir: if not os.path.exists(subdir): os.mkdir(subdir) fname = os.path.join(subdir, fprefix) fmt = "%Y%m%d-%H%M%S" timestamp = datetime.datetime.now().strftime(fmt) if self.gzip_compress: suffix = ".gz" else: suffix = "" outfile = f"{fname}.{timestamp}.json{suffix}" return outfile def handle(self, data): """ Write Twitter data as line-delimited JSON into one or more files. :return: return `False` if processing should cease, otherwise return `True`. :param data: tweet object returned by Twitter API """ if self.startingup: if self.gzip_compress: self.output = gzip.open(self.fname, "w") else: self.output = open(self.fname, "w") print(f"Writing to {self.fname}") json_data = json.dumps(data) if self.gzip_compress: self.output.write((json_data + "\n").encode("utf-8")) else: self.output.write(json_data + "\n") self.check_date_limit(data) if self.do_stop: return self.startingup = False def on_finish(self): print(f"Written {self.counter} Tweets") if self.output: self.output.close() def do_continue(self): if self.repeat == False: return TweetHandlerI.do_continue(self) if self.do_stop: # stop for a functional cause (e.g. date limit) return False if self.counter == self.limit: # repeat is True, thus close output file and # create a new one self._restart_file() return True def _restart_file(self): self.on_finish() self.fname = self.timestamped_file() self.startingup = True self.counter = 0 nltk-3.7/nltk/twitter/util.py000066400000000000000000000104571420073152400163410ustar00rootroot00000000000000# Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2022 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ Authentication utilities to accompany `twitterclient`. """ import os import pprint from twython import Twython def credsfromfile(creds_file=None, subdir=None, verbose=False): """ Convenience function for authentication """ return Authenticate().load_creds( creds_file=creds_file, subdir=subdir, verbose=verbose ) class Authenticate: """ Methods for authenticating with Twitter. """ def __init__(self): self.creds_file = "credentials.txt" self.creds_fullpath = None self.oauth = {} try: self.twitter_dir = os.environ["TWITTER"] self.creds_subdir = self.twitter_dir except KeyError: self.twitter_dir = None self.creds_subdir = None def load_creds(self, creds_file=None, subdir=None, verbose=False): """ Read OAuth credentials from a text file. File format for OAuth 1:: app_key=YOUR_APP_KEY app_secret=YOUR_APP_SECRET oauth_token=OAUTH_TOKEN oauth_token_secret=OAUTH_TOKEN_SECRET File format for OAuth 2:: app_key=YOUR_APP_KEY app_secret=YOUR_APP_SECRET access_token=ACCESS_TOKEN :param str file_name: File containing credentials. ``None`` (default) reads data from `TWITTER/'credentials.txt'` """ if creds_file is not None: self.creds_file = creds_file if subdir is None: if self.creds_subdir is None: msg = ( "Supply a value to the 'subdir' parameter or" + " set the TWITTER environment variable." ) raise ValueError(msg) else: self.creds_subdir = subdir self.creds_fullpath = os.path.normpath( os.path.join(self.creds_subdir, self.creds_file) ) if not os.path.isfile(self.creds_fullpath): raise OSError(f"Cannot find file {self.creds_fullpath}") with open(self.creds_fullpath) as infile: if verbose: print(f"Reading credentials file {self.creds_fullpath}") for line in infile: if "=" in line: name, value = line.split("=", 1) self.oauth[name.strip()] = value.strip() self._validate_creds_file(verbose=verbose) return self.oauth def _validate_creds_file(self, verbose=False): """Check validity of a credentials file.""" oauth1 = False oauth1_keys = ["app_key", "app_secret", "oauth_token", "oauth_token_secret"] oauth2 = False oauth2_keys = ["app_key", "app_secret", "access_token"] if all(k in self.oauth for k in oauth1_keys): oauth1 = True elif all(k in self.oauth for k in oauth2_keys): oauth2 = True if not (oauth1 or oauth2): msg = f"Missing or incorrect entries in {self.creds_file}\n" msg += pprint.pformat(self.oauth) raise ValueError(msg) elif verbose: print(f'Credentials file "{self.creds_file}" looks good') def add_access_token(creds_file=None): """ For OAuth 2, retrieve an access token for an app and append it to a credentials file. """ if creds_file is None: path = os.path.dirname(__file__) creds_file = os.path.join(path, "credentials2.txt") oauth2 = credsfromfile(creds_file=creds_file) app_key = oauth2["app_key"] app_secret = oauth2["app_secret"] twitter = Twython(app_key, app_secret, oauth_version=2) access_token = twitter.obtain_access_token() tok = f"access_token={access_token}\n" with open(creds_file, "a") as infile: print(tok, file=infile) def guess_path(pth): """ If the path is not absolute, guess that it is a subdirectory of the user's home directory. :param str pth: The pathname of the directory where files of tweets should be written """ if os.path.isabs(pth): return pth else: return os.path.expanduser(os.path.join("~", pth)) nltk-3.7/nltk/util.py000066400000000000000000001175611420073152400146430ustar00rootroot00000000000000# Natural Language Toolkit: Utility functions # # Copyright (C) 2001-2022 NLTK Project # Author: Steven Bird # Eric Kafe (acyclic closures) # URL: # For license information, see LICENSE.TXT import bisect import inspect import locale import os import pydoc import re import sys import textwrap import types import warnings from collections import defaultdict, deque from itertools import chain, combinations, islice, tee from pprint import pprint from urllib.request import ( HTTPPasswordMgrWithDefaultRealm, ProxyBasicAuthHandler, ProxyDigestAuthHandler, ProxyHandler, build_opener, getproxies, install_opener, ) from nltk.collections import * from nltk.internals import deprecated, raise_unorderable_types, slice_bounds ###################################################################### # Short usage message ###################################################################### @deprecated("Use help(obj) instead.") def usage(obj): str(obj) # In case it's lazy, this will load it. if not isinstance(obj, type): obj = obj.__class__ print(f"{obj.__name__} supports the following operations:") for (name, method) in sorted(pydoc.allmethods(obj).items()): if name.startswith("_"): continue if getattr(method, "__deprecated__", False): continue try: sig = str(inspect.signature(method)) except ValueError as e: # builtins sometimes don't support introspection if "builtin" in str(e): continue else: raise args = sig.lstrip("(").rstrip(")").split(", ") meth = inspect.getattr_static(obj, name) if isinstance(meth, (classmethod, staticmethod)): name = f"cls.{name}" elif args and args[0] == "self": name = f"self.{name}" args.pop(0) print( textwrap.fill( f"{name}({', '.join(args)})", initial_indent=" - ", subsequent_indent=" " * (len(name) + 5), ) ) ########################################################################## # IDLE ########################################################################## def in_idle(): """ Return True if this function is run within idle. Tkinter programs that are run in idle should never call ``Tk.mainloop``; so this function should be used to gate all calls to ``Tk.mainloop``. :warning: This function works by checking ``sys.stdin``. If the user has modified ``sys.stdin``, then it may return incorrect results. :rtype: bool """ import sys return sys.stdin.__class__.__name__ in ("PyShell", "RPCProxy") ########################################################################## # PRETTY PRINTING ########################################################################## def pr(data, start=0, end=None): """ Pretty print a sequence of data items :param data: the data stream to print :type data: sequence or iter :param start: the start position :type start: int :param end: the end position :type end: int """ pprint(list(islice(data, start, end))) def print_string(s, width=70): """ Pretty print a string, breaking lines on whitespace :param s: the string to print, consisting of words and spaces :type s: str :param width: the display width :type width: int """ print("\n".join(textwrap.wrap(s, width=width))) def tokenwrap(tokens, separator=" ", width=70): """ Pretty print a list of text tokens, breaking lines on whitespace :param tokens: the tokens to print :type tokens: list :param separator: the string to use to separate tokens :type separator: str :param width: the display width (default=70) :type width: int """ return "\n".join(textwrap.wrap(separator.join(tokens), width=width)) ########################################################################## # Indexing ########################################################################## class Index(defaultdict): def __init__(self, pairs): defaultdict.__init__(self, list) for key, value in pairs: self[key].append(value) ###################################################################### ## Regexp display (thanks to David Mertz) ###################################################################### def re_show(regexp, string, left="{", right="}"): """ Return a string with markers surrounding the matched substrings. Search str for substrings matching ``regexp`` and wrap the matches with braces. This is convenient for learning about regular expressions. :param regexp: The regular expression. :type regexp: str :param string: The string being matched. :type string: str :param left: The left delimiter (printed before the matched substring) :type left: str :param right: The right delimiter (printed after the matched substring) :type right: str :rtype: str """ print(re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip())) ########################################################################## # READ FROM FILE OR STRING ########################################################################## # recipe from David Mertz def filestring(f): if hasattr(f, "read"): return f.read() elif isinstance(f, str): with open(f) as infile: return infile.read() else: raise ValueError("Must be called with a filename or file-like object") ########################################################################## # Breadth-First Search ########################################################################## def breadth_first(tree, children=iter, maxdepth=-1): """Traverse the nodes of a tree in breadth-first order. (No check for cycles.) The first argument should be the tree root; children should be a function taking as argument a tree node and returning an iterator of the node's children. """ queue = deque([(tree, 0)]) while queue: node, depth = queue.popleft() yield node if depth != maxdepth: try: queue.extend((c, depth + 1) for c in children(node)) except TypeError: pass ########################################################################## # Graph Drawing ########################################################################## def edge_closure(tree, children=iter, maxdepth=-1, verbose=False): """Yield the edges of a graph in breadth-first order, discarding eventual cycles. The first argument should be the start node; children should be a function taking as argument a graph node and returning an iterator of the node's children. >>> from nltk.util import edge_closure >>> print(list(edge_closure('A', lambda node:{'A':['B','C'], 'B':'C', 'C':'B'}[node]))) [('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')] """ traversed = set() edges = set() queue = deque([(tree, 0)]) while queue: node, depth = queue.popleft() traversed.add(node) if depth != maxdepth: try: for child in children(node): if child not in traversed: queue.append((child, depth + 1)) else: if verbose: warnings.warn( f"Discarded redundant search for {child} at depth {depth + 1}", stacklevel=2, ) edge = (node, child) if edge not in edges: yield edge edges.add(edge) except TypeError: pass def edges2dot(edges, shapes=None, attr=None): """ :param edges: the set (or list) of edges of a directed graph. :return dot_string: a representation of 'edges' as a string in the DOT graph language, which can be converted to an image by the 'dot' program from the Graphviz package, or nltk.parse.dependencygraph.dot2img(dot_string). :param shapes: dictionary of strings that trigger a specified shape. :param attr: dictionary with global graph attributes >>> import nltk >>> from nltk.util import edges2dot >>> print(edges2dot([('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')])) digraph G { "A" -> "B"; "A" -> "C"; "B" -> "C"; "C" -> "B"; } """ if not shapes: shapes = dict() if not attr: attr = dict() dot_string = "digraph G {\n" for pair in attr.items(): dot_string += f"{pair[0]} = {pair[1]};\n" for edge in edges: for shape in shapes.items(): for node in range(2): if shape[0] in repr(edge[node]): dot_string += f'"{edge[node]}" [shape = {shape[1]}];\n' dot_string += f'"{edge[0]}" -> "{edge[1]}";\n' dot_string += "}\n" return dot_string def unweighted_minimum_spanning_digraph(tree, children=iter, shapes=None, attr=None): """ Build a Minimum Spanning Tree (MST) of an unweighted graph, by traversing the nodes of a tree in breadth-first order, discarding eventual cycles. Return a representation of this MST as a string in the DOT graph language, which can be converted to an image by the 'dot' program from the Graphviz package, or nltk.parse.dependencygraph.dot2img(dot_string). The first argument should be the tree root; children should be a function taking as argument a tree node and returning an iterator of the node's children. >>> import nltk >>> wn=nltk.corpus.wordnet >>> from nltk.util import unweighted_minimum_spanning_digraph as umsd >>> print(umsd(wn.synset('bound.a.01'), lambda s:s.also_sees())) digraph G { "Synset('bound.a.01')" -> "Synset('unfree.a.02')"; "Synset('unfree.a.02')" -> "Synset('confined.a.02')"; "Synset('unfree.a.02')" -> "Synset('dependent.a.01')"; "Synset('unfree.a.02')" -> "Synset('restricted.a.01')"; "Synset('restricted.a.01')" -> "Synset('classified.a.02')"; } """ return edges2dot( edge_closure( tree, lambda node: unweighted_minimum_spanning_dict(tree, children)[node] ), shapes, attr, ) ########################################################################## # Breadth-First / Depth-first Searches with Cycle Detection ########################################################################## def acyclic_breadth_first(tree, children=iter, maxdepth=-1): """Traverse the nodes of a tree in breadth-first order, discarding eventual cycles. The first argument should be the tree root; children should be a function taking as argument a tree node and returning an iterator of the node's children. """ traversed = set() queue = deque([(tree, 0)]) while queue: node, depth = queue.popleft() yield node traversed.add(node) if depth != maxdepth: try: for child in children(node): if child not in traversed: queue.append((child, depth + 1)) else: warnings.warn( "Discarded redundant search for {} at depth {}".format( child, depth + 1 ), stacklevel=2, ) except TypeError: pass def acyclic_depth_first(tree, children=iter, depth=-1, cut_mark=None, traversed=None): """Traverse the nodes of a tree in depth-first order, discarding eventual cycles within any branch, adding cut_mark (when specified) if cycles were truncated. The first argument should be the tree root; children should be a function taking as argument a tree node and returning an iterator of the node's children. Catches all cycles: >>> import nltk >>> from nltk.util import acyclic_depth_first as acyclic_tree >>> wn=nltk.corpus.wordnet >>> from pprint import pprint >>> pprint(acyclic_tree(wn.synset('dog.n.01'), lambda s:s.hypernyms(),cut_mark='...')) [Synset('dog.n.01'), [Synset('canine.n.02'), [Synset('carnivore.n.01'), [Synset('placental.n.01'), [Synset('mammal.n.01'), [Synset('vertebrate.n.01'), [Synset('chordate.n.01'), [Synset('animal.n.01'), [Synset('organism.n.01'), [Synset('living_thing.n.01'), [Synset('whole.n.02'), [Synset('object.n.01'), [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]]]]], [Synset('domestic_animal.n.01'), "Cycle(Synset('animal.n.01'),-3,...)"]] """ if traversed is None: traversed = {tree} out_tree = [tree] if depth != 0: try: for child in children(tree): if child not in traversed: # Recurse with a common "traversed" set for all children: traversed.add(child) out_tree += [ acyclic_depth_first( child, children, depth - 1, cut_mark, traversed ) ] else: warnings.warn( "Discarded redundant search for {} at depth {}".format( child, depth - 1 ), stacklevel=3, ) if cut_mark: out_tree += [f"Cycle({child},{depth - 1},{cut_mark})"] except TypeError: pass elif cut_mark: out_tree += [cut_mark] return out_tree def acyclic_branches_depth_first( tree, children=iter, depth=-1, cut_mark=None, traversed=None ): """Traverse the nodes of a tree in depth-first order, discarding eventual cycles within the same branch, but keep duplicate paths in different branches. Add cut_mark (when defined) if cycles were truncated. The first argument should be the tree root; children should be a function taking as argument a tree node and returning an iterator of the node's children. Catches only only cycles within the same branch, but keeping cycles from different branches: >>> import nltk >>> from nltk.util import acyclic_branches_depth_first as tree >>> wn=nltk.corpus.wordnet >>> from pprint import pprint >>> pprint(tree(wn.synset('certified.a.01'), lambda s:s.also_sees(), cut_mark='...', depth=4)) [Synset('certified.a.01'), [Synset('authorized.a.01'), [Synset('lawful.a.01'), [Synset('legal.a.01'), "Cycle(Synset('lawful.a.01'),0,...)", [Synset('legitimate.a.01'), '...']], [Synset('straight.a.06'), [Synset('honest.a.01'), '...'], "Cycle(Synset('lawful.a.01'),0,...)"]], [Synset('legitimate.a.01'), "Cycle(Synset('authorized.a.01'),1,...)", [Synset('legal.a.01'), [Synset('lawful.a.01'), '...'], "Cycle(Synset('legitimate.a.01'),0,...)"], [Synset('valid.a.01'), "Cycle(Synset('legitimate.a.01'),0,...)", [Synset('reasonable.a.01'), '...']]], [Synset('official.a.01'), "Cycle(Synset('authorized.a.01'),1,...)"]], [Synset('documented.a.01')]] """ if traversed is None: traversed = {tree} out_tree = [tree] if depth != 0: try: for child in children(tree): if child not in traversed: # Recurse with a different "traversed" set for each child: out_tree += [ acyclic_branches_depth_first( child, children, depth - 1, cut_mark, traversed.union({child}), ) ] else: warnings.warn( "Discarded redundant search for {} at depth {}".format( child, depth - 1 ), stacklevel=3, ) if cut_mark: out_tree += [f"Cycle({child},{depth - 1},{cut_mark})"] except TypeError: pass elif cut_mark: out_tree += [cut_mark] return out_tree def acyclic_dic2tree(node, dic): """Convert acyclic dictionary 'dic', where the keys are nodes, and the values are lists of children, to output tree suitable for pprint(), starting at root 'node', with subtrees as nested lists.""" return [node] + [acyclic_dic2tree(child, dic) for child in dic[node]] def unweighted_minimum_spanning_dict(tree, children=iter): """ Output a dictionary representing a Minimum Spanning Tree (MST) of an unweighted graph, by traversing the nodes of a tree in breadth-first order, discarding eventual cycles. The first argument should be the tree root; children should be a function taking as argument a tree node and returning an iterator of the node's children. >>> import nltk >>> from nltk.corpus import wordnet as wn >>> from nltk.util import unweighted_minimum_spanning_dict as umsd >>> from pprint import pprint >>> pprint(umsd(wn.synset('bound.a.01'), lambda s:s.also_sees())) {Synset('bound.a.01'): [Synset('unfree.a.02')], Synset('classified.a.02'): [], Synset('confined.a.02'): [], Synset('dependent.a.01'): [], Synset('restricted.a.01'): [Synset('classified.a.02')], Synset('unfree.a.02'): [Synset('confined.a.02'), Synset('dependent.a.01'), Synset('restricted.a.01')]} """ traversed = set() # Empty set of traversed nodes queue = deque([tree]) # Initialize queue agenda = {tree} # Set of all nodes ever queued mstdic = {} # Empty MST dictionary while queue: node = queue.popleft() # Node is not yet in the MST dictionary, mstdic[node] = [] # so add it with an empty list of children if node not in traversed: # Avoid cycles traversed.add(node) for child in children(node): if child not in agenda: # Queue nodes only once mstdic[node].append(child) # Add child to the MST queue.append(child) # Add child to queue agenda.add(child) return mstdic def unweighted_minimum_spanning_tree(tree, children=iter): """ Output a Minimum Spanning Tree (MST) of an unweighted graph, by traversing the nodes of a tree in breadth-first order, discarding eventual cycles. The first argument should be the tree root; children should be a function taking as argument a tree node and returning an iterator of the node's children. >>> import nltk >>> from nltk.util import unweighted_minimum_spanning_tree as mst >>> wn=nltk.corpus.wordnet >>> from pprint import pprint >>> pprint(mst(wn.synset('bound.a.01'), lambda s:s.also_sees())) [Synset('bound.a.01'), [Synset('unfree.a.02'), [Synset('confined.a.02')], [Synset('dependent.a.01')], [Synset('restricted.a.01'), [Synset('classified.a.02')]]]] """ return acyclic_dic2tree(tree, unweighted_minimum_spanning_dict(tree, children)) ########################################################################## # Guess Character Encoding ########################################################################## # adapted from io.py in the docutils extension module (https://docutils.sourceforge.io/) # http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html def guess_encoding(data): """ Given a byte string, attempt to decode it. Tries the standard 'UTF8' and 'latin-1' encodings, Plus several gathered from locale information. The calling program *must* first call:: locale.setlocale(locale.LC_ALL, '') If successful it returns ``(decoded_unicode, successful_encoding)``. If unsuccessful it raises a ``UnicodeError``. """ successful_encoding = None # we make 'utf-8' the first encoding encodings = ["utf-8"] # # next we add anything we can learn from the locale try: encodings.append(locale.nl_langinfo(locale.CODESET)) except AttributeError: pass try: encodings.append(locale.getlocale()[1]) except (AttributeError, IndexError): pass try: encodings.append(locale.getdefaultlocale()[1]) except (AttributeError, IndexError): pass # # we try 'latin-1' last encodings.append("latin-1") for enc in encodings: # some of the locale calls # may have returned None if not enc: continue try: decoded = str(data, enc) successful_encoding = enc except (UnicodeError, LookupError): pass else: break if not successful_encoding: raise UnicodeError( "Unable to decode input data. " "Tried the following encodings: %s." % ", ".join([repr(enc) for enc in encodings if enc]) ) else: return (decoded, successful_encoding) ########################################################################## # Remove repeated elements from a list deterministcally ########################################################################## def unique_list(xs): seen = set() # not seen.add(x) here acts to make the code shorter without using if statements, seen.add(x) always returns None. return [x for x in xs if x not in seen and not seen.add(x)] ########################################################################## # Invert a dictionary ########################################################################## def invert_dict(d): inverted_dict = defaultdict(list) for key in d: if hasattr(d[key], "__iter__"): for term in d[key]: inverted_dict[term].append(key) else: inverted_dict[d[key]] = key return inverted_dict ########################################################################## # Utilities for directed graphs: transitive closure, and inversion # The graph is represented as a dictionary of sets ########################################################################## def transitive_closure(graph, reflexive=False): """ Calculate the transitive closure of a directed graph, optionally the reflexive transitive closure. The algorithm is a slight modification of the "Marking Algorithm" of Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms". :param graph: the initial graph, represented as a dictionary of sets :type graph: dict(set) :param reflexive: if set, also make the closure reflexive :type reflexive: bool :rtype: dict(set) """ if reflexive: base_set = lambda k: {k} else: base_set = lambda k: set() # The graph U_i in the article: agenda_graph = {k: graph[k].copy() for k in graph} # The graph M_i in the article: closure_graph = {k: base_set(k) for k in graph} for i in graph: agenda = agenda_graph[i] closure = closure_graph[i] while agenda: j = agenda.pop() closure.add(j) closure |= closure_graph.setdefault(j, base_set(j)) agenda |= agenda_graph.get(j, base_set(j)) agenda -= closure return closure_graph def invert_graph(graph): """ Inverts a directed graph. :param graph: the graph, represented as a dictionary of sets :type graph: dict(set) :return: the inverted graph :rtype: dict(set) """ inverted = {} for key in graph: for value in graph[key]: inverted.setdefault(value, set()).add(key) return inverted ########################################################################## # HTML Cleaning ########################################################################## def clean_html(html): raise NotImplementedError( "To remove HTML markup, use BeautifulSoup's get_text() function" ) def clean_url(url): raise NotImplementedError( "To remove HTML markup, use BeautifulSoup's get_text() function" ) ########################################################################## # FLATTEN LISTS ########################################################################## def flatten(*args): """ Flatten a list. >>> from nltk.util import flatten >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3) [1, 2, 'b', 'a', 'c', 'd', 3] :param args: items and lists to be combined into a single list :rtype: list """ x = [] for l in args: if not isinstance(l, (list, tuple)): l = [l] for item in l: if isinstance(item, (list, tuple)): x.extend(flatten(item)) else: x.append(item) return x ########################################################################## # Ngram iteration ########################################################################## def pad_sequence( sequence, n, pad_left=False, pad_right=False, left_pad_symbol=None, right_pad_symbol=None, ): """ Returns a padded sequence of items before ngram extraction. >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) ['', 1, 2, 3, 4, 5, ''] >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) ['', 1, 2, 3, 4, 5] >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) [1, 2, 3, 4, 5, ''] :param sequence: the source data to be padded :type sequence: sequence or iter :param n: the degree of the ngrams :type n: int :param pad_left: whether the ngrams should be left-padded :type pad_left: bool :param pad_right: whether the ngrams should be right-padded :type pad_right: bool :param left_pad_symbol: the symbol to use for left padding (default is None) :type left_pad_symbol: any :param right_pad_symbol: the symbol to use for right padding (default is None) :type right_pad_symbol: any :rtype: sequence or iter """ sequence = iter(sequence) if pad_left: sequence = chain((left_pad_symbol,) * (n - 1), sequence) if pad_right: sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) return sequence # add a flag to pad the sequence so we get peripheral ngrams? def ngrams(sequence, n, **kwargs): """ Return the ngrams generated from a sequence of items, as an iterator. For example: >>> from nltk.util import ngrams >>> list(ngrams([1,2,3,4,5], 3)) [(1, 2, 3), (2, 3, 4), (3, 4, 5)] Wrap with list for a list version of this function. Set pad_left or pad_right to true in order to get additional ngrams: >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='
    ')) [(1, 2), (2, 3), (3, 4), (4, 5), (5, '
    ')] >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) [('', 1), (1, 2), (2, 3), (3, 4), (4, 5)] >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) [('', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '')] :param sequence: the source data to be converted into ngrams :type sequence: sequence or iter :param n: the degree of the ngrams :type n: int :param pad_left: whether the ngrams should be left-padded :type pad_left: bool :param pad_right: whether the ngrams should be right-padded :type pad_right: bool :param left_pad_symbol: the symbol to use for left padding (default is None) :type left_pad_symbol: any :param right_pad_symbol: the symbol to use for right padding (default is None) :type right_pad_symbol: any :rtype: sequence or iter """ sequence = pad_sequence(sequence, n, **kwargs) # Creates the sliding window, of n no. of items. # `iterables` is a tuple of iterables where each iterable is a window of n items. iterables = tee(sequence, n) for i, sub_iterable in enumerate(iterables): # For each window, for _ in range(i): # iterate through every order of ngrams next(sub_iterable, None) # generate the ngrams within the window. return zip(*iterables) # Unpack and flattens the iterables. def bigrams(sequence, **kwargs): """ Return the bigrams generated from a sequence of items, as an iterator. For example: >>> from nltk.util import bigrams >>> list(bigrams([1,2,3,4,5])) [(1, 2), (2, 3), (3, 4), (4, 5)] Use bigrams for a list version of this function. :param sequence: the source data to be converted into bigrams :type sequence: sequence or iter :rtype: iter(tuple) """ yield from ngrams(sequence, 2, **kwargs) def trigrams(sequence, **kwargs): """ Return the trigrams generated from a sequence of items, as an iterator. For example: >>> from nltk.util import trigrams >>> list(trigrams([1,2,3,4,5])) [(1, 2, 3), (2, 3, 4), (3, 4, 5)] Use trigrams for a list version of this function. :param sequence: the source data to be converted into trigrams :type sequence: sequence or iter :rtype: iter(tuple) """ yield from ngrams(sequence, 3, **kwargs) def everygrams( sequence, min_len=1, max_len=-1, pad_left=False, pad_right=False, **kwargs ): """ Returns all possible ngrams generated from a sequence of items, as an iterator. >>> sent = 'a b c'.split() New version outputs for everygrams. >>> list(everygrams(sent)) [('a',), ('a', 'b'), ('a', 'b', 'c'), ('b',), ('b', 'c'), ('c',)] Old version outputs for everygrams. >>> sorted(everygrams(sent), key=len) [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')] >>> list(everygrams(sent, max_len=2)) [('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',)] :param sequence: the source data to be converted into ngrams. If max_len is not provided, this sequence will be loaded into memory :type sequence: sequence or iter :param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram :type min_len: int :param max_len: maximum length of the ngrams (set to length of sequence by default) :type max_len: int :param pad_left: whether the ngrams should be left-padded :type pad_left: bool :param pad_right: whether the ngrams should be right-padded :type pad_right: bool :rtype: iter(tuple) """ # Get max_len for padding. if max_len == -1: try: max_len = len(sequence) except TypeError: sequence = list(sequence) max_len = len(sequence) # Pad if indicated using max_len. sequence = pad_sequence(sequence, max_len, pad_left, pad_right, **kwargs) # Sliding window to store grams. history = list(islice(sequence, max_len)) # Yield ngrams from sequence. while history: for ngram_len in range(min_len, len(history) + 1): yield tuple(history[:ngram_len]) # Append element to history if sequence has more items. try: history.append(next(sequence)) except StopIteration: pass del history[0] def skipgrams(sequence, n, k, **kwargs): """ Returns all possible skipgrams generated from a sequence of items, as an iterator. Skipgrams are ngrams that allows tokens to be skipped. Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf >>> sent = "Insurgents killed in ongoing fighting".split() >>> list(skipgrams(sent, 2, 2)) [('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')] >>> list(skipgrams(sent, 3, 2)) [('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')] :param sequence: the source data to be converted into trigrams :type sequence: sequence or iter :param n: the degree of the ngrams :type n: int :param k: the skip distance :type k: int :rtype: iter(tuple) """ # Pads the sequence as desired by **kwargs. if "pad_left" in kwargs or "pad_right" in kwargs: sequence = pad_sequence(sequence, n, **kwargs) # Note when iterating through the ngrams, the pad_right here is not # the **kwargs padding, it's for the algorithm to detect the SENTINEL # object on the right pad to stop inner loop. SENTINEL = object() for ngram in ngrams(sequence, n + k, pad_right=True, right_pad_symbol=SENTINEL): head = ngram[:1] tail = ngram[1:] for skip_tail in combinations(tail, n - 1): if skip_tail[-1] is SENTINEL: continue yield head + skip_tail ###################################################################### # Binary Search in a File ###################################################################### # inherited from pywordnet, by Oliver Steele def binary_search_file(file, key, cache=None, cacheDepth=-1): """ Return the line from the file with first word key. Searches through a sorted file using the binary search algorithm. :type file: file :param file: the file to be searched through. :type key: str :param key: the identifier we are searching for. """ key = key + " " keylen = len(key) start = 0 currentDepth = 0 if hasattr(file, "name"): end = os.stat(file.name).st_size - 1 else: file.seek(0, 2) end = file.tell() - 1 file.seek(0) if cache is None: cache = {} while start < end: lastState = start, end middle = (start + end) // 2 if cache.get(middle): offset, line = cache[middle] else: line = "" while True: file.seek(max(0, middle - 1)) if middle > 0: file.discard_line() offset = file.tell() line = file.readline() if line != "": break # at EOF; try to find start of the last line middle = (start + middle) // 2 if middle == end - 1: return None if currentDepth < cacheDepth: cache[middle] = (offset, line) if offset > end: assert end != middle - 1, "infinite loop" end = middle - 1 elif line[:keylen] == key: return line elif line > key: assert end != middle - 1, "infinite loop" end = middle - 1 elif line < key: start = offset + len(line) - 1 currentDepth += 1 thisState = start, end if lastState == thisState: # Detects the condition where we're searching past the end # of the file, which is otherwise difficult to detect return None return None ###################################################################### # Proxy configuration ###################################################################### def set_proxy(proxy, user=None, password=""): """ Set the HTTP proxy for Python to download through. If ``proxy`` is None then tries to set proxy from environment or system settings. :param proxy: The HTTP proxy server to use. For example: 'http://proxy.example.com:3128/' :param user: The username to authenticate with. Use None to disable authentication. :param password: The password to authenticate with. """ if proxy is None: # Try and find the system proxy settings try: proxy = getproxies()["http"] except KeyError as e: raise ValueError("Could not detect default proxy settings") from e # Set up the proxy handler proxy_handler = ProxyHandler({"https": proxy, "http": proxy}) opener = build_opener(proxy_handler) if user is not None: # Set up basic proxy authentication if provided password_manager = HTTPPasswordMgrWithDefaultRealm() password_manager.add_password(realm=None, uri=proxy, user=user, passwd=password) opener.add_handler(ProxyBasicAuthHandler(password_manager)) opener.add_handler(ProxyDigestAuthHandler(password_manager)) # Override the existing url opener install_opener(opener) ###################################################################### # ElementTree pretty printing from https://www.effbot.org/zone/element-lib.htm ###################################################################### def elementtree_indent(elem, level=0): """ Recursive function to indent an ElementTree._ElementInterface used for pretty printing. Run indent on elem and then output in the normal way. :param elem: element to be indented. will be modified. :type elem: ElementTree._ElementInterface :param level: level of indentation for this element :type level: nonnegative integer :rtype: ElementTree._ElementInterface :return: Contents of elem indented to reflect its structure """ i = "\n" + level * " " if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + " " for elem in elem: elementtree_indent(elem, level + 1) if not elem.tail or not elem.tail.strip(): elem.tail = i else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i ###################################################################### # Mathematical approximations ###################################################################### def choose(n, k): """ This function is a fast way to calculate binomial coefficients, commonly known as nCk, i.e. the number of combinations of n things taken k at a time. (https://en.wikipedia.org/wiki/Binomial_coefficient). This is the *scipy.special.comb()* with long integer computation but this approximation is faster, see https://github.com/nltk/nltk/issues/1181 >>> choose(4, 2) 6 >>> choose(6, 2) 15 :param n: The number of things. :type n: int :param r: The number of times a thing is taken. :type r: int """ if 0 <= k <= n: ntok, ktok = 1, 1 for t in range(1, min(k, n - k) + 1): ntok *= n ktok *= t n -= 1 return ntok // ktok else: return 0 ###################################################################### # Iteration utilities ###################################################################### def pairwise(iterable): """s -> (s0,s1), (s1,s2), (s2, s3), ...""" a, b = tee(iterable) next(b, None) return zip(a, b) ###################################################################### # Parallelization. ###################################################################### def parallelize_preprocess(func, iterator, processes, progress_bar=False): from joblib import Parallel, delayed from tqdm import tqdm iterator = tqdm(iterator) if progress_bar else iterator if processes <= 1: return map(func, iterator) return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator) nltk-3.7/nltk/wsd.py000066400000000000000000000033121420073152400144470ustar00rootroot00000000000000# Natural Language Toolkit: Word Sense Disambiguation Algorithms # # Authors: Liling Tan , # Dmitrijs Milajevs # # Copyright (C) 2001-2022 NLTK Project # URL: # For license information, see LICENSE.TXT from nltk.corpus import wordnet def lesk(context_sentence, ambiguous_word, pos=None, synsets=None): """Return a synset for an ambiguous word in a context. :param iter context_sentence: The context sentence where the ambiguous word occurs, passed as an iterable of words. :param str ambiguous_word: The ambiguous word that requires WSD. :param str pos: A specified Part-of-Speech (POS). :param iter synsets: Possible synsets of the ambiguous word. :return: ``lesk_sense`` The Synset() object with the highest signature overlaps. This function is an implementation of the original Lesk algorithm (1986) [1]. Usage example:: >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n') Synset('savings_bank.n.02') [1] Lesk, Michael. "Automatic sense disambiguation using machine readable dictionaries: how to tell a pine cone from an ice cream cone." Proceedings of the 5th Annual International Conference on Systems Documentation. ACM, 1986. https://dl.acm.org/citation.cfm?id=318728 """ context = set(context_sentence) if synsets is None: synsets = wordnet.synsets(ambiguous_word) if pos: synsets = [ss for ss in synsets if str(ss.pos()) == pos] if not synsets: return None _, sense = max( (len(context.intersection(ss.definition().split())), ss) for ss in synsets ) return sense nltk-3.7/pip-req.txt000066400000000000000000000003531420073152400144500ustar00rootroot00000000000000pytest>=6.0.1 tox>=1.6.1 pylint>=1.1.0 numpy>=1.8.0 scipy>=0.13.2 matplotlib>=1.3.1 scikit-learn>=0.14.1 python-crfsuite>=0.8.2 pyparsing>=2.0.3 twython>=3.2.0 regex>=2021.8.3 click>=7.1.2 joblib>=1.0.1 tqdm>=4.59.0 pre-commit>=2.13.0 nltk-3.7/requirements-ci.txt000066400000000000000000000001341420073152400162040ustar00rootroot00000000000000gensim>=4.0.0 matplotlib pytest pytest-mock pytest-xdist[psutil] regex scikit-learn twython nltk-3.7/requirements-test.txt000066400000000000000000000000701420073152400165670ustar00rootroot00000000000000pylint pytest>=6.0.1 pytest-cov>=2.10.1 pytest-mock tox nltk-3.7/setup.cfg000066400000000000000000000001511420073152400141470ustar00rootroot00000000000000[metadata] license_files = LICENSE.txt AUTHORS.md README.md [build_sphinx] source-dir = web nltk-3.7/setup.py000066400000000000000000000071561420073152400140540ustar00rootroot00000000000000#!/usr/bin/env python # # Setup script for the Natural Language Toolkit # # Copyright (C) 2001-2022 NLTK Project # Author: NLTK Team # URL: # For license information, see LICENSE.TXT # Work around mbcs bug in distutils. # https://bugs.python.org/issue10945 import codecs try: codecs.lookup("mbcs") except LookupError: ascii = codecs.lookup("ascii") func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs") codecs.register(func) import os # Use the VERSION file to get NLTK version version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION") with open(version_file) as fh: nltk_version = fh.read().strip() # setuptools from setuptools import find_packages, setup # Specify groups of optional dependencies extras_require = { "machine_learning": [ "numpy", "python-crfsuite", "scikit-learn", "scipy", ], "plot": ["matplotlib"], "tgrep": ["pyparsing"], "twitter": ["twython"], "corenlp": ["requests"], } # Add a group made up of all optional dependencies extras_require["all"] = { package for group in extras_require.values() for package in group } # Adds CLI commands console_scripts = """ [console_scripts] nltk=nltk.cli:cli """ _project_homepage = "https://www.nltk.org/" setup( name="nltk", description="Natural Language Toolkit", version=nltk_version, url=_project_homepage, project_urls={ "Documentation": _project_homepage, "Source Code": "https://github.com/nltk/nltk", "Issue Tracker": "https://github.com/nltk/nltk/issues", }, long_description="""\ The Natural Language Toolkit (NLTK) is a Python package for natural language processing. NLTK requires Python 3.7, 3.8, 3.9 or 3.10.""", license="Apache License, Version 2.0", keywords=[ "NLP", "CL", "natural language processing", "computational linguistics", "parsing", "tagging", "tokenizing", "syntax", "linguistics", "language", "natural language", "text analytics", ], maintainer="NLTK Team", maintainer_email="nltk.team@gmail.com", author="NLTK Team", author_email="nltk.team@gmail.com", classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Education", "Intended Audience :: Information Technology", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Human Machine Interfaces", "Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Text Processing", "Topic :: Text Processing :: Filters", "Topic :: Text Processing :: General", "Topic :: Text Processing :: Indexing", "Topic :: Text Processing :: Linguistic", ], package_data={"nltk": ["test/*.doctest", "VERSION"]}, python_requires=">=3.7", install_requires=[ "click", "joblib", "regex>=2021.8.3", "tqdm", ], extras_require=extras_require, packages=find_packages(), zip_safe=False, # since normal files will be present too? entry_points=console_scripts, ) nltk-3.7/tools/000077500000000000000000000000001420073152400134715ustar00rootroot00000000000000nltk-3.7/tools/find_deprecated.py000077500000000000000000000210651420073152400171520ustar00rootroot00000000000000#!/usr/bin/env python # # Natural Language Toolkit: Deprecated Function & Class Finder # # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ This command-line tool takes a list of python files or directories, and searches them for calls to deprecated NLTK functions, or uses of deprecated NLTK classes. For each use of a deprecated object it finds, it will print out a warning containing the offending line, as well as its line number and containing file name. If the terminal has color support (and if epydoc is installed), then the offending identifier will be highlighted in red. """ ###################################################################### # Imports ###################################################################### import os import re import sys import textwrap import tokenize from doctest import DocTestParser, register_optionflag from cStringIO import StringIO import nltk.corpus from nltk import defaultdict ###################################################################### # Regexps ###################################################################### #: A little over-simplified, but it'll do. STRING_PAT = ( r"\s*[ur]{0,2}(?:" r'"""[\s\S]*?"""|' '"[^"\n]+?"|' r"'''[\s\S]*?'''|" "'[^'\n]+?'" r")\s*" ) STRING_RE = re.compile(STRING_PAT) STRINGS_PAT = f"{STRING_PAT}(?:[+]?{STRING_PAT})*" STRINGS_RE = re.compile(STRINGS_PAT) # Define a regexp to search for deprecated definitions. DEPRECATED_DEF_PAT = ( fr"^\s*@deprecated\s*\(\s*({STRINGS_PAT})\s*\)\s*\n+" + r"\s*def\s*(\w+).*" + r"|" + r"^\s*class\s+(\w+)\s*\(.*Deprecated.*\):\s*" ) DEPRECATED_DEF_RE = re.compile(DEPRECATED_DEF_PAT, re.MULTILINE) CORPUS_READ_METHOD_RE = re.compile( r"({})\.read\(".format("|".join(re.escape(n) for n in dir(nltk.corpus))) ) CLASS_DEF_RE = re.compile(r"^\s*class\s+(\w+)\s*[:\(]") ###################################################################### # Globals ###################################################################### # Yes, it's bad programming practice, but this is a little hack # script. :) These get initialized by find_deprecated_defs. deprecated_funcs = defaultdict(set) deprecated_classes = defaultdict(set) deprecated_methods = defaultdict(set) try: from epydoc.cli import TerminalController except ImportError: class TerminalController: def __getattr__(self, attr): return "" term = TerminalController() ###################################################################### # Code ###################################################################### def strip_quotes(s): s = s.strip() while s and (s[0] in "ur") and (s[-1] in "'\""): s = s[1:] while s and (s[0] in "'\"" and (s[0] == s[-1])): s = s[1:-1] s = s.strip() return s def find_class(s, index): lines = s[:index].split("\n") while lines: m = CLASS_DEF_RE.match(lines[-1]) if m: return m.group(1) + "." lines.pop() return "?." def find_deprecated_defs(pkg_dir): """ Return a list of all functions marked with the @deprecated decorator, and classes with an immediate Deprecated base class, in all Python files in the given directory. """ # Walk through the directory, finding python files. for root, dirs, files in os.walk(pkg_dir): for filename in files: if filename.endswith(".py"): # Search the file for any deprecated definitions. s = open(os.path.join(root, filename)).read() for m in DEPRECATED_DEF_RE.finditer(s): if m.group(2): name = m.group(2) msg = " ".join( strip_quotes(s) for s in STRING_RE.findall(m.group(1)) ) msg = " ".join(msg.split()) if m.group()[0] in " \t": cls = find_class(s, m.start()) deprecated_methods[name].add((msg, cls, "()")) else: deprecated_funcs[name].add((msg, "", "()")) else: name = m.group(3) m2 = STRING_RE.match(s, m.end()) if m2: msg = strip_quotes(m2.group()) else: msg = "" msg = " ".join(msg.split()) deprecated_classes[name].add((msg, "", "")) def print_deprecated_uses(paths): dep_names = set() dep_files = set() for path in sorted(paths): if os.path.isdir(path): dep_names.update( print_deprecated_uses([os.path.join(path, f) for f in os.listdir(path)]) ) elif path.endswith(".py"): print_deprecated_uses_in(open(path).readline, path, dep_files, dep_names, 0) elif path.endswith(".doctest") or path.endswith(".txt"): for example in DocTestParser().get_examples(open(path).read()): ex = StringIO(example.source) try: print_deprecated_uses_in( ex.readline, path, dep_files, dep_names, example.lineno ) except tokenize.TokenError: print( term.RED + "Caught TokenError -- " "malformatted doctest?" + term.NORMAL ) return dep_names def print_deprecated_uses_in(readline, path, dep_files, dep_names, lineno_offset): tokiter = tokenize.generate_tokens(readline) context = [""] for (typ, tok, start, end, line) in tokiter: # Remember the previous line -- it might contain # the @deprecated decorator. if line is not context[-1]: context.append(line) if len(context) > 10: del context[0] esctok = re.escape(tok) # Ignore all tokens except deprecated names. if not ( tok in deprecated_classes or (tok in deprecated_funcs and re.search(fr"\b{esctok}\s*\(", line)) or ( tok in deprecated_methods and re.search(fr"(?!<\bself)[.]\s*{esctok}\s*\(", line) ) ): continue # Hack: only complain about read if it's used after a corpus. if tok == "read" and not CORPUS_READ_METHOD_RE.search(line): continue # Ignore deprecated definitions: if DEPRECATED_DEF_RE.search("".join(context)): continue # Print a header for the first use in a file: if path not in dep_files: print("\n" + term.BOLD + path + term.NORMAL) print(f" {term.YELLOW}linenum{term.NORMAL}") dep_files.add(path) # Mark the offending token. dep_names.add(tok) if term.RED: sub = term.RED + tok + term.NORMAL elif term.BOLD: sub = term.BOLD + tok + term.NORMAL else: sub = "<<" + tok + ">>" line = re.sub(fr"\b{esctok}\b", sub, line) # Print the offending line. print( " {}[{:5d}]{} {}".format( term.YELLOW, start[0] + lineno_offset, term.NORMAL, line.rstrip() ) ) def main(): paths = sys.argv[1:] or ["."] print("Importing nltk...") try: import nltk except ImportError: print("Unable to import nltk -- check your PYTHONPATH.") sys.exit(-1) print("Finding definitions of deprecated functions & classes in nltk...") find_deprecated_defs(nltk.__path__[0]) print("Looking for possible uses of deprecated funcs & classes...") dep_names = print_deprecated_uses(paths) if not dep_names: print("No deprecated funcs or classes found!") else: print("\n" + term.BOLD + "What you should use instead:" + term.NORMAL) for name in sorted(dep_names): msgs = ( deprecated_funcs[name] .union(deprecated_classes[name]) .union(deprecated_methods[name]) ) for msg, prefix, suffix in msgs: print( textwrap.fill( term.RED + prefix + name + suffix + term.NORMAL + ": " + msg, width=75, initial_indent=" " * 2, subsequent_indent=" " * 6, ) ) if __name__ == "__main__": main() nltk-3.7/tools/github_actions/000077500000000000000000000000001420073152400164735ustar00rootroot00000000000000nltk-3.7/tools/github_actions/third-party.sh000066400000000000000000000111551420073152400213010ustar00rootroot00000000000000#!/bin/bash # This install script is used in our GitHub Actions CI. # See .github/workflows/ci.yaml # Installing the third-party software and the appropriate env variables. pushd ${HOME} [[ ! -d 'third' ]] && mkdir 'third' pushd 'third' # Download nltk stanford dependencies # Downloaded to ~/third/stanford-corenlp # stanford_corenlp_package_zip_name=$(curl -s 'https://stanfordnlp.github.io/CoreNLP/' | grep -o 'stanford-corenlp-full-.*\.zip' | head -n1) stanford_corenlp_package_zip_name="stanford-corenlp-full-2018-10-05.zip" [[ ${stanford_corenlp_package_zip_name} =~ (.+)\.zip ]] stanford_corenlp_package_name=${BASH_REMATCH[1]} if [[ ! -d ${stanford_corenlp_package_name} ]]; then curl -L "https://nlp.stanford.edu/software/$stanford_corenlp_package_zip_name" -o ${stanford_corenlp_package_zip_name} # wget -nv "https://nlp.stanford.edu/software/$stanford_corenlp_package_zip_name" unzip -q ${stanford_corenlp_package_zip_name} rm ${stanford_corenlp_package_zip_name} mv ${stanford_corenlp_package_name} 'stanford-corenlp' fi # Downloaded to ~/third/stanford-parser #stanford_parser_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/lex-parser.shtml' | grep -o 'stanford-parser-full-.*\.zip' | head -n1) stanford_parser_package_zip_name="stanford-parser-full-2018-10-17.zip" [[ ${stanford_parser_package_zip_name} =~ (.+)\.zip ]] stanford_parser_package_name=${BASH_REMATCH[1]} if [[ ! -d ${stanford_parser_package_name} ]]; then curl -L "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name" -o ${stanford_parser_package_zip_name} # wget -nv "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name" unzip -q ${stanford_parser_package_zip_name} rm ${stanford_parser_package_zip_name} mv ${stanford_parser_package_name} 'stanford-parser' fi # Downloaded to ~/third/stanford-postagger #stanford_tagger_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/tagger.shtml' | grep -o 'stanford-postagger-full-.*\.zip' | head -n1) stanford_tagger_package_zip_name="stanford-postagger-full-2018-10-16.zip" [[ ${stanford_tagger_package_zip_name} =~ (.+)\.zip ]] stanford_tagger_package_name=${BASH_REMATCH[1]} if [[ ! -d ${stanford_tagger_package_name} ]]; then curl -L "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name" -o ${stanford_tagger_package_zip_name} # wget -nv "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name" unzip -q ${stanford_tagger_package_zip_name} rm ${stanford_tagger_package_zip_name} mv ${stanford_tagger_package_name} 'stanford-postagger' fi # Download SENNA to ~/third/senna senna_file_name=$(curl -s 'https://ronan.collobert.com/senna/download.html' | grep -o 'senna-v.*.tgz' | head -n1) senna_folder_name='senna' if [[ ! -d $senna_folder_name ]]; then curl -L "https://ronan.collobert.com/senna/$senna_file_name" -o ${senna_file_name} # wget -nv "https://ronan.collobert.com/senna/$senna_file_name" tar -xzf ${senna_file_name} rm ${senna_file_name} fi # Download PROVER9 to ~/third/prover9 prover9_file_name="p9m4-v05.tar.gz" [[ ${prover9_file_name} =~ (.+)\.tar\.gz ]] prover9_folder_name=${BASH_REMATCH[1]} if [[ ! -d ${prover9_folder_name} ]]; then curl -L "https://www.cs.unm.edu/~mccune/prover9/gui/$prover9_file_name" -o ${prover9_file_name} tar -xzf ${prover9_file_name} mv ${prover9_folder_name} 'prover9' rm ${prover9_file_name} fi # Download MEGAM to ~/third/megam megam_file_name="megam_i686.opt.gz" [[ ${megam_file_name} =~ (.+)\.gz ]] megam_folder_name=${BASH_REMATCH[1]} if [[ ! -d ${megam_folder_name} ]]; then curl -L "http://hal3.name/megam/$megam_file_name" -o ${megam_file_name} gunzip -vf ${megam_file_name} mkdir -p "megam" mv ${megam_folder_name} "megam/${megam_folder_name}" chmod -R 711 "megam/$megam_folder_name" fi # TADM requires `libtaopetsc.so` from PETSc v2.3.3, and likely has more # tricky to install requirements, so we don't run tests for it. # Download TADM to ~/third/tadm # tadm_file_name="tadm-0.9.8.tgz" # [[ ${tadm_file_name} =~ (.+)\.tgz ]] # tadm_folder_name=${BASH_REMATCH[1]} # if [[ ! -d ${tadm_folder_name} ]]; then # curl -L "https://master.dl.sourceforge.net/project/tadm/tadm/tadm%200.9.8/$tadm_file_name?viasf=1" -o ${tadm_file_name} # tar -xvzf ${tadm_file_name} # rm ${tadm_file_name} # chmod -R 711 "./tadm/bin/tadm" # fi # Download MaltParser to ~/third/maltparser malt_file_name="maltparser-1.9.2.tar.gz" [[ ${malt_file_name} =~ (.+)\.tar\.gz ]] malt_folder_name=${BASH_REMATCH[1]} if [[ ! -d ${malt_folder_name} ]]; then curl -L "http://maltparser.org/dist/$malt_file_name" -o ${malt_file_name} tar -xzf ${malt_file_name} mv ${malt_folder_name} 'maltparser' rm ${malt_file_name} fi ls ~/third popd popd nltk-3.7/tools/global_replace.py000077500000000000000000000032051420073152400170010ustar00rootroot00000000000000#!/usr/bin/env python # # Natural Language Toolkit: substitute a pattern with # a replacement in every file # Copyright (C) 2001-2022 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT # NB Should work on all platforms, # http://www.python.org/doc/2.5.2/lib/os-file-dir.html import os import stat import sys def update(file, pattern, replacement): try: # make sure we can write the file old_perm = os.stat(file)[0] if not os.access(file, os.W_OK): os.chmod(file, old_perm | stat.S_IWRITE) # write the file s = open(file, "rb").read().decode("utf-8") t = s.replace(pattern, replacement) out = open(file, "wb") out.write(t.encode("utf-8")) out.close() # restore permissions os.chmod(file, old_perm) return s != t except Exception: exc_type, exc_obj, exc_tb = sys.exc_info() print(f"Unable to check {file:s} {str(exc_type):s}") return 0 if __name__ == "__main__": if len(sys.argv) != 3: exit("Usage: %s " % sys.argv[0]) pattern = sys.argv[1] replacement = sys.argv[2] count = 0 for root, dirs, files in os.walk("."): if not ("/.git" in root or "/.tox" in root): for file in files: path = os.path.join(root, file) if update(path, pattern, replacement): print("Updated:", path) count += 1 print(f"Updated {count} files") nltk-3.7/tools/jenkins/000077500000000000000000000000001420073152400151325ustar00rootroot00000000000000nltk-3.7/tools/jenkins/pre-install.sh000066400000000000000000000026271420073152400177270ustar00rootroot00000000000000#!/bin/bash # This install script is pre-installation environment setup for old jenkins CI: # https://nltk.ci.cloudbees.com/job/pull_request_tests/configure set -e # Exit immediately if a command exits with a non-zero status. set -x # Print all executed commands to the terminal. # The following are from Jenkins "Build Environment > Properties Content": # https://nltk.ci.cloudbees.com/job/pull_request_tests/configure PATH=$PATH:/scratch/jenkins/python/python-3.5.4-x86_64/bin PATH=$PATH:/scratch/jenkins/python/python-3.6.4-x86_64/bin PATH=$PATH:/opt/local/bin:/opt/local/sbin PATH=$PATH:/usr/bin:/bin:/usr/sbin PATH=$PATH:/s # Heh? Not sure why does this exist in CI?? Leaving it here first. PATH=$PATH:/usr/local/bin JAVA_HOME=/opt/jdk/jdk8.latest/bin BLAS=/home/jenkins/lib/ LAPACK=/home/jenkins/lib/ # Checking Java and Python version. java -version python --version # More Jenkins related commands from "Build Environment > Script Content": # https://nltk.ci.cloudbees.com/job/pull_request_tests/configure curl -s -o use-python https://repository-cloudbees.forge.cloudbees.com/distributions/ci-addons/python/use-python chmod u+x use-python export PYTHON_VERSION=${PYV} . ./use-python mkdir -p /home/jenkins/lib [ -f /home/jenkins/lib/libblas.so ] || ln -sf /usr/lib64/libblas.so.3 /home/jenkins/lib/libblas.so [ -f /home/jenkins/lib/liblapack.so ] || ln -sf /usr/lib64/liblapack.so.3 /home/jenkins/lib/liblapack.so nltk-3.7/tools/nltk_term_index.py000077500000000000000000000061721420073152400172420ustar00rootroot00000000000000import re import sys import epydoc.cli import epydoc.docbuilder from epydoc import log import nltk STOPLIST = "../../tools/nltk_term_index.stoplist" FILENAMES = ["ch%02d.xml" % n for n in range(13)] TARGET_DIR = "nlp/" # FILENAMES = ['../doc/book/ll.xml'] logger = epydoc.cli.ConsoleLogger(0) logger._verbosity = 5 log.register_logger(logger) def find_all_names(stoplist): ROOT = ["nltk"] logger._verbosity = 0 docindex = epydoc.docbuilder.build_doc_index(ROOT, add_submodules=True) valdocs = sorted( docindex.reachable_valdocs( imports=False, # packages=False, bases=False, submodules=False, # subclasses=False, private=False, ) ) logger._verbosity = 5 names = nltk.defaultdict(list) n = 0 for valdoc in valdocs: name = valdoc.canonical_name if name is not epydoc.apidoc.UNKNOWN and name is not None and name[0] == "nltk": n += 1 for i in range(len(name)): key = str(name[i:]) if len(key) == 1: continue if key in stoplist: continue names[key].append(valdoc) log.info(f"Found {len(names)} names from {n} objects") return names SCAN_RE1 = r"[\s\S]*?" SCAN_RE2 = r"[\s\S]*?" SCAN_RE = re.compile(f"({SCAN_RE1})|({SCAN_RE2})") TOKEN_RE = re.compile(r"[\w\.]+") LINE_RE = re.compile(".*") INDEXTERM = '%s' def scan_xml(filenames, names): fdist = nltk.FreqDist() def linesub(match): line = match.group() for token in TOKEN_RE.findall(line): if token in names: targets = names[token] fdist.inc(token) if len(targets) > 1: log.warning( "{} is ambiguous: {}".format( token, ", ".join(str(v.canonical_name) for v in names[token]), ) ) line += INDEXTERM % token # line += INDEXTERM % names[token][0].canonical_name return line def scansub(match): return LINE_RE.sub(linesub, match.group()) for filename in filenames: log.info(f" {filename}") src = open(filename, "rb").read() src = SCAN_RE.sub(scansub, src) # out = open(filename[:-4]+'.li.xml', 'wb') out = open(TARGET_DIR + filename, "wb") out.write(src) out.close() for word in fdist: namestr = ("\n" + 38 * " ").join( [str(v.canonical_name[:-1]) for v in names[word][:1]] ) print("[%3d] %-30s %s" % (fdist[word], word, namestr)) sys.stdout.flush() def main(): log.info("Loading stoplist...") stoplist = open(STOPLIST).read().split() log.info(f" Stoplist contains {len(stoplist)} words") log.info("Running epydoc to build a name index...") names = find_all_names(stoplist) log.info("Scanning xml files...") scan_xml(FILENAMES, names) main() nltk-3.7/tools/nltk_term_index.stoplist000066400000000000000000000012101420073152400204540ustar00rootroot00000000000000__init__ Comment Plot about add all analysis args book bubble categories close concatenate contains copy coverage defaultdict demo describe dict discourse doctype documents dump end ends fileids files find first free goal groups help incorrect insert instances items join key labels lhs line lines list lookup matches max means min missed name next nltk nltk.book open pairs play plot pop pos pp pprint prev process purge put quick raw read reader readings readme repr rhs root run second see select sentences sents set simple size sorted span start step stop str table test text texts trace type update verbs view vocab walk wav width words write nltk-3.7/tools/run_doctests.py000077500000000000000000000006521420073152400165650ustar00rootroot00000000000000#!/usr/bin/python3 """ run doctests """ import os import subprocess import sys for root, dirs, filenames in os.walk("."): for filename in filenames: if filename.endswith(".py"): path = os.path.join(root, filename) for pyver in ["python3.5", "python3.6", "python3.7"]: print(pyver, filename, file=sys.stderr) subprocess.call([pyver, "-m", "doctest", path]) nltk-3.7/tools/svnmime.py000077500000000000000000000024551420073152400155320ustar00rootroot00000000000000#!/usr/bin/env python # NB, this wouldn't be needed if everyone had .subversion/config # configured to automatically set mime types # http://code.google.com/p/support/wiki/FAQ import os import sys types_map = { "ai": "application/postscript", "coverage": "text/plain", "css": "text/css", "eps": "application/postscript", "exe": "application/octet-stream", "errs": "text/plain", "gif": "image/gif", "htm": "text/html", "html": "text/html", "jpeg": "image/jpeg", "jpg": "image/jpeg", "js": "application/x-javascript", "pbm": "image/x-portable-bitmap", "pdf": "application/pdf", "pgm": "image/x-portable-graymap", "pnm": "image/x-portable-anymap", "png": "image/png", "ppm": "image/x-portable-pixmap", "py": "text/x-python", "ps": "application/postscript", "rst": "text/plain", "tex": "application/x-tex", "txt": "text/plain", "xml": "text/xml", "xsl": "text/plain", "zip": "application/zip", } def usage(): exit("Usage: svnmime files") for file in sys.argv[1:]: if "." in file: extension = file.rsplit(".", 1)[1] if extension in types_map: os.system(f"svn propset svn:mime-type {types_map[extension]} {file}") else: print("Unrecognized extension", extension) nltk-3.7/tools/travis/000077500000000000000000000000001420073152400150015ustar00rootroot00000000000000nltk-3.7/tools/travis/coverage-pylint.sh000066400000000000000000000011731420073152400204470ustar00rootroot00000000000000#!/usr/bin/env bash set -e # Exit immediately if a command exits with a non-zero status. set -x # Print all executed commands to the terminal. # Paranoid checks. # Checking Java and Python version. java -version python --version # Which Python / pip which python which pip pip -V echo "$(pwd)" # Know which directory tox is running this shell from. #coverage rm -f coverage_scrubbed.xml pytest --cov=nltk --cov-report xml iconv -c -f utf-8 -t utf-8 coverage.xml > coverage_scrubbed.xml # Create a default pylint configuration file. ##touch $HOME/.pylintrc ##pylint -f parseable nltk > pylintoutput #script always succeeds true nltk-3.7/tools/travis/install.sh000066400000000000000000000006401420073152400170030ustar00rootroot00000000000000#!/bin/bash # This install script is used by the "install" step defined in travis.yml # See https://docs.travis-ci.com/user/installing-dependencies/ # Install the requirements. pip install --upgrade -r pip-req.txt pip install --upgrade https://github.com/PyCQA/pylint/archive/master.zip #download nltk data packages python -c "import nltk; nltk.download('all', force=True)" || echo "NLTK data download failed: $?" nltk-3.7/tools/travis/pre-install.sh000066400000000000000000000006011420073152400175640ustar00rootroot00000000000000#!/bin/bash # This install script is used by the "install" step defined in travis.yml # See https://docs.travis-ci.com/user/installing-dependencies/ set -x # Print all executed commands to the terminal. # Set JAVA env variable. JAVA_HOME=/opt/jdk/jdk8.latest/bin # Checking Java and Python version. java -version python --version # Which Python / pip which python which pip pip -V nltk-3.7/tools/travis/third-party.sh000066400000000000000000000057471420073152400176210ustar00rootroot00000000000000#!/bin/bash # This install script is used by the "install" step defined in travis.yml # See https://docs.travis-ci.com/user/installing-dependencies/ # Installing the third-party software and the appropriate env variables. pushd ${HOME} [[ ! -d 'third' ]] && mkdir 'third' pushd 'third' # Download nltk stanford dependencies #stanford_corenlp_package_zip_name=$(curl -s 'https://stanfordnlp.github.io/CoreNLP/' | grep -o 'stanford-corenlp-full-.*\.zip' | head -n1) stanford_corenlp_package_zip_name="stanford-corenlp-full-2017-06-09.zip" [[ ${stanford_corenlp_package_zip_name} =~ (.+)\.zip ]] stanford_corenlp_package_name=${BASH_REMATCH[1]} if [[ ! -d ${stanford_corenlp_package_name} ]]; then wget -nv "https://nlp.stanford.edu/software/$stanford_corenlp_package_zip_name" unzip ${stanford_corenlp_package_zip_name} rm ${stanford_corenlp_package_zip_name} ln -sf ${stanford_corenlp_package_name} 'stanford-corenlp' fi #stanford_parser_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/lex-parser.shtml' | grep -o 'stanford-parser-full-.*\.zip' | head -n1) stanford_parser_package_zip_name="stanford-parser-full-2017-06-09.zip" [[ ${stanford_parser_package_zip_name} =~ (.+)\.zip ]] stanford_parser_package_name=${BASH_REMATCH[1]} if [[ ! -d ${stanford_parser_package_name} ]]; then wget -nv "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name" unzip ${stanford_parser_package_zip_name} rm ${stanford_parser_package_zip_name} ln -sf ${stanford_parser_package_name} 'stanford-parser' fi #stanford_tagger_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/tagger.shtml' | grep -o 'stanford-postagger-full-.*\.zip' | head -n1) stanford_tagger_package_zip_name="stanford-postagger-full-2017-06-09.zip" [[ ${stanford_tagger_package_zip_name} =~ (.+)\.zip ]] stanford_tagger_package_name=${BASH_REMATCH[1]} if [[ ! -d ${stanford_tagger_package_name} ]]; then wget -nv "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name" unzip ${stanford_tagger_package_zip_name} rm ${stanford_tagger_package_zip_name} ln -sf ${stanford_tagger_package_name} 'stanford-postagger' fi # Download SENNA senna_file_name=$(curl -s 'https://ronan.collobert.com/senna/download.html' | grep -o 'senna-v.*.tgz' | head -n1) senna_folder_name='senna' if [[ ! -d $senna_folder_name ]]; then wget -nv "https://ronan.collobert.com/senna/$senna_file_name" tar -xvzf ${senna_file_name} rm ${senna_file_name} fi # Setup the Environment variable export CLASSPATH=$(pwd)"/${stanford_corenlp_package_name}" export CLASSPATH=${CLASSPATH}:$(pwd)"/${stanford_parser_package_name}" export CLASSPATH=${CLASSPATH}:$(pwd)"/${stanford_tagger_package_name}" export STANFORD_CORENLP=$(pwd)'/stanford-corenlp' export STANFORD_PARSER=$(pwd)'/stanford-parser' export STANFORD_MODELS=$(pwd)'/stanford-postagger/models' export STANFORD_POSTAGGER=$(pwd)'/stanford-postagger' export SENNA=$(pwd)'/senna' popd popd echo "---- CLASSPATH: ----" echo $CLASSPATH echo "---- MODELS: ----" echo $STANFORD_MODELS nltk-3.7/tools/travis/travis-ci-commenter.sh000066400000000000000000000004001420073152400212170ustar00rootroot00000000000000#!/bin/bash if [ "$TRAVIS_PULL_REQUEST" != "false" ] ; then curl -H "Authorization: token ${GITHUB_TOKEN}" -X POST \ -d "{\"body\": \"[CI: retest]\"}" \ "https://api.github.com/repos/${TRAVIS_REPO_SLUG}/issues/${TRAVIS_PULL_REQUEST}/comments" nltk-3.7/tox.ini000066400000000000000000000043071420073152400136500ustar00rootroot00000000000000[tox] envlist = py{37,38,39,310} pypy py{37,38,39,310}-nodeps py{37,38,39,310}-jenkins py-travis [testenv] ; simplify numpy installation setenv = LAPACK= ATLAS=None PYTHONWARNINGS=ignore ; Copy all environment variables to the tox test environment passenv = * deps = numpy text-unidecode twython pyparsing pytest pytest-cov pytest-mock python-crfsuite regex click joblib tqdm matplotlib changedir = nltk/test commands = ; scipy and scikit-learn requires numpy even to run setup.py so ; they can't be installed in one command pip install scipy scikit-learn ; pytest --cov=nltk --cov-report html:{envdir}/docs nltk/test/ pytest [testenv:pypy] ; numpy is bundled with pypy; coverage is extra slow and ; the coverage results are not that different from CPython. deps = pytest pytest-mock twython commands = pytest [testenv:py37-nodeps] basepython = python3.7 deps = pytest pytest-mock commands = pytest [testenv:py38-nodeps] basepython = python3.8 deps = pytest pytest-mock commands = pytest [testenv:py39-nodeps] basepython = python3.9 deps = pytest pytest-mock commands = pytest [testenv:py310-nodeps] basepython = python3.10 deps = pytest pytest-mock commands = pytest # Use minor version agnostic basepython, but specify testenv # control Python2/3 versions using jenkins' user-defined matrix instead. # Available Python versions: http://repository-cloudbees.forge.cloudbees.com/distributions/ci-addons/python/fc25/ [testenv:py-travis] extras = all setenv = NLTK_DATA = {homedir}/nltk_data/ commands = {toxinidir}/tools/travis/coverage-pylint.sh [testenv:py-travis-third-party] extras = all setenv = STANFORD_MODELS = {homedir}/third/stanford-parser/ STANFORD_PARSER = {homedir}/third/stanford-parser/ STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/ NLTK_DATA = {homedir}/nltk_data/ commands = {toxinidir}/tools/travis/third-party.sh {toxinidir}/tools/travis/coverage-pylint.sh [testenv:py3-runtime-check] ; nltk should be runnable in an env with nothing installed basepython = python3 deps = commands = python -c "import nltk" [isort] profile=black nltk-3.7/web/000077500000000000000000000000001420073152400131065ustar00rootroot00000000000000nltk-3.7/web/Makefile000066400000000000000000000131671420073152400145560ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build WEB = ../../nltk.github.com # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext web: clean_api sphinx-apidoc -o api ../nltk $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(WEB) @echo @echo "Build finished. The HTML pages are in $(WEB)." without_api: clean_api $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(WEB) clean_api: rm -f api/modules.rst api/nltk.*.rst help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* # $(WEB)/* dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/NLTK.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/NLTK.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/NLTK" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/NLTK" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b nltk-doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." nltk-3.7/web/_static/000077500000000000000000000000001420073152400145345ustar00rootroot00000000000000nltk-3.7/web/_static/css/000077500000000000000000000000001420073152400153245ustar00rootroot00000000000000nltk-3.7/web/_static/css/team.css000066400000000000000000000013011420073152400167570ustar00rootroot00000000000000.member-card { background-color: #E6E6E6; margin-bottom: 2.5rem; } .member-header { background-color: #2D2D2D; color: #EBEBEB; padding: 0.5em; font-weight: 500; } .member-header:hover .headerlink { display: inline-block; } .member-body { display: flex; /* Padding on all sides except the bottom, as the right-panel takes care of this padding */ padding: 1em 1em 0em 1em; } .member-left-panel { /* This panel only exists if there is an image to fill it. */ padding-right: 1em; } .member-right-panel { /* This panel extends as far as it can */ flex: 1; /* padding-left: 1em; */ } .github-link { font-weight: 200; float: right; } nltk-3.7/web/_static/images/000077500000000000000000000000001420073152400160015ustar00rootroot00000000000000nltk-3.7/web/_static/images/book.gif000066400000000000000000000160021420073152400174210ustar00rootroot00000000000000GIF89aZv  " &!#)%&*&(-**0-.2@4523856:78<9:>@<@=>B>@CPEEBCFJEHEFJGHJHGJUKL=GLJKO)DPMNRNPRZRT8LTRSU2LXVVYWX[c\\Z[]CU`^^a_`bW^c*Ucjdd9Xdbbh.[hefifhjhgk5^ljklrlmQepnnqnps(dtrsw=iwDjxWnxvvxzwyvxz$izgt{lu{{|z{}V}[s[~~~a(q9sRvk%tAwM{lQ}#xq<|){u$}(~{ |~~~ !*#)06<*a%2C :L4D8;L RGBUniwersalny profil RGBdescGeneric RGB ProfileGeneric RGB ProfileXYZ Zus4XYZ RXYZ tM=XYZ (6curvtextCopyright 2007 Apple Inc.C, all rights reserved.sf32 B&l,Zv 6P%,](\ȰÇ#J Jcy̨ImڶiӢm;I$m߶EkmZKԾESd͗+:&51lǎÆ=JU=~ cݕuSwxAV(*0Uxs'*tРr/2s*hJ,Wa ErDm[5Reyf/ܵʫ_+hf6j/QgvaZH?os 5T֧+rӫ/~ßO___f7`u` >` >aSM7dva7z($M5L%4\̡j(4(e<裍sp!y4b"4$PF)wL6 SvR626DNUtÍzM$zчuܡG | j4x^ ${"'})q>{"F5e>Uw2eH}2H#x62Hz`g(K6hj 8e baGR.HL$'qJ#vJ$shʩEvTzH=(Ԉr9a&Ę#1Ljbt&aH'pb0zp 7ܸ{G$Ҍ(F7PÍ$r 1sG[ 3+tsS/ߜc>@S+ rM1ء<1B#GEpE$TnIKw "Y gqS8Zw,$.R ECӪt :n0*+GLS)3:i?Te:OȔ$ ;EC8 P~2.`S#)KGT7sHB0+rQ 5H\\A0F#B _HbuЅ1DŞ@ b\/\! D}ecE5eJ.ҝfrQ5aw4IH/ @hAe5֠TtCeG-冧r0Lg4 Z10vtn)8@Yd#  u"Qc|HЪ3`C:&'@H, ]X3PvFOpHEP(C" 3VOFA3ʱ;ZQY1ZSrܷ9nHD5D bE(HgҀ. N鰗0`c ")mh.hZ2HP[wZ4= OIR8-)bBwsh)qFps%%'q͘D"V)iN$K\$@a5Vz8 Ir-1!ip.EG~a;.!6WȡJO.<!@2 h*! ^hE%˸n \FgӡWv;"! l|9_Rބp+Ha ?RT UHe >HBbQ*ϱ7uVtczh\1Nt |*nA p5x7z`X #HG *B$! U f.;LljF5BꊣB` _p}c=ч"Ppo'1 G8 *`Oz|@'xyw*e0 {ԋ+W x49:Іመn >B/5wD&'H 8`8-xˆ/Xc@R0@ 0 ' ` w@hp/)z;-Uf~xior'Ё,@10 QC)(z !'H(I?M3v5@o/sY!^}I}W{'2r ~d1G%P2 &]Bh@Hy'Ygz^MM}#^f*2qojQO! `@DeTFj؉1P'8@iЅWOxkO(x`^P8Az~wnmWjAaS ʸ @ p%PM}ܷ_Y@^}NXYkUЅ)l,*1zN28{vjGf0؁ȁv Јj'w% %0o '4z>}9j7xd 8>3vM@s*P!Y"yB`y"07wI28@oP~PB9Iuo~#XB sppkpdIYIV0Ɂ4@*(d !2 ]Qz)sa  ްb%oIoX1vU&s,P%P  HQȁ֑Nq UЈfn sXn2ׁ[ s>8 ! 0*Ё p~5>z> qx؝0wGo>`נS o! @@~U  "W"Pwyw;o-Xy5'PpHr7*%ֱ  I+is8'o(Yom@Yw%, U80i@Z `z! *, 'c)zQxPoњ v{ pЮЇ oMJQw~(zI0 ðqЁAw9,K nÐ K8jor ,:!D2x)x %zHڀ1 x7i`kPArUp[ m[l ze6[؁v;nqЀ In^ 4*o>{( %3 #*Bб Ɂk8VOxJ %4U˞gzY,K[^Q8p2p3&w%@۞rP+ B@ XM{08ɲޗpUvw3gv<0w}؁ CJOxi0⇐Yoiouasv~lxop;pk wdP zz4[w4i08LPnU[ 317s6+9o.)W2DpD̥ QW Sy *߷}#(R+27wA@@cH% `)n]ȇ}[ `1畩Y-M*[ڀ% Hdͷwj)Uk^Ь'lQ$1ׁ 0}(wzI2ivib[@4`Nga 5y_Y" G W!(˨P48g9 :fzzYI UP`NXHM **vM ۗz௫Jr)Wmq*B+] n I {\,>+-eLހa Cq1 >dЈ0 aHQ;.^p ð@"bȱ; xÌ|qbt.dtp0Wݶ< _hx\к"YQz0 <<~< ʑ @ƴސPr1 pnAԞ~$n^ q^~;nltk-3.7/web/_static/images/tree.gif000066400000000000000000000340411420073152400174310ustar00rootroot00000000000000GIF89a]c!f%i+m/p3r8u6t 9v zA|:<<=E~FHIAC"M$FG%FI&P(Q)JL+T/OR/X0W3RT3Z3t;7]8^8^8w?<[\{E@]_AeB~ID`bDhFjIegIOJmMTNpOikPVQsSmnTZUvVprVxXxX^Z`[tu[{]vx^xy^c`~`fac{|dejg~gmh~iinlmmrppqqvrsxtuzvxy~z{}}~~ÕřȝȟˣͨΪĭѭưȱұʴԵ͹ָλغŻнȼپȾ! !ICCRGBG1012 HLinomntrRGB XYZ  1acspMSFTIEC sRGB-HP cprtP3desclwtptbkptrXYZgXYZ,bXYZ@dmndTpdmddvuedLview$lumimeas $tech0 rTRC< gTRC< bTRC< textCopyright (c) 1998 Hewlett-Packard CompanydescsRGB IEC61966-2.1sRGB IEC61966-2.1XYZ QXYZ XYZ o8XYZ bXYZ $descIEC http://www.iec.chIEC http://www.iec.chdesc.IEC 61966-2.1 Default RGB colour space - sRGB.IEC 61966-2.1 Default RGB colour space - sRGBdesc,Reference Viewing Condition in IEC61966-2.1,Reference Viewing Condition in IEC61966-2.1view_. \XYZ L VPWmeassig CRT curv #(-27;@EJOTY^chmrw| %+28>ELRY`gnu| &/8AKT]gqz !-8COZfr~ -;HUcq~ +:IXgw'7HYj{+=Oat 2FZn  % : O d y  ' = T j " 9 Q i  * C \ u & @ Z t .Id %A^z &Ca~1Om&Ed#Cc'Ij4Vx&IlAe@e Ek*Qw;c*R{Gp@j>i  A l !!H!u!!!"'"U"""# #8#f###$$M$|$$% %8%h%%%&'&W&&&''I'z''( (?(q(())8)k))**5*h**++6+i++,,9,n,,- -A-v--..L.../$/Z///050l0011J1112*2c223 3F3334+4e4455M555676r667$7`7788P8899B999:6:t::;-;k;;<' >`>>?!?a??@#@d@@A)AjAAB0BrBBC:C}CDDGDDEEUEEF"FgFFG5G{GHHKHHIIcIIJ7J}JK KSKKL*LrLMMJMMN%NnNOOIOOP'PqPQQPQQR1R|RSS_SSTBTTU(UuUVV\VVWDWWX/X}XYYiYZZVZZ[E[[\5\\]']x]^^l^__a_``W``aOaabIbbcCccd@dde=eef=ffg=ggh?hhiCiijHjjkOkklWlmm`mnnknooxop+ppq:qqrKrss]sttptu(uuv>vvwVwxxnxy*yyzFz{{c{|!||}A}~~b~#G k͂0WGrׇ;iΉ3dʋ0cʍ1fΏ6n֑?zM _ɖ4 uL$h՛BdҞ@iءG&vVǥ8nRĩ7u\ЭD-u`ֲK³8%yhYѹJº;.! zpg_XQKFAǿ=ȼ:ɹ8ʷ6˶5̵5͵6ζ7ϸ9к<Ѿ?DINU\dlvۀ܊ݖޢ)߯6DScsb 2F[p(@Xr4Pm8Ww)Km, H*\ȰÇ#JHŋ3jȱǏ CIIv˗0cʜIͅj|H͟@ JQ$tӧPJ 2Qʵׯ`*;“hӪ]֥+ȤU1Ѷݻxt LWX^̸F~LeРMvϠCܬ̒%JHwԔQ3GO!GDhKSfL6p 1ωRIhpx#8ŗ$Rh(c)ebF Fj䤼@ɥPc8r^|j$ X GΔºlCq ca%|XF~| [l0f>;R^F&L28(Rk7w],gh*Qf"(DOS *99觾%B7{L$e'ʫGNB (8Ruh`ğQd!~r筡F%|B43klX&1_w9?WqKd>5 ۹u:-4M9rvw[+lm1a0wA?tԊ([|A\߰(w_ӺŘ_ıW8ex"kԷy8V /D hά< F!o5QH{q|6,YfWNW9aBT2e}&F(HVĊ I-': FCᐍ gj@(!Dޤy]0an "cE(3]"E'0bH8؄ Rl1θːhp ih ("FAX $ H ̠0YJz 4Z Y0@ Ȃ4"Â#HUp bD Ő'Q ԀV@Kr*` `@ @8Ԁ&000KaB0#t:d0A^a  A8 H tP7{3 puZ4(4%ЈT(`3QF ]@ L83> ,EMU0ژJC8+@?/ Ԁ@R(;zj妣q hL٢A ]H@QQD(bAUgq^9gu ,Œ\ma _q;3`Q;rp,҅ +ǀ]$ O~AJX/d`6Y`nj3!? Di$X H8 G *A  A PW B% }pH͂ P$ Pu 0 pD8p5`P :@Lph}I:8@6E-8J&@ILL@ΠP P`(%q ۤ ?J@L`S}IP@0Ey jM|T @ `( ] P&p0NlYh9 WLҀ2pqAHLR ?P  a>,q- ]OC 2U0q  Gx LP0PK ? G!h0E UD VJRO k`8pU?0H% 4( P̠OP F qNY0 ~ f)01 mt O@R8enP ) -#0!PP?8 цY%R+r@/2 ͠^ڡ8W e&|BlfWmxH<p 4sr)*Ǿ)Hg! O P HL 0   E $u L 5Pp ! GN0  q ?)`2] L,y̰} 0|| 0gGE!nP G }L) L Y gP?@MA} Is! `JA Gx  4 O_i P) 7ư 0 |O)n@ 2NqI]>XP`@GIgt`.] J  xR>L+P }O 2ϤPg@]p 4NĊu|rq M)|PӭҐ &up  8@RpD\׊ a [ T ` P ` 4p C}0u) %(*qa.ΰOY 0Ԋ-] 0R"()nNt`- ZS5pEp& \G2x0,p)p} npO&$ O= 5%ߤ@   t4 @R' KWu0ʰ5߷) 0EG V `*U&HPt AL]p5xU d I & ??pU G O5ۛ t g0  n2 "i8p H ~i ` )g. b -  M~Nʀ] n ?Pc~x r  pQ  OPH ot?gURxQRPdρQ̀!ZQ U pp.$oN}  1 0 &4u#ўQOObFPLHn L) 0~H& L b O˯p ( x0]PD ]`9HgPDtln* x0B8E=apIҰ ` `  ] bC0%nx m|2<)SpGp: P-GʀO"X }` xBQ P nb2"=tRQ"qX# cy¾eޟpd.T,]! M_ $X̌@CC8!/i #n4LOEt 1#ssLSΤ)'4F8'NXQIJ#F#`3Vk3f)!dXͼD+͢|djk5X2Ndb0'W䀒YaͦbF#9l$j [T˺ˈ<Rs!xO»d߁eWȟrtya  rA-W'e8zFa:3gJQ?HGﯙ+Gg!&D0{0#\ۯ)O?{H蝹X kF2h* OԸ? B bȓ:n5'C8F=(sFp kPq‹ :f #&2DƱ0`I bȇ0 rLćB49 ʚD!УӰڸ27)P3 H%(RnE! ?NSMJє>tfr";L`"ɉL؄+ ,%u+-I$" ic`0LRBĖ0?C@~&rXC'@o!E: 9rƄ[01A.|3tFPlIiR&XofRtG1!,9#\0;8[Xaafi%f<(,ҟ4,1he|18CRAHp p e@U`.`@HJ! \ N,a \|D , , 4A4x@$0P8тMx Q `t-܂4`RlA<~L6a4fl?h =8Nʀ\4`W[+0 e#`0T ] d<]+4   > 4y@n `H@v@7a T`%,bH4i`  ,< L1C3 P82R4Y j #pCs(5:$KLL b0)0 ] 0 ?h F~- 2:9?3QrhD>#tp .<x~`#p4d88:H&(҃B0>AH (LppR8#h$D8Xy( a J!8B ,` 6Hv(E c x/ض040;-H*..<&^!d "G33 P#6$X=iد@P 8L Γކ Qb>R v xZ w &݂B̨RPA(9$ 7u&AZ#@d9@”%30 wZbvCFAX@%:L~B3qV8#pdmV-ABe lȢ C2;aøl:%Ǵ_(, (b XDtEit AU@(ΐP`!)pN@f>8C4AnatR̸H'K`#7s Z16]˔F,8 aOi3rR,#m/n8] iE36L ?@P*9"8C_i!Ed! c$# Pc qu@ YȂ,"R :0B!t#ШB1H1:iA:uU2Mt|A w-Ʌ2/@^ϰ߇|'_k;'4_ >.zvtΙ u_ORc=ݕ&OG$taJB:~gA@Ȑ ԚdH'X-+HI90@`9+ 9Q ZPȅ+hb9hL/\H@7IHLJ +\B\(DdFP#p6HX^xxɸ#O虎CBB)D .1CXhA 'J/񄕙i(ʐD(;xÔÚ>OP'$!Ěh B8AD,1I-X-CXÆ"b0?PYSQD$ .S 0,FHYF@£PȕJc\h̓cE.h14iN2PFj g쾚p_GFPţh `_|-(D6pr1`DԂ)3xǙPGD?D @D  fȅnt vD hĤhLЃt;\8FȈDXHƢ:Ƥ£ o'@/lȃH6JY[Wxe dX=x呆Έh5eh5cǴ84 S۟H(} c`3 `.Ѐ:Pe0zYKez:QH3@GHL[+"Ђq]؃@/V%[)YP]*# :M8dAR%]{gUB?i(8@5Ղ.gPg |8 )"рHR#01.c!,3څcAx, ,'3@P*"q:W`kR`h8Me([` `hЩHNr+*ςQ&6 ?DPSix\H>U>c; Z,@N!E`EH}عv+J"qME0Օ e\B\NxX\̓C(*x* <,% 8bdxcp`L  u N8c--!اX@+-@&LϚ4!O0_ SMa`UX08g(QP]04"t=b[PgH'`e3h^b4zQ;" XUˊeZe[֚g[e^e_|BHfdNf )fA'?@UfkfHf&gOffq`'(ϔxc pCr~gxFfXjuifwg~g,ng}F~.ghp6hg=f~hVR{^h}h>hgiC'hf=甞lbnf6 Dijj@c%E٢~jF@h\0P R僸jd;VR0rf:Xr WrQe7Sc'Iak@x*3:W;8H %DSz4 & cN+[^70(fO\6-iV$K\0V֑;u8PXMPWZ,`QpjʮnuY%~e(XCx_c0[KV0$8XZi+ NUV!vg *ŅŖ%xсKp%n\~067[Q(M8 FR`iPK0g$pI/eX[x&XG`W][N0zie*C0ZG0cWqNq f]C-lc4G5NI؃ XCI`8&UcHDYb(@6zfe`d lZbSSO6n6yN [g8)NVT`8W`Hvb/vc?vdOveߏ;nltk-3.7/web/_templates/000077500000000000000000000000001420073152400152435ustar00rootroot00000000000000nltk-3.7/web/_templates/doctest.rst000066400000000000000000000010471420073152400174440ustar00rootroot00000000000000{# The :autogenerated: tag is picked up by breadcrumbs.html to suppress "Edit on Github" link #} :autogenerated: .. This file is autogenerated by `generate_custom_files` in conf.py, and ought to be generated via building the documentation through Sphinx, e.g. with `sphinx-build ./web ./build` from the root directory. {% for item in range(17 + module_name|length) -%}#{%- endfor %} Sample usage for {{ module_name }} {% for item in range(17 + module_name|length) -%}#{%- endfor %} .. include:: ../../nltk/test/{{ module_name }}.doctest nltk-3.7/web/_templates/module.rst000066400000000000000000000010661420073152400172650ustar00rootroot00000000000000{# The :autogenerated: tag is picked up by breadcrumbs.html to suppress "Edit on Github" link #} :autogenerated: .. This file is autogenerated by `generate_custom_files` in conf.py, and ought to be generated via building the documentation through Sphinx, e.g. with `sphinx-build ./web ./build` from the root directory. {{ fullname }} module {% for item in range(7 + fullname|length) -%}={%- endfor %} .. currentmodule:: {{ fullname }} .. automodule:: {{ fullname }} :members: :undoc-members: :show-inheritance: :member-order: bysource nltk-3.7/web/_templates/package.rst000066400000000000000000000015721420073152400173750ustar00rootroot00000000000000{# The :autogenerated: tag is picked up by breadcrumbs.html to suppress "Edit on Github" link #} :autogenerated: .. This file is autogenerated by `generate_custom_files` in conf.py, and ought to be generated via building the documentation through Sphinx, e.g. with `sphinx-build ./web ./build` from the root directory. {{ fullname }} package {% for item in range(8 + fullname|length) -%}={%- endfor %} .. automodule:: {{ fullname }} :members: :undoc-members: :show-inheritance: {% if subpackages %} *********** Subpackages *********** .. toctree:: :maxdepth: 1 {% for item in subpackages %} {{ fullname }}.{{ item }} {%- endfor %} {%- endif %} {% if submodules %} ********** Submodules ********** .. toctree:: :maxdepth: 1 {% for item in submodules %} {{ fullname }}.{{ item }} {%- endfor %} {%- endif -%} nltk-3.7/web/_templates/team.html000066400000000000000000000040221420073152400170550ustar00rootroot00000000000000 {% for member in members -%} {% endfor %} nltk-3.7/web/api/000077500000000000000000000000001420073152400136575ustar00rootroot00000000000000nltk-3.7/web/api/.gitkeep000066400000000000000000000001051420073152400153040ustar00rootroot00000000000000# This file exists to keep this directory from being removed by git. nltk-3.7/web/conf.py000066400000000000000000000275041420073152400144150ustar00rootroot00000000000000# # NLTK documentation build configuration file, created by # sphinx-quickstart on Wed Nov 2 17:02:59 2011. # # This file is execfile()d with the current directory set to its containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import os import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # build docs using nltk from the upper dir, not the installed version sys.path.insert(0, os.path.abspath("..")) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.imgmath", "sphinx.ext.viewcode", ] def run_apidoc(app): """Generage API documentation""" import better_apidoc better_apidoc.APP = app better_apidoc.main( [ "better-apidoc", "-t", os.path.join(".", "web", "_templates"), "--force", "--separate", "-o", os.path.join(".", "web", "api"), os.path.join(".", "nltk"), ] ) def generate_custom_files(): """Generating contents in the ``howto`` folder, based on the ``ntlk/test/*.doctest`` files, as well as contents in the ``team`` folder, based on ``team.json``. """ import glob import json import re from jinja2 import Template modules = [] web_folder = os.path.dirname(os.path.abspath(__file__)) howto_folder = os.path.join(web_folder, "howto") if not os.path.exists(howto_folder): os.makedirs(howto_folder) # Load jinja template with open( os.path.join(web_folder, "_templates", "doctest.rst"), encoding="utf8" ) as f: doctest_template = Template(f.read()) print("Generating HOWTO pages...") # Iterate over .doctest files, and find the module_name. pattern = re.compile(r"(\w+)\.doctest$") for path in glob.glob(os.path.join(web_folder, "..", "nltk", "test", "*.doctest")): match = pattern.search(path) module_name = match.group(1) # Ignore index.doctest, we already have an index, i.e. howto.rst if module_name == "index": continue # Write .rst files based on the doctest_template. doctest_template.stream(module_name=module_name).dump( os.path.join(howto_folder, f"{module_name}.rst") ) modules.append(module_name) print(f"Generated {len(modules)} HOWTO pages.") # Load the team JSON data with open(os.path.join(web_folder, "team", "team.json"), encoding="utf8") as f: full_data = json.load(f) print("Team data loaded!") # Load the team jinja template with open( os.path.join(web_folder, "_templates", "team.html"), encoding="utf8" ) as f: team_template = Template(f.read()) for members_type, members_data in full_data.items(): team_template.stream(members=members_data).dump( os.path.join(web_folder, "team", f"{members_type}_team.html") ) print(f"{members_type.title()} team HTML page written!") # Build the Team & HOWTO page before creating the Sphinx build generate_custom_files() # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix of source filenames. source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. master_doc = "index" # General information about the project. project = "NLTK" copyright = "2022, NLTK Project" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = "3.7" # The full version, including alpha/beta/rc tags. release = "3.7" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' # Else, today_fmt is used as the format for a strftime call. # today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = ["_build", "api/modules.rst", "dev/*.rst"] # The reST default role (used for this markup: `text`) to use for all documents. # default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. # add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. # show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. modindex_common_prefix = ["nltk."] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = "nltk_theme" def setup(app): app.connect("builder-inited", run_apidoc) # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = {"navigation_depth": 1} # Required for the theme, used for linking to a specific tag in the website footer html_context = {"github_user": "nltk", "github_repo": "nltk"} # Add any paths that contain custom themes here, relative to this directory. # html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". # html_title = None # A shorter title for the navigation bar. Default is the same as html_title. # html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. # html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. # html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. html_last_updated_fmt = "%b %d, %Y" # html_last_updated_fmt = "%d %b %Y" # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. html_use_smartypants = True # Custom sidebar templates, maps document names to template names. # html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. # html_additional_pages = {} # If false, no module index is generated. html_domain_indices = True # If false, no index is generated. # We don't use the genindex. html_use_index = False # If true, the index is split into individual pages for each letter. # html_split_index = False # If true, links to the reST sources are added to the pages. # html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. # html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. # html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. # html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = "NLTKdoc" # -- Options for LaTeX output -------------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [("index", "NLTK.tex", "NLTK Documentation", "Steven Bird", "manual")] # The name of an image file (relative to this directory) to place at the top of # the title page. # latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. # latex_use_parts = False # If true, show page references after internal links. # latex_show_pagerefs = False # If true, show URL addresses after external links. # latex_show_urls = False # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. # latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [("index", "nltk", "NLTK Documentation", ["Steven Bird"], 1)] # If true, show URL addresses after external links. # man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( "index", "NLTK", "NLTK Documentation", "Steven Bird", "NLTK", "One line description of project.", "Miscellaneous", ) ] # Documents to append as an appendix to all manuals. # texinfo_appendices = [] # If false, no module index is generated. # texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. # texinfo_show_urls = 'footnote' # -- Options for Autodoc output ------------------------------------------------ # If it's "mixed", then the documentation for each parameter isn't listed # e.g. nltk.tokenize.casual.TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False, match_phone_numbers=True) # and that's it. # With "seperated": # nltk.tokenize.casual.TweetTokenizer # ... # __init__(preserve_case=True, reduce_len=False, strip_handles=False, match_phone_numbers=True) # Create a TweetTokenizer instance with settings for use in the tokenize method. # Parameters # preserve_case (bool) – Flag indicating whether to preserve the casing (capitalisation) of text used in the tokenize method. Defaults to True. # reduce_len (bool) – Flag indicating whether to replace repeated character sequences of length 3 or greater with sequences of length 3. Defaults to False. # strip_handles (bool) – Flag indicating whether to remove Twitter handles of text used in the tokenize method. Defaults to False. # match_phone_numbers (bool) – Flag indicating whether the tokenize method should look for phone numbers. Defaults to True. autodoc_class_signature = "separated" # Put the Python 3.5+ type hint in the signature and also at the Parameters list autodoc_typehints = "both" nltk-3.7/web/contribute.rst000066400000000000000000000013011420073152400160110ustar00rootroot00000000000000Contributing to NLTK ==================== The Natural Language Toolkit exists thanks to the efforts of dozens of voluntary developers who have contributed functionality and bugfixes since the project began in 2000 (`contributors `_). Information for contributors: * `contributing to NLTK `_ * `desired enhancements `_ * `contribute a corpus `_ * `nltk-dev mailing list `_ * `GitHub Project `_ nltk-3.7/web/data.rst000066400000000000000000000077071420073152400145640ustar00rootroot00000000000000Installing NLTK Data ==================== NLTK comes with many corpora, toy grammars, trained models, etc. A complete list is posted at: https://www.nltk.org/nltk_data/ To install the data, first install NLTK (see https://www.nltk.org/install.html), then use NLTK's data downloader as described below. Apart from individual data packages, you can download the entire collection (using "all"), or just the data required for the examples and exercises in the book (using "book"), or just the corpora and no grammars or trained models (using "all-corpora"). Interactive installer --------------------- *For central installation on a multi-user machine, do the following from an administrator account.* Run the Python interpreter and type the commands: >>> import nltk >>> nltk.download() A new window should open, showing the NLTK Downloader. Click on the File menu and select Change Download Directory. For central installation, set this to ``C:\nltk_data`` (Windows), ``/usr/local/share/nltk_data`` (Mac), or ``/usr/share/nltk_data`` (Unix). Next, select the packages or collections you want to download. If you did not install the data to one of the above central locations, you will need to set the ``NLTK_DATA`` environment variable to specify the location of the data. (On a Windows machine, right click on "My Computer" then select ``Properties > Advanced > Environment Variables > User Variables > New...``) Test that the data has been installed as follows. (This assumes you downloaded the Brown Corpus): >>> from nltk.corpus import brown >>> brown.words() ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...] Installing via a proxy web server ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If your web connection uses a proxy server, you should specify the proxy address as follows. In the case of an authenticating proxy, specify a username and password. If the proxy is set to None then this function will attempt to detect the system proxy. >>> nltk.set_proxy('http://proxy.example.com:3128', ('USERNAME', 'PASSWORD')) >>> nltk.download() Command line installation ------------------------- The downloader will search for an existing ``nltk_data`` directory to install NLTK data. If one does not exist it will attempt to create one in a central location (when using an administrator account) or otherwise in the user's filespace. If necessary, run the download command from an administrator account, or using sudo. The recommended system location is ``C:\nltk_data`` (Windows); ``/usr/local/share/nltk_data`` (Mac); and ``/usr/share/nltk_data`` (Unix). You can use the ``-d`` flag to specify a different location (but if you do this, be sure to set the ``NLTK_DATA`` environment variable accordingly). Run the command ``python -m nltk.downloader all``. To ensure central installation, run the command ``sudo python -m nltk.downloader -d /usr/local/share/nltk_data all``. Windows: Use the "Run..." option on the Start menu. Windows Vista users need to first turn on this option, using ``Start -> Properties -> Customize`` to check the box to activate the "Run..." option. Test the installation: Check that the user environment and privileges are set correctly by logging in to a user account, starting the Python interpreter, and accessing the Brown Corpus (see the previous section). Manual installation ------------------- Create a folder ``nltk_data``, e.g. ``C:\nltk_data``, or ``/usr/local/share/nltk_data``, and subfolders ``chunkers``, ``grammars``, ``misc``, ``sentiment``, ``taggers``, ``corpora``, ``help``, ``models``, ``stemmers``, ``tokenizers``. Download individual packages from ``https://www.nltk.org/nltk_data/`` (see the "download" links). Unzip them to the appropriate subfolder. For example, the Brown Corpus, found at: ``https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown.zip`` is to be unzipped to ``nltk_data/corpora/brown``. Set your ``NLTK_DATA`` environment variable to point to your top level ``nltk_data`` folder. nltk-3.7/web/dev/000077500000000000000000000000001420073152400136645ustar00rootroot00000000000000nltk-3.7/web/dev/jenkins.rst000066400000000000000000000100501420073152400160530ustar00rootroot00000000000000NLTK c-i setup ============== This is an overview of how our `continuous integration`_ setup works. It includes a quick introduction to the tasks it runs, and the later sections detail the process of setting up these tasks. Our continuous integration is currently hosted at `Shining Panda`_, free thanks to their FLOSS program. The setup is not specific to their solutions, it could be moved to any `Jenkins`_ instance. The URL of our current instance is https://jenkins.shiningpanda.com/nltk/ .. _`continuous integration`: https://en.wikipedia.org/wiki/Continuous_integration .. _`Shining Panda`: http://shiningpanda.com .. _`Jenkins`: https://jenkins-ci.org Base tasks ---------- The base tasks of the c-i instance is as follows: * Check out the NLTK project when VCS changes occur * Build the project using setup.py * Run our test suite * Make packages for all platforms * Build these web pages Because the NLTK build environment is highly customized, we only run tests on one configuration - the lowest version supported. NLTK 2 supports python down to version 2.5, so all tests are run using a python2.5 virtualenv. The virtualenv configuration is slightly simplified on ShiningPanda machines by their having compiled all relevant python versions and making virtualenv use these versions in their custom virtualenv builders. VCS setup/integration --------------------- All operations are done against the `NLTK repos on Github`_. The Jenkins instance on ShiningPanda has a limit to the build time it can use each day. Because of this, it only polls the main NLTK repo once a day, using the `Poll SCM` option in Jenkins. Against the main code repo it uses public access only, and for pushing to the nltk.github.com repo it uses the key of the user nltk-webdeploy. .. _`NLTK repos on Github`: https://github.com/nltk/ The base build -------------- To build the project, the following tasks are run: 1. Create a VERSION file A VERSION file is created using ``git describe --tags --match '*.*.*' > nltk/VERSION``. This makes the most recent VCS tag available in nltk.__version__ etc. 2. ``python setup.py build`` This essentially copies the files that are required to run NLTK into build/ The test suite -------------- The tests require that all dependencies be installed. These have all been installed beforehand, and to make them run a series of extra environment variables are initialized. These dependencies will not be detailed until the last section. The test suite itself consists of doctests and unittests. Doctests are found in each module as docstrings, and in all the .doctest files under the test folder in the nltk repo. We run these tests using pytest_, find code coverage using `pytest-cov`_ and check for `PEP-8`_ etc. standard violations using `pylint`_. All these tools are easily installable through pip your favourite OS' software packaging system. For testing, you can install the requirements with ``pip install -r requirements-test.txt`` The results of these programs are parsed and published by the jenkins instance, giving us pretty graphs :) .. _pytest: https://docs.pytest.org/ .. _`pytest-cov`: https://pytest-cov.readthedocs.io/ .. _`PEP-8`: https://www.python.org/dev/peps/pep-0008/ .. _`pylint`: https://pylint.org/ The builds ---------- The packages are built using ``make dist``. The outputted builds are all placed `in our jenkins workspace`_ and should be safe to distribute. Builds specifically for mac are not available. File names are made based on the ``__version__`` string, so they change every build. .. _`in our jenkins workspace`: https://example.com/ Web page builder ---------------- The web page is built using Sphinx_. It fetches all code documentation directly from the code's docstrings. After building the page using ``make web`` it pushes it to the `nltk.github.com repo on github`_. To push it, it needs access to the repo – because this cannot be done using a deploy key, it has the ssh key of the ``nltk-webdeploy`` user. .. _Sphinx: https://www.sphinx-doc.org .. _`nltk.github.com repo on github`: https://github.com/nltk/nltk.github.com nltk-3.7/web/dev/local_testing.rst000066400000000000000000000120411420073152400172430ustar00rootroot00000000000000NLTK testing ============ 1. Obtain nltk source code; 2. install virtualenv and tox:: pip install virtualenv pip install tox 3. make sure currently supported python versions and pypy executables are in system PATH. It is OK not to have all the executables, tests will be executed for available interpreters. 4. Make sure all NLTK data is downloaded (see ``nltk.download()``); 5. run 'tox' command from the root nltk folder. It will install dependencies and run ``pytest`` for all available interpreters. You may also pass any pytest options here (for example, `-v` for verbose). It may take a long time at first run, but the subsequent runs will be much faster. Please consult https://tox.wiki for more info about the tox tool. Examples -------- Run tests for python 3.7 in verbose mode; executing only tests that failed in the last test run:: tox -e py37 -- -v --failed Run tree doctests for all available interpreters:: tox -- tree.doctest Run a selected unit test for Python 3.7:: tox -e py37 -- -v nltk.test.unit.test_seekable_unicode_stream_reader By default, numpy, scipy and scikit-learn are installed in tox virtualenvs. This is slow, requires working build toolchain and is not always feasible. In order to skip numpy & friends, use ``..-nodeps`` environments:: tox -e py37-nodeps,py37,pypy It is also possible to run tests without tox. This way NLTK would be tested only under single interpreter, but it may be easier to have numpy and other libraries installed this way. In order to run tests without tox, make sure to ``pip install -r test-requirements.txt`` and run ``pytest``:: pytest nltk/test/ Writing tests ------------- Unlike most open-source projects, NLTK test suite is doctest-based. This format is very expressive, and doctests are usually read as documentation. We don't want to rewrite them to unittests; if you're contributing code to NLTK please prefer doctests for testing. Doctests are located at ``nltk/test/*.doctest`` text files and in docstrings for modules, classes, methods and functions. That said, doctests have their limitations and sometimes it is better to use unittests. Test should be written as unittest if some of the following apply: * test deals with non-ascii unicode and Python 2.x support is required; * test is a regression test that is not necessary for documentational purposes. Unittests currently reside in ``nltk/test/unit/test_*.py`` files; pytest is used for test running. If a test should be written as unittest but also has a documentational value then it should be duplicated as doctest, but with a "# doctest: +SKIP" option. There are some gotchas with NLTK doctests (and with doctests in general): * Use ``print("foo")``, not ``print "foo"``: NLTK doctests act like ``from __future__ import print_functions`` is in use. * Don't write ``+ELLIPSIS``, ``+NORMALIZE_WHITESPACE``, ``+IGNORE_EXCEPTION_DETAIL`` flags (they are already ON by default in NLTK). * Do not write doctests that have non-ascii output (they are not supported in Python 2.x). Incorrect:: >>> greeting u'Привет' The proper way is to rewrite such a doctest as a unittest. * In order to conditionally skip a doctest in a separate ``nltk/test/foo.doctest`` file, create ``nltk.test/foo_fixt.py`` file from the following template:: # def setup_module(module): import pytest if some_condition: pytest.skip("foo.doctest is skipped because <...>") * In order to conditionally skip all doctests from the module/class/function docstrings, put the following function in a top-level module namespace:: # def setup_module(module): import pytest if some_condition: pytest.skip("doctests from nltk.. are skipped because <...>") A good idea is to define ``__all__`` in such module and omit ``setup_module`` from ``__all__``. It is not possible to conditionally skip only some doctests from a module. * Do not expect the exact float output; this may fail on some machines:: >>> some_float_constant 0.867 Use ellipsis in this case to make the test robust (or compare the values):: >>> some_float_constant 0.867... >>> abs(some_float_constant - 0.867) < 1e-6 True * Do not rely on dictionary or set item order. Incorrect:: >>> some_dict {"x": 10, "y": 20} The proper way is to sort the items and print them:: >>> for key, value in sorted(some_dict.items()): ... print(key, value) x 10 y 20 If the code requires some external dependencies, then * tests for this code should be skipped if the dependencies are not available: use ``setup_module`` for doctests (as described above) and ``@pytest.mark.skipif / @pytest.mark.skip`` decorators or ``pytest.skip`` exception for unittests; * if the dependency is a Python package, it should be added to tox.ini (but not to ..-nodeps environments). nltk-3.7/web/howto.rst000066400000000000000000000001601420073152400147750ustar00rootroot00000000000000Example usage of NLTK modules ============================= .. toctree:: :titlesonly: :glob: howto/* nltk-3.7/web/index.rst000066400000000000000000000071071420073152400147540ustar00rootroot00000000000000Natural Language Toolkit ======================== NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to `over 50 corpora and lexical resources `_ such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active `discussion forum `_. Thanks to a hands-on guide introducing programming fundamentals alongside topics in computational linguistics, plus comprehensive API documentation, NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike. NLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free, open source, community-driven project. NLTK has been called "a wonderful tool for teaching, and working in, computational linguistics using Python," and "an amazing library to play with natural language." `Natural Language Processing with Python `_ provides a practical introduction to programming for language processing. Written by the creators of NLTK, it guides the reader through the fundamentals of writing Python programs, working with corpora, categorizing text, analyzing linguistic structure, and more. The online version of the book has been been updated for Python 3 and NLTK 3. (The original Python 2 version is still available at `https://www.nltk.org/book_1ed `_.) Some simple things you can do with NLTK --------------------------------------- Tokenize and tag some text: >>> import nltk >>> sentence = """At eight o'clock on Thursday morning ... Arthur didn't feel very good.""" >>> tokens = nltk.word_tokenize(sentence) >>> tokens ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.'] >>> tagged = nltk.pos_tag(tokens) >>> tagged[0:6] [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')] Identify named entities: >>> entities = nltk.chunk.ne_chunk(tagged) >>> entities Tree('S', [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'), Tree('PERSON', [('Arthur', 'NNP')]), ('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')]) Display a parse tree: >>> from nltk.corpus import treebank >>> t = treebank.parsed_sents('wsj_0001.mrg')[0] >>> t.draw() .. image:: _static/images/tree.gif NB. If you publish work that uses NLTK, please cite the NLTK book as follows: Bird, Steven, Edward Loper and Ewan Klein (2009), *Natural Language Processing with Python*. O'Reilly Media Inc. Next Steps ---------- * `Sign up for release announcements `_ * `Join in the discussion `_ .. toctree:: :maxdepth: 1 :hidden: :caption: NLTK Documentation API Reference Example Usage Module Index Wiki FAQ Open Issues NLTK on GitHub .. toctree:: :maxdepth: 1 :hidden: :caption: Installation install data .. toctree:: :maxdepth: 1 :hidden: :caption: More news contribute team nltk-3.7/web/install.rst000066400000000000000000000040371420073152400153120ustar00rootroot00000000000000Installing NLTK =============== NLTK requires Python versions 3.7, 3.8, 3.9 or 3.10 For Windows users, it is strongly recommended that you go through this guide to install Python 3 successfully https://docs.python-guide.org/starting/install3/win/#install3-windows Setting up a Python Environment (Mac/Unix/Windows) -------------------------------------------------- Please go through this guide to learn how to manage your virtual environment managers before you install NLTK, https://docs.python-guide.org/dev/virtualenvs/ Alternatively, you can use the Anaconda distribution installer that comes "batteries included" https://www.anaconda.com/distribution/ Mac/Unix -------- #. Install NLTK: run ``pip install --user -U nltk`` #. Install Numpy (optional): run ``pip install --user -U numpy`` #. Test installation: run ``python`` then type ``import nltk`` For older versions of Python it might be necessary to install setuptools (see https://pypi.python.org/pypi/setuptools) and to install pip (``sudo easy_install pip``). Windows ------- These instructions assume that you do not already have Python installed on your machine. 32-bit binary installation ~~~~~~~~~~~~~~~~~~~~~~~~~~ #. Install Python 3.8: https://www.python.org/downloads/ (avoid the 64-bit versions) #. Install Numpy (optional): https://www.scipy.org/scipylib/download.html #. Install NLTK: https://pypi.python.org/pypi/nltk #. Test installation: ``Start>Python38``, then type ``import nltk`` Installing Third-Party Software ------------------------------- Please see: https://github.com/nltk/nltk/wiki/Installing-Third-Party-Software Installing NLTK Data ------------------------------- After installing the NLTK package, please do install the necessary datasets/models for specific functions to work. If you're unsure of which datasets/models you'll need, you can install the "popular" subset of NLTK data, on the command line type ``python -m nltk.downloader popular``, or in the Python interpreter ``import nltk; nltk.download('popular')`` For details, see https://www.nltk.org/data.html nltk-3.7/web/news.rst000066400000000000000000000663171420073152400146310ustar00rootroot00000000000000Release Notes ============= 2022 ---- NLTK 3.7 release: February 2022: improve and update the NLTK team page on nltk.org drop support for Python 3.6 add support for Python 3.10 2021 ---- NLTK 3.6.7 release: December 2021: resolve issue with `word_tokenize` and `sent_tokenize` NLTK 3.6.6 release: December 2021: add precision, recall, F-measure, confusion matrix to Taggers support alternative Wordnet versions (#2860) support OMW 1.4, use Multilingual Wordnet Data from OMW with newer Wordnet versions add multi Bleu functionality allow empty string in CFG's + more fix several TreebankWordTokenizer and NLTKWordTokenizer bugs fix levenstein distance for duplicated letters modernize `nltk.org/howto` pages update third party tools to newer versions NLTK 3.6.5 release: October 2021: support emoji ZJW sequences and skin tone modifiers in TweetTokenizer METEOR evaluation now requires pre-tokenized input code linting and type hinting avoid re.Pattern and regex.Pattern which fail for Python 3.6, 3.7 NLTK 3.6.4 release: October 2021 improved phone number recognition in tweet tokenizer resolved ReDoS vulnerability in Corpus Reader refactored CISTEM stemmer for German NLTK 3.6.3 release: September 2021 Drop support for Python 3.5, added pre-commit hooks (isort, pyupgrade, black), improvements to WordNet visualization, RIBES score, edit_distance, METEOR score, Punkt, language model package, TweetTokenizer, code and comment cleanups, CI tests now also run on Windows, moved from Travis CI to GitHub Actions NLTK 3.6.2 release: April 2021 Minor enhancements NLTK 3.6 release: April 2021 Add support for Python 3.9 Minor enhancements, bug fixes, code cleanups, efficiency improvements 2020 ---- NLTK 3.5 release: April 2020 Add support for Python 3.8, drop support for Python 2 2019 ---- NLTK 3.4.5 release: August 2019 Fixed security bug in downloader: Zip slip vulnerability - for the unlikely situation where a user configures their downloader to use a compromised server https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-14751) NLTK 3.4.4 release: July 2019 Fix bug in plot function (probability.py) Add improved PanLex Swadesh corpus reader NLTK 3.4.3 release: June 2019 Add Text.generate(), QuadgramAssocMeasures Add SSP to tokenizers Return confidence of best tag from AveragedPerceptron Make plot methods return Axes objects Minor bug fixes Update installation instructions NLTK 3.4.1 release: April 2019 Add chomsky_normal_form for CFGs Add meteor score Add minimum edit/Levenshtein distance based alignment function Allow access to collocation list via text.collocation_list() Support corenlp server options Drop support for Python 3.4 Other minor fixes 2018 ---- NLTK 3.4 release: November 2018 Support Python 3.7, New Language Modeling package, Cistem Stemmer for German, Support Russian National Corpus incl POS tag model, Krippendorf Alpha inter-rater reliability test, Comprehensive code clean-ups, Switch continuous integration from Jenkins to Travis NLTK 3.3 release: May 2018 Support Python 3.6, New interface to CoreNLP, Support synset retrieval by sense key, Minor fixes to CoNLL Corpus Reader, AlignedSent, Fixed minor inconsistencies in APIs and API documentation, Better conformance to PEP8, Drop Moses Tokenizer (incompatible license) 2017 ---- NLTK 3.2.5 release: September 2017 Arabic stemmers (ARLSTem, Snowball), NIST MT evaluation metric and added NIST international_tokenize, Moses tokenizer, Document Russian tagger, Fix to Stanford segmenter, Improve treebank detokenizer, VerbNet, Vader, Misc code and documentation cleanups, Implement fixes suggested by LGTM NLTK 3.2.4 released: May 2017 Remove load-time dependency on Python requests library, Add support for Arabic in StanfordSegmenter NLTK 3.2.3 released: May 2017 Interface to Stanford CoreNLP Web API, improved Lancaster stemmer, improved Treebank tokenizer, support custom tab files for extending WordNet, speed up TnT tagger, speed up FreqDist and ConditionalFreqDist, new corpus reader for MWA subset of PPDB; improvements to testing framework 2016 ---- NLTK 3.2.2 released: December 2016 Support for Aline, ChrF and GLEU MT evaluation metrics, Russian POS tagger model, Moses detokenizer, rewrite Porter Stemmer and FrameNet corpus reader, update FrameNet Corpus to version 1.7, fixes: stanford_segmenter.py, SentiText, CoNLL Corpus Reader, BLEU, naivebayes, Krippendorff's alpha, Punkt, Moses tokenizer, TweetTokenizer, ToktokTokenizer; improvements to testing framework NLTK 3.2.1 released: April 2016 Support for CCG semantics, Stanford segmenter, VADER lexicon; Fixes to BLEU score calculation, CHILDES corpus reader. NLTK 3.2 released: March 2016 Fixes for Python 3.5, code cleanups now Python 2.6 is no longer supported, support for PanLex, support for third party download locations for NLTK data, new support for RIBES score, BLEU smoothing, corpus-level BLEU, improvements to TweetTokenizer, updates for Stanford API, add mathematical operators to ConditionalFreqDist, fix bug in sentiwordnet for adjectives, improvements to documentation, code cleanups, consistent handling of file paths for cross-platform operation. 2015 ---- NLTK 3.1 released: October 2015 Add support for Python 3.5, drop support for Python 2.6, sentiment analysis package and several corpora, improved POS tagger, Twitter package, multi-word expression tokenizer, wrapper for Stanford Neural Dependency Parser, improved translation/alignment module including stack decoder, skipgram and everygram methods, Multext East Corpus and MTECorpusReader, minor bugfixes and enhancements NLTK 3.0.5 released: September 2015 New Twitter package; updates to IBM models 1-3, new models 4 and 5, minor bugfixes and enhancements NLTK 3.0.4 released: July 2015 Minor bugfixes and enhancements. NLTK 3.0.3 released: June 2015 PanLex Swadesh Corpus, tgrep tree search, minor bugfixes. NLTK 3.0.2 released: March 2015 Senna, BLLIP, python-crfsuite interfaces, transition-based dependency parsers, dependency graph visualization, NKJP corpus reader, minor bugfixes and clean-ups. NLTK 3.0.1 released: January 2015 Minor packaging update. 2014 ---- NLTK 3.0.0 released: September 2014 Minor bugfixes. NLTK 3.0.0b2 released: August 2014 Minor bugfixes and clean-ups. NLTK Book Updates: July 2014 The NLTK book is being updated for Python 3 and NLTK 3 `here `__. The original Python 2 edition is still available `here `__. NLTK 3.0.0b1 released: July 2014 FrameNet, SentiWordNet, universal tagset, misc efficiency improvements and bugfixes Several API changes, see https://github.com/nltk/nltk/wiki/Porting-your-code-to-NLTK-3.0 NLTK 3.0a4 released: June 2014 FrameNet, universal tagset, misc efficiency improvements and bugfixes Several API changes, see https://github.com/nltk/nltk/wiki/Porting-your-code-to-NLTK-3.0 For full details see: https://github.com/nltk/nltk/blob/develop/ChangeLog http://nltk.org/nltk3-alpha/ 2013 ---- NLTK Book Updates: October 2013 We are updating the NLTK book for Python 3 and NLTK 3; please see https://www.nltk.org/book/ NLTK 3.0a2 released: July 2013 Misc efficiency improvements and bugfixes; for details see https://github.com/nltk/nltk/blob/develop/ChangeLog http://nltk.org/nltk3-alpha/ NLTK 3.0a1 released: February 2013 This version adds support for NLTK's graphical user interfaces. http://nltk.org/nltk3-alpha/ NLTK 3.0a0 released: January 2013 The first alpha release of NLTK 3.0 is now available for testing. This version of NLTK works with Python 2.6, 2.7, and Python 3. http://nltk.org/nltk3-alpha/ 2012 ---- Python Grant: November 2012 The Python Software Foundation is sponsoring Mikhail Korobov's work on porting NLTK to Python 3. https://pyfound.blogspot.hu/2012/11/grants-to-assist-kivy-nltk-in-porting.html NLTK 2.0.4 released: November 2012 Minor fix to remove numpy dependency. NLTK 2.0.3 released: September 2012 This release contains minor improvements and bugfixes. This is the final release compatible with Python 2.5. NLTK 2.0.2 released: July 2012 This release contains minor improvements and bugfixes. NLTK 2.0.1 released: May 2012 The final release of NLTK 2. NLTK 2.0.1rc4 released: February 2012 The fourth release candidate for NLTK 2. NLTK 2.0.1rc3 released: January 2012 The third release candidate for NLTK 2. 2011 ---- NLTK 2.0.1rc2 released: December 2011 The second release candidate for NLTK 2. For full details see the ChangeLog. NLTK development moved to GitHub: October 2011 The development site for NLTK has moved from GoogleCode to GitHub: https://github.com/nltk NLTK 2.0.1rc1 released: April 2011 The first release candidate for NLTK 2. For full details see the ChangeLog. 2010 ---- Python Text Processing with NLTK 2.0 Cookbook: December 2010 Jacob Perkins has written a 250-page cookbook full of great recipes for text processing using Python and NLTK, published by Packt Publishing. Some of the royalties are being donated to the NLTK project. Japanese translation of NLTK book: November 2010 Masato Hagiwara has translated the NLTK book into Japanese, along with an extra chapter on particular issues with Japanese language process. See https://www.oreilly.co.jp/books/9784873114705/. NLTK 2.0b9 released: July 2010 The last beta release before 2.0 final. For full details see the ChangeLog. NLTK in Ubuntu 10.4 (Lucid Lynx): February 2010 NLTK is now in the latest LTS version of Ubuntu, thanks to the efforts of Robin Munn. See https://packages.ubuntu.com/lucid/python/python-nltk NLTK 2.0b? released: June 2009 - February 2010 Bugfix releases in preparation for 2.0 final. For full details see the ChangeLog. 2009 ---- NLTK Book in second printing: December 2009 The second print run of Natural Language Processing with Python will go on sale in January. We've taken the opportunity to make about 40 minor corrections. The online version has been updated. NLTK Book published: June 2009 Natural Language Processing with Python, by Steven Bird, Ewan Klein and Edward Loper, has been published by O'Reilly Media Inc. It can be purchased in hardcopy, ebook, PDF or for online access, at https://oreilly.com/catalog/9780596516499/. For information about sellers and prices, see https://isbndb.com/d/book/natural_language_processing_with_python/prices.html. Version 0.9.9 released: May 2009 This version finalizes NLTK's API ahead of the 2.0 release and the publication of the NLTK book. There have been dozens of minor enhancements and bugfixes. Many names of the form nltk.foo.Bar are now available as nltk.Bar. There is expanded functionality in the decision tree, collocations, and Toolbox modules. A new translation toy nltk.misc.babelfish has been added. A new module nltk.help gives access to tagset documentation. Fixed imports so NLTK will build and install without Tkinter (for running on servers). New data includes a maximum entropy chunker model and updated grammars. NLTK Contrib includes updates to the coreference package (Joseph Frazee) and the ISRI Arabic stemmer (Hosam Algasaier). The book has undergone substantial editorial corrections ahead of final publication. For full details see the ChangeLog. Version 0.9.8 released: February 2009 This version contains a new off-the-shelf tokenizer, POS tagger, and named-entity tagger. A new metrics package includes inter-annotator agreement scores and various distance and word association measures (Tom Lippincott and Joel Nothman). There's a new collocations package (Joel Nothman). There are many improvements to the WordNet package and browser (Steven Bethard, Jordan Boyd-Graber, Paul Bone), and to the semantics and inference packages (Dan Garrette). The NLTK corpus collection now includes the PE08 Parser Evaluation data, and the CoNLL 2007 Basque and Catalan Dependency Treebanks. We have added an interface for dependency treebanks. Many chapters of the book have been revised in response to feedback from readers. For full details see the ChangeLog. NB some method names have been changed for consistency and simplicity. Use of old names will generate deprecation warnings that indicate the correct name to use. 2008 ---- Version 0.9.7 released: December 2008 This version contains fixes to the corpus downloader (see instructions) enabling NLTK corpora to be released independently of the software, and to be stored in compressed format. There are improvements in the grammars, chart parsers, probability distributions, sentence segmenter, text classifiers and RTE classifier. There are many further improvements to the book. For full details see the ChangeLog. Version 0.9.6 released: December 2008 This version has an incremental corpus downloader (see instructions) enabling NLTK corpora to be released independently of the software. A new WordNet interface has been developed by Steven Bethard (details). NLTK now has support for dependency parsing, developed by Jason Narad (sponsored by Google Summer of Code). There are many enhancements to the semantics and inference packages, contributed by Dan Garrette. The frequency distribution classes have new support for tabulation and plotting. The Brown Corpus reader has human readable category labels instead of letters. A new Swadesh Corpus containing comparative wordlists has been added. NLTK-Contrib includes a TIGERSearch implementation for searching treebanks (Torsten Marek). Most chapters of the book have been substantially revised. The NLTK Project has moved: November 2008 The NLTK project has moved to Google Sites, Google Code and Google Groups. Content for users and the nltk.org domain is hosted on Google Sites. The home of NLTK development is now Google Code. All discussion lists are at Google Groups. Our old site at nltk.sourceforge.net will continue to be available while we complete this transition. Old releases are still available via our SourceForge release page. We're grateful to SourceForge for hosting our project since its inception in 2001. Version 0.9.5 released: August 2008 This version contains several low-level changes to facilitate installation, plus updates to several NLTK-Contrib projects. A new text module gives easy access to text corpora for newcomers to NLP. For full details see the ChangeLog. Version 0.9.4 released: August 2008 This version contains a substantially expanded semantics package contributed by Dan Garrette, improvements to the chunk, tag, wordnet, tree and feature-structure modules, Mallet interface, ngram language modeling, new GUI tools (WordNet? browser, chunking, POS-concordance). The data distribution includes the new NPS Chat Corpus. NLTK-Contrib includes the following new packages (still undergoing active development) NLG package (Petro Verkhogliad), dependency parsers (Jason Narad), coreference (Joseph Frazee), CCG parser (Graeme Gange), and a first order resolution theorem prover (Dan Garrette). For full details see the ChangeLog. NLTK presented at ACL conference: June 2008 A paper on teaching courses using NLTK will be presented at the ACL conference: Multidisciplinary Instruction with the Natural Language Toolkit Version 0.9.3 released: June 2008 This version contains an improved WordNet? similarity module using pre-built information content files (included in the corpus distribution), new/improved interfaces to Weka, MEGAM and Prover9/Mace4 toolkits, improved Unicode support for corpus readers, a BNC corpus reader, and a rewrite of the Punkt sentence segmenter contributed by Joel Nothman. NLTK-Contrib includes an implementation of incremental algorithm for generating referring expression contributed by Margaret Mitchell. For full details see the ChangeLog. NLTK presented at LinuxFest Northwest: April 2008 Sean Boisen presented NLTK at LinuxFest Northwest, which took place in Bellingham, Washington. His presentation slides are available at: https://semanticbible.com/other/talks/2008/nltk/main.html NLTK in Google Summer of Code: April 2008 Google Summer of Code will sponsor two NLTK projects. Jason Narad won funding for a project on dependency parsers in NLTK (mentored by Sebastian Riedel and Jason Baldridge). Petro Verkhogliad won funding for a project on natural language generation in NLTK (mentored by Robert Dale and Edward Loper). Python Software Foundation adopts NLTK for Google Summer of Code application: March 2008 The Python Software Foundation has listed NLTK projects for sponsorship from the 2008 Google Summer of Code program. For details please see https://wiki.python.org/moin/SummerOfCode. Version 0.9.2 released: March 2008 This version contains a new inference module linked to the Prover9/Mace4 theorem-prover and model checker (Dan Garrette, Ewan Klein). It also includes the VerbNet? and PropBank? corpora along with corpus readers. A bug in the Reuters corpus reader has been fixed. NLTK-Contrib includes new work on the WordNet? browser (Jussi Salmela). For full details see the ChangeLog Youtube video about NLTK: January 2008 The video from of the NLTK talk at the Bay Area Python Interest Group last July has been posted at https://www.youtube.com/watch?v=keXW_5-llD0 (1h15m) Version 0.9.1 released: January 2008 This version contains new support for accessing text categorization corpora, along with several corpora categorized for topic, genre, question type, or sentiment. It includes several new corpora: Question classification data (Li & Roth), Reuters 21578 Corpus, Movie Reviews corpus (Pang & Lee), Recognising Textual Entailment (RTE) Challenges. NLTK-Contrib includes expanded support for semantics (Dan Garrette), readability scoring (Thomas Jakobsen, Thomas Skardal), and SIL Toolbox (Greg Aumann). The book contains many improvements in early chapters in response to reader feedback. For full details see the ChangeLog. 2007 ---- NLTK-Lite 0.9 released: October 2007 This version is substantially revised and expanded from version 0.8. The entire toolkit can be accessed via a single import statement "import nltk", and there is a more convenient naming scheme. Calling deprecated functions generates messages that help programmers update their code. The corpus, tagger, and classifier modules have been redesigned. All functionality of the old NLTK 1.4.3 is now covered by NLTK-Lite 0.9. The book has been revised and expanded. A new data package incorporates the existing corpus collection and contains new sections for pre-specified grammars and pre-computed models. Several new corpora have been added, including treebanks for Portuguese, Spanish, Catalan and Dutch. A Macintosh distribution is provided. For full details see the ChangeLog. NLTK-Lite 0.9b2 released: September 2007 This version is substantially revised and expanded from version 0.8. The entire toolkit can be accessed via a single import statement "import nltk", and many common NLP functions accessed directly, e.g. nltk.PorterStemmer?, nltk.ShiftReduceParser?. The corpus, tagger, and classifier modules have been redesigned. The book has been revised and expanded, and the chapters have been reordered. NLTK has a new data package incorporating the existing corpus collection and adding new sections for pre-specified grammars and pre-computed models. The Floresta Portuguese Treebank has been added. Release 0.9b2 fixes several minor problems with 0.9b1 and removes the numpy dependency. It includes a new corpus and corpus reader for Brazilian Portuguese news text (MacMorphy?) and an improved corpus reader for the Sinica Treebank, and a trained model for Portuguese sentence segmentation. NLTK-Lite 0.9b1 released: August 2007 This version is substantially revised and expanded from version 0.8. The entire toolkit can be accessed via a single import statement "import nltk", and many common NLP functions accessed directly, e.g. nltk.PorterStemmer?, nltk.ShiftReduceParser?. The corpus, tagger, and classifier modules have been redesigned. The book has been revised and expanded, and the chapters have been reordered. NLTK has a new data package incorporating the existing corpus collection and adding new sections for pre-specified grammars and pre-computed models. The Floresta Portuguese Treebank has been added. For full details see the ChangeLog?. NLTK talks in São Paulo: August 2007 Steven Bird will present NLTK in a series of talks at the First Brazilian School on Computational Linguistics, at the University of São Paulo in the first week of September. NLTK talk in Bay Area: July 2007 Steven Bird, Ewan Klein, and Edward Loper will present NLTK at the Bay Area Python Interest Group, at Google on Thursday 12 July. NLTK-Lite 0.8 released: July 2007 This version is substantially revised and expanded from version 0.7. The code now includes improved interfaces to corpora, chunkers, grammars, frequency distributions, full integration with WordNet? 3.0 and WordNet? similarity measures. The book contains substantial revision of Part I (tokenization, tagging, chunking) and Part II (grammars and parsing). NLTK has several new corpora including the Switchboard Telephone Speech Corpus transcript sample (Talkbank Project), CMU Problem Reports Corpus sample, CONLL2002 POS+NER data, Patient Information Leaflet corpus sample, Indian POS-Tagged data (Bangla, Hindi, Marathi, Telugu), Shakespeare XML corpus sample, and the Universal Declaration of Human Rights corpus with text samples in 300+ languages. NLTK features in Language Documentation and Conservation article: July 2007 An article Managing Fieldwork Data with Toolbox and the Natural Language Toolkit by Stuart Robinson, Greg Aumann, and Steven Bird appears in the inaugural issue of ''Language Documentation and Conservation''. It discusses several small Python programs for manipulating field data. NLTK features in ACM Crossroads article: May 2007 An article Getting Started on Natural Language Processing with Python by Nitin Madnani will appear in ''ACM Crossroads'', the ACM Student Journal. It discusses NLTK in detail, and provides several helpful examples including an entertaining free word association program. NLTK-Lite 0.7.5 released: May 2007 This version contains improved interfaces for WordNet 3.0 and WordNet-Similarity, the Lancaster Stemmer (contributed by Steven Tomcavage), and several new corpora including the Switchboard Telephone Speech Corpus transcript sample (Talkbank Project), CMU Problem Reports Corpus sample, CONLL2002 POS+NER data, Patient Information Leaflet corpus sample and WordNet 3.0 data files. With this distribution WordNet no longer needs to be separately installed. NLTK-Lite 0.7.4 released: May 2007 This release contains new corpora and corpus readers for Indian POS-Tagged data (Bangla, Hindi, Marathi, Telugu), and the Sinica Treebank, and substantial revision of Part II of the book on structured programming, grammars and parsing. NLTK-Lite 0.7.3 released: April 2007 This release contains improved chunker and PCFG interfaces, the Shakespeare XML corpus sample and corpus reader, improved tutorials and improved formatting of code samples, and categorization of problem sets by difficulty. NLTK-Lite 0.7.2 released: March 2007 This release contains new text classifiers (Cosine, NaiveBayes?, Spearman), contributed by Sam Huston, simple feature detectors, the UDHR corpus with text samples in 300+ languages and a corpus interface; improved tutorials (340 pages in total); additions to contrib area including Kimmo finite-state morphology system, Lambek calculus system, and a demonstration of text classifiers for language identification. NLTK-Lite 0.7.1 released: January 2007 This release contains bugfixes in the WordNet? and HMM modules. 2006 ---- NLTK-Lite 0.7 released: December 2006 This release contains: new semantic interpretation package (Ewan Klein), new support for SIL Toolbox format (Greg Aumann), new chunking package including cascaded chunking (Steven Bird), new interface to WordNet? 2.1 and Wordnet similarity measures (David Ormiston Smith), new support for Penn Treebank format (Yoav Goldberg), bringing the codebase to 48,000 lines; substantial new chapters on semantic interpretation and chunking, and substantial revisions to several other chapters, bringing the textbook documentation to 280 pages; NLTK-Lite 0.7b1 released: December 2006 This release contains: new semantic interpretation package (Ewan Klein), new support for SIL Toolbox format (Greg Aumann), new chunking package including cascaded chunking, wordnet package updated for version 2.1 of Wordnet, and prototype wordnet similarity measures (David Ormiston Smith), bringing the codebase to 48,000 lines; substantial new chapters on semantic interpretation and chunking, and substantial revisions to several other chapters, bringing the textbook documentation to 270 pages; NLTK-Lite 0.6.6 released: October 2006 This release contains bugfixes, improvements to Shoebox file format support, and expanded tutorial discussions of programming and feature-based grammars. NLTK-Lite 0.6.5 released: July 2006 This release contains improvements to Shoebox file format support (by Stuart Robinson and Greg Aumann); an implementation of hole semantics (by Peter Wang); improvements to lambda calculus and semantic interpretation modules (by Ewan Klein); a new corpus (Sinica Treebank sample); and expanded tutorial discussions of trees, feature-based grammar, unification, PCFGs, and more exercises. NLTK-Lite passes 10k download milestone: May 2006 We have now had 10,000 downloads of NLTK-Lite in the nine months since it was first released. NLTK-Lite 0.6.4 released: April 2006 This release contains new corpora (Senseval 2, TIMIT sample), a clusterer, cascaded chunker, and several substantially revised tutorials. 2005 ---- NLTK 1.4 no longer supported: December 2005 The main development has switched to NLTK-Lite. The latest version of NLTK can still be downloaded; see the installation page for instructions. NLTK-Lite 0.6 released: November 2005 contains bug-fixes, PDF versions of tutorials, expanded fieldwork tutorial, PCFG grammar induction (by Nathan Bodenstab), and prototype concordance and paradigm display tools (by Peter Spiller and Will Hardy). NLTK-Lite 0.5 released: September 2005 contains bug-fixes, improved tutorials, more project suggestions, and a pronunciation dictionary. NLTK-Lite 0.4 released: September 2005 contains bug-fixes, improved tutorials, more project suggestions, and probabilistic parsers. NLTK-Lite 0.3 released: August 2005 contains bug-fixes, documentation clean-up, project suggestions, and the chart parser demos including one for Earley parsing by Jean Mark Gawron. NLTK-Lite 0.2 released: July 2005 contains bug-fixes, documentation clean-up, and some translations of tutorials into Brazilian Portuguese by Tiago Tresoldi. NLTK-Lite 0.1 released: July 2005 substantially simplified and streamlined version of NLTK has been released Brazilian Portuguese Translation: April 2005 top-level pages of this website have been translated into Brazilian Portuguese by Tiago Tresoldi; translations of the tutorials are in preparation http://hermes.sourceforge.net/nltk-br/ 1.4.3 Release: February 2005 NLTK 1.4.3 has been released; this is the first version which is compatible with Python 2.4. nltk-3.7/web/py-modindex.rst000066400000000000000000000003671420073152400161030ustar00rootroot00000000000000Index ===== .. This file is a hack to allow linking to the Documentation Index in a Table of Contents tree. That said, you're not really supposed to: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#special-names nltk-3.7/web/team.rst000066400000000000000000000002461420073152400145700ustar00rootroot00000000000000NLTK Team ========= .. raw:: html :file: team/current_team.html Former NLTK Team Members ------------------------ .. raw:: html :file: team/former_team.html nltk-3.7/web/team/000077500000000000000000000000001420073152400140345ustar00rootroot00000000000000nltk-3.7/web/team/current_team.html000066400000000000000000000255261420073152400174240ustar00rootroot00000000000000
    Location

    Nijmegen, Netherlands

    Role

    Core Maintainer. Rewrote nltk.org, frequent issue triaging, bug fixing, and enhancements.

    Active Range

    2020-

    Joel Nothman @jnothman
    Location

    Sydney, Australia

    Role

    Metrics package maintainer, also responsible for nltk.tokenize.punkt, i.a.

    Active Range

    2008-

    Location

    Darwin, Australia

    Role

    Project lead, manager of NLTK Releases.

    Active Range

    2001-

    Alexis Dimitradis @alexisdimi
    Location

    Utrecht, Netherlands

    Role

    Manager of the NLTK mailing lists.

    Danny Sepler @dannysepler
    Role

    Maintainer, primarily involved with tests and quality assurance.

    Active Range

    2020-

    Dmitrijs Milajevs @dimazest
    Location

    Oakland, USA

    Role

    Maintainer, particularly involved with WSD, CoreNLP and DependencyGraphs.

    Active Range

    2014-

    Francis Bond @fcbond
    Role

    Domain expert of WordNet.

    Active Range

    2015-

    Ilia Kurenkov @iliakur
    Role

    Language models package maintainer. Also involved with tests and code quality.

    Active Range

    2012-

    purificant @purificant
    Active Range

    2018-

    Location

    Saarbrucken, Germany

    Active Range

    2014-

    nltk-3.7/web/team/former_team.html000066400000000000000000000265521420073152400172340ustar00rootroot00000000000000
    Dan Garrette @dhgarrette
    Location

    Austin, USA

    Role

    Semantics (nltk.sem, nltk.inference)

    Active Range

    2007-2016

    Location

    Gothenburg, Sweden

    Role

    Parsing package maintainer: nltk.parse, nltk.featstruct, i.a.

    Active Range

    2008-2015

    Location

    Ekaterinburg, Russia

    Role

    Porting NLTK to Python 3.

    Active Range

    2012-2017

    Alex Rudnick @alexrudnick
    Active Range

    2010-2014

    Edward Loper @edloper
    Location

    Baltimore, USA

    Active Range

    2001-2013

    Ewan Klein @ewan-klein
    Location

    Edinburgh, UK

    Active Range

    2001-2016

    Location

    Melbourne, Australia

    Role

    parsing, classification

    Active Range

    2003-2005

    Pierpaolo Pantone @fievelk
    Location

    Berlin, Germany

    Active Range

    2014-2020

    Nathan Schneider @nschneid
    Location

    Georgetown, USA

    Active Range

    2012-2018

    Álvaro Justen @turicas
    Active Range

    2012-2015

    Morten M. Neergaard @xim
    Active Range

    2011-2014

    nltk-3.7/web/team/team.json000066400000000000000000000136021420073152400156570ustar00rootroot00000000000000{ "current": [ { "full_name": "Tom Aarsen", "github_username": "tomaarsen", "personal_website_url": "https://www.tomaarsen.com", "location": "Nijmegen, Netherlands", "role": "Core Maintainer. Rewrote nltk.org, frequent issue triaging, bug fixing, and enhancements.", "active_range": "2020-" }, { "full_name": "Joel Nothman", "github_username": "jnothman", "personal_website_url": null, "location": "Sydney, Australia", "role": "Metrics package maintainer, also responsible for nltk.tokenize.punkt, i.a.", "active_range": "2008-" }, { "full_name": "Steven Bird", "github_username": "stevenbird", "personal_website_url": "http://www.stevenbird.net/", "location": "Darwin, Australia", "role": "Project lead, manager of NLTK Releases.", "active_range": "2001-" }, { "full_name": "Alexis Dimitradis", "github_username": "alexisdimi", "personal_website_url": null, "location": "Utrecht, Netherlands", "role": "Manager of the NLTK mailing lists." }, { "full_name": "Danny Sepler", "github_username": "dannysepler", "personal_website_url": null, "location": "", "role": "Maintainer, primarily involved with tests and quality assurance.", "active_range": "2020-" }, { "full_name": "Dmitrijs Milajevs", "github_username": "dimazest", "personal_website_url": null, "location": "Oakland, USA", "role": "Maintainer, particularly involved with WSD, CoreNLP and DependencyGraphs.", "active_range": "2014-" }, { "full_name": "Francis Bond", "github_username": "fcbond", "personal_website_url": null, "location": "", "role": "Domain expert of WordNet.", "active_range": "2015-" }, { "full_name": "Ilia Kurenkov", "github_username": "iliakur", "personal_website_url": null, "location": "", "role": "Language models package maintainer. Also involved with tests and code quality.", "active_range": "2012-" }, { "full_name": "purificant", "github_username": "purificant", "personal_website_url": null, "location": "", "role": "", "active_range": "2018-" }, { "full_name": "Liling Tan", "github_username": "alvations", "personal_website_url": "http://alvations.com", "location": "Saarbrucken, Germany", "role": "", "active_range": "2014-" } ], "former": [ { "full_name": "Dan Garrette", "github_username": "dhgarrette", "personal_website_url": null, "location": "Austin, USA", "role": "Semantics (nltk.sem, nltk.inference)", "active_range": "2007-2016" }, { "full_name": "Peter Ljunglöf", "github_username": "heatherleaf", "personal_website_url": "https://www.cse.chalmers.se/~peb/", "location": "Gothenburg, Sweden", "role": "Parsing package maintainer: nltk.parse, nltk.featstruct, i.a.", "active_range": "2008-2015" }, { "full_name": "Mikhail Korobov", "github_username": "kmike", "personal_website_url": "kmike.ru", "location": "Ekaterinburg, Russia", "role": "Porting NLTK to Python 3.", "active_range": "2012-2017" }, { "full_name": "Alex Rudnick", "github_username": "alexrudnick", "personal_website_url": null, "location": "", "role": "", "active_range": "2010-2014" }, { "full_name": "Edward Loper", "github_username": "edloper", "personal_website_url": null, "location": "Baltimore, USA", "role": "", "active_range": "2001-2013" }, { "full_name": "Ewan Klein", "github_username": "ewan-klein", "personal_website_url": null, "location": "Edinburgh, UK", "role": "", "active_range": "2001-2016" }, { "full_name": "Trevor Cohn", "github_username": "trevorcohn", "personal_website_url": "https://people.eng.unimelb.edu.au/tcohn/", "location": "Melbourne, Australia", "role": "parsing, classification", "active_range": "2003-2005" }, { "full_name": "Pierpaolo Pantone", "github_username": "fievelk", "personal_website_url": null, "location": "Berlin, Germany", "role": "", "active_range": "2014-2020" }, { "full_name": "Nathan Schneider", "github_username": "nschneid", "personal_website_url": null, "location": "Georgetown, USA", "role": "", "active_range": "2012-2018" }, { "full_name": "Álvaro Justen", "github_username": "turicas", "personal_website_url": null, "location": "", "role": "", "active_range": "2012-2015" }, { "full_name": "Morten M. Neergaard", "github_username": "xim", "personal_website_url": null, "location": "", "role": "", "active_range": "2011-2014" } ] }
    {%- if member.location %}
    Location

    {{ member.location }}

    {%- endif %} {%- if member.role %}
    Role

    {{ member.role }}

    {%- endif %} {%- if member.active_range %}
    Active Range

    {{ member.active_range }}

    {%- endif %}