pax_global_header 0000666 0000000 0000000 00000000064 14376573757 0014542 g ustar 00root root 0000000 0000000 52 comment=35298b5c24e9d3fa62aa65ef50da8f155eee9cd7
camelot-py-0.11.0/ 0000775 0000000 0000000 00000000000 14376573757 0013673 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/.coveragerc 0000664 0000000 0000000 00000000024 14376573757 0016010 0 ustar 00root root 0000000 0000000 [run]
branch = True
camelot-py-0.11.0/.github/ 0000775 0000000 0000000 00000000000 14376573757 0015233 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/.github/FUNDING.yml 0000664 0000000 0000000 00000000031 14376573757 0017042 0 ustar 00root root 0000000 0000000 open_collective: camelot
camelot-py-0.11.0/.github/ISSUE_TEMPLATE/ 0000775 0000000 0000000 00000000000 14376573757 0017416 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/.github/ISSUE_TEMPLATE/bug_report.md 0000664 0000000 0000000 00000002176 14376573757 0022116 0 ustar 00root root 0000000 0000000 ---
name: Bug report
about: Please follow this template to submit bug reports.
title: ''
labels: bug
assignees: ''
---
**Describe the bug**
**Steps to reproduce the bug**
**Expected behavior**
**Code**
```
import camelot
# add your code here
```
**PDF**
**Screenshots**
**Environment**
- OS: [e.g. macOS]
- Python version:
- Numpy version:
- OpenCV version:
- Ghostscript version:
- Camelot version:
**Additional context**
camelot-py-0.11.0/.github/workflows/ 0000775 0000000 0000000 00000000000 14376573757 0017270 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/.github/workflows/tests.yml 0000664 0000000 0000000 00000002067 14376573757 0021162 0 ustar 00root root 0000000 0000000 name: tests
on: [pull_request]
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install camelot with dependencies
run: |
make install
- name: Test with pytest
run: |
make test
test_latest:
name: Test on ${{ matrix.os }} with Python 3.9
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: [3.9]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install camelot with dependencies
run: |
make install
- name: Test with pytest
run: |
make test
camelot-py-0.11.0/.gitignore 0000664 0000000 0000000 00000000233 14376573757 0015661 0 ustar 00root root 0000000 0000000 fontconfig/
__pycache__/
*.py[cod]
*.so
build/
dist/
*.egg-info/
.eggs/
.coverage
coverage.xml
.pytest_cache/
_build/
.venv/
htmlcov/
# vscode
.vscode
camelot-py-0.11.0/.readthedocs.yml 0000664 0000000 0000000 00000001122 14376573757 0016755 0 ustar 00root root 0000000 0000000 # .readthedocs.yml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
# Build documentation with MkDocs
#mkdocs:
# configuration: mkdocs.yml
# Optionally build your docs in additional formats such as PDF
formats:
- pdf
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.8
install:
- method: pip
path: .
extra_requirements:
- dev
camelot-py-0.11.0/CODE_OF_CONDUCT.md 0000664 0000000 0000000 00000000160 14376573757 0016467 0 ustar 00root root 0000000 0000000 Be cordial or be on your way. --Kenneth Reitz
https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way
camelot-py-0.11.0/CONTRIBUTING.md 0000664 0000000 0000000 00000015306 14376573757 0016131 0 ustar 00root root 0000000 0000000 # Contributor's Guide
If you're reading this, you're probably looking to contributing to Camelot. *Time is the only real currency*, and the fact that you're considering spending some here is *very* generous of you. Thank you very much!
This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to [Vinayak Mehta](https://vinayak-mehta.github.io), the author and maintainer.
## Code Of Conduct
The following quote sums up the **Code Of Conduct**.
> Be cordial or be on your way. --Kenneth Reitz
Kenneth Reitz has also written an [essay](https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way) on this topic, which you should read.
As the [Requests Code Of Conduct](http://docs.python-requests.org/en/master/dev/contributing/#be-cordial) states, **all contributions are welcome**, as long as everyone involved is treated with respect.
## Your first contribution
A great way to start contributing to Camelot is to pick an issue tagged with the [help wanted](https://github.com/camelot-dev/camelot/labels/help%20wanted) tag or the [good first issue](https://github.com/camelot-dev/camelot/labels/good%20first%20issue) tag. If you're unable to find a good first issue, feel free to contact the maintainer.
## Setting up a development environment
To install the dependencies needed for development, you can use pip:
$ pip install "camelot-py[dev]"
Alternatively, you can clone the project repository, and install using pip:
$ pip install ".[dev]"
## Pull Requests
### Submit a pull request
The preferred workflow for contributing to Camelot is to fork the [project repository](https://github.com/camelot-dev/camelot) on GitHub, clone, develop on a branch and then finally submit a pull request. Here are the steps:
1. Fork the project repository. Click on the ‘Fork’ button near the top of the page. This creates a copy of the code under your account on the GitHub.
2. Clone your fork of Camelot from your GitHub account:
$ git clone https://www.github.com/[username]/camelot
3. Create a branch to hold your changes:
$ git checkout -b my-feature
Always branch out from `master` to work on your contribution. It's good practice to never work on the `master` branch!
**Protip: `git stash` is a great way to save the work that you haven't committed yet, to move between branches.**
4. Work on your contribution. Add changed files using `git add` and then `git commit` them:
$ git add modified_files
$ git commit
5. Finally, push them to your GitHub fork:
$ git push -u origin my-feature
Now it's time to go to the your fork of Camelot and create a pull request! You can [follow these instructions](https://help.github.com/articles/creating-a-pull-request-from-a-fork/) to do this.
### Work on your pull request
We recommend that your pull request complies with the following rules:
- Make sure your code follows [pep8](http://pep8.org).
- In case your pull request contains function docstrings, make sure you follow the [numpydoc](https://numpydoc.readthedocs.io/en/latest/format.html) format. All function docstrings in Camelot follow this format. Moreover, following the format will make sure that the API documentation is generated flawlessly.
- Make sure your commit messages follow [the seven rules of a great git commit message](https://chris.beams.io/posts/git-commit/):
- Separate subject from body with a blank line
- Limit the subject line to 50 characters
- Capitalize the subject line
- Do not end the subject line with a period
- Use the imperative mood in the subject line
- Wrap the body at 72 characters
- Use the body to explain what and why vs. how
- Please prefix the title of your pull request with [MRG] (Ready for Merge), if the contribution is complete and ready for a detailed review. An incomplete pull request's title should be prefixed with [WIP] (to indicate a work in progress), and changed to [MRG] when it's complete. A good [task list](https://blog.github.com/2013-01-09-task-lists-in-gfm-issues-pulls-comments/) in the PR description will ensure that other people get a better idea of what it proposes to do, which will also increase collaboration.
- If contributing new functionality, make sure that you add a unit test for it, while making sure that all previous tests pass. Camelot uses [pytest](https://docs.pytest.org/en/latest/) for testing. Tests can be run using:
$ python setup.py test
## Writing Documentation
Writing documentation, function docstrings, examples and tutorials is a great way to start contributing to open-source software! The documentation is present inside the `docs/` directory of the project repository.
It is written in [reStructuredText](https://en.wikipedia.org/wiki/ReStructuredText), with [Sphinx](http://www.sphinx-doc.org/en/master/) used to generate these lovely HTML files that you're currently reading (unless you're reading this on GitHub). You can edit the documentation using any text editor and then generate the HTML output by running `make html` in the `docs/` directory.
The function docstrings are written using the [numpydoc](https://numpydoc.readthedocs.io/en/latest/format.html) extension for Sphinx. Make sure you check out its format guidelines before you start writing one.
## Filing Issues
We use [GitHub issues](https://github.com/camelot-dev/camelot/issues) to keep track of all issues and pull requests. Before opening an issue (which asks a question or reports a bug), please use GitHub search to look for existing issues (both open and closed) that may be similar.
### Questions
Please don't use GitHub issues for support questions. A better place for them would be [Stack Overflow](http://stackoverflow.com). Make sure you tag them using the `python-camelot` tag.
### Bug Reports
In bug reports, make sure you include:
- Your operating system type and Python version number, along with the version numbers of NumPy, OpenCV and Camelot. You can use the following code snippet to find this information:
import platform; print(platform.platform())
import sys; print('Python', sys.version)
import numpy; print('NumPy', numpy.__version__)
import cv2; print('OpenCV', cv2.__version__)
import camelot; print('Camelot', camelot.__version__)
- The complete traceback. Just adding the exception message or a part of the traceback won't help us fix your issue sooner.
- Steps to reproduce the bug, using code snippets. See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks/).
- A link to the PDF document that you were trying to extract tables from, telling us what you expected the code to do and what actually happened.
camelot-py-0.11.0/HISTORY.md 0000775 0000000 0000000 00000037060 14376573757 0015367 0 ustar 00root root 0000000 0000000 Release History
===============
master
------
0.11.0 (2023-02-26)
------------------
- Replace `PdfFileReader` with `PdfReader` and pin PyPDF to `>=3.0.0`. [#307](https://github.com/camelot-dev/camelot/pull/307) by [Martin Thoma](https://github.com/MartinThoma).
0.10.1 (2021-07-11)
------------------
- Change extra requirements from `cv` to `base`. You can use `pip install "camelot-py[base]"` to install everything required to run camelot.
0.10.0 (2021-07-11)
------------------
**Improvements**
- Add support for multiple image conversion backends. [#198](https://github.com/camelot-dev/camelot/pull/198) and [#253](https://github.com/camelot-dev/camelot/pull/253) by Vinayak Mehta.
- Add markdown export format. [#222](https://github.com/camelot-dev/camelot/pull/222/) by [Lucas Cimon](https://github.com/Lucas-C).
**Documentation**
- Add faq section. [#216](https://github.com/camelot-dev/camelot/pull/216) by [Stefano Fiorucci](https://github.com/anakin87).
0.9.0 (2021-06-15)
------------------
**Bugfixes**
- Fix use of resolution argument to generate image with ghostscript. [#231](https://github.com/camelot-dev/camelot/pull/231) by [Tiago Samaha Cordeiro](https://github.com/tiagosamaha).
- [#15](https://github.com/camelot-dev/camelot/issues/15) Fix duplicate strings being assigned to the same cell. [#206](https://github.com/camelot-dev/camelot/pull/206) by [Eduardo Gonzalez Lopez de Murillas](https://github.com/edugonza).
- Save plot when filename is specified. [#121](https://github.com/camelot-dev/camelot/pull/121) by [Jens Diemer](https://github.com/jedie).
- Close file streams explicitly. [#202](https://github.com/camelot-dev/camelot/pull/202) by [Martin Abente Lahaye](https://github.com/tchx84).
- Use correct re.sub signature. [#186](https://github.com/camelot-dev/camelot/pull/186) by [pevisscher](https://github.com/pevisscher).
- [#183](https://github.com/camelot-dev/camelot/issues/183) Fix UnicodeEncodeError when using Stream flavor by adding encoding kwarg to `to_html`. [#188](https://github.com/camelot-dev/camelot/pull/188) by [Stefano Fiorucci](https://github.com/anakin87).
- [#179](https://github.com/camelot-dev/camelot/issues/179) Fix `max() arg is an empty sequence` error on PDFs with blank pages. [#189](https://github.com/camelot-dev/camelot/pull/189) by Vinayak Mehta.
**Improvements**
- Add `line_overlap` and `boxes_flow` to `LAParams`. [#219](https://github.com/camelot-dev/camelot/pull/219) by [Arnie97](https://github.com/Arnie97).
- [Add bug report template.](https://github.com/camelot-dev/camelot/commit/0a3944e54d133b701edfe9c7546ff11289301ba8)
- Move from [Travis to GitHub Actions](https://github.com/camelot-dev/camelot/pull/241).
- Update `.readthedocs.yml` and [remove requirements.txt](https://github.com/camelot-dev/camelot/commit/7ab5db39d07baa4063f975e9e00f6073340e04c1#diff-cde814ef2f549dc093f5b8fc533b7e8f47e7b32a8081e0760e57d5c25a1139d9)
**Documentation**
- [#193](https://github.com/camelot-dev/camelot/issues/193) Add better checks to confirm proper installation of ghostscript. [#196](https://github.com/camelot-dev/camelot/pull/196) by [jimhall](https://github.com/jimhall).
- Update `advanced.rst` plotting examples. [#119](https://github.com/camelot-dev/camelot/pull/119) by [Jens Diemer](https://github.com/jedie).
0.8.2 (2020-07-27)
------------------
* Revert the changes in `0.8.1`.
0.8.1 (2020-07-21)
------------------
**Bugfixes**
* [#169](https://github.com/camelot-dev/camelot/issues/169) Fix import error caused by `pdfminer.six==20200720`. [#171](https://github.com/camelot-dev/camelot/pull/171) by Vinayak Mehta.
0.8.0 (2020-05-24)
------------------
**Improvements**
* Drop Python 2 support!
* Remove Python 2.7 and 3.5 support.
* Replace all instances of `.format` with f-strings.
* Remove all `__future__` imports.
* Fix HTTP 403 forbidden exception in read_pdf(url) and remove Python 2 urllib support.
* Fix test data.
**Bugfixes**
* Fix library discovery on Windows. [#32](https://github.com/camelot-dev/camelot/pull/32) by [KOLANICH](https://github.com/KOLANICH).
* Fix calling convention of callback functions. [#34](https://github.com/camelot-dev/camelot/pull/34) by [KOLANICH](https://github.com/KOLANICH).
0.7.3 (2019-07-07)
------------------
**Improvements**
* Camelot now follows the Black code style! [#1](https://github.com/camelot-dev/camelot/pull/1) and [#3](https://github.com/camelot-dev/camelot/pull/3).
**Bugfixes**
* Fix Click.HelpFormatter monkey-patch. [#5](https://github.com/camelot-dev/camelot/pull/5) by [Dimiter Naydenov](https://github.com/dimitern).
* Fix strip_text argument getting ignored. [#4](https://github.com/camelot-dev/camelot/pull/4) by [Dimiter Naydenov](https://github.com/dimitern).
* [#25](https://github.com/camelot-dev/camelot/issues/25) edge_tol skipped in read_pdf. [#26](https://github.com/camelot-dev/camelot/pull/26) by Vinayak Mehta.
* Fix pytest deprecation warning. [#2](https://github.com/camelot-dev/camelot/pull/2) by Vinayak Mehta.
* [#293](https://github.com/socialcopsdev/camelot/issues/293) Split text ignores all text to the right of last cut. [#294](https://github.com/socialcopsdev/camelot/pull/294) by Vinayak Mehta.
* [#277](https://github.com/socialcopsdev/camelot/issues/277) Sort TableList by order of tables in PDF. [#283](https://github.com/socialcopsdev/camelot/pull/283) by [Sym Roe](https://github.com/symroe).
* [#312](https://github.com/socialcopsdev/camelot/issues/312) `table_regions` throws `ValueError` when `flavor='stream'`. [#332](https://github.com/socialcopsdev/camelot/pull/332) by Vinayak Mehta.
0.7.2 (2019-01-10)
------------------
**Bugfixes**
* [#245](https://github.com/socialcopsdev/camelot/issues/245) Fix AttributeError for encrypted files. [#251](https://github.com/socialcopsdev/camelot/pull/251) by Yatin Taluja.
0.7.1 (2019-01-06)
------------------
**Bugfixes**
* Move ghostscript import to inside the function so Anaconda builds don't fail.
0.7.0 (2019-01-05)
------------------
**Improvements**
* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
* You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
* Kwarg `line_size_scaling` is now called `line_scale`.
* [#212](https://github.com/socialcopsdev/camelot/issues/212) Add support to export as sqlite database. [#244](https://github.com/socialcopsdev/camelot/pull/244) by Vinayak Mehta.
* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.
**Documentation**
* Remove mention of old mesh kwarg from docs. [#241](https://github.com/socialcopsdev/camelot/pull/241) by [fte10kso](https://github.com/fte10kso).
**Note**: The python wrapper to Ghostscript's C API is now vendorized under the `ext` module. This was done due to unavailability of the [ghostscript](https://pypi.org/project/ghostscript/) package on Anaconda. The code should be removed after we submit a recipe for it to conda-forge. With this release, the user doesn't need to ensure that the Ghostscript executable is available on the PATH variable.
0.6.0 (2018-12-24)
------------------
**Improvements**
* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
* [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
* `strip_text`: To define characters that should be stripped from each string.
* `edge_tol`: Tolerance parameter for extending textedges vertically.
* `resolution`: Resolution used for PDF to PNG conversion.
* Check out the [advanced docs](https://camelot-py.readthedocs.io/en/master/user/advanced.html#strip-characters-from-text) for usage details.
* [#170](https://github.com/socialcopsdev/camelot/issues/170) Add option to pass pdfminer layout kwargs. [#232](https://github.com/socialcopsdev/camelot/pull/232) by Vinayak Mehta.
* Keyword arguments for [pdfminer.layout.LAParams](https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33) can now be passed using `layout_kwargs` in `read_pdf()`.
* The `margins` keyword argument in `read_pdf()` is now deprecated.
0.5.0 (2018-12-13)
------------------
**Improvements**
* [#207](https://github.com/socialcopsdev/camelot/issues/207) Add a plot type for Stream text edges and detected table areas. [#224](https://github.com/socialcopsdev/camelot/pull/224) by Vinayak Mehta.
* [#204](https://github.com/socialcopsdev/camelot/issues/204) `suppress_warnings` is now called `suppress_stdout`. [#225](https://github.com/socialcopsdev/camelot/pull/225) by Vinayak Mehta.
**Bugfixes**
* [#217](https://github.com/socialcopsdev/camelot/issues/217) Fix IndexError when scale is large.
* [#105](https://github.com/socialcopsdev/camelot/issues/105), [#192](https://github.com/socialcopsdev/camelot/issues/192) and [#215](https://github.com/socialcopsdev/camelot/issues/215) in [#227](https://github.com/socialcopsdev/camelot/pull/227) by Vinayak Mehta.
**Documentation**
* Add pdfplumber comparison and update Tabula (stream) comparison. Check out the [wiki page](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
0.4.1 (2018-12-05)
------------------
**Bugfixes**
* Add chardet to `install_requires` to fix [#210](https://github.com/socialcopsdev/camelot/issues/210). More details in [pdfminer.six#213](https://github.com/pdfminer/pdfminer.six/issues/213).
0.4.0 (2018-11-23)
------------------
**Improvements**
* [#102](https://github.com/socialcopsdev/camelot/issues/102) Detect tables automatically when Stream is used. [#206](https://github.com/socialcopsdev/camelot/pull/206) Add implementation of Anssi Nurminen's table detection algorithm by Vinayak Mehta.
0.3.2 (2018-11-04)
------------------
**Improvements**
* [#186](https://github.com/socialcopsdev/camelot/issues/186) Add `_bbox` attribute to table. [#193](https://github.com/socialcopsdev/camelot/pull/193) by Vinayak Mehta.
* You can use `table._bbox` to get coordinates of the detected table.
0.3.1 (2018-11-02)
------------------
**Improvements**
* Matplotlib is now an optional requirement. [#190](https://github.com/socialcopsdev/camelot/pull/190) by Vinayak Mehta.
* You can install it using `$ pip install camelot-py[plot]`.
* [#127](https://github.com/socialcopsdev/camelot/issues/127) Add tests for plotting. Coverage is now at 87%! [#179](https://github.com/socialcopsdev/camelot/pull/179) by [Suyash Behera](https://github.com/Suyash458).
0.3.0 (2018-10-28)
------------------
**Improvements**
* [#162](https://github.com/socialcopsdev/camelot/issues/162) Add password keyword argument. [#180](https://github.com/socialcopsdev/camelot/pull/180) by [rbares](https://github.com/rbares).
* An encrypted PDF can now be decrypted by passing `password=''` to `read_pdf` or `--password ` to the command-line interface. (Limited encryption algorithm support from PyPDF2.)
* [#139](https://github.com/socialcopsdev/camelot/issues/139) Add suppress_warnings keyword argument. [#155](https://github.com/socialcopsdev/camelot/pull/155) by [Jonathan Lloyd](https://github.com/jonathanlloyd).
* Warnings raised by Camelot can now be suppressed by passing `suppress_warnings=True` to `read_pdf` or `--quiet` to the command-line interface.
* [#154](https://github.com/socialcopsdev/camelot/issues/154) The CLI can now be run using `python -m`. Try `python -m camelot --help`. [#159](https://github.com/socialcopsdev/camelot/pull/159) by [Parth P Panchal](https://github.com/pqrth).
* [#165](https://github.com/socialcopsdev/camelot/issues/165) Rename `table_area` to `table_areas`. [#171](https://github.com/socialcopsdev/camelot/pull/171) by [Parth P Panchal](https://github.com/pqrth).
**Bugfixes**
* Raise error if the ghostscript executable is not on the PATH variable. [#166](https://github.com/socialcopsdev/camelot/pull/166) by Vinayak Mehta.
* Convert filename to lowercase to check for PDF extension. [#169](https://github.com/socialcopsdev/camelot/pull/169) by [Vinicius Mesel](https://github.com/vmesel).
**Files**
* [#114](https://github.com/socialcopsdev/camelot/issues/114) Add Makefile and make codecov run only once. [#132](https://github.com/socialcopsdev/camelot/pull/132) by [Vaibhav Mule](https://github.com/vaibhavmule).
* Add .editorconfig. [#151](https://github.com/socialcopsdev/camelot/pull/151) by [KOLANICH](https://github.com/KOLANICH).
* Downgrade numpy version from 1.15.2 to 1.13.3.
* Add requirements.txt for readthedocs.
**Documentation**
* Add "Using conda" section to installation instructions.
* Add readthedocs badge.
0.2.3 (2018-10-08)
------------------
* Remove hard dependencies on requirements versions.
0.2.2 (2018-10-08)
------------------
**Bugfixes**
* Move opencv-python to extra\_requires. [#134](https://github.com/socialcopsdev/camelot/pull/134) by Vinayak Mehta.
0.2.1 (2018-10-05)
------------------
**Bugfixes**
* [#121](https://github.com/socialcopsdev/camelot/issues/121) Fix ghostscript subprocess call for Windows. [#124](https://github.com/socialcopsdev/camelot/pull/124) by Vinayak Mehta.
**Improvements**
* [#123](https://github.com/socialcopsdev/camelot/issues/123) Make PEP8 compatible. [#125](https://github.com/socialcopsdev/camelot/pull/125) by [Oshawk](https://github.com/Oshawk).
* [#110](https://github.com/socialcopsdev/camelot/issues/110) Add more tests. Coverage is now at 84%!
* Add tests for `__repr__`. [#128](https://github.com/socialcopsdev/camelot/pull/128) by [Vaibhav Mule](https://github.com/vaibhavmule).
* Add tests for CLI. [#122](https://github.com/socialcopsdev/camelot/pull/122) by [Vaibhav Mule](https://github.com/vaibhavmule) and [#117](https://github.com/socialcopsdev/camelot/pull/117) by Vinayak Mehta.
* Add tests for errors/warnings. [#113](https://github.com/socialcopsdev/camelot/pull/113) by Vinayak Mehta.
* Add tests for output formats and parser kwargs. [#126](https://github.com/socialcopsdev/camelot/pull/126) by Vinayak Mehta.
* Add Python 3.5 and 3.7 support. [#119](https://github.com/socialcopsdev/camelot/pull/119) by Vinayak Mehta.
* Add logging and warnings.
**Documentation**
* Copyedit all documentation. [#112](https://github.com/socialcopsdev/camelot/pull/112) by [Christine Garcia](https://github.com/christinegarcia).
* [#115](https://github.com/socialcopsdev/camelot/issues/115) Update issue labels in contributor's guide. [#116](https://github.com/socialcopsdev/camelot/pull/116) by [Johnny Metz](https://github.com/johnnymetz).
* Update installation instructions for Windows. [#124](https://github.com/socialcopsdev/camelot/pull/124) by Vinayak Mehta.
**Note**: This release also bumps the version for numpy from 1.13.3 to 1.15.2 and adds a MANIFEST.in. Also, openpyxl==2.5.8 is a new requirement and pytest-cov==2.6.0 is a new dev requirement.
0.2.0 (2018-09-28)
------------------
**Improvements**
* [#81](https://github.com/socialcopsdev/camelot/issues/81) Add Python 3.6 support. [#109](https://github.com/socialcopsdev/camelot/pull/109) by Vinayak Mehta.
0.1.2 (2018-09-25)
------------------
**Improvements**
* [#85](https://github.com/socialcopsdev/camelot/issues/85) Add Travis and Codecov.
0.1.1 (2018-09-24)
------------------
**Documentation**
* Add documentation fixes.
0.1.0 (2018-09-24)
------------------
* Rebirth!
camelot-py-0.11.0/LICENSE 0000664 0000000 0000000 00000002157 14376573757 0014705 0 ustar 00root root 0000000 0000000 MIT License
Copyright (c) 2019-2021 Camelot Developers
Copyright (c) 2018-2019 Peeply Private Ltd (Singapore)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
camelot-py-0.11.0/MANIFEST.in 0000664 0000000 0000000 00000000104 14376573757 0015424 0 ustar 00root root 0000000 0000000 include MANIFEST.in README.md HISTORY.md LICENSE setup.py setup.cfg
camelot-py-0.11.0/Makefile 0000664 0000000 0000000 00000001301 14376573757 0015326 0 ustar 00root root 0000000 0000000 .PHONY: docs
INSTALL :=
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
INSTALL := @sudo apt install python-tk python3-tk ghostscript
else ifeq ($(UNAME_S),Darwin)
INSTALL := @brew install tcl-tk ghostscript
else
INSTALL := @echo "Please install tk and ghostscript"
endif
install:
$(INSTALL)
pip install --upgrade pip
pip install ".[dev]"
test:
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
docs:
cd docs && make html
@echo "\033[95m\n\nBuild successful! View the docs homepage at docs/_build/html/index.html.\n\033[0m"
publish:
pip install twine
python setup.py sdist
twine upload dist/*
rm -fr build dist .egg camelot_py.egg-info
camelot-py-0.11.0/README.md 0000664 0000000 0000000 00000014126 14376573757 0015156 0 ustar 00root root 0000000 0000000
# Camelot: PDF Table Extraction for Humans
[](https://github.com/camelot-dev/camelot/actions/workflows/tests.yml) [](https://camelot-py.readthedocs.io/en/master/)
[](https://codecov.io/github/camelot-dev/camelot?branch=master)
[](https://pypi.org/project/camelot-py/) [](https://pypi.org/project/camelot-py/) [](https://pypi.org/project/camelot-py/) [](https://gitter.im/camelot-dev/Lobby)
[](https://github.com/ambv/black)
**Camelot** is a Python library that can help you extract tables from PDFs!
**Note:** You can also check out [Excalibur](https://github.com/camelot-dev/excalibur), the web interface to Camelot!
---
**Here's how you can extract tables from PDFs.** You can check out the PDF used in this example [here](https://github.com/camelot-dev/camelot/blob/master/docs/_static/pdf/foo.pdf).
>>> import camelot
>>> tables = camelot.read_pdf('foo.pdf')
>>> tables
<TableList n=1>
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html, markdown, sqlite
>>> tables[0]
<Table shape=(7, 7)>
>>> tables[0].parsing_report
{
'accuracy': 99.02,
'whitespace': 12.24,
'order': 1,
'page': 1
}
>>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html, to_markdown, to_sqlite
>>> tables[0].df # get a pandas DataFrame!
| Cycle Name | KI (1/km) | Distance (mi) | Percent Fuel Savings | | | |
|------------|-----------|---------------|----------------------|-----------------|-----------------|----------------|
| | | | Improved Speed | Decreased Accel | Eliminate Stops | Decreased Idle |
| 2012_2 | 3.30 | 1.3 | 5.9% | 9.5% | 29.2% | 17.4% |
| 2145_1 | 0.68 | 11.2 | 2.4% | 0.1% | 9.5% | 2.7% |
| 4234_1 | 0.59 | 58.7 | 8.5% | 1.3% | 8.5% | 3.3% |
| 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% |
| 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% |
Camelot also comes packaged with a [command-line interface](https://camelot-py.readthedocs.io/en/master/user/cli.html)!
**Note:** Camelot only works with text-based PDFs and not scanned documents. (As Tabula [explains](https://github.com/tabulapdf/tabula#why-tabula), "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".)
You can check out some frequently asked questions [here](https://camelot-py.readthedocs.io/en/master/user/faq.html).
## Why Camelot?
- **Configurability**: Camelot gives you control over the table extraction process with [tweakable settings](https://camelot-py.readthedocs.io/en/master/user/advanced.html).
- **Metrics**: You can discard bad tables based on metrics like accuracy and whitespace, without having to manually look at each table.
- **Output**: Each table is extracted into a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873). You can also export tables to multiple formats, which include CSV, JSON, Excel, HTML, Markdown, and Sqlite.
See [comparison with similar libraries and tools](https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
## Support the development
If Camelot has helped you, please consider supporting its development with a one-time or monthly donation [on OpenCollective](https://opencollective.com/camelot).
## Installation
### Using conda
The easiest way to install Camelot is with [conda](https://conda.io/docs/), which is a package manager and environment management system for the [Anaconda](http://docs.continuum.io/anaconda/) distribution.
$ conda install -c conda-forge camelot-py
### Using pip
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/bionic/python/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can also just use pip to install Camelot:
$ pip install "camelot-py[base]"
### From the source code
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip), clone the repo using:
$ git clone https://www.github.com/camelot-dev/camelot
and install Camelot using pip:
$ cd camelot
$ pip install ".[base]"
## Documentation
The documentation is available at [http://camelot-py.readthedocs.io/](http://camelot-py.readthedocs.io/).
## Wrappers
- [camelot-php](https://github.com/randomstate/camelot-php) provides a [PHP](https://www.php.net/) wrapper on Camelot.
## Contributing
The [Contributor's Guide](https://camelot-py.readthedocs.io/en/master/dev/contributing.html) has detailed information about contributing issues, documentation, code, and tests.
## Versioning
Camelot uses [Semantic Versioning](https://semver.org/). For the available versions, see the tags on this repository. For the changelog, you can check out [HISTORY.md](https://github.com/camelot-dev/camelot/blob/master/HISTORY.md).
## License
This project is licensed under the MIT License, see the [LICENSE](https://github.com/camelot-dev/camelot/blob/master/LICENSE) file for details.
camelot-py-0.11.0/camelot/ 0000775 0000000 0000000 00000000000 14376573757 0015317 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/camelot/__init__.py 0000664 0000000 0000000 00000000727 14376573757 0017436 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import logging
from .__version__ import __version__
from .io import read_pdf
from .plotting import PlotMethods
# set up logging
logger = logging.getLogger("camelot")
format_string = "%(asctime)s - %(levelname)s - %(message)s"
formatter = logging.Formatter(format_string, datefmt="%Y-%m-%dT%H:%M:%S")
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
# instantiate plot method
plot = PlotMethods()
camelot-py-0.11.0/camelot/__main__.py 0000775 0000000 0000000 00000000217 14376573757 0017414 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
__all__ = ("main",)
def main():
from camelot.cli import cli
cli()
if __name__ == "__main__":
main()
camelot-py-0.11.0/camelot/__version__.py 0000664 0000000 0000000 00000001303 14376573757 0020147 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
VERSION = (0, 11, 0)
PRERELEASE = None # alpha, beta or rc
REVISION = None
def generate_version(version, prerelease=None, revision=None):
version_parts = [".".join(map(str, version))]
if prerelease is not None:
version_parts.append(f"-{prerelease}")
if revision is not None:
version_parts.append(f".{revision}")
return "".join(version_parts)
__title__ = "camelot-py"
__description__ = "PDF Table Extraction for Humans."
__url__ = "http://camelot-py.readthedocs.io/"
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
__author__ = "Vinayak Mehta"
__author_email__ = "vmehta94@gmail.com"
__license__ = "MIT License"
camelot-py-0.11.0/camelot/backends/ 0000775 0000000 0000000 00000000000 14376573757 0017071 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/camelot/backends/__init__.py 0000664 0000000 0000000 00000000116 14376573757 0021200 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
from .image_conversion import ImageConversionBackend
camelot-py-0.11.0/camelot/backends/ghostscript_backend.py 0000664 0000000 0000000 00000002260 14376573757 0023463 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import sys
import ctypes
from ctypes.util import find_library
def installed_posix():
library = find_library("gs")
return library is not None
def installed_windows():
library = find_library(
"".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll"))
)
return library is not None
class GhostscriptBackend(object):
def installed(self):
if sys.platform in ["linux", "darwin"]:
return installed_posix()
elif sys.platform == "win32":
return installed_windows()
else:
return installed_posix()
def convert(self, pdf_path, png_path, resolution=300):
if not self.installed():
raise OSError(
"Ghostscript is not installed. You can install it using the instructions"
" here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
)
import ghostscript
gs_command = [
"gs",
"-q",
"-sDEVICE=png16m",
"-o",
png_path,
f"-r{resolution}",
pdf_path,
]
ghostscript.Ghostscript(*gs_command)
camelot-py-0.11.0/camelot/backends/image_conversion.py 0000664 0000000 0000000 00000002746 14376573757 0023003 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
from .poppler_backend import PopplerBackend
from .ghostscript_backend import GhostscriptBackend
BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
class ImageConversionBackend(object):
def __init__(self, backend="poppler", use_fallback=True):
if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend '{backend}' not supported")
self.backend = backend
self.use_fallback = use_fallback
self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
def convert(self, pdf_path, png_path):
try:
converter = BACKENDS[self.backend]()
converter.convert(pdf_path, png_path)
except Exception as e:
import sys
if self.use_fallback:
for fallback in self.fallbacks:
try:
converter = BACKENDS[fallback]()
converter.convert(pdf_path, png_path)
except Exception as e:
raise type(e)(
str(e) + f" with image conversion backend '{fallback}'"
).with_traceback(sys.exc_info()[2])
continue
else:
break
else:
raise type(e)(
str(e) + f" with image conversion backend '{self.backend}'"
).with_traceback(sys.exc_info()[2])
camelot-py-0.11.0/camelot/backends/poppler_backend.py 0000664 0000000 0000000 00000001250 14376573757 0022571 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import shutil
import subprocess
class PopplerBackend(object):
def convert(self, pdf_path, png_path):
pdftopng_executable = shutil.which("pdftopng")
if pdftopng_executable is None:
raise OSError(
"pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
)
pdftopng_command = [pdftopng_executable, pdf_path, png_path]
try:
subprocess.check_output(
" ".join(pdftopng_command), stderr=subprocess.STDOUT, shell=True
)
except subprocess.CalledProcessError as e:
raise ValueError(e.output)
camelot-py-0.11.0/camelot/cli.py 0000664 0000000 0000000 00000020311 14376573757 0016435 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import logging
import click
try:
import matplotlib.pyplot as plt
except ImportError:
_HAS_MPL = False
else:
_HAS_MPL = True
from . import __version__, read_pdf, plot
logger = logging.getLogger("camelot")
logger.setLevel(logging.INFO)
class Config(object):
def __init__(self):
self.config = {}
def set_config(self, key, value):
self.config[key] = value
pass_config = click.make_pass_decorator(Config)
@click.group(name="camelot")
@click.version_option(version=__version__)
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
@click.option(
"-p",
"--pages",
default="1",
help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.",
)
@click.option("-pw", "--password", help="Password for decryption.")
@click.option("-o", "--output", help="Output file path.")
@click.option(
"-f",
"--format",
type=click.Choice(["csv", "excel", "html", "json", "markdown", "sqlite"]),
help="Output file format.",
)
@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.")
@click.option(
"-split",
"--split_text",
is_flag=True,
help="Split text that spans across multiple cells.",
)
@click.option(
"-flag",
"--flag_size",
is_flag=True,
help="Flag text based on" " font size. Useful to detect super/subscripts.",
)
@click.option(
"-strip",
"--strip_text",
help="Characters that should be stripped from a string before"
" assigning it to a cell.",
)
@click.option(
"-M",
"--margins",
nargs=3,
default=(1.0, 0.5, 0.1),
help="PDFMiner char_margin, line_margin and word_margin.",
)
@click.pass_context
def cli(ctx, *args, **kwargs):
"""Camelot: PDF Table Extraction for Humans"""
ctx.obj = Config()
for key, value in kwargs.items():
ctx.obj.set_config(key, value)
@cli.command("lattice")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-back", "--process_background", is_flag=True, help="Process background lines."
)
@click.option(
"-scale",
"--line_scale",
default=15,
help="Line size scaling factor. The larger the value,"
" the smaller the detected lines.",
)
@click.option(
"-copy",
"--copy_text",
default=[],
type=click.Choice(["h", "v"]),
multiple=True,
help="Direction in which text in a spanning cell" " will be copied over.",
)
@click.option(
"-shift",
"--shift_text",
default=["l", "t"],
type=click.Choice(["", "l", "r", "t", "b"]),
multiple=True,
help="Direction in which text in a spanning cell will flow.",
)
@click.option(
"-l",
"--line_tol",
default=2,
help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
)
@click.option(
"-j",
"--joint_tol",
default=2,
help="Tolerance parameter used to decide whether"
" the detected lines and points lie close to each other.",
)
@click.option(
"-block",
"--threshold_blocksize",
default=15,
help="For adaptive thresholding, size of a pixel"
" neighborhood that is used to calculate a threshold value for"
" the pixel. Example: 3, 5, 7, and so on.",
)
@click.option(
"-const",
"--threshold_constant",
default=-2,
help="For adaptive thresholding, constant subtracted"
" from the mean or weighted mean. Normally, it is positive but"
" may be zero or negative as well.",
)
@click.option(
"-I",
"--iterations",
default=0,
help="Number of times for erosion/dilation will be applied.",
)
@click.option(
"-res",
"--resolution",
default=300,
help="Resolution used for PDF to PNG conversion.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "joint", "line"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def lattice(c, *args, **kwargs):
"""Use lines between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)
table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
copy_text = list(kwargs["copy_text"])
kwargs["copy_text"] = None if not copy_text else copy_text
kwargs["shift_text"] = list(kwargs["shift_text"])
if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError("Please specify output file path using --output")
if f is None:
raise click.UsageError("Please specify output file format using --format")
tables = read_pdf(
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
plt.show()
else:
tables.export(output, f=f, compress=compress)
@cli.command("stream")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def stream(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)
table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns
if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError("Please specify output file path using --output")
if f is None:
raise click.UsageError("Please specify output file format using --format")
tables = read_pdf(
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
plt.show()
else:
tables.export(output, f=f, compress=compress)
camelot-py-0.11.0/camelot/core.py 0000664 0000000 0000000 00000057535 14376573757 0016640 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import os
import sqlite3
import zipfile
import tempfile
from itertools import chain
from operator import itemgetter
import numpy as np
import pandas as pd
# minimum number of vertical textline intersections for a textedge
# to be considered valid
TEXTEDGE_REQUIRED_ELEMENTS = 4
# padding added to table area on the left, right and bottom
TABLE_AREA_PADDING = 10
class TextEdge(object):
"""Defines a text edge coordinates relative to a left-bottom
origin. (PDF coordinate space)
Parameters
----------
x : float
x-coordinate of the text edge.
y0 : float
y-coordinate of bottommost point.
y1 : float
y-coordinate of topmost point.
align : string, optional (default: 'left')
{'left', 'right', 'middle'}
Attributes
----------
intersections: int
Number of intersections with horizontal text rows.
is_valid: bool
A text edge is valid if it intersections with at least
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
"""
def __init__(self, x, y0, y1, align="left"):
self.x = x
self.y0 = y0
self.y1 = y1
self.align = align
self.intersections = 0
self.is_valid = False
def __repr__(self):
x = round(self.x, 2)
y0 = round(self.y0, 2)
y1 = round(self.y1, 2)
return (
f""
)
def update_coords(self, x, y0, edge_tol=50):
"""Updates the text edge's x and bottom y coordinates and sets
the is_valid attribute.
"""
if np.isclose(self.y0, y0, atol=edge_tol):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0
self.intersections += 1
# a textedge is valid only if it extends uninterrupted
# over a required number of textlines
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
self.is_valid = True
class TextEdges(object):
"""Defines a dict of left, right and middle text edges found on
the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects.
"""
def __init__(self, edge_tol=50):
self.edge_tol = edge_tol
self._textedges = {"left": [], "right": [], "middle": []}
@staticmethod
def get_x_coord(textline, align):
"""Returns the x coordinate of a text row based on the
specified alignment.
"""
x_left = textline.x0
x_right = textline.x1
x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
return x_coord[align]
def find(self, x_coord, align):
"""Returns the index of an existing text edge using
the specified x coordinate and alignment.
"""
for i, te in enumerate(self._textedges[align]):
if np.isclose(te.x, x_coord, atol=0.5):
return i
return None
def add(self, textline, align):
"""Adds a new text edge to the current dict."""
x = self.get_x_coord(textline, align)
y0 = textline.y0
y1 = textline.y1
te = TextEdge(x, y0, y1, align=align)
self._textedges[align].append(te)
def update(self, textline):
"""Updates an existing text edge in the current dict."""
for align in ["left", "right", "middle"]:
x_coord = self.get_x_coord(textline, align)
idx = self.find(x_coord, align)
if idx is None:
self.add(textline, align)
else:
self._textedges[align][idx].update_coords(
x_coord, textline.y0, edge_tol=self.edge_tol
)
def generate(self, textlines):
"""Generates the text edges dict based on horizontal text
rows.
"""
for tl in textlines:
if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update(tl)
def get_relevant(self):
"""Returns the list of relevant text edges (all share the same
alignment) based on which list intersects horizontal text rows
the most.
"""
intersections_sum = {
"left": sum(
te.intersections for te in self._textedges["left"] if te.is_valid
),
"right": sum(
te.intersections for te in self._textedges["right"] if te.is_valid
),
"middle": sum(
te.intersections for te in self._textedges["middle"] if te.is_valid
),
}
# TODO: naive
# get vertical textedges that intersect maximum number of
# times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align]
def get_table_areas(self, textlines, relevant_textedges):
"""Returns a dict of interesting table areas on the PDF page
calculated using relevant text edges.
"""
def pad(area, average_row_height):
x0 = area[0] - TABLE_AREA_PADDING
y0 = area[1] - TABLE_AREA_PADDING
x1 = area[2] + TABLE_AREA_PADDING
# add a constant since table headers can be relatively up
y1 = area[3] + average_row_height * 5
return (x0, y0, x1, y1)
# sort relevant textedges in reading order
relevant_textedges.sort(key=lambda te: (-te.y0, te.x))
table_areas = {}
for te in relevant_textedges:
if te.is_valid:
if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
found = None
for area in table_areas:
# check for overlap
if te.y1 >= area[1] and te.y0 <= area[3]:
found = area
break
if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
table_areas.pop(found)
updated_area = (
found[0],
min(te.y0, found[1]),
max(found[2], te.x),
max(found[3], te.y1),
)
table_areas[updated_area] = None
# extend table areas based on textlines that overlap
# vertically. it's possible that these textlines were
# eliminated during textedges generation since numbers and
# chars/words/sentences are often aligned differently.
# drawback: table areas that have paragraphs on their sides
# will include the paragraphs too.
sum_textline_height = 0
for tl in textlines:
sum_textline_height += tl.y1 - tl.y0
found = None
for area in table_areas:
# check for overlap
if tl.y0 >= area[1] and tl.y1 <= area[3]:
found = area
break
if found is not None:
table_areas.pop(found)
updated_area = (
min(tl.x0, found[0]),
min(tl.y0, found[1]),
max(found[2], tl.x1),
max(found[3], tl.y1),
)
table_areas[updated_area] = None
average_textline_height = sum_textline_height / float(len(textlines))
# add some padding to table areas
table_areas_padded = {}
for area in table_areas:
table_areas_padded[pad(area, average_textline_height)] = None
return table_areas_padded
class Cell(object):
"""Defines a cell in a table with coordinates relative to a
left-bottom origin. (PDF coordinate space)
Parameters
----------
x1 : float
x-coordinate of left-bottom point.
y1 : float
y-coordinate of left-bottom point.
x2 : float
x-coordinate of right-top point.
y2 : float
y-coordinate of right-top point.
Attributes
----------
lb : tuple
Tuple representing left-bottom coordinates.
lt : tuple
Tuple representing left-top coordinates.
rb : tuple
Tuple representing right-bottom coordinates.
rt : tuple
Tuple representing right-top coordinates.
left : bool
Whether or not cell is bounded on the left.
right : bool
Whether or not cell is bounded on the right.
top : bool
Whether or not cell is bounded on the top.
bottom : bool
Whether or not cell is bounded on the bottom.
hspan : bool
Whether or not cell spans horizontally.
vspan : bool
Whether or not cell spans vertically.
text : string
Text assigned to cell.
"""
def __init__(self, x1, y1, x2, y2):
self.x1 = x1
self.y1 = y1
self.x2 = x2
self.y2 = y2
self.lb = (x1, y1)
self.lt = (x1, y2)
self.rb = (x2, y1)
self.rt = (x2, y2)
self.left = False
self.right = False
self.top = False
self.bottom = False
self.hspan = False
self.vspan = False
self._text = ""
def __repr__(self):
x1 = round(self.x1)
y1 = round(self.y1)
x2 = round(self.x2)
y2 = round(self.y2)
return f""
@property
def text(self):
return self._text
@text.setter
def text(self, t):
self._text = "".join([self._text, t])
@property
def bound(self):
"""The number of sides on which the cell is bounded."""
return self.top + self.bottom + self.left + self.right
class Table(object):
"""Defines a table with coordinates relative to a left-bottom
origin. (PDF coordinate space)
Parameters
----------
cols : list
List of tuples representing column x-coordinates in increasing
order.
rows : list
List of tuples representing row y-coordinates in decreasing
order.
Attributes
----------
df : :class:`pandas.DataFrame`
shape : tuple
Shape of the table.
accuracy : float
Accuracy with which text was assigned to the cell.
whitespace : float
Percentage of whitespace in the table.
order : int
Table number on PDF page.
page : int
PDF page number.
"""
def __init__(self, cols, rows):
self.cols = cols
self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
self.df = None
self.shape = (0, 0)
self.accuracy = 0
self.whitespace = 0
self.order = None
self.page = None
def __repr__(self):
return f"<{self.__class__.__name__} shape={self.shape}>"
def __lt__(self, other):
if self.page == other.page:
if self.order < other.order:
return True
if self.page < other.page:
return True
@property
def data(self):
"""Returns two-dimensional list of strings in table."""
d = []
for row in self.cells:
d.append([cell.text.strip() for cell in row])
return d
@property
def parsing_report(self):
"""Returns a parsing report with %accuracy, %whitespace,
table number on page and page number.
"""
# pretty?
report = {
"accuracy": round(self.accuracy, 2),
"whitespace": round(self.whitespace, 2),
"order": self.order,
"page": self.page,
}
return report
def set_all_edges(self):
"""Sets all table edges to True."""
for row in self.cells:
for cell in row:
cell.left = cell.right = cell.top = cell.bottom = True
return self
def set_edges(self, vertical, horizontal, joint_tol=2):
"""Sets a cell's edges to True depending on whether the cell's
coordinates overlap with the line's coordinates within a
tolerance.
Parameters
----------
vertical : list
List of detected vertical lines.
horizontal : list
List of detected horizontal lines.
"""
for v in vertical:
# find closest x coord
# iterate over y coords and find closest start and end points
i = [
i
for i, t in enumerate(self.cols)
if np.isclose(v[0], t[0], atol=joint_tol)
]
j = [
j
for j, t in enumerate(self.rows)
if np.isclose(v[3], t[0], atol=joint_tol)
]
k = [
k
for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=joint_tol)
]
if not j:
continue
J = j[0]
if i == [0]: # only left edge
L = i[0]
if k:
K = k[0]
while J < K:
self.cells[J][L].left = True
J += 1
else:
K = len(self.rows)
while J < K:
self.cells[J][L].left = True
J += 1
elif i == []: # only right edge
L = len(self.cols) - 1
if k:
K = k[0]
while J < K:
self.cells[J][L].right = True
J += 1
else:
K = len(self.rows)
while J < K:
self.cells[J][L].right = True
J += 1
else: # both left and right edges
L = i[0]
if k:
K = k[0]
while J < K:
self.cells[J][L].left = True
self.cells[J][L - 1].right = True
J += 1
else:
K = len(self.rows)
while J < K:
self.cells[J][L].left = True
self.cells[J][L - 1].right = True
J += 1
for h in horizontal:
# find closest y coord
# iterate over x coords and find closest start and end points
i = [
i
for i, t in enumerate(self.rows)
if np.isclose(h[1], t[0], atol=joint_tol)
]
j = [
j
for j, t in enumerate(self.cols)
if np.isclose(h[0], t[0], atol=joint_tol)
]
k = [
k
for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=joint_tol)
]
if not j:
continue
J = j[0]
if i == [0]: # only top edge
L = i[0]
if k:
K = k[0]
while J < K:
self.cells[L][J].top = True
J += 1
else:
K = len(self.cols)
while J < K:
self.cells[L][J].top = True
J += 1
elif i == []: # only bottom edge
L = len(self.rows) - 1
if k:
K = k[0]
while J < K:
self.cells[L][J].bottom = True
J += 1
else:
K = len(self.cols)
while J < K:
self.cells[L][J].bottom = True
J += 1
else: # both top and bottom edges
L = i[0]
if k:
K = k[0]
while J < K:
self.cells[L][J].top = True
self.cells[L - 1][J].bottom = True
J += 1
else:
K = len(self.cols)
while J < K:
self.cells[L][J].top = True
self.cells[L - 1][J].bottom = True
J += 1
return self
def set_border(self):
"""Sets table border edges to True."""
for r in range(len(self.rows)):
self.cells[r][0].left = True
self.cells[r][len(self.cols) - 1].right = True
for c in range(len(self.cols)):
self.cells[0][c].top = True
self.cells[len(self.rows) - 1][c].bottom = True
return self
def set_span(self):
"""Sets a cell's hspan or vspan attribute to True depending
on whether the cell spans horizontally or vertically.
"""
for row in self.cells:
for cell in row:
left = cell.left
right = cell.right
top = cell.top
bottom = cell.bottom
if cell.bound == 4:
continue
elif cell.bound == 3:
if not left and (right and top and bottom):
cell.hspan = True
elif not right and (left and top and bottom):
cell.hspan = True
elif not top and (left and right and bottom):
cell.vspan = True
elif not bottom and (left and right and top):
cell.vspan = True
elif cell.bound == 2:
if left and right and (not top and not bottom):
cell.vspan = True
elif top and bottom and (not left and not right):
cell.hspan = True
elif cell.bound in [0, 1]:
cell.vspan = True
cell.hspan = True
return self
def to_csv(self, path, **kwargs):
"""Writes Table to a comma-separated values (csv) file.
For kwargs, check :meth:`pandas.DataFrame.to_csv`.
Parameters
----------
path : str
Output filepath.
"""
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
kw.update(kwargs)
self.df.to_csv(path, **kw)
def to_json(self, path, **kwargs):
"""Writes Table to a JSON file.
For kwargs, check :meth:`pandas.DataFrame.to_json`.
Parameters
----------
path : str
Output filepath.
"""
kw = {"orient": "records"}
kw.update(kwargs)
json_string = self.df.to_json(**kw)
with open(path, "w") as f:
f.write(json_string)
def to_excel(self, path, **kwargs):
"""Writes Table to an Excel file.
For kwargs, check :meth:`pandas.DataFrame.to_excel`.
Parameters
----------
path : str
Output filepath.
"""
kw = {
"sheet_name": f"page-{self.page}-table-{self.order}",
"encoding": "utf-8",
}
kw.update(kwargs)
writer = pd.ExcelWriter(path)
self.df.to_excel(writer, **kw)
writer.save()
def to_html(self, path, **kwargs):
"""Writes Table to an HTML file.
For kwargs, check :meth:`pandas.DataFrame.to_html`.
Parameters
----------
path : str
Output filepath.
"""
html_string = self.df.to_html(**kwargs)
with open(path, "w", encoding="utf-8") as f:
f.write(html_string)
def to_markdown(self, path, **kwargs):
"""Writes Table to a Markdown file.
For kwargs, check :meth:`pandas.DataFrame.to_markdown`.
Parameters
----------
path : str
Output filepath.
"""
md_string = self.df.to_markdown(**kwargs)
with open(path, "w", encoding="utf-8") as f:
f.write(md_string)
def to_sqlite(self, path, **kwargs):
"""Writes Table to sqlite database.
For kwargs, check :meth:`pandas.DataFrame.to_sql`.
Parameters
----------
path : str
Output filepath.
"""
kw = {"if_exists": "replace", "index": False}
kw.update(kwargs)
conn = sqlite3.connect(path)
table_name = f"page-{self.page}-table-{self.order}"
self.df.to_sql(table_name, conn, **kw)
conn.commit()
conn.close()
class TableList(object):
"""Defines a list of camelot.core.Table objects. Each table can
be accessed using its index.
Attributes
----------
n : int
Number of tables in the list.
"""
def __init__(self, tables):
self._tables = tables
def __repr__(self):
return f"<{self.__class__.__name__} n={self.n}>"
def __len__(self):
return len(self._tables)
def __getitem__(self, idx):
return self._tables[idx]
@staticmethod
def _format_func(table, f):
return getattr(table, f"to_{f}")
@property
def n(self):
return len(self)
def _write_file(self, f=None, **kwargs):
dirname = kwargs.get("dirname")
root = kwargs.get("root")
ext = kwargs.get("ext")
for table in self._tables:
filename = f"{root}-page-{table.page}-table-{table.order}{ext}"
filepath = os.path.join(dirname, filename)
to_format = self._format_func(table, f)
to_format(filepath)
def _compress_dir(self, **kwargs):
path = kwargs.get("path")
dirname = kwargs.get("dirname")
root = kwargs.get("root")
ext = kwargs.get("ext")
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
for table in self._tables:
filename = f"{root}-page-{table.page}-table-{table.order}{ext}"
filepath = os.path.join(dirname, filename)
z.write(filepath, os.path.basename(filepath))
def export(self, path, f="csv", compress=False):
"""Exports the list of tables to specified file format.
Parameters
----------
path : str
Output filepath.
f : str
File format. Can be csv, excel, html, json, markdown or sqlite.
compress : bool
Whether or not to add files to a ZIP archive.
"""
dirname = os.path.dirname(path)
basename = os.path.basename(path)
root, ext = os.path.splitext(basename)
if compress:
dirname = tempfile.mkdtemp()
kwargs = {"path": path, "dirname": dirname, "root": root, "ext": ext}
if f in ["csv", "html", "json", "markdown"]:
self._write_file(f=f, **kwargs)
if compress:
self._compress_dir(**kwargs)
elif f == "excel":
filepath = os.path.join(dirname, basename)
writer = pd.ExcelWriter(filepath)
for table in self._tables:
sheet_name = f"page-{table.page}-table-{table.order}"
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
writer.save()
if compress:
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath))
elif f == "sqlite":
filepath = os.path.join(dirname, basename)
for table in self._tables:
table.to_sqlite(filepath)
if compress:
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath))
camelot-py-0.11.0/camelot/handlers.py 0000664 0000000 0000000 00000013702 14376573757 0017474 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import os
import sys
from pypdf import PdfReader, PdfWriter
from .core import TableList
from .parsers import Stream, Lattice
from .utils import (
TemporaryDirectory,
get_page_layout,
get_text_objects,
get_rotation,
is_url,
download_url,
)
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
temp directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
"""
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
if password is None:
self.password = ""
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)
def _get_pages(self, pages):
"""Converts pages string to list of ints.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Returns
-------
P : list
List of int page numbers.
"""
page_numbers = []
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
with open(self.filepath, "rb") as f:
infile = PdfReader(f, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})
P = []
for p in page_numbers:
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
page : int
Page number.
temp : str
Tmp directory.
"""
with open(filepath, "rb") as fileobj:
infile = PdfReader(fileobj, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
Parameters
----------
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
suppress_stdout : str (default: False)
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams `_ kwargs.
kwargs : dict
See camelot.read_pdf kwargs.
Returns
-------
tables : camelot.core.TableList
List of tables found in PDF.
"""
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
for p in pages:
t = parser.extract_tables(
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
return TableList(sorted(tables))
camelot-py-0.11.0/camelot/image_processing.py 0000664 0000000 0000000 00000017052 14376573757 0021214 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import cv2
import numpy as np
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
"""Thresholds an image using OpenCV's adaptiveThreshold.
Parameters
----------
imagename : string
Path to image file.
process_background : bool, optional (default: False)
Whether or not to process lines that are in background.
blocksize : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold `_.
c : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold `_.
Returns
-------
img : object
numpy.ndarray representing the original image.
threshold : object
numpy.ndarray representing the thresholded image.
"""
img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if process_background:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
)
else:
threshold = cv2.adaptiveThreshold(
np.invert(gray),
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blocksize,
c,
)
return img, threshold
def find_lines(
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
):
"""Finds horizontal and vertical lines by applying morphological
transformations on an image.
Parameters
----------
threshold : object
numpy.ndarray representing the thresholded image.
regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in image coordinate space.
direction : string, optional (default: 'horizontal')
Specifies whether to find vertical or horizontal lines.
line_scale : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate `_.
Returns
-------
dmask : object
numpy.ndarray representing pixels where vertical/horizontal
lines lie.
lines : list
List of tuples representing vertical/horizontal lines with
coordinates relative to a left-top origin in
image coordinate space.
"""
lines = []
if direction == "vertical":
size = threshold.shape[0] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
elif direction == "horizontal":
size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None:
raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
if regions is not None:
region_mask = np.zeros(threshold.shape)
for region in regions:
x, y, w, h = region
region_mask[y : y + h, x : x + w] = 1
threshold = np.multiply(threshold, region_mask)
threshold = cv2.erode(threshold, el)
threshold = cv2.dilate(threshold, el)
dmask = cv2.dilate(threshold, el, iterations=iterations)
try:
_, contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError:
# for opencv backward compatibility
contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
for c in contours:
x, y, w, h = cv2.boundingRect(c)
x1, x2 = x, x + w
y1, y2 = y, y + h
if direction == "vertical":
lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
elif direction == "horizontal":
lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
return dmask, lines
def find_contours(vertical, horizontal):
"""Finds table boundaries using OpenCV's findContours.
Parameters
----------
vertical : object
numpy.ndarray representing pixels where vertical lines lie.
horizontal : object
numpy.ndarray representing pixels where horizontal lines lie.
Returns
-------
cont : list
List of tuples representing table boundaries. Each tuple is of
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
h -> height in image coordinate space.
"""
mask = vertical + horizontal
try:
__, contours, __ = cv2.findContours(
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError:
# for opencv backward compatibility
contours, __ = cv2.findContours(
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
# sort in reverse based on contour area and use first 10 contours
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
cont = []
for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
cont.append((x, y, w, h))
return cont
def find_joints(contours, vertical, horizontal):
"""Finds joints/intersections present inside each table boundary.
Parameters
----------
contours : list
List of tuples representing table boundaries. Each tuple is of
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
h -> height in image coordinate space.
vertical : object
numpy.ndarray representing pixels where vertical lines lie.
horizontal : object
numpy.ndarray representing pixels where horizontal lines lie.
Returns
-------
tables : dict
Dict with table boundaries as keys and list of intersections
in that boundary as their value.
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
and (x2, y2) -> rt in image coordinate space.
"""
joints = np.multiply(vertical, horizontal)
tables = {}
for c in contours:
x, y, w, h = c
roi = joints[y : y + h, x : x + w]
try:
__, jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
)
except ValueError:
# for opencv backward compatibility
jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
)
if len(jc) <= 4: # remove contours with less than 4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2 * jx + jw) // 2, y + (2 * jy + jh) // 2
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
return tables
camelot-py-0.11.0/camelot/io.py 0000664 0000000 0000000 00000011275 14376573757 0016306 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import warnings
from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def read_pdf(
filepath,
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
layout_kwargs={},
**kwargs
):
"""Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream'
and kwargs annotated with * can only be used with flavor='lattice'.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
suppress_stdout : bool, optional (default: True)
Print all logs and warnings.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams `_ kwargs.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns^ : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol^ : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
process_background* : bool, optional (default: False)
Process background lines.
line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
copy_text* : list, optional (default: None)
{'h', 'v'}
Direction in which text in a spanning cell will be copied
over.
shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Direction in which text in a spanning cell will flow.
line_tol* : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
lines.
joint_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold `_.
threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold `_.
iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate `_.
resolution* : int, optional (default: 300)
Resolution used for PDF to PNG conversion.
Returns
-------
tables : camelot.core.TableList
"""
if flavor not in ["lattice", "stream"]:
raise NotImplementedError(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
)
with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")
validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs,
**kwargs
)
return tables
camelot-py-0.11.0/camelot/parsers/ 0000775 0000000 0000000 00000000000 14376573757 0016776 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/camelot/parsers/__init__.py 0000664 0000000 0000000 00000000121 14376573757 0021101 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
from .stream import Stream
from .lattice import Lattice
camelot-py-0.11.0/camelot/parsers/base.py 0000664 0000000 0000000 00000001412 14376573757 0020260 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import os
from ..utils import get_page_layout, get_text_objects
class BaseParser(object):
"""Defines a base parser."""
def _generate_layout(self, filename, layout_kwargs):
self.filename = filename
self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
self.images = get_text_objects(self.layout, ltype="image")
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename)
self.imagename = "".join([self.rootname, ".png"])
camelot-py-0.11.0/camelot/parsers/lattice.py 0000664 0000000 0000000 00000037663 14376573757 0021014 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import os
import sys
import copy
import locale
import logging
import warnings
import numpy as np
import pandas as pd
from .base import BaseParser
from ..core import Table
from ..utils import (
scale_image,
scale_pdf,
segments_in_bbox,
text_in_bbox,
merge_close_lines,
get_table_index,
compute_accuracy,
compute_whitespace,
)
from ..image_processing import (
adaptive_threshold,
find_lines,
find_contours,
find_joints,
)
from ..backends.image_conversion import BACKENDS
logger = logging.getLogger("camelot")
class Lattice(BaseParser):
"""Lattice method of parsing looks for lines between text
to parse the table.
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
process_background : bool, optional (default: False)
Process background lines.
line_scale : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
copy_text : list, optional (default: None)
{'h', 'v'}
Direction in which text in a spanning cell will be copied
over.
shift_text : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Direction in which text in a spanning cell will flow.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
line_tol : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
lines.
joint_tol : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold `_.
threshold_constant : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold `_.
iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate `_.
resolution : int, optional (default: 300)
Resolution used for PDF to PNG conversion.
"""
def __init__(
self,
table_regions=None,
table_areas=None,
process_background=False,
line_scale=15,
copy_text=None,
shift_text=["l", "t"],
split_text=False,
flag_size=False,
strip_text="",
line_tol=2,
joint_tol=2,
threshold_blocksize=15,
threshold_constant=-2,
iterations=0,
resolution=300,
backend="ghostscript",
**kwargs,
):
self.table_regions = table_regions
self.table_areas = table_areas
self.process_background = process_background
self.line_scale = line_scale
self.copy_text = copy_text
self.shift_text = shift_text
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.line_tol = line_tol
self.joint_tol = joint_tol
self.threshold_blocksize = threshold_blocksize
self.threshold_constant = threshold_constant
self.iterations = iterations
self.resolution = resolution
self.backend = Lattice._get_backend(backend)
@staticmethod
def _get_backend(backend):
def implements_convert():
methods = [
method for method in dir(backend) if method.startswith("__") is False
]
return "convert" in methods
if isinstance(backend, str):
if backend not in BACKENDS.keys():
raise NotImplementedError(
f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'."
)
if backend == "ghostscript":
warnings.warn(
"'ghostscript' will be replaced by 'poppler' as the default image conversion"
" backend in v0.12.0. You can try out 'poppler' with backend='poppler'.",
DeprecationWarning,
)
return BACKENDS[backend]()
else:
if not implements_convert():
raise NotImplementedError(
f"'{backend}' must implement a 'convert' method"
)
return backend
@staticmethod
def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning
cell.
Parameters
----------
table : camelot.core.Table
idx : list
List of tuples of the form (r_idx, c_idx, text).
shift_text : list
{'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a
list to specify where the text in a spanning cell should
flow.
Returns
-------
indices : list
List of tuples of the form (r_idx, c_idx, text) where
r_idx and c_idx are new row and column indices for text.
"""
indices = []
for r_idx, c_idx, text in idx:
for d in shift_text:
if d == "l":
if t.cells[r_idx][c_idx].hspan:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if d == "r":
if t.cells[r_idx][c_idx].hspan:
while not t.cells[r_idx][c_idx].right:
c_idx += 1
if d == "t":
if t.cells[r_idx][c_idx].vspan:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
if d == "b":
if t.cells[r_idx][c_idx].vspan:
while not t.cells[r_idx][c_idx].bottom:
r_idx += 1
indices.append((r_idx, c_idx, text))
return indices
@staticmethod
def _copy_spanning_text(t, copy_text=None):
"""Copies over text in empty spanning cells.
Parameters
----------
t : camelot.core.Table
copy_text : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
Returns
-------
t : camelot.core.Table
"""
for f in copy_text:
if f == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == "":
if t.cells[i][j].hspan and not t.cells[i][j].left:
t.cells[i][j].text = t.cells[i][j - 1].text
elif f == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == "":
if t.cells[i][j].vspan and not t.cells[i][j].top:
t.cells[i][j].text = t.cells[i - 1][j].text
return t
def _generate_table_bbox(self):
def scale_areas(areas):
scaled_areas = []
for area in areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas
self.image, self.threshold = adaptive_threshold(
self.imagename,
process_background=self.process_background,
blocksize=self.threshold_blocksize,
c=self.threshold_constant,
)
image_width = self.image.shape[1]
image_height = self.image.shape[0]
image_width_scaler = image_width / float(self.pdf_width)
image_height_scaler = image_height / float(self.pdf_height)
pdf_width_scaler = self.pdf_width / float(image_width)
pdf_height_scaler = self.pdf_height / float(image_height)
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
if self.table_areas is None:
regions = None
if self.table_regions is not None:
regions = scale_areas(self.table_regions)
vertical_mask, vertical_segments = find_lines(
self.threshold,
regions=regions,
direction="vertical",
line_scale=self.line_scale,
iterations=self.iterations,
)
horizontal_mask, horizontal_segments = find_lines(
self.threshold,
regions=regions,
direction="horizontal",
line_scale=self.line_scale,
iterations=self.iterations,
)
contours = find_contours(vertical_mask, horizontal_mask)
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
else:
vertical_mask, vertical_segments = find_lines(
self.threshold,
direction="vertical",
line_scale=self.line_scale,
iterations=self.iterations,
)
horizontal_mask, horizontal_segments = find_lines(
self.threshold,
direction="horizontal",
line_scale=self.line_scale,
iterations=self.iterations,
)
areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
)
def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox
t_bbox = {}
v_s, h_s = segments_in_bbox(
tk, self.vertical_segments, self.horizontal_segments
)
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox
cols, rows = zip(*self.table_bbox[tk])
cols, rows = list(cols), list(rows)
cols.extend([tk[0], tk[2]])
rows.extend([tk[1], tk[3]])
# sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
# make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, **kwargs):
v_s = kwargs.get("v_s")
h_s = kwargs.get("h_s")
if v_s is None or h_s is None:
raise ValueError("No segments found on {}".format(self.rootname))
table = Table(cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True
table = table.set_border()
# set spanning cells to True
table = table.set_span()
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
indices = Lattice._reduce_index(
table, indices, shift_text=self.shift_text
)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None:
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "lattice"
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = (self.image, self.table_bbox_unscaled)
table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None
return table
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
if not suppress_stdout:
logger.info("Processing {}".format(os.path.basename(self.rootname)))
if not self.horizontal_text:
if self.images:
warnings.warn(
"{} is image-based, camelot only works on"
" text-based pages.".format(os.path.basename(self.rootname))
)
else:
warnings.warn(
"No tables found on {}".format(os.path.basename(self.rootname))
)
return []
self.backend.convert(self.filename, self.imagename)
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = tk
_tables.append(table)
return _tables
camelot-py-0.11.0/camelot/parsers/stream.py 0000664 0000000 0000000 00000041051 14376573757 0020644 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import os
import logging
import warnings
import numpy as np
import pandas as pd
from .base import BaseParser
from ..core import TextEdges, Table
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
logger = logging.getLogger("camelot")
class Stream(BaseParser):
"""Stream method of parsing looks for spaces between text
to parse the table.
If you want to specify columns when specifying multiple table
areas, make sure that the length of both lists are equal.
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
"""
def __init__(
self,
table_regions=None,
table_areas=None,
columns=None,
split_text=False,
flag_size=False,
strip_text="",
edge_tol=50,
row_tol=2,
column_tol=0,
**kwargs,
):
self.table_regions = table_regions
self.table_areas = table_areas
self.columns = columns
self._validate_columns()
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
@staticmethod
def _text_bbox(t_bbox):
"""Returns bounding box for the text present on a page.
Parameters
----------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
Returns
-------
text_bbox : tuple
Tuple (x0, y0, x1, y1) in pdf coordinate space.
"""
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
text_bbox = (xmin, ymin, xmax, ymax)
return text_bbox
@staticmethod
def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically
within a tolerance.
Parameters
----------
text : list
List of PDFMiner text objects.
row_tol : int, optional (default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = 0
rows = []
temp = []
for t in text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
# type(obj) is LTChar]):
if t.get_text().strip():
if not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0))
temp = []
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
if len(rows) > 1:
__ = rows.pop(0) # TODO: hacky
return rows
@staticmethod
def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap
or lie within a tolerance.
Parameters
----------
l : list
List of column x-coordinate tuples.
column_tol : int, optional (default: 0)
Returns
-------
merged : list
List of merged column x-coordinate tuples.
"""
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if column_tol >= 0:
if higher[0] <= lower[1] or np.isclose(
higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
elif column_tol < 0:
if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
merged.append(higher)
else:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row y-coordinate tuples.
"""
row_mids = [
sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
for r in rows_grouped
]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
rows.insert(0, text_y_max)
rows.append(text_y_min)
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
return rows
@staticmethod
def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
if text:
text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text]
new_cols = [
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
]
cols.extend(Stream._merge_columns(sorted(new_cols)))
return cols
@staticmethod
def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
return cols
def _validate_columns(self):
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns" " should be equal")
def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm
described by Anssi Nurminen's master's thesis.
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
Assumes that tables are situated relatively far apart
vertically.
"""
# TODO: add support for arabic text #141
# sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0))
textedges = TextEdges(edge_tol=self.edge_tol)
# generate left, middle and right textedges
textedges.generate(textlines)
# select relevant edges
relevant_textedges = textedges.get_relevant()
self.textedges.extend(relevant_textedges)
# guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found
if not len(table_bbox):
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox
def _generate_table_bbox(self):
self.textedges = []
if self.table_areas is None:
hor_text = self.horizontal_text
if self.table_regions is not None:
# filter horizontal text
hor_text = []
for region in self.table_regions:
x1, y1, x2, y2 = region.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text)
else:
table_bbox = {}
for area in self.table_areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None
self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox
t_bbox = {}
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
if not len(elements):
cols = [(text_x_min, text_x_max)]
else:
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if len(elements):
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(f"No tables found in table area {table_idx + 1}")
cols = [
(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r
]
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = Table(cols, rows)
table = table.set_all_edges()
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]])
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "stream"
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = None
table._segments = None
table._textedges = self.textedges
return table
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
base_filename = os.path.basename(self.rootname)
if not suppress_stdout:
logger.info(f"Processing {base_filename}")
if not self.horizontal_text:
if self.images:
warnings.warn(
f"{base_filename} is image-based, camelot only works on"
" text-based pages."
)
else:
warnings.warn(f"No tables found on {base_filename}")
return []
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows = self._generate_columns_and_rows(table_idx, tk)
table = self._generate_table(table_idx, cols, rows)
table._bbox = tk
_tables.append(table)
return _tables
camelot-py-0.11.0/camelot/plotting.py 0000664 0000000 0000000 00000014451 14376573757 0017536 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
try:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
except ImportError:
_HAS_MPL = False
else:
_HAS_MPL = True
class PlotMethods(object):
def __call__(self, table, kind="text", filename=None):
"""Plot elements found on PDF page based on kind
specified, useful for debugging and playing with different
parameters to get the best output.
Parameters
----------
table: camelot.core.Table
A Camelot Table.
kind : str, optional (default: 'text')
{'text', 'grid', 'contour', 'joint', 'line'}
The element type for which a plot should be generated.
filepath: str, optional (default: None)
Absolute path for saving the generated plot.
Returns
-------
fig : matplotlib.fig.Figure
"""
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
if table.flavor == "lattice" and kind in ["textedge"]:
raise NotImplementedError(f"Lattice flavor does not support kind='{kind}'")
elif table.flavor == "stream" and kind in ["joint", "line"]:
raise NotImplementedError(f"Stream flavor does not support kind='{kind}'")
plot_method = getattr(self, kind)
fig = plot_method(table)
if filename is not None:
fig.savefig(filename)
return None
return fig
def text(self, table):
"""Generates a plot for all text elements present
on the PDF page.
Parameters
----------
table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig
def grid(self, table):
"""Generates a plot for the detected table grids
on the PDF page.
Parameters
----------
table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
for row in table.cells:
for cell in row:
if cell.left:
ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]])
if cell.right:
ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]])
if cell.top:
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
return fig
def contour(self, table):
"""Generates a plot for all table boundaries present
on the PDF page.
Parameters
----------
table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
"""
try:
img, table_bbox = table._image
_FOR_LATTICE = True
except TypeError:
img, table_bbox = (None, {table._bbox: None})
_FOR_LATTICE = False
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], []
if not _FOR_LATTICE:
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
)
)
for t in table_bbox.keys():
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
)
)
if not _FOR_LATTICE:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
if _FOR_LATTICE:
ax.imshow(img)
return fig
def textedge(self, table):
"""Generates a plot for relevant textedges.
Parameters
----------
table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
for te in table._textedges:
ax.plot([te.x, te.x], [te.y0, te.y1])
return fig
def joint(self, table):
"""Generates a plot for all line intersections present
on the PDF page.
Parameters
----------
table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
"""
img, table_bbox = table._image
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
x_coord = []
y_coord = []
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
ax.plot(x_coord, y_coord, "ro")
ax.imshow(img)
return fig
def line(self, table):
"""Generates a plot for all line segments present
on the PDF page.
Parameters
----------
table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
vertical, horizontal = table._segments
for v in vertical:
ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]])
return fig
camelot-py-0.11.0/camelot/utils.py 0000664 0000000 0000000 00000064631 14376573757 0017043 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
import os
import re
import random
import shutil
import string
import tempfile
import warnings
from itertools import groupby
from operator import itemgetter
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
LAParams,
LTAnno,
LTChar,
LTTextLineHorizontal,
LTTextLineVertical,
LTImage,
)
from urllib.request import Request, urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_relative, uses_netloc, uses_params
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.
Parameters
----------
url : str or unicode
Returns
-------
isurl : bool
If url has a valid protocol return True otherwise False.
"""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def random_string(length):
ret = ""
while length:
ret += random.choice(
string.digits + string.ascii_lowercase + string.ascii_uppercase
)
length -= 1
return ret
def download_url(url):
"""Download file from specified URL.
Parameters
----------
url : str or unicode
Returns
-------
filepath : str or unicode
Temporary filepath.
"""
filename = f"{random_string(6)}.pdf"
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, None, headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
lattice_kwargs = [
"process_background",
"line_scale",
"copy_text",
"shift_text",
"line_tol",
"joint_tol",
"threshold_blocksize",
"threshold_constant",
"iterations",
"resolution",
]
def validate_input(kwargs, flavor="lattice"):
def check_intersection(parser_kwargs, input_kwargs):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec:
raise ValueError(
f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'"
)
if flavor == "lattice":
check_intersection(stream_kwargs, kwargs)
else:
check_intersection(lattice_kwargs, kwargs)
def remove_extra(kwargs, flavor="lattice"):
if flavor == "lattice":
for key in kwargs.keys():
if key in stream_kwargs:
kwargs.pop(key)
else:
for key in kwargs.keys():
if key in lattice_kwargs:
kwargs.pop(key)
return kwargs
# https://stackoverflow.com/a/22726782
class TemporaryDirectory(object):
def __enter__(self):
self.name = tempfile.mkdtemp()
return self.name
def __exit__(self, exc_type, exc_value, traceback):
shutil.rmtree(self.name)
def translate(x1, x2):
"""Translates x2 by x1.
Parameters
----------
x1 : float
x2 : float
Returns
-------
x2 : float
"""
x2 += x1
return x2
def scale(x, s):
"""Scales x by scaling factor s.
Parameters
----------
x : float
s : float
Returns
-------
x : float
"""
x *= s
return x
def scale_pdf(k, factors):
"""Translates and scales pdf coordinate space to image
coordinate space.
Parameters
----------
k : tuple
Tuple (x1, y1, x2, y2) representing table bounding box where
(x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
space.
factors : tuple
Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
first two elements are scaling factors and pdf_y is height of
pdf.
Returns
-------
knew : tuple
Tuple (x1, y1, x2, y2) representing table bounding box where
(x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
space.
"""
x1, y1, x2, y2 = k
scaling_factor_x, scaling_factor_y, pdf_y = factors
x1 = scale(x1, scaling_factor_x)
y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y)
x2 = scale(x2, scaling_factor_x)
y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y)
knew = (int(x1), int(y1), int(x2), int(y2))
return knew
def scale_image(tables, v_segments, h_segments, factors):
"""Translates and scales image coordinate space to pdf
coordinate space.
Parameters
----------
tables : dict
Dict with table boundaries as keys and list of intersections
in that boundary as value.
v_segments : list
List of vertical line segments.
h_segments : list
List of horizontal line segments.
factors : tuple
Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
first two elements are scaling factors and img_y is height of
image.
Returns
-------
tables_new : dict
v_segments_new : dict
h_segments_new : dict
"""
scaling_factor_x, scaling_factor_y, img_y = factors
tables_new = {}
for k in tables.keys():
x1, y1, x2, y2 = k
x1 = scale(x1, scaling_factor_x)
y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y)
x2 = scale(x2, scaling_factor_x)
y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y)
j_x, j_y = zip(*tables[k])
j_x = [scale(j, scaling_factor_x) for j in j_x]
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
joints = zip(j_x, j_y)
tables_new[(x1, y1, x2, y2)] = joints
v_segments_new = []
for v in v_segments:
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
y1, y2 = (
scale(abs(translate(-img_y, v[1])), scaling_factor_y),
scale(abs(translate(-img_y, v[3])), scaling_factor_y),
)
v_segments_new.append((x1, y1, x2, y2))
h_segments_new = []
for h in h_segments:
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
y1, y2 = (
scale(abs(translate(-img_y, h[1])), scaling_factor_y),
scale(abs(translate(-img_y, h[3])), scaling_factor_y),
)
h_segments_new.append((x1, y1, x2, y2))
return tables_new, v_segments_new, h_segments_new
def get_rotation(chars, horizontal_text, vertical_text):
"""Detects if text in table is rotated or not using the current
transformation matrix (CTM) and returns its orientation.
Parameters
----------
horizontal_text : list
List of PDFMiner LTTextLineHorizontal objects.
vertical_text : list
List of PDFMiner LTTextLineVertical objects.
ltchar : list
List of PDFMiner LTChar objects.
Returns
-------
rotation : string
'' if text in table is upright, 'anticlockwise' if
rotated 90 degree anticlockwise and 'clockwise' if
rotated 90 degree clockwise.
"""
rotation = ""
hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
return rotation
def segments_in_bbox(bbox, v_segments, h_segments):
"""Returns all line segments present inside a bounding box.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
space.
v_segments : list
List of vertical line segments.
h_segments : list
List of vertical horizontal segments.
Returns
-------
v_s : list
List of vertical line segments that lie inside table.
h_s : list
List of horizontal line segments that lie inside table.
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
v_s = [
v
for v in v_segments
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
]
h_s = [
h
for h in h_segments
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
]
return v_s, h_s
def text_in_bbox(bbox, text):
"""Returns all text objects present inside a bounding box.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
text : List of PDFMiner text objects.
Returns
-------
t_bbox : list
List of PDFMiner text objects that lie inside table, discarding the overlapping ones
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
t_bbox = [
t
for t in text
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
]
# Avoid duplicate text by discarding overlapping boxes
rest = {t for t in t_bbox}
for ba in t_bbox:
for bb in rest.copy():
if ba == bb:
continue
if bbox_intersect(ba, bb):
# if the intersection is larger than 80% of ba's size, we keep the longest
if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8:
if bbox_longer(bb, ba):
rest.discard(ba)
unique_boxes = list(rest)
return unique_boxes
def bbox_intersection_area(ba, bb) -> float:
"""Returns area of the intersection of the bounding boxes of two PDFMiner objects.
Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object
Returns
-------
intersection_area : float
Area of the intersection of the bounding boxes of both objects
"""
x_left = max(ba.x0, bb.x0)
y_top = min(ba.y1, bb.y1)
x_right = min(ba.x1, bb.x1)
y_bottom = max(ba.y0, bb.y0)
if x_right < x_left or y_bottom > y_top:
return 0.0
intersection_area = (x_right - x_left) * (y_top - y_bottom)
return intersection_area
def bbox_area(bb) -> float:
"""Returns area of the bounding box of a PDFMiner object.
Parameters
----------
bb : PDFMiner text object
Returns
-------
area : float
Area of the bounding box of the object
"""
return (bb.x1 - bb.x0) * (bb.y1 - bb.y0)
def bbox_intersect(ba, bb) -> bool:
"""Returns True if the bounding boxes of two PDFMiner objects intersect.
Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object
Returns
-------
overlaps : bool
True if the bounding boxes intersect
"""
return ba.x1 >= bb.x0 and bb.x1 >= ba.x0 and ba.y1 >= bb.y0 and bb.y1 >= ba.y0
def bbox_longer(ba, bb) -> bool:
"""Returns True if the bounding box of the first PDFMiner object is longer or equal to the second.
Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object
Returns
-------
longer : bool
True if the bounding box of the first object is longer or equal
"""
return (ba.x1 - ba.x0) >= (bb.x1 - bb.x0)
def merge_close_lines(ar, line_tol=2):
"""Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections.
Parameters
----------
ar : list
line_tol : int, optional (default: 2)
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=line_tol):
temp = (temp + a) / 2.0
ret[-1] = temp
else:
ret.append(a)
return ret
def text_strip(text, strip=""):
"""Strips any characters in `strip` that are present in `text`.
Parameters
----------
text : str
Text to process and strip.
strip : str, optional (default: '')
Characters that should be stripped from `text`.
Returns
-------
stripped : str
"""
if not strip:
return text
stripped = re.sub(
fr"[{''.join(map(re.escape, strip))}]", "", text, flags=re.UNICODE
)
return stripped
# TODO: combine the following functions into a TextProcessor class which
# applies corresponding transformations sequentially
# (inspired from sklearn.pipeline.Pipeline)
def flag_font_size(textline, direction, strip_text=""):
"""Flags super/subscripts in text by enclosing them with .
May give false positives.
Parameters
----------
textline : list
List of PDFMiner LTChar objects.
direction : string
Direction of the PDFMiner LTTextLine object.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns
-------
fstring : string
"""
if direction == "horizontal":
d = [
(t.get_text(), np.round(t.height, decimals=6))
for t in textline
if not isinstance(t, LTAnno)
]
elif direction == "vertical":
d = [
(t.get_text(), np.round(t.width, decimals=6))
for t in textline
if not isinstance(t, LTAnno)
]
l = [np.round(size, decimals=6) for text, size in d]
if len(set(l)) > 1:
flist = []
min_size = min(l)
for key, chars in groupby(d, itemgetter(1)):
if key == min_size:
fchars = [t[0] for t in chars]
if "".join(fchars).strip():
fchars.insert(0, "")
fchars.append("")
flist.append("".join(fchars))
else:
fchars = [t[0] for t in chars]
if "".join(fchars).strip():
flist.append("".join(fchars))
fstring = "".join(flist)
else:
fstring = "".join([t.get_text() for t in textline])
return text_strip(fstring, strip_text)
def split_textline(table, textline, direction, flag_size=False, strip_text=""):
"""Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.
Parameters
----------
table : camelot.core.Table
textline : object
PDFMiner LTTextLine object.
direction : string
Direction of the PDFMiner LTTextLine object.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using
if its size is different from rest of the string. (Useful for
super and subscripts.)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns
-------
grouped_chars : list
List of tuples of the form (idx, text) where idx is the index
of row/column and text is the an lttextline substring.
"""
idx = 0
cut_text = []
bbox = textline.bbox
try:
if direction == "horizontal" and not textline.is_empty():
x_overlap = [
i
for i, x in enumerate(table.cols)
if x[0] <= bbox[2] and bbox[0] <= x[1]
]
r_idx = [
j
for j, r in enumerate(table.rows)
if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
]
r = r_idx[0]
x_cuts = [
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
]
if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
for obj in textline._objs:
row = table.rows[r]
for cut in x_cuts:
if isinstance(obj, LTChar):
if (
row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
and (obj.x0 + obj.x1) / 2 <= cut[1]
):
cut_text.append((r, cut[0], obj))
break
else:
# TODO: add test
if cut == x_cuts[-1]:
cut_text.append((r, cut[0] + 1, obj))
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj))
elif direction == "vertical" and not textline.is_empty():
y_overlap = [
j
for j, y in enumerate(table.rows)
if y[1] <= bbox[3] and bbox[1] <= y[0]
]
c_idx = [
i
for i, c in enumerate(table.cols)
if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
]
c = c_idx[0]
y_cuts = [
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
]
if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
for obj in textline._objs:
col = table.cols[c]
for cut in y_cuts:
if isinstance(obj, LTChar):
if (
col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
and (obj.y0 + obj.y1) / 2 >= cut[1]
):
cut_text.append((cut[0], c, obj))
break
else:
# TODO: add test
if cut == y_cuts[-1]:
cut_text.append((cut[0] - 1, c, obj))
elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj))
except IndexError:
return [(-1, -1, textline.get_text())]
grouped_chars = []
for key, chars in groupby(cut_text, itemgetter(0, 1)):
if flag_size:
grouped_chars.append(
(
key[0],
key[1],
flag_font_size(
[t[2] for t in chars], direction, strip_text=strip_text
),
)
)
else:
gchars = [t[2].get_text() for t in chars]
grouped_chars.append(
(key[0], key[1], text_strip("".join(gchars), strip_text))
)
return grouped_chars
def get_table_index(
table, t, direction, split_text=False, flag_size=False, strip_text=""
):
"""Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates.
Parameters
----------
table : camelot.core.Table
t : object
PDFMiner LTTextLine object.
direction : string
Direction of the PDFMiner LTTextLine object.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using
if its size is different from rest of the string. (Useful for
super and subscripts)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns
-------
indices : list
List of tuples of the form (r_idx, c_idx, text) where r_idx
and c_idx are row and column indices.
error : float
Assignment error, percentage of text area that lies outside
a cell.
+-------+
| |
| [Text bounding box]
| |
+-------+
"""
r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)):
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
r
][1]:
lt_col_overlap = []
for c in table.cols:
if c[0] <= t.x1 and c[1] >= t.x0:
left = t.x0 if c[0] <= t.x0 else c[0]
right = t.x1 if c[1] >= t.x1 else c[1]
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
else:
lt_col_overlap.append(-1)
if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0:
text = t.get_text().strip("\n")
text_range = (t.x0, t.x1)
col_range = (table.cols[0][0], table.cols[-1][1])
warnings.warn(
f"{text} {text_range} does not lie in column range {col_range}"
)
r_idx = r
c_idx = lt_col_overlap.index(max(lt_col_overlap))
break
# error calculation
y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4
if t.y0 > table.rows[r_idx][0]:
y0_offset = abs(t.y0 - table.rows[r_idx][0])
if t.y1 < table.rows[r_idx][1]:
y1_offset = abs(t.y1 - table.rows[r_idx][1])
if t.x0 < table.cols[c_idx][0]:
x0_offset = abs(t.x0 - table.cols[c_idx][0])
if t.x1 > table.cols[c_idx][1]:
x1_offset = abs(t.x1 - table.cols[c_idx][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
if split_text:
return (
split_textline(
table, t, direction, flag_size=flag_size, strip_text=strip_text
),
error,
)
else:
if flag_size:
return (
[
(
r_idx,
c_idx,
flag_font_size(t._objs, direction, strip_text=strip_text),
)
],
error,
)
else:
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
def compute_accuracy(error_weights):
"""Calculates a score based on weights assigned to various
parameters and their error percentages.
Parameters
----------
error_weights : list
Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
where pn is the weight assigned to list of errors en.
Sum of pn should be equal to 100.
Returns
-------
score : float
"""
SCORE_VAL = 100
try:
score = 0
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
raise ValueError("Sum of weights should be equal to 100.")
for ew in error_weights:
weight = ew[0] / len(ew[1])
for error_percentage in ew[1]:
score += weight * (1 - error_percentage)
except ZeroDivisionError:
score = 0
return score
def compute_whitespace(d):
"""Calculates the percentage of empty strings in a
two-dimensional list.
Parameters
----------
d : list
Returns
-------
whitespace : float
Percentage of empty cells.
"""
whitespace = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d:
for j in i:
if j.strip() == "":
whitespace += 1
whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
return whitespace
def get_page_layout(
filename,
line_overlap=0.5,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=0.5,
detect_vertical=True,
all_texts=True,
):
"""Returns a PDFMiner LTPage object and page dimension of a single
page pdf. To get the definitions of kwargs, see
https://pdfminersix.rtfd.io/en/latest/reference/composable.html.
Parameters
----------
filename : string
Path to pdf file.
line_overlap : float
char_margin : float
line_margin : float
word_margin : float
boxes_flow : float
detect_vertical : bool
all_texts : bool
Returns
-------
layout : object
PDFMiner LTPage object.
dim : tuple
Dimension of pdf page in the form (width, height).
"""
with open(filename, "rb") as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed(
f"Text extraction is not allowed: {filename}"
)
laparams = LAParams(
line_overlap=line_overlap,
char_margin=char_margin,
line_margin=line_margin,
word_margin=word_margin,
boxes_flow=boxes_flow,
detect_vertical=detect_vertical,
all_texts=all_texts,
)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
width = layout.bbox[2]
height = layout.bbox[3]
dim = (width, height)
return layout, dim
def get_text_objects(layout, ltype="char", t=None):
"""Recursively parses pdf layout to get a list of
PDFMiner text objects.
Parameters
----------
layout : object
PDFMiner LTPage object.
ltype : string
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
and LTTextLineVertical objects respectively.
t : list
Returns
-------
t : list
List of PDFMiner text objects.
"""
if ltype == "char":
LTObject = LTChar
elif ltype == "image":
LTObject = LTImage
elif ltype == "horizontal_text":
LTObject = LTTextLineHorizontal
elif ltype == "vertical_text":
LTObject = LTTextLineVertical
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += get_text_objects(obj, ltype=ltype)
except AttributeError:
pass
return t
camelot-py-0.11.0/docs/ 0000775 0000000 0000000 00000000000 14376573757 0014623 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/docs/Makefile 0000664 0000000 0000000 00000016637 14376573757 0016300 0 ustar 00root root 0000000 0000000 # Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help
help:
@echo "Please use \`make ' where is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " applehelp to make an Apple Help Book"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " epub3 to make an epub3"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
@echo " coverage to run coverage check of the documentation (if enabled)"
@echo " dummy to check syntax errors of document sources"
.PHONY: clean
clean:
rm -rf $(BUILDDIR)/*
.PHONY: html
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
.PHONY: dirhtml
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
.PHONY: singlehtml
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
.PHONY: pickle
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
.PHONY: json
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
.PHONY: htmlhelp
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
.PHONY: qthelp
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/camelot.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/camelot.qhc"
.PHONY: applehelp
applehelp:
$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
@echo
@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
@echo "N.B. You won't be able to view it unless you put it in" \
"~/Library/Documentation/Help or install it in your application" \
"bundle."
.PHONY: devhelp
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/camelot"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/camelot"
@echo "# devhelp"
.PHONY: epub
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
.PHONY: epub3
epub3:
$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
@echo
@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
.PHONY: latex
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
.PHONY: latexpdf
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
.PHONY: latexpdfja
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
.PHONY: text
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
.PHONY: man
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
.PHONY: texinfo
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
.PHONY: info
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
.PHONY: gettext
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
.PHONY: changes
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
.PHONY: linkcheck
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
.PHONY: doctest
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
.PHONY: coverage
coverage:
$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
@echo "Testing of coverage in the sources finished, look at the " \
"results in $(BUILDDIR)/coverage/python.txt."
.PHONY: xml
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
.PHONY: pseudoxml
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
.PHONY: dummy
dummy:
$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
@echo
@echo "Build finished. Dummy builder generates no files."
camelot-py-0.11.0/docs/_static/ 0000775 0000000 0000000 00000000000 14376573757 0016251 5 ustar 00root root 0000000 0000000 camelot-py-0.11.0/docs/_static/camelot.png 0000664 0000000 0000000 00000013264 14376573757 0020411 0 ustar 00root root 0000000 0000000 PNG
IHDR E P _h\ pHYs )I fIDATxQy:uTTQb^>aKؕRUT)^ºSb#'
/fjڴ"⼌塊yhlVn*XSC0uusw=3;Ds=7s9G"N jYkxLA24f
COqJ@SE`EDUQl}tO |