pax_global_header00006660000000000000000000000064145016055620014516gustar00rootroot0000000000000052 comment=ca191ba776743fe10a1c3dd99eaa82626ee4edc1 pyocr-0.8.5/000077500000000000000000000000001450160556200126645ustar00rootroot00000000000000pyocr-0.8.5/.git_archival.txt000066400000000000000000000001411450160556200161330ustar00rootroot00000000000000ref-names: HEAD -> master, tag: 0.8.5, refs/keep-around/ca191ba776743fe10a1c3dd99eaa82626ee4edc1 pyocr-0.8.5/.gitattributes000066400000000000000000000000371450160556200155570ustar00rootroot00000000000000.git_archival.txt export-subst pyocr-0.8.5/.gitignore000066400000000000000000000001201450160556200146450ustar00rootroot00000000000000*.pyc *~ build/ dist/ pyocr.egg-info/ venv*/ .tox/ .egg*/ _version.py .coverage pyocr-0.8.5/.gitlab-ci.yml000066400000000000000000000017261450160556200153260ustar00rootroot00000000000000image: debian:bullseye .apt: &apt before_script: - apt-get update # WORKAROUND: manpages is required to install some of the openjdk-jre # and openjdk-jre is required for plantuml, which is required to generate # documentation - mkdir -p /usr/share/man/man1 - apt-get install -y -qq git - apt-get install -y -qq make - apt-get install -y -qq python3 - apt-get install -y -qq python3-pip check: only: - branches@World/OpenPaperwork/pyocr - tags@World/OpenPaperwork/pyocr <<: *apt script: - pip install ".[lint]" - make check test: only: - branches@World/OpenPaperwork/pyocr - tags@World/OpenPaperwork/pyocr <<: *apt script: # required for Pillow - apt-get install -y -qq zlib1g-dev - apt-get install -y -qq libjpeg-dev - apt-get install -y -qq libpng-dev - apt-get install -y -qq build-essential gcc - apt-get install -y -qq python3-dev - pip install ".[dev]" - make test pyocr-0.8.5/AUTHORS000066400000000000000000000206271450160556200137430ustar00rootroot00000000000000$ git quick-stats -T >| AUTHORS # some authors appears many times with different names. They have been # merged together # sorted by line inserted Contribution stats (by author): Jerome Flesch : insertions: 12901 deletions: 2973 files: 461 (46%) commits: 304 lines changed: 15874 first commit: Mon Sep 19 20:16:37 2011 +0200 last commit: Fri Mar 4 18:49:00 2022 +0100 Thomas Perret : insertions: 4742 (24%) deletions: 6361 (56%) files: 145 (14%) commits: 24 (6%) lines changed: 11103 (36%) first commit: Thu Nov 8 16:28:21 2018 +0100 last commit: Thu Dec 6 11:31:09 2018 +0100 Elliott Sales de Andrade : insertions: 478 (2%) deletions: 911 (8%) files: 73 (7%) commits: 23 (5%) lines changed: 1389 (4%) first commit: Wed Aug 1 17:11:12 2018 -0400 last commit: Thu May 16 05:26:00 2019 -0400 Samuel Hoffstaetter : insertions: 310 (2%) deletions: 115 (1%) files: 22 (2%) commits: 15 (3%) lines changed: 425 (1%) first commit: Tue Jun 23 21:59:22 2009 -0400 last commit: Wed Jun 24 19:15:50 2009 -0400 Benjamin Nguyen-Van-Yen : insertions: 595 (3%) deletions: 600 (5%) files: 30 (3%) commits: 9 (2%) lines changed: 1195 (4%) first commit: Tue Nov 15 23:30:49 2016 +0100 last commit: Tue Nov 22 21:02:32 2016 +0100 Paulo Miguel Almeida : insertions: 70 (0%) deletions: 51 (0%) files: 6 (1%) commits: 6 (1%) lines changed: 121 (0%) first commit: Thu May 7 15:30:51 2015 -0300 last commit: Sun May 31 16:41:57 2015 -0300 Adriano Pagano : insertions: 62 (0%) deletions: 21 (0%) files: 8 (1%) commits: 6 (1%) lines changed: 83 (0%) first commit: Wed Nov 15 15:35:56 2017 +0100 last commit: Thu Nov 30 16:27:59 2017 +0100 ZoranPavlovic : insertions: 23 (0%) deletions: 32 (0%) files: 5 (0%) commits: 5 (1%) lines changed: 55 (0%) first commit: Tue Jan 2 14:14:13 2018 +0200 last commit: Tue Jan 2 20:34:42 2018 +0200 Matthias Kraus : insertions: 195 (1%) deletions: 77 (1%) files: 7 (1%) commits: 4 (1%) lines changed: 272 (1%) first commit: Sat Apr 18 19:02:52 2020 +0200 last commit: Wed Jul 29 20:05:48 2020 +0200 Teis : insertions: 3 (0%) deletions: 2 (0%) files: 3 (0%) commits: 3 (1%) lines changed: 5 (0%) first commit: Mon May 2 12:37:53 2016 +0200 last commit: Mon May 2 14:48:40 2016 +0200 Chanwoong Kim : insertions: 26 (0%) deletions: 1 (0%) files: 2 (0%) commits: 2 (0%) lines changed: 27 (0%) first commit: Sun Feb 14 19:32:38 2016 +0900 last commit: Sun Feb 14 20:19:56 2016 +0900 Marian Skrip : insertions: 175 (1%) deletions: 11 (0%) files: 6 (1%) commits: 2 (0%) lines changed: 186 (1%) first commit: Wed Aug 30 14:45:38 2017 +0200 last commit: Wed Sep 13 09:34:13 2017 +0200 Marián Skrip : insertions: 13 (0%) deletions: 2 (0%) files: 2 (0%) commits: 2 (0%) lines changed: 15 (0%) first commit: Tue Oct 23 16:13:49 2018 +0200 last commit: Sat Feb 23 17:33:35 2019 +0100 Bernard Cafarelli : insertions: 1 (0%) deletions: 1 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 2 (0%) first commit: Mon May 18 13:51:00 2015 +0200 last commit: Mon May 18 13:51:00 2015 +0200 Ashish Kulkarni : insertions: 16 (0%) deletions: 0 (0%) files: 2 (0%) commits: 2 (0%) lines changed: 16 (0%) first commit: Wed May 31 11:49:56 2017 +0530 last commit: Wed May 31 11:52:14 2017 +0530 Ross Vandegrift : insertions: 13 (0%) deletions: 1 (0%) files: 2 (0%) commits: 2 (0%) lines changed: 14 (0%) first commit: Sat Mar 22 21:56:45 2014 +0000 last commit: Sat Mar 22 21:48:16 2014 +0000 Fjup : insertions: 8 (0%) deletions: 3 (0%) files: 2 (0%) commits: 2 (0%) lines changed: 11 (0%) first commit: Sat May 7 20:22:34 2016 +0200 last commit: Sat May 7 20:32:30 2016 +0200 David Martin : insertions: 42 (0%) deletions: 25 (0%) files: 10 (1%) commits: 8 (2%) lines changed: 67 (0%) first commit: Sat May 13 20:17:38 2017 +1000 last commit: Wed Jun 13 21:04:33 2018 +1000 Yorun/悠然 : insertions: 1 (0%) deletions: 1 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 2 (0%) first commit: Tue Mar 10 16:53:01 2020 +0000 last commit: Tue Mar 10 16:53:01 2020 +0000 oda : insertions: 19 (0%) deletions: 14 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 33 (0%) first commit: Wed Apr 13 11:36:20 2022 +0900 last commit: Wed Apr 13 11:36:20 2022 +0900 Bernhard Liebl : insertions: 9 (0%) deletions: 21 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 30 (0%) first commit: Tue Aug 23 16:47:06 2016 +0200 last commit: Tue Aug 23 16:47:06 2016 +0200 Gian Luca Dalla Torre : insertions: 56 (0%) deletions: 1 (0%) files: 4 (0%) commits: 1 (0%) lines changed: 57 (0%) first commit: Mon Aug 4 10:22:03 2014 +0200 last commit: Mon Aug 4 10:22:03 2014 +0200 Yada : insertions: 1 (0%) deletions: 1 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 2 (0%) first commit: Sat Mar 19 22:52:34 2016 +0900 last commit: Sat Mar 19 22:52:34 2016 +0900 Daniel Quinn : insertions: 1 (0%) deletions: 1 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 2 (0%) first commit: Sat Apr 28 12:17:29 2018 +0100 last commit: Sat Apr 28 12:17:29 2018 +0100 Pedro-Juan Ferrer : insertions: 2 (0%) deletions: 1 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 3 (0%) first commit: Wed Apr 16 15:11:26 2014 +0200 last commit: Wed Apr 16 15:11:26 2014 +0200 Jakub Semerák : insertions: 6 (0%) deletions: 0 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 6 (0%) first commit: Mon Aug 15 09:24:01 2016 +0200 last commit: Mon Aug 15 09:24:01 2016 +0200 tom : insertions: 1 (0%) deletions: 1 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 2 (0%) first commit: Tue Sep 6 17:51:27 2016 +0200 last commit: Tue Sep 6 17:51:27 2016 +0200 Jon Debonis : insertions: 1 (0%) deletions: 0 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 1 (0%) first commit: Thu Sep 25 15:34:21 2014 -0700 last commit: Thu Sep 25 15:34:21 2014 -0700 aszlig : insertions: 101 (1%) deletions: 46 (0%) files: 5 (0%) commits: 1 (0%) lines changed: 147 (0%) first commit: Sun Apr 9 22:24:17 2017 +0200 last commit: Sun Apr 9 22:24:17 2017 +0200 Ramon Navarro Bosch : insertions: 1 (0%) deletions: 0 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 1 (0%) first commit: Mon Jun 14 09:18:56 2021 +0000 last commit: Mon Jun 14 09:18:56 2021 +0000 jbochi : insertions: 10 (0%) deletions: 3 (0%) files: 1 (0%) commits: 1 (0%) lines changed: 13 (0%) first commit: Wed Oct 27 21:09:50 2010 -0200 last commit: Wed Oct 27 21:09:50 2010 -0200 total: insertions: 19882 (100%) deletions: 11277 (100%) files: 1006 (100%) commits: 433 (100%) pyocr-0.8.5/COPYING000066400000000000000000001045131450160556200137230ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . pyocr-0.8.5/ChangeLog000066400000000000000000000165031450160556200144430ustar00rootroot0000000000000017/09/2023 - 0.8.5: - Add build-system.build-backend to pyproject.toml 16/09/2023 - 0.8.4: - Fix LineBoxBuilder: Take into account headers and footers too, not just the body. - switch from setup.py to pyproject.toml - switch from tox to pytest 26/06/2022 - 0.8.3: - Workaround https://github.com/pypa/setuptools_scm/issues/727 15/04/2022 - 0.8.2: - Add support for Tesseravt 5 + Linux - Fix file descriptor leak (thanks to oda) 05/12/2021 - 0.8.1: - Make the dependency on setuptools_scm optional 01/01/2020 - 0.8.0: - Replaced libtesseract.image_to_pdf() by an object-oriented API that allows creating PDF with more than 1 page (thanks to Matthias Kraus). - Tesseract 4 + sys.frozen=True: Fix TESSDATA_PREFIX: starting with Tesseract 4, the path must include tessdata/ 22/06/2019 - 0.7.2: - Fix setup.py on Windows 22/06/2019 - 0.7.1: - tesseract.can_detect_orientation(): only returns True if 'osd' date files are installed - setup.py: Fix installation in MSYS2 12/05/2019 - 0.7: - Drop support for Python <= 2.7 - Fix: Make sure the builder objects can be used to parse box files even if Tesseract is not installed. - PyOCR version is now automatically set in the module by setuptools_scm instead of PyOCR's Makefile (except on Windows) - Tesseract: optim: keep the get_version() in memory instead of calling Tesseract everytime (get_version() by psm_parameter() which is called each time a box file is parsed ...) 18/02/2019 - 0.6: - Complete rewrite of unit tests (thanks to Thomas Perret) - Libtesseract 4.0: Fix segfault when running orientation detection (thanks to Marián Skrip) - Libtesseract 4.0: Add a workaround: Tesseract need the locale to be set to 'C' (thanks to Thomas Perret) - Libtesseract: Specify DPI of the image to Libtesseract (thanks to Thomas Perret) - Tesseract 4.0: Improve Tesseract version parsing 09/04/2018 - 0.5.3: - Really fix tesseract 4.0 support (thanks to David Martin) - Tests: switch from nose to pytest (thanks to Elliott Sales de Andrade) 25/07/2018 - 0.5.2: - Fix tesseract 4.0 support: Use option '--psm' instead of '-psm' - tesseract.detection_orientation(): Fix exception generation 01/03/2017 - 0.5.1: - libtesseract/Windows: Add possible DLL names for libtesseract - libtesseract: Keep track of library-loading errors in pyocr.libtesseract.lib_load_errors (useful for debugging) - Build method has been changed: Use now "make install" instead of "python3 ./setup.py install" - cosmetic: builders/WordHTMLParser: Message "OCR confidence not found" floods the logs when working with old documents --> switch to debug instead of info. 14/12/2017 - 0.5: - Tesseract/Libtesseract + LineBoxBuilder: Add confidence scores to every word boxes and to hOCR files (thanks to Adriano Pagano) - Tesseract 4 (shell): Add '--oem 0' to specify legacy model when doing orientation detection as orientation detection does not work yet with Tesseract 4 (thanks to Adriano Pagano) - Libtesseract: Fix multi-language support - Tesseract (shell) + Windows: Never let the cmd window appear - Libtesseract: Implements image_to_pdf() (thanks to Marian Skrip) - Libtesseract: Hide debug messages (thanks to Ashish Kulkarni) 13/05/2017 - 0.4.7: - Tesseract 4.00.00alpha: - Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) - Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available - Support for Tesseract 3.05.00: - Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' - Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS - Libtesseract: Workaround: Prevents possible segfault in image_to_string() when the target language is not available 26/01/2017 - 0.4.6: - hOCR outputs: Generate valid XHTML files 10/01/2017 - 0.4.5: - Clean up exceptions raised when OCR fails: - Now, all tools raise only exceptions inheriting from pyocr.PyocrException - There is now one and only one TesseractError (shared between pyocr.libtesseract and pyocr.tesseract) 08/12/2016 - 0.4.4: - Fix Python 2.7 support (broken import) 06/12/2016 - 0.4.3: - (temporary) Use tesseract-sh by default instead of libtesseract. Some people have reported crashes with Paperwork+libtesseract. It needs more stress-testing - DigitBuilder is now available in 'pyocr.builders' (can be used with libtesseract and cuneiform) - New builder: DigitLineBoxBuilder - Windows: Fix pyinstaller packaging suport: env variable TESSDATA_PREFIX wasn't set correctly - Windows: Tesseract-sh: Prevent CMD windows from appearing 05/10/2016 - 0.4.2: * Tesseract: orientation detection: Ignore errors printed by libleptonic on stderr (thanks to TeisD) * Tesseract: Fix support of dev builds (thanks to Fjup) * Libtesseract: Fix support of dev builds (thanks to Jakub Semerák) * Tesseract: Use '--list-langs' to get the available languages instead of looking for the data directory (thanks to Bernhard Liebl) 06/04/2016 - 0.4.1: * Disable 'libtesseract' with Tesseract <= 3.03. It tends to segfault. Libtesseract: Disable it with Tesseract <= 3.03. It tends to segfault. Note: the segfault may not actually be related to Libtesseract. It may be due to other things in Debian stable (jessie). Anyway, Paperwork cannot work on Debian stable because of that --> disabled just to be safe 13/03/2013 - 0.4.0: * New module: 'libtesseract'. Use the C API of Tesseract for OCR. This module is more efficient and cleaner than the old 'tesseract' module (no more fork + exec + sh, less image manipulation, etc). Note that with this module the images are just loaded and uncompressed by Pillow. With 'tesseract', they were loaded, uncompressed, re-compressed and saved by Pillow, then be reloaded by Leptonica. So the results may vary slightly. * Tesseract: Add support for Win32 * Tesseract: Fix orientation detection for version >= 3.04.01 0.3.1: * tesseract.detect_orientation(): Use a temporary file instead of stdin to transmit the image to Tesseract. Tesseract 3.04 doesn't support stdin + "-psm 0" (regression ?) * tesseract.detect_orientation(): Improve output parsing reliability * optim: Avoid unnecessary convert to RGB and allow using image formats different from PNG * TextBuilder + Cuneiform: add extra settings for Cuneiform (cuneiform_dotmatrix, cuneiform_fax=False, cuneiform_singlecolumn) 0.3.0: * New API: pyocr..can_detect_orientation() and pyocr..detect_orientation() 0.2.4: * Tesseract : add digit-only support * Tesseract : add support for Tesseract subsets of layout analysis (-psm) 0.2.3: * Strip the alpha channel from images before running the OCR. It's basically useless and can prevent the tool from working correctly. * Make hOCR parsing more resistant (handle extra data around box positions) * Fix: Take into account that new versions of Tesseract uses the file extension .hocr instead of .html 0.2.2: * Fix Python 3 support * Add support for Tesseract on Heroku 0.2.1: * Make it possible to use 'import pyocr' instead of 'from pyocr import pyocr'. 'from pyocr import pyocr' still works but is obsolete. * Fix dependency list: depends on Pillow (it's untested with PIL) * Fix pyocr.VERSION 0.2.0: * Python 3.x support 0.1.2: * Tesseract: Fix version parsing * Tesseract: Fix Tesseract 3.02.01's hOCR format support 0.1.1: * hOCR: Parse lines as well as words * tesseract.get_available_languages() : Fix fedora support * Fix UTF-8 support pyocr-0.8.5/Makefile000066400000000000000000000026221450160556200143260ustar00rootroot00000000000000PYTHON = python3 build: echo "Run make build_py" echo "or make install" install: install_py install_c uninstall: uninstall_py build_py: pip install build build build_c: doc: install_py (cd doc && make html) cp doc/index.html doc/build/index.html check: flake8 src/ tests/ test: pytest -xv tests/ linux_exe: windows_exe: release: ifeq (${RELEASE}, ) @echo "You must specify a release version (make release RELEASE=1.2.3)" else @echo "Will release: ${RELEASE}" @echo "Checking release is in ChangeLog ..." grep ${RELEASE} ChangeLog | grep -v "/xx" @echo "Releasing ..." git tag -a ${RELEASE} -m ${RELEASE} git push origin ${RELEASE} make clean ${PYTHON} ./setup.py sdist twine upload dist/pyocr-${RELEASE}.tar.gz @echo "All done" endif clean: rm -rf doc/build rm -rf build dist *.egg-info rm -rf src/pyocr/__pycache__ rm -f src/pyocr/_version.py install_py: ${PYTHON} -m pip install ${PIP_ARGS} . install_c: uninstall_py: pip3 uninstall -y pyocr uninstall_c: help: @echo "make build || make build_py" @echo "make check" @echo "make doc" @echo "make help: display this message" @echo "make install || make install_py" @echo "make release" @echo "make test" @echo "make uninstall || make uninstall_py" .PHONY: \ build \ build_c \ build_py \ check \ doc \ linux_exe \ windows_exe \ help \ install \ install_c \ install_py \ release \ test \ uninstall \ uninstall_c pyocr-0.8.5/README.md000066400000000000000000000210751450160556200141500ustar00rootroot00000000000000# PyOCR PyOCR is an optical character recognition (OCR) tool wrapper for python. That is, it helps using various OCR tools from a Python program. It has been tested only on GNU/Linux systems. It should also work on similar systems (*BSD, etc). It may or may not work on Windows, MacOSX, etc. ## Supported OCR tools * Libtesseract (Python bindings for the C API) * Tesseract (wrapper: fork + exec) * Cuneiform (wrapper: fork + exec) ## Features * Supports all the image formats supported by [Pillow](https://github.com/python-imaging/Pillow), including jpeg, png, gif, bmp, tiff and others * Various output types: text only, bounding boxes, etc. * Orientation detection (Tesseract and libtesseract only) * Can focus on digits only (Tesseract and libtesseract only) * Can save and reload boxes in hOCR format * PDF generation (libtesseract only) ## Limitations * hOCR: Only a subset of the specification is supported. For instance, pages and paragraph positions are not stored. ## Installation ```sh sudo pip3 install pyocr # Python 3.X ``` or the manual way: ```sh mkdir -p ~/git ; cd git git clone https://gitlab.gnome.org/World/OpenPaperwork/pyocr.git cd pyocr make install # will run 'python ./setup.py install' ``` ## Usage ### Initialization ```Python from PIL import Image import sys import pyocr import pyocr.builders tools = pyocr.get_available_tools() if len(tools) == 0: print("No OCR tool found") sys.exit(1) # The tools are returned in the recommended order of usage tool = tools[0] print("Will use tool '%s'" % (tool.get_name())) # Ex: Will use tool 'libtesseract' langs = tool.get_available_languages() print("Available languages: %s" % ", ".join(langs)) lang = langs[0] print("Will use lang '%s'" % (lang)) # Ex: Will use lang 'fra' # Note that languages are NOT sorted in any way. Please refer # to the system locale settings for the default language # to use. ``` ### Image to text ```Python txt = tool.image_to_string( Image.open('test.png'), lang=lang, builder=pyocr.builders.TextBuilder() ) # txt is a Python string word_boxes = tool.image_to_string( Image.open('test.png'), lang="eng", builder=pyocr.builders.WordBoxBuilder() ) # list of box objects. For each box object: # box.content is the word in the box # box.position is its position on the page (in pixels) # # Beware that some OCR tools (Tesseract for instance) # may return empty boxes line_and_word_boxes = tool.image_to_string( Image.open('test.png'), lang="fra", builder=pyocr.builders.LineBoxBuilder() ) # list of line objects. For each line object: # line.word_boxes is a list of word boxes (the individual words in the line) # line.content is the whole text of the line # line.position is the position of the whole line on the page (in pixels) # # Each word box object has an attribute 'confidence' giving the confidence # score provided by the OCR tool. Confidence score depends entirely on # the OCR tool. Only supported with Tesseract and Libtesseract (always 0 # with Cuneiform). # # Beware that some OCR tools (Tesseract for instance) may return boxes # with an empty content. # Digits - Only Tesseract (not 'libtesseract' yet !) digits = tool.image_to_string( Image.open('test-digits.png'), lang=lang, builder=pyocr.tesseract.DigitBuilder() ) # digits is a python string ``` Argument 'lang' is optional. The default value depends of the tool used. Argument 'builder' is optional. Default value is builders.TextBuilder(). If the OCR fails, an exception ```pyocr.PyocrException``` will be raised. An exception MAY be raised if the input image contains no text at all (depends on the OCR tool behavior). ### Orientation detection Currently only available with Tesseract or Libtesseract. ```Python if tool.can_detect_orientation(): try: orientation = tool.detect_orientation( Image.open('test.png'), lang='fra' ) except pyocr.PyocrException as exc: print("Orientation detection failed: {}".format(exc)) return print("Orientation: {}".format(orientation)) # Ex: Orientation: { # 'angle': 90, # 'confidence': 123.4, # } ``` Angles are given in degrees (range: [0-360[). Exact possible values depend of the tool used. Tesseract only returns angles = 0, 90, 180, 270. Confidence is a score arbitrarily defined by the tool. It MAY not be returned. detect_orientation() MAY raise an exception if there is no text detected in the image. ### Writing and reading text files Writing: ```Python import codecs import pyocr import pyocr.builders tool = pyocr.get_available_tools()[0] builder = pyocr.builders.TextBuilder() txt = tool.image_to_string( Image.open('test.png'), lang=lang, builder=builder ) # txt is a Python string with codecs.open("toto.txt", 'w', encoding='utf-8') as file_descriptor: builder.write_file(file_descriptor, txt) # toto.txt is a simple text file, encoded in utf-8 ``` Reading: ```Python import codecs import pyocr.builders builder = pyocr.builders.TextBuilder() with codecs.open("toto.txt", 'r', encoding='utf-8') as file_descriptor: txt = builder.read_file(file_descriptor) # txt is a Python string ``` ### Writing and reading hOCR files Writing: ```Python import codecs import pyocr import pyocr.builders tool = pyocr.get_available_tools()[0] builder = pyocr.builders.LineBoxBuilder() line_boxes = tool.image_to_string( Image.open('test.png'), lang=lang, builder=builder ) # list of LineBox (each box points to a list of word boxes) with codecs.open("toto.html", 'w', encoding='utf-8') as file_descriptor: builder.write_file(file_descriptor, line_boxes) # toto.html is a valid XHTML file ``` Reading: ```Python import codecs import pyocr.builders builder = pyocr.builders.LineBoxBuilder() with codecs.open("toto.html", 'r', encoding='utf-8') as file_descriptor: line_boxes = builder.read_file(file_descriptor) # list of LineBox (each box points to a list of word boxes) ``` ### Generating PDF file from an image With libtesseract >= 4, it's possible to generate a PDF from an image: ```Python import PIL.Image import pyocr image = PIL.Image.open("image.jpg") builder = pyocr.libtesseract.LibtesseractPdfBuilder() builder.add_image(image) # multiple images are added as separate pages builder.set_lang("deu") # optional builder.set_output_file("output_filename") # .pdf will be appended builder.build() ``` #### Add text layer to PDF ```Python import pyocr import pdf2image images = pdf2image.convert_from_path("file.pdf", dpi=200, fmt='jpg') builder = pyocr.libtesseract.LibtesseractPdfBuilder() for image in images: builder.add_image(image) builder.set_output_file("output") # .pdf will be appended builder.build() ``` Beware this code hasn't been adapted to libtesseract 3 yet. ## Dependencies * PyOCR requires Python 3.4 or later. * You will need [Pillow](https://github.com/python-imaging/Pillow) or Python Imaging Library (PIL). Under Debian/Ubuntu, Pillow is in the package ```python-pil``` (```python3-pil``` for the Python 3 version). * Install an OCR: * [libtesseract](http://code.google.com/p/tesseract-ocr/) ('libtesseract3' + 'tesseract-ocr-<lang>' in Debian). * or [tesseract-ocr](http://code.google.com/p/tesseract-ocr/) ('tesseract-ocr' + 'tesseract-ocr-<lang>' in Debian). You must be able to invoke the tesseract command as "tesseract". PyOCR is tested with Tesseract >= 3.01 only. * or Cuneiform ## Tests ```sh make check # requires pyflake8 make test # requires tox, pytest and python3 ``` Tests are made to be run without external dependencies (no Tesseract or Cuneiform needed). ## OCR on natural scenes If you want to run OCR on natural scenes (photos, etc), you will have to filter the image first. There are many algorithms possible to do that. One of those who gives the best results is [Stroke Width Transform](https://gitlab.gnome.org/World/OpenPaperwork/libpillowfight#stroke-width-transformation). ## Contact * [Forum](https://forum.openpaper.work/) * [Bug tracker](https://gitlab.gnome.org/World/OpenPaperwork/pyocr/issues) ## Applications that use PyOCR * [Mayan EDMS](http://mayan-edms.com/) * [Paperless](https://github.com/danielquinn/paperless#readme) * [Paperwork](https://gitlab.gnome.org/World/OpenPaperwork/paperwork#readme) If you know of any other applications that use Pyocr, please [tell us](https://forum.openpaper.work/) :-) ## Copyright PyOCR is released under the GPL v3+. Copyright belongs to the authors of each piece of code (see the file AUTHORS for the contributors list, and ```git blame``` to know which lines belong to which author). https://gitlab.gnome.org/World/OpenPaperwork/pyocr pyocr-0.8.5/doc/000077500000000000000000000000001450160556200134315ustar00rootroot00000000000000pyocr-0.8.5/doc/Makefile000066400000000000000000000011441450160556200150710ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python3 -m sphinx SPHINXPROJ = PyOCR SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) pyocr-0.8.5/doc/index.html000066400000000000000000000005031450160556200154240ustar00rootroot00000000000000 PyOCR documentation pyocr-0.8.5/doc/source/000077500000000000000000000000001450160556200147315ustar00rootroot00000000000000pyocr-0.8.5/doc/source/conf.py000066400000000000000000000112301450160556200162250ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # PyOCR documentation build configuration file, created by # sphinx-quickstart on Fri Feb 9 08:39:36 2018. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo', 'sphinx.ext.viewcode' ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General information about the project. project = 'PyOCR' copyright = '2018, OpenPaper.work' author = 'OpenPaper.work' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = '' # The full version, including alpha/beta/rc tags. release = '' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'alabaster' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = 'PyOCRdoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'PyOCR.tex', 'PyOCR Documentation', 'OpenPaper.work', 'manual'), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'pyocr', 'PyOCR Documentation', [author], 1) ] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'PyOCR', 'PyOCR Documentation', author, 'PyOCR', 'One line description of project.', 'Miscellaneous'), ] pyocr-0.8.5/doc/source/index.rst000066400000000000000000000007771450160556200166050ustar00rootroot00000000000000.. PyOCR documentation master file, created by sphinx-quickstart on Fri Feb 9 08:39:36 2018. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to PyOCR's documentation! ================================= .. toctree:: :maxdepth: 2 :caption: Contents: Pyinsane2 Python API ==================== .. automodule:: pyocr :members: Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` pyocr-0.8.5/pyproject.toml000066400000000000000000000011101450160556200155710ustar00rootroot00000000000000[project] name = "pyocr" description = "A Python wrapper for OCR engines (Tesseract, Cuneiform, etc)" dynamic = ["version"] authors = [ {name = "Jerome Flesch", email = "jflesch@openpaper.work" }, ] license = {text = "GPL-3.0-or-later"} readme = {file = "README.md", content-type = "text/markdown"} dependencies = [ "Pillow", ] [project.optional-dependencies] dev = [ "pytest", ] lint = [ "flake8", ] [tool.setuptools_scm] write_to = "src/pyocr/_version.py" [build-system] requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" pyocr-0.8.5/src/000077500000000000000000000000001450160556200134535ustar00rootroot00000000000000pyocr-0.8.5/src/pyocr/000077500000000000000000000000001450160556200146075ustar00rootroot00000000000000pyocr-0.8.5/src/pyocr/__init__.py000066400000000000000000000012721450160556200167220ustar00rootroot00000000000000# NOTE: This file must remain Python 2 compatible for the foreseeable future, # to ensure that we error out properly for existing editable installs. import sys if sys.version_info < (3, 4): # noqa: E402 raise ImportError(""" PyOCR 0.7+ does not support Python 2.x, 3.0, 3.1, 3.2, or 3.3. Beginning with PyOCR 0.7, Python 3.4 and above is required. See PyOCR `README.markdown` file for more information: https://gitlab.gnome.org/World/OpenPaperwork/pyocr/blob/master/README.markdown """) from .pyocr import ( get_available_tools, TOOLS, VERSION ) from .error import PyocrException __all__ = [ 'get_available_tools', 'PyocrException', 'TOOLS', 'VERSION', ] pyocr-0.8.5/src/pyocr/builders.py000066400000000000000000000546021450160556200170010ustar00rootroot00000000000000""" Builders: Each builder specifies the expected output format raw text : TextBuilder words + boxes : WordBoxBuilder lines + words + boxes : LineBoxBuilder """ from html.parser import HTMLParser import logging import xml.dom.minidom logger = logging.getLogger(__name__) __all__ = [ 'Box', 'TextBuilder', 'WordBoxBuilder', 'LineBox', 'LineBoxBuilder', 'DigitBuilder', 'DigitLineBoxBuilder', ] _XHTML_HEADER = """ \t \tOCR output """ class Box(object): """ Boxes are rectangles around each individual element recognized in the image. Elements are either char or word depending of the builder that was used. """ def __init__(self, content, position, confidence=0): """ Arguments: content --- a single string position --- the position of the box on the image. Given as a tuple of tuple: ((box_pt_min_x, box_pt_min_y), (box_pt_max_x, box_pt_max_y)) """ self.content = content self.position = position self.confidence = confidence def get_xml_tag(self, parent_doc): span_tag = parent_doc.createElement("span") span_tag.setAttribute("class", "ocrx_word") span_tag.setAttribute("title", ("bbox %d %d %d %d; x_wconf %d" % ( (self.position[0][0], self.position[0][1], self.position[1][0], self.position[1][1], self.confidence)))) txt = xml.dom.minidom.Text() txt.data = self.content span_tag.appendChild(txt) return span_tag def __str__(self): return "{} {} {} {} {}".format( self.content, self.position[0][0], self.position[0][1], self.position[1][0], self.position[1][1], ) def __box_cmp(self, other): """ Comparison function. """ if other is None or getattr(other, "position", None) is None: return -1 for (x, y) in ((self.position[0][1], other.position[0][1]), (self.position[1][1], other.position[1][1]), (self.position[0][0], other.position[0][0]), (self.position[1][0], other.position[1][0])): if x < y: return -1 elif x > y: return 1 return 0 def __lt__(self, other): return self.__box_cmp(other) < 0 def __gt__(self, other): return self.__box_cmp(other) > 0 def __eq__(self, other): return self.__box_cmp(other) == 0 def __le__(self, other): return self.__box_cmp(other) <= 0 def __ge__(self, other): return self.__box_cmp(other) >= 0 def __ne__(self, other): return self.__box_cmp(other) != 0 def __hash__(self): position_hash = 0 position_hash += ((self.position[0][0] & 0xFF) << 0) position_hash += ((self.position[0][1] & 0xFF) << 8) position_hash += ((self.position[1][0] & 0xFF) << 16) position_hash += ((self.position[1][1] & 0xFF) << 24) return (position_hash ^ hash(self.content) ^ hash(self.content)) class LineBox(object): """ Boxes are rectangles around each individual element recognized in the image. LineBox are boxes around lines. LineBox contains Box. """ def __init__(self, word_boxes, position): """ Arguments: word_boxes --- a list of Box objects position --- the position of the box on the image. Given as a tuple of tuple: ((width_pt_x, height_pt_x), (width_pt_y, height_pt_y)) """ self.word_boxes = word_boxes self.position = position @property def content(self): txt = u"" for box in self.word_boxes: txt += box.content + u" " txt = txt.strip() return txt def get_xml_tag(self, parent_doc): span_tag = parent_doc.createElement("span") span_tag.setAttribute("class", "ocr_line") span_tag.setAttribute("title", ("bbox %d %d %d %d" % ( (self.position[0][0], self.position[0][1], self.position[1][0], self.position[1][1])))) for box_idx, box in enumerate(self.word_boxes): if box_idx: space = xml.dom.minidom.Text() space.data = " " span_tag.appendChild(space) box_xml = box.get_xml_tag(parent_doc) span_tag.appendChild(box_xml) return span_tag def __str__(self): txt = "[\n" for box in self.word_boxes: txt += " {} {} {} {} {}\n".format( box.content, box.position[0][0], box.position[0][1], box.position[1][0], box.position[1][1], ) return "{}] {} {} {} {}".format( txt, self.position[0][0], self.position[0][1], self.position[1][0], self.position[1][1], ) def __repr__(self): return f"LineBox({str(self)})" def __contains__(self, text: str): return text in self.content def __box_cmp(self, other): """ Comparison function. """ if other is None or getattr(other, "position", None) is None: return -1 for (x, y) in ((self.position[0][1], other.position[0][1]), (self.position[1][1], other.position[1][1]), (self.position[0][0], other.position[0][0]), (self.position[1][0], other.position[1][0])): if (x < y): return -1 elif (x > y): return 1 return 0 def __lt__(self, other): return self.__box_cmp(other) < 0 def __gt__(self, other): return self.__box_cmp(other) > 0 def __eq__(self, other): return self.__box_cmp(other) == 0 def __le__(self, other): return self.__box_cmp(other) <= 0 def __ge__(self, other): return self.__box_cmp(other) >= 0 def __ne__(self, other): return self.__box_cmp(other) != 0 def __hash__(self): content = self.content position_hash = 0 position_hash += ((self.position[0][0] & 0xFF) << 0) position_hash += ((self.position[0][1] & 0xFF) << 8) position_hash += ((self.position[1][0] & 0xFF) << 16) position_hash += ((self.position[1][1] & 0xFF) << 24) return (position_hash ^ hash(content)) class BaseBuilder(object): """ Builders format the output of the OCR tools, and potentially configures the tools. Attributes: file_extensions : File extensions of the output. tesseract_configs : Arguments passed to the Tesseract command line. cuneiform_args : Arguments passed to the Cuneiform command line. """ def __init__(self, file_extensions, tesseract_flags, tesseract_configs, cuneiform_args): self.file_extensions = file_extensions self.tesseract_flags = tesseract_flags self.tesseract_configs = tesseract_configs self.cuneiform_args = cuneiform_args # used with Tesseract and Cuneiform def read_file(self, file_descriptor): # pragma: no cover """ Read in the OCR results from `file_descriptor` as an appropriate format. """ raise NotImplementedError("Implement in subclasses") def write_file(self, file_descriptor, output): # pragma: no cover """ Write the `output` to `file_descriptor`. """ raise NotImplementedError("Implement in subclasses") # used with Libtesseract def start_line(self, box): # pragma: no cover """ Start a new line of output. """ raise NotImplementedError("Implement in subclasses") def add_word(self, word, box, confidence=0): # pragma: no cover """ Add a word to output. """ raise NotImplementedError("Implement in subclasses") def end_line(self): # pragma: no cover """ End a line in output. """ raise NotImplementedError("Implement in subclasses") def get_output(self): # pragma: no cover """ Return the output that has been built so far. """ raise NotImplementedError("Implement in subclasses") class TextBuilder(BaseBuilder): """ If passed to image_to_string(), image_to_string() will return a simple string. This string will be the output of the OCR tool, as-is. In other words, the raw text as produced by the tool. Warning: The returned string is encoded in UTF-8 """ def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False, cuneiform_fax=False, cuneiform_singlecolumn=False): from .tesseract import psm_parameter tess_flags = [psm_parameter(), str(tesseract_layout)] file_ext = ["txt"] cun_args = ["-f", "text"] # Add custom cuneiform parameters if needed for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"), (cuneiform_fax, "--fax"), (cuneiform_singlecolumn, "--singlecolumn")]: if par: cun_args.append(arg) super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args) self.tesseract_layout = tesseract_layout self.built_text = [] @staticmethod def read_file(file_descriptor): """ Read a file and extract the content as a string. """ return file_descriptor.read().strip() @staticmethod def write_file(file_descriptor, text): """ Write a string in a file. """ file_descriptor.write(text) def start_line(self, box): self.built_text.append(u"") def add_word(self, word, box, confidence=0): if self.built_text[-1] != u"": self.built_text[-1] += u" " self.built_text[-1] += word def end_line(self): pass def get_output(self): return u"\n".join(self.built_text) def __str__(self): return "Raw text" class DigitBuilder(TextBuilder): """ If passed to image_to_string(), image_to_string() will return a simple string of digits. This string will be the output of the OCR tool, as-is. In other words, the raw text as produced by the tool when the input is assumed to be [0-9.] only. image_to_string() raises `NotImplementedError` with tools (Cuneiform) unable to process the input this way. Warning: The returned string is encoded in UTF-8. """ def __str__(self): return "Digits raw text" def __init__(self, tesseract_layout=3): super(DigitBuilder, self).__init__(tesseract_layout) self.tesseract_configs.append("digits") class _WordHTMLParser(HTMLParser): """ Tesseract style: Tesseract provides handy but non-standard hOCR tags: ocrx_word """ WORD_TAG_TYPES = {'ocr_word', 'ocrx_word'} LINE_TAG_TYPES = {'ocr_header', 'ocr_footer', 'ocr_line'} def __init__(self): HTMLParser.__init__(self) self.__tag_types = [] self.__current_box_position = None self.__current_box_text = None self.__current_box_confidence = None self.boxes = [] self.__current_line_position = None self.__current_line_content = [] self.lines = [] @staticmethod def __parse_confidence(title): for piece in title.split("; "): piece = piece.strip() if not piece.startswith("x_wconf"): continue confidence = piece.split(" ")[1] return int(confidence) logger.debug("OCR confidence measure not found. Assuming 0.") return 0 @staticmethod def __parse_position(title): for piece in title.split("; "): piece = piece.strip() if not piece.startswith("bbox"): continue piece = piece.split(" ") position = ((int(piece[1]), int(piece[2])), (int(piece[3]), int(piece[4]))) return position raise Exception("Invalid hocr position: %s" % title) def handle_starttag(self, tag, attrs): if (tag != "span"): return position = None tag_type = None for attr in attrs: if attr[0] == 'class': tag_type = attr[1] if attr[0] == 'title': position = attr[1] if position is None or tag_type is None: return if tag_type in self.WORD_TAG_TYPES: try: confidence = self.__parse_confidence(position) position = self.__parse_position(position) self.__current_box_confidence = confidence self.__current_box_position = position except Exception: # invalid position --> old format --> we ignore this tag self.__tag_types.append("ignore") return self.__current_box_text = "" elif tag_type in self.LINE_TAG_TYPES: self.__current_line_position = self.__parse_position(position) self.__current_line_content = [] self.__tag_types.append(tag_type) def handle_data(self, data): if self.__current_box_text is None: return self.__current_box_text += data def handle_endtag(self, tag): if tag != 'span': return tag_type = self.__tag_types.pop() if tag_type in self.WORD_TAG_TYPES: if self.__current_box_text is None: return box_position = self.__current_box_position box = Box(self.__current_box_text, box_position, self.__current_box_confidence) self.boxes.append(box) self.__current_line_content.append(box) self.__current_box_text = None return elif tag_type in self.LINE_TAG_TYPES: line = LineBox( self.__current_line_content, self.__current_line_position ) self.lines.append(line) self.__current_line_content = [] return def __str__(self): # pragma: no cover return "WordHTMLParser" class _LineHTMLParser(HTMLParser): """ Cuneiform style: Cuneiform provides the OCR line by line, and for each line, the position of all its characters. Spaces have "-1 -1 -1 -1" for position". """ TAG_TYPE_CONTENT = 0 TAG_TYPE_POSITIONS = 1 def __init__(self): HTMLParser.__init__(self) self.boxes = [] self.__line_text = None self.__char_positions = None def handle_starttag(self, tag, attrs): if (tag != "span"): return tag_type = -1 for attr in attrs: if attr[0] == 'class': if attr[1] == 'ocr_line': tag_type = self.TAG_TYPE_CONTENT elif attr[1] == 'ocr_cinfo': tag_type = self.TAG_TYPE_POSITIONS if tag_type == self.TAG_TYPE_CONTENT: self.__line_text = "" self.__char_positions = [] return elif tag_type == self.TAG_TYPE_POSITIONS: for attr in attrs: if attr[0] == 'title': self.__char_positions = attr[1].split(" ") # strip x_bboxes self.__char_positions = self.__char_positions[1:] if self.__char_positions[-1] == "": self.__char_positions = self.__char_positions[:-1] try: while True: self.__char_positions.remove("-1") except ValueError: pass def handle_data(self, data): if self.__line_text is None: return self.__line_text += data def handle_endtag(self, tag): if self.__line_text is None or self.__char_positions == []: return words = self.__line_text.split(" ") for word in words: if word == "": continue positions = self.__char_positions[0:4 * len(word)] self.__char_positions = self.__char_positions[4 * len(word):] left_pos = min([int(positions[x]) for x in range(0, 4 * len(word), 4)]) top_pos = min([int(positions[x]) for x in range(1, 4 * len(word), 4)]) right_pos = max([int(positions[x]) for x in range(2, 4 * len(word), 4)]) bottom_pos = max([int(positions[x]) for x in range(3, 4 * len(word), 4)]) box_pos = ((left_pos, top_pos), (right_pos, bottom_pos)) box = Box(word, box_pos) self.boxes.append(box) self.__line_text = None def __str__(self): # pragma: no cover return "LineHTMLParser" class WordBoxBuilder(BaseBuilder): """ If passed to image_to_string(), image_to_string() will return an array of Box. Each box contains a word recognized in the image. """ def __init__(self, tesseract_layout=1): from .tesseract import psm_parameter tess_flags = [psm_parameter(), str(tesseract_layout)] file_ext = ["html", "hocr"] tess_conf = ["hocr"] cun_args = ["-f", "hocr"] super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, cun_args) self.word_boxes = [] self.tesseract_layout = tesseract_layout def read_file(self, file_descriptor): """ Extract of set of Box from the lines of 'file_descriptor' Return: An array of Box. """ parsers = [_WordHTMLParser(), _LineHTMLParser()] html_str = file_descriptor.read() for p in parsers: p.feed(html_str) if len(p.boxes) > 0: last_box = p.boxes[-1] if last_box.content == "": # some parser leave an empty box at the end p.boxes.pop(-1) return p.boxes return [] @staticmethod def write_file(file_descriptor, boxes): """ Write boxes in a box file. Output is a *very* *simplified* version of hOCR. Warning: The file_descriptor must support UTF-8 ! (see module 'codecs') """ global _XHTML_HEADER impl = xml.dom.minidom.getDOMImplementation() newdoc = impl.createDocument(None, "root", None) file_descriptor.write(_XHTML_HEADER) file_descriptor.write("\n") for box in boxes: xml_str = box.get_xml_tag(newdoc).toxml() file_descriptor.write("

" + xml_str + "

\n") file_descriptor.write("\n\n") def start_line(self, box): pass def add_word(self, word, box, confidence=0): self.word_boxes.append(Box(word, box, confidence)) def end_line(self): pass def get_output(self): return self.word_boxes def __str__(self): return "Word boxes" class LineBoxBuilder(BaseBuilder): """ If passed to image_to_string(), image_to_string() will return an array of LineBox. Each LineBox contains a list of word boxes. """ def __init__(self, tesseract_layout=1): from .tesseract import psm_parameter tess_flags = [psm_parameter(), str(tesseract_layout)] file_ext = ["html", "hocr"] tess_conf = ["hocr"] cun_args = ["-f", "hocr"] super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, cun_args) self.lines = [] self.tesseract_layout = tesseract_layout def read_file(self, file_descriptor): """ Extract of set of Box from the lines of 'file_descriptor' Return: An array of LineBox. """ parsers = [ (_WordHTMLParser(), lambda parser: parser.lines), (_LineHTMLParser(), lambda parser: [LineBox([box], box.position) for box in parser.boxes]), ] html_str = file_descriptor.read() for (parser, convertion) in parsers: parser.feed(html_str) if len(parser.boxes) > 0: last_box = parser.boxes[-1] if last_box.content == "": # some parser leave an empty box at the end parser.boxes.pop(-1) return convertion(parser) return [] @staticmethod def write_file(file_descriptor, boxes): """ Write boxes in a box file. Output is a *very* *simplified* version of hOCR. Warning: The file_descriptor must support UTF-8 ! (see module 'codecs') """ global _XHTML_HEADER impl = xml.dom.minidom.getDOMImplementation() newdoc = impl.createDocument(None, "root", None) file_descriptor.write(_XHTML_HEADER) file_descriptor.write("\n") for box in boxes: xml_str = box.get_xml_tag(newdoc).toxml() file_descriptor.write("

" + xml_str + "

\n") file_descriptor.write("\n\n") def start_line(self, box): # no empty line if len(self.lines) > 0 and self.lines[-1].content == "": return self.lines.append(LineBox([], box)) def add_word(self, word, box, confidence=0): self.lines[-1].word_boxes.append(Box(word, box, confidence)) def end_line(self): pass def get_output(self): return self.lines def __str__(self): return "Line boxes" class DigitLineBoxBuilder(LineBoxBuilder): """ If passed to image_to_string(), image_to_string() will return an array of LineBox. Each box contains a word recognized in the image with nearly only numeric characters [0-9.], depending on the tool. `image_to_string` raises NotImplementedError with some tools (Cuneiform) unable to process the input this way. """ def __str__(self): return "Digit line boxes" def __init__(self, tesseract_layout=1): super(DigitLineBoxBuilder, self).__init__(tesseract_layout) self.tesseract_configs.append("digits") pyocr-0.8.5/src/pyocr/cuneiform.py000066400000000000000000000075371450160556200171640ustar00rootroot00000000000000''' cuneiform.py is a wrapper for Cuneiform USAGE: > from PIL import Image > from pyocr.cuneiform import image_to_string > print image_to_string(Image.open('test.png')) > print image_to_string(Image.open('test-european.jpg'), lang='fra') COPYRIGHT: PyOCR is released under the GPL v3. Copyright (c) Samuel Hoffstaetter, 2009 Copyright (c) Jerome Flesch, 2011-2016 https://gitlab.gnome.org/World/OpenPaperwork/pyocr#readme ''' import codecs from io import BytesIO import re import shutil import subprocess import tempfile from . import builders from .error import CuneiformError # CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY CUNEIFORM_CMD = 'cuneiform' CUNEIFORM_DATA_POSSIBLE_PATHS = [ "/usr/local/share/cuneiform", "/usr/share/cuneiform", ] LANGUAGES_LINE_PREFIX = "Supported languages: " LANGUAGES_SPLIT_RE = re.compile("[^a-z]") VERSION_LINE_RE = re.compile(r"Cuneiform for \w+ (\d+).(\d+).(\d+)") __all__ = [ 'can_detect_orientation', 'get_available_builders', 'get_available_languages', 'get_name', 'get_version', 'image_to_string', 'is_available', 'CuneiformError', ] def can_detect_orientation(): return False def get_name(): return "Cuneiform (sh)" def get_available_builders(): return [ builders.TextBuilder, builders.WordBoxBuilder, builders.LineBoxBuilder, ] def temp_file(suffix): ''' Returns a temporary file ''' return tempfile.NamedTemporaryFile(prefix='cuneiform_', suffix=suffix) def image_to_string(image, lang=None, builder=None): if builder is None: builder = builders.TextBuilder() if "digits" in builder.tesseract_configs: raise NotImplementedError( "Numerical only : This option is not available with Cuneiform" ) with temp_file(builder.file_extensions[0]) as output_file: cmd = [CUNEIFORM_CMD] if lang is not None: cmd += ["-l", lang] cmd += builder.cuneiform_args cmd += ["-o", output_file.name] cmd += ["-"] # stdin if image.mode != "RGB": image = image.convert("RGB") img_data = BytesIO() image.save(img_data, format="BMP") proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) proc.stdin.write(img_data.getvalue()) proc.stdin.close() output = proc.stdout.read().decode('utf-8') retcode = proc.wait() if retcode: raise CuneiformError(retcode, output) with codecs.open(output_file.name, 'r', encoding='utf-8', errors='replace') as file_desc: results = builder.read_file(file_desc) return results def is_available(): return shutil.which(CUNEIFORM_CMD) is not None def get_available_languages(): proc = subprocess.Popen([CUNEIFORM_CMD, "-l"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = proc.stdout.read().decode('utf-8') proc.wait() languages = [] for line in output.split("\n"): if not line.startswith(LANGUAGES_LINE_PREFIX): continue line = line[len(LANGUAGES_LINE_PREFIX):] for language in LANGUAGES_SPLIT_RE.split(line): if language == "": continue languages.append(language) return languages def get_version(): proc = subprocess.Popen([CUNEIFORM_CMD], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = proc.stdout.read().decode('utf-8') proc.wait() for line in output.split("\n"): m = VERSION_LINE_RE.match(line) if m is not None: g = m.groups() ver = (int(g[0]), int(g[1]), int(g[2])) return ver return None pyocr-0.8.5/src/pyocr/error.py000066400000000000000000000010571450160556200163150ustar00rootroot00000000000000class PyocrException(Exception): pass class TesseractError(PyocrException): """ Obsolete. You should look for PyocrException """ def __init__(self, status, message): PyocrException.__init__(self, message) self.status = status self.message = message self.args = (status, message) class CuneiformError(PyocrException): def __init__(self, status, message): PyocrException.__init__(self, message) self.status = status self.message = message self.args = (status, message) pyocr-0.8.5/src/pyocr/libtesseract/000077500000000000000000000000001450160556200172735ustar00rootroot00000000000000pyocr-0.8.5/src/pyocr/libtesseract/__init__.py000066400000000000000000000233761450160556200214170ustar00rootroot00000000000000''' libtesseract/ is a wrapper for google's Tesseract-OCR C API ( http://code.google.com/p/tesseract-ocr/ ). USAGE: > from PIL import Image > from pyocr.libtesseract import image_to_string > print(image_to_string(Image.open('test.png'))) > print(image_to_string(Image.open('test-european.jpg'), lang='fra')) COPYRIGHT: PyOCR is released under the GPL v3. Copyright (c) Jerome Flesch, 2011-2016 https://gitlab.gnome.org/World/OpenPaperwork/pyocr#readme ''' from os import devnull from .. import builders from . import tesseract_raw from ..error import TesseractError from ..util import digits_only import logging logger = logging.getLogger(__name__) __all__ = [ 'can_detect_orientation', 'detect_orientation', 'get_available_builders', 'get_available_languages', 'get_name', 'get_version', 'image_to_string', 'is_available', 'TesseractError', ] def can_detect_orientation(): langs = get_available_languages() return 'osd' in langs def detect_orientation(image, lang=None): # C-API with Tesseract 4 segfaults if running OSD_ONLY # psm mode with other than osd language # lang argument left purely for compatibility reasons # tested on 4.0.0-rc2 handle = tesseract_raw.init(lang='osd') try: tesseract_raw.set_page_seg_mode( handle, tesseract_raw.PageSegMode.OSD_ONLY ) tesseract_raw.set_image(handle, image) os = tesseract_raw.detect_os(handle) if os['confidence'] <= 0: raise TesseractError( "no script", "no script detected" ) orientation = { tesseract_raw.Orientation.PAGE_UP: 0, tesseract_raw.Orientation.PAGE_RIGHT: 90, tesseract_raw.Orientation.PAGE_DOWN: 180, tesseract_raw.Orientation.PAGE_LEFT: 270, }[os['orientation']] return { 'angle': orientation, 'confidence': os['confidence'] } finally: tesseract_raw.cleanup(handle) def get_name(): return "Tesseract (C-API)" def get_available_builders(): return [ builders.TextBuilder, builders.WordBoxBuilder, builders.DigitBuilder, builders.LineBoxBuilder, builders.DigitLineBoxBuilder, ] def _tess_box_to_pyocr_box(box): return ( (box[0], box[1]), (box[2], box[3]), ) def image_to_string(image, lang=None, builder=None): if builder is None: builder = builders.TextBuilder() handle = tesseract_raw.init(lang=lang) lvl_line = tesseract_raw.PageIteratorLevel.TEXTLINE lvl_word = tesseract_raw.PageIteratorLevel.WORD try: # XXX(Jflesch): Issue #51: # Tesseract TessBaseAPIRecognize() may segfault when the target # language is not available clang = lang if lang else "eng" for lang_item in clang.split("+"): if lang_item not in tesseract_raw.get_available_languages(handle): raise TesseractError( "no lang", "language {} is not available".format(lang_item) ) tesseract_raw.set_page_seg_mode( handle, builder.tesseract_layout ) tesseract_raw.set_debug_file(handle, devnull) tesseract_raw.set_image(handle, image) if "digits" in builder.tesseract_configs: tesseract_raw.set_is_numeric(handle, True) # XXX(JFlesch): PageIterator and ResultIterator are actually the # very same thing. If it changes, we are screwed. tesseract_raw.recognize(handle) res_iterator = tesseract_raw.get_iterator(handle) if res_iterator is None: raise TesseractError( "no script", "no script detected" ) page_iterator = tesseract_raw.result_iterator_get_page_iterator( res_iterator ) while True: if tesseract_raw.page_iterator_is_at_beginning_of( page_iterator, lvl_line): (r, box) = tesseract_raw.page_iterator_bounding_box( page_iterator, lvl_line ) assert r box = _tess_box_to_pyocr_box(box) builder.start_line(box) last_word_in_line = ( tesseract_raw.page_iterator_is_at_final_element( page_iterator, lvl_line, lvl_word ) ) word = tesseract_raw.result_iterator_get_utf8_text( res_iterator, lvl_word ) confidence = tesseract_raw.result_iterator_get_confidence( res_iterator, lvl_word ) if word is not None and confidence is not None and word != "": (r, box) = tesseract_raw.page_iterator_bounding_box( page_iterator, lvl_word ) assert r box = _tess_box_to_pyocr_box(box) builder.add_word(word, box, confidence) if last_word_in_line: builder.end_line() if not tesseract_raw.page_iterator_next(page_iterator, lvl_word): break finally: tesseract_raw.cleanup(handle) return builder.get_output() def image_to_pdf(image, output_file, lang=None, input_file="stdin", textonly=False): ''' Creates pdf file with embeded text based on OCR from an image Args: image: image to be converted output_file: path to the file that will be created, `.pdf` extension should not be specified lang: three letter language code. For available languages see https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages. Defaults to None. input_file: path to the image file that should be beneath the text in output pdf. If not specified (stdin, incorrect file) output pdf is correct but tesseract writes some errors about not being able to open the file. Defaults to stdin. textonly: create pdf with only one invisible text layer. Defaults to False. ''' LibtesseractPdfBuilder()\ .set_lang(lang)\ .set_output_file(output_file)\ .set_text_only(textonly)\ .add_image(image)\ .build() class LibtesseractPdfBuilder(object): ''' Creates a pdf file with embeded text based on OCR from one or more images. ''' def __init__(self): self.images = [] self.output_file = None self.lang = None self.text_only = False def set_lang(self, lang): ''' Language to be used for ocr. :param lang: three letter language code. For available languages see https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages. Defaults to None. ''' self.lang = lang return self def set_output_file(self, output_file): self.output_file = output_file return self def set_text_only(self, text_only): ''' :param text_only: create pdf with only one invisible text layer. Defaults to False. ''' self.text_only = text_only return self def add_image(self, img): ''' Add an image to be converted to a page in the pdf :param img: image to convert ''' self.images.append(img) # or something else return self def __validate(self): if len(self.images) < 1: raise ValueError( "At least one image is required to build the pdf!" ) if self.output_file is None: raise ValueError("An output-file is required to build the pdf!") def build(self): ''' Create and write PDF file. ''' self.__validate() handle = tesseract_raw.init(lang=self.lang) renderer = None try: tesseract_raw.set_page_seg_mode( handle, tesseract_raw.PageSegMode.AUTO_OSD ) renderer = tesseract_raw.init_pdf_renderer( handle, self.output_file, self.text_only ) assert renderer tesseract_raw.begin_document(renderer, "") for image in self.images: tesseract_raw.set_image(handle, image) # tesseract_raw.set_input_name(handle, input_file) tesseract_raw.recognize(handle) tesseract_raw.add_renderer_image(handle, renderer) tesseract_raw.end_document(renderer) finally: tesseract_raw.cleanup(handle) if renderer: tesseract_raw.cleanup(renderer) def is_available(): available = tesseract_raw.is_available() if not available: return False version = get_version() # C-API with Tesseract <= 3.02 segfaults sometimes # (seen with Debian stable + Paperwork) # not tested with 3.03 if (version[0] < 3 or (version[0] == 3 and version[1] < 4)): logger.warning("Unsupported version [%s]" % ".".join( [str(r) for r in version] )) return False return True def get_available_languages(): handle = tesseract_raw.init() try: return tesseract_raw.get_available_languages(handle) finally: tesseract_raw.cleanup(handle) def get_version(): version = tesseract_raw.get_version() version = version.split(" ", 1)[0] # cut off "dev" string if exists for proper int conversion index = version.find("dev") if index != -1: version = version[:index] version = version.split(".") major = digits_only(version[0]) minor = digits_only(version[1]) upd = 0 if len(version) >= 3: upd = digits_only(version[2]) return (major, minor, upd) pyocr-0.8.5/src/pyocr/libtesseract/tesseract_raw.py000066400000000000000000000505121450160556200225160ustar00rootroot00000000000000import ctypes import locale import logging import os import sys from ..error import TesseractError logger = logging.getLogger(__name__) TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None) libnames = [] # 70 is the minimum credible dpi for tesseract and force it to compute an # estimate of the image dpi DPI_DEFAULT = 70 if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'): # Pyinstaller integration libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")] libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")] tessdata = os.path.join(sys._MEIPASS, "data") if not os.path.exists(os.path.join(tessdata, "tessdata")): logger.warning( "Running from container, but no tessdata ({}) found !".format( tessdata ) ) else: TESSDATA_PREFIX = os.path.join(tessdata, "tessdata") if sys.platform[:3] == "win": # pragma: no cover libnames += [ # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on # Windows ? "../vs2010/DLL_Release/libtesseract302.dll", # prefer the most recent first "libtesseract305.dll", "libtesseract304.dll", "libtesseract303.dll", "libtesseract302.dll", "libtesseract400.dll", # Tesseract 4 is still in alpha stage "libtesseract.dll", "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-4.dll", "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-3.dll", ] else: libnames += [ "libtesseract.so.5", "libtesseract.so.4", "libtesseract.so.3", "libtesseract.5.dylib", "libtesseract.4.dylib", ] g_libtesseract = None lib_load_errors = [] for libname in libnames: # pragma: no branch try: g_libtesseract = ctypes.cdll.LoadLibrary(libname) lib_load_errors = [] break except OSError as ex: # pragma: no cover if hasattr(ex, 'message'): # python 2 lib_load_errors.append((libname, ex.message)) else: # python 3 lib_load_errors.append((libname, str(ex))) class PageSegMode(object): OSD_ONLY = 0 AUTO_OSD = 1 AUTO_ONLY = 2 AUTO = 3 SINGLE_COLUMN = 4 SINGLE_BLOCK_VERT_TEXT = 5 SINGLE_BLOCK = 6 SINGLE_LINE = 7 SINGLE_WORD = 8 CIRCLE_WORD = 9 SINGLE_CHAR = 10 SPARSE_TEXT = 11 SPARSE_TEXT_OSD = 12 PSM_RAW_LINE = 13 COUNT = 14 class Orientation(object): PAGE_UP = 0 PAGE_RIGHT = 1 PAGE_DOWN = 2 PAGE_LEFT = 3 class PageIteratorLevel(object): BLOCK = 0 PARA = 1 TEXTLINE = 2 WORD = 3 SYMBOL = 4 class PolyBlockType(object): UNKNOWN = 0 FLOWING_TEXT = 1 HEADING_TEXT = 2 PULLOUT_TEXT = 3 TABLE = 4 VERTICAL_TEXT = 5 CAPTION_TEXT = 6 FLOWING_IMAGE = 7 HEADING_IMAGE = 8 PULLOUT_IMAGE = 9 HORZ_LINE = 10 VERT_LINE = 11 NOISE = 12 COUNT = 13 class OSResults(ctypes.Structure): _fields_ = [ ("orientations", ctypes.c_float * 4), ("scripts_na", ctypes.c_float * 4 * (116 + 1 + 2 + 1)), ("unicharset", ctypes.c_void_p), ("best_orientation_id", ctypes.c_int), ("best_script_id", ctypes.c_int), ("best_sconfidence", ctypes.c_float), ("best_oconfidence", ctypes.c_float), # extra padding in case the structure is extended later ("padding", ctypes.c_char * 512), ] if g_libtesseract: # pragma: no cover g_libtesseract.TessVersion.argtypes = [] g_libtesseract.TessVersion.restype = ctypes.c_char_p g_libtesseract.TessBaseAPICreate.argtypes = [] g_libtesseract.TessBaseAPICreate.restype = ctypes.c_void_p # TessBaseAPI* g_libtesseract.TessBaseAPIDelete.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ] g_libtesseract.TessBaseAPIDelete.argtypes = None g_libtesseract.TessBaseAPIGetDatapath.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ] g_libtesseract.TessBaseAPIGetDatapath.restype = ctypes.POINTER( ctypes.c_char) g_libtesseract.TessBaseAPIInit1.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.c_char_p, # datapath ctypes.c_char_p, # language ctypes.c_int, # TessOcrEngineMode ctypes.POINTER(ctypes.c_char_p), # configs ctypes.c_int, # configs_size ] g_libtesseract.TessBaseAPIInit1.restype = ctypes.c_int g_libtesseract.TessBaseAPIInit3.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.c_char_p, # datapath ctypes.c_char_p, # language ] g_libtesseract.TessBaseAPIInit3.restype = ctypes.c_int g_libtesseract.TessBaseAPISetSourceResolution.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.c_int, # PPI ] g_libtesseract.TessBaseAPISetSourceResolution.restype = None g_libtesseract.TessBaseAPISetVariable.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.c_char_p, # name ctypes.c_char_p, # value ] g_libtesseract.TessBaseAPISetVariable.restype = ctypes.c_bool g_libtesseract.TessBaseAPIGetAvailableLanguagesAsVector.argtypes = [ ctypes.c_void_p # TessBaseAPI* ] g_libtesseract.TessBaseAPIGetAvailableLanguagesAsVector.restype = \ ctypes.POINTER(ctypes.c_char_p) g_libtesseract.TessBaseAPISetPageSegMode.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.c_int, # See PageSegMode ] g_libtesseract.TessBaseAPISetPageSegMode.restype = None g_libtesseract.TessBaseAPIInitForAnalysePage.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ] g_libtesseract.TessBaseAPIInitForAnalysePage.restype = None g_libtesseract.TessBaseAPISetImage.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.POINTER(ctypes.c_char), # imagedata ctypes.c_int, # width ctypes.c_int, # height ctypes.c_int, # bytes_per_pixel ctypes.c_int, # bytes_per_line ] g_libtesseract.TessBaseAPISetImage.restype = None g_libtesseract.TessResultRendererAddImage.argtypes = [ ctypes.c_void_p, # TessResultRenderer* renderer ctypes.c_void_p # TessBaseAPI* api ] g_libtesseract.TessResultRendererAddImage.restype = ctypes.c_bool g_libtesseract.TessBaseAPISetInputName.argtypes = [ ctypes.c_void_p, # TessBaseAPI* handle ctypes.c_char_p # const char* name ] g_libtesseract.TessBaseAPISetInputName.restype = None g_libtesseract.TessResultRendererBeginDocument.argtypes = [ ctypes.c_void_p, # TessResultRenderer* renderer ctypes.c_char_p # const char* title ] g_libtesseract.TessResultRendererBeginDocument.restype = ctypes.c_bool g_libtesseract.TessResultRendererEndDocument.argtypes = [ ctypes.c_void_p # TessResultRenderer* renderer ] g_libtesseract.TessResultRendererEndDocument.restype = ctypes.c_bool g_libtesseract.TessPDFRendererCreate.argtypes = [ ctypes.c_char_p, # const char* outputbase ctypes.c_char_p, # const char* datadir ctypes.c_bool # BOOL textonly ] g_libtesseract.TessPDFRendererCreate.restype = ctypes.c_void_p g_libtesseract.TessBaseAPIRecognize.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.c_void_p, # ETEXT_DESC* ] g_libtesseract.TessBaseAPIRecognize.restype = ctypes.c_int g_libtesseract.TessBaseAPIGetIterator.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ] g_libtesseract.TessBaseAPIGetIterator.restype = \ ctypes.c_void_p # TessResultIterator g_libtesseract.TessBaseAPIAnalyseLayout.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ] g_libtesseract.TessBaseAPIAnalyseLayout.restype = \ ctypes.c_void_p # TessPageIterator* g_libtesseract.TessBaseAPIGetUTF8Text.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ] g_libtesseract.TessBaseAPIGetUTF8Text.restype = ctypes.c_void_p g_libtesseract.TessPageIteratorDelete.argtypes = [ ctypes.c_void_p, # TessPageIterator* ] g_libtesseract.TessPageIteratorDelete.restype = None g_libtesseract.TessPageIteratorOrientation.argtypes = [ ctypes.c_void_p, # TessPageIterator* ctypes.POINTER(ctypes.c_int), # TessOrientation* ctypes.POINTER(ctypes.c_int), # TessWritingDirection* ctypes.POINTER(ctypes.c_int), # TessTextlineOrder* ctypes.POINTER(ctypes.c_float), # deskew_angle ] g_libtesseract.TessPageIteratorOrientation.restype = None g_libtesseract.TessPageIteratorNext.argtypes = [ ctypes.c_void_p, # TessPageIterator* ctypes.c_int, # TessPageIteratorLevel ] g_libtesseract.TessPageIteratorNext.restype = ctypes.c_bool g_libtesseract.TessPageIteratorIsAtBeginningOf.argtypes = [ ctypes.c_void_p, # TessPageIterator* ctypes.c_int, # TessPageIteratorLevel ] g_libtesseract.TessPageIteratorIsAtBeginningOf.restype = ctypes.c_bool g_libtesseract.TessPageIteratorIsAtFinalElement.argtypes = [ ctypes.c_void_p, # TessPageIterator* ctypes.c_int, # TessPageIteratorLevel (level) ctypes.c_int, # TessPageIteratorLevel (element) ] g_libtesseract.TessPageIteratorIsAtFinalElement.restype = ctypes.c_bool g_libtesseract.TessPageIteratorBlockType.argtypes = [ ctypes.c_void_p, # TessPageIterator* ] g_libtesseract.TessPageIteratorBlockType.restype = \ ctypes.c_int # PolyBlockType g_libtesseract.TessPageIteratorBoundingBox.args = [ ctypes.c_void_p, # TessPageIterator* ctypes.c_int, # TessPageIteratorLevel (level) ctypes.POINTER(ctypes.c_int), # left ctypes.POINTER(ctypes.c_int), # top ctypes.POINTER(ctypes.c_int), # right ctypes.POINTER(ctypes.c_int), # bottom ] g_libtesseract.TessPageIteratorBoundingBox.restype = ctypes.c_bool g_libtesseract.TessResultIteratorGetPageIterator.argtypes = [ ctypes.c_void_p, # TessResultIterator* ] g_libtesseract.TessResultIteratorGetPageIterator.restype = \ ctypes.c_void_p # TessPageIterator* g_libtesseract.TessResultIteratorGetUTF8Text.argtypes = [ ctypes.c_void_p, # TessResultIterator* ctypes.c_int, # TessPageIteratorLevel (level) ] g_libtesseract.TessResultIteratorGetUTF8Text.restype = \ ctypes.c_void_p g_libtesseract.TessResultIteratorConfidence.argtypes = [ ctypes.c_void_p, ctypes.c_int, ] g_libtesseract.TessResultIteratorConfidence.restype = ctypes.c_float g_libtesseract.TessDeleteText.argtypes = [ ctypes.c_void_p ] g_libtesseract.TessDeleteText.restype = None if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'): g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.POINTER(ctypes.c_int), # orient_deg ctypes.POINTER(ctypes.c_float), # orient_conf ctypes.POINTER(ctypes.c_char_p), # script_name ctypes.POINTER(ctypes.c_float), # script_conf ] g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \ ctypes.c_bool else: g_libtesseract.TessBaseAPIDetectOS.argtypes = [ ctypes.c_void_p, # TessBaseAPI* ctypes.POINTER(OSResults), ] g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool def init(lang=None): assert g_libtesseract # Tesseract 4 workaround if get_version() == "4.0.0": locale.setlocale(locale.LC_ALL, "C") handle = g_libtesseract.TessBaseAPICreate() try: if lang: lang = lang.encode("utf-8") prefix = None if TESSDATA_PREFIX: # pragma: no cover prefix = TESSDATA_PREFIX.encode("utf-8") g_libtesseract.TessBaseAPIInit3( ctypes.c_void_p(handle), ctypes.c_char_p(prefix), ctypes.c_char_p(lang) ) g_libtesseract.TessBaseAPISetVariable( ctypes.c_void_p(handle), b"tessedit_zero_rejection", b"F" ) except: # noqa: E722 g_libtesseract.TessBaseAPIDelete(ctypes.c_void_p(handle)) raise return handle def cleanup(handle): assert g_libtesseract is not None g_libtesseract.TessBaseAPIDelete(ctypes.c_void_p(handle)) def is_available(): return g_libtesseract is not None def get_version(): assert g_libtesseract is not None return g_libtesseract.TessVersion().decode("utf-8") def get_available_languages(handle): assert g_libtesseract is not None langs = [] c_langs = g_libtesseract.TessBaseAPIGetAvailableLanguagesAsVector( ctypes.c_void_p(handle) ) i = 0 while c_langs[i]: langs.append(c_langs[i].decode("utf-8")) i += 1 return langs def set_is_numeric(handle, mode): assert g_libtesseract is not None if mode: wl = b"0123456789." else: wl = b"" g_libtesseract.TessBaseAPISetVariable( ctypes.c_void_p(handle), b"tessedit_char_whitelist", wl ) def set_debug_file(handle, filename): assert g_libtesseract is not None if not isinstance(filename, bytes): filename = filename.encode('utf-8') g_libtesseract.TessBaseAPISetVariable( ctypes.c_void_p(handle), b"debug_file", filename ) def set_page_seg_mode(handle, mode): assert g_libtesseract is not None g_libtesseract.TessBaseAPISetPageSegMode( ctypes.c_void_p(handle), ctypes.c_int(mode) ) def init_for_analyse_page(handle): assert g_libtesseract is not None g_libtesseract.TessBaseAPIInitForAnalysePage(ctypes.c_void_p(handle)) def set_image(handle, image): assert g_libtesseract is not None image = image.convert("RGB") image.load() imgdata = image.tobytes("raw", "RGB") g_libtesseract.TessBaseAPISetImage( ctypes.c_void_p(handle), imgdata, ctypes.c_int(image.width), ctypes.c_int(image.height), ctypes.c_int(3), # RGB = 3 * 8 ctypes.c_int(image.width * 3) ) dpi = image.info.get("dpi", [DPI_DEFAULT])[0] g_libtesseract.TessBaseAPISetSourceResolution(ctypes.c_void_p(handle), dpi) def recognize(handle): assert g_libtesseract is not None return g_libtesseract.TessBaseAPIRecognize( ctypes.c_void_p(handle), ctypes.c_void_p(None) ) def analyse_layout(handle): assert g_libtesseract is not None return g_libtesseract.TessBaseAPIAnalyseLayout(ctypes.c_void_p(handle)) def get_utf8_text(handle): assert g_libtesseract is not None ptr = g_libtesseract.TessBaseAPIGetUTF8Text(ctypes.c_void_p(handle)) val = ctypes.cast(ptr, ctypes.c_char_p).value.decode("utf-8") g_libtesseract.TessDeleteText(ptr) return val def page_iterator_delete(iterator): assert g_libtesseract is not None return g_libtesseract.TessPageIteratorDelete(ctypes.c_void_p(iterator)) def page_iterator_next(iterator, level): assert g_libtesseract is not None return g_libtesseract.TessPageIteratorNext(ctypes.c_void_p(iterator), level) def page_iterator_is_at_beginning_of(iterator, level): assert g_libtesseract is not None return g_libtesseract.TessPageIteratorIsAtBeginningOf( ctypes.c_void_p(iterator), level ) def page_iterator_is_at_final_element(iterator, level, element): assert g_libtesseract is not None return g_libtesseract.TessPageIteratorIsAtFinalElement( ctypes.c_void_p(iterator), level, element ) def page_iterator_block_type(iterator): assert g_libtesseract is not None return g_libtesseract.TessPageIteratorBlockType( ctypes.c_void_p(iterator) ) def page_iterator_bounding_box(iterator, level): assert g_libtesseract is not None left = ctypes.c_int(0) left_p = ctypes.pointer(left) top = ctypes.c_int(0) top_p = ctypes.pointer(top) right = ctypes.c_int(0) right_p = ctypes.pointer(right) bottom = ctypes.c_int(0) bottom_p = ctypes.pointer(bottom) r = g_libtesseract.TessPageIteratorBoundingBox( ctypes.c_void_p(iterator), level, left_p, top_p, right_p, bottom_p ) if not r: return (False, (0, 0, 0, 0)) return (True, (left.value, top.value, right.value, bottom.value)) def page_iterator_orientation(iterator): assert g_libtesseract is not None orientation = ctypes.c_int(0) writing_direction = ctypes.c_int(0) textline_order = ctypes.c_int(0) deskew_angle = ctypes.c_float(0.0) g_libtesseract.TessPageIteratorOrientation( ctypes.c_void_p(iterator), ctypes.pointer(orientation), ctypes.pointer(writing_direction), ctypes.pointer(textline_order), ctypes.pointer(deskew_angle) ) return { "orientation": orientation.value, "writing_direction": writing_direction.value, "textline_order": textline_order.value, "deskew_angle": deskew_angle.value, } def get_iterator(handle): assert g_libtesseract is not None i = g_libtesseract.TessBaseAPIGetIterator(ctypes.c_void_p(handle)) return i def result_iterator_get_page_iterator(res_iterator): assert g_libtesseract is not None return g_libtesseract.TessResultIteratorGetPageIterator( ctypes.c_void_p(res_iterator) ) def result_iterator_get_utf8_text(iterator, level): assert g_libtesseract is not None ptr = g_libtesseract.TessResultIteratorGetUTF8Text( ctypes.c_void_p(iterator), level ) if ptr is None: return None val = ctypes.cast(ptr, ctypes.c_char_p).value.decode("utf-8") g_libtesseract.TessDeleteText(ptr) return val def result_iterator_get_confidence(iterator, level): assert g_libtesseract is not None ptr = g_libtesseract.TessResultIteratorConfidence( ctypes.c_void_p(iterator), level ) if ptr is None: return None val = ctypes.c_float(ptr).value return val def detect_os(handle): assert g_libtesseract is not None # Use the new API function if it is available, because since Tesseract # 3.05.00 the old API function _always_ returns False. if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'): orientation_deg = ctypes.c_int(0) orientation_confidence = ctypes.c_float(0.0) r = g_libtesseract.TessBaseAPIDetectOrientationScript( ctypes.c_void_p(handle), ctypes.byref(orientation_deg), ctypes.byref(orientation_confidence), None, # script_name None # script_confidence ) if not r: raise TesseractError("detect_orientation failed", "TessBaseAPIDetectOrientationScript() failed") return { "orientation": round(orientation_deg.value / 90), "confidence": orientation_confidence.value, } else: # old API (before Tesseract 3.05.00) results = OSResults() r = g_libtesseract.TessBaseAPIDetectOS( ctypes.c_void_p(handle), ctypes.pointer(results) ) if not r: raise TesseractError("detect_orientation failed", "TessBaseAPIDetectOS() failed") return { "orientation": results.best_orientation_id, "confidence": results.best_oconfidence, } def set_input_name(handle, input_file): assert g_libtesseract is not None g_libtesseract.TessBaseAPISetInputName( ctypes.c_void_p(handle), input_file.encode() ) def init_pdf_renderer(handle, output_file, textonly): assert g_libtesseract is not None tessdata_dir = g_libtesseract.TessBaseAPIGetDatapath(handle) renderer = g_libtesseract.TessPDFRendererCreate( output_file.encode(), tessdata_dir, ctypes.c_bool(textonly) ) return renderer def begin_document(renderer, doc_name): assert g_libtesseract is not None g_libtesseract.TessResultRendererBeginDocument( ctypes.c_void_p(renderer), doc_name.encode() ) def add_renderer_image(handle, renderer): assert g_libtesseract is not None g_libtesseract.TessResultRendererAddImage( ctypes.c_void_p(renderer), ctypes.c_void_p(handle) ) def end_document(renderer): assert g_libtesseract is not None g_libtesseract.TessResultRendererEndDocument( ctypes.c_void_p(renderer) ) pyocr-0.8.5/src/pyocr/pyocr.py000066400000000000000000000042721450160556200163220ustar00rootroot00000000000000""" Wrapper for various OCR tools. USAGE: from PIL import Image import sys from pyocr import pyocr tools = pyocr.get_available_tools()[:] if len(tools) == 0: print("No OCR tool found") sys.exit(1) print("Using '%s'" % (tools[0].get_name())) tools[0].image_to_string(Image.open('test.png'), lang='fra', builder=TextBuilder()) DETAILS: Each module wrapping an OCR tool provides the following functions: - get_name(): Return the name of the tool - is_available(): Returns True if the tool is installed. False else. - get_version(): Return a tuple containing the version of the tool (if installed) - get_available_builders(): Returns a list of builders that can be used with this tool (see image_to_string()) - get_available_languages(): Returns a list of languages supported by this tool. Languages are usually written using ISO 3 letters country codes - image_to_string(): Takes 3 arguments: - an image (see python Imaging "Image" module) (mandatory) - lang= (see get_available_languages()) (optional) - builder= (see get_available_builders() or the classes in the module 'pyocr.builders') (optional: default is pyocr.builders.TextBuilder) Returned value depends of the specified builder. COPYRIGHT: Pyocr is released under the GPL v3. Copyright (c) Jerome Flesch, 2011-2016 Tesseract module: Copyright (c) Samuel Hoffstaetter, 2009 WEBSITE: https://gitlab.gnome.org/World/OpenPaperwork/pyocr#readme """ from . import _version from . import cuneiform from . import libtesseract from . import tesseract __all__ = [ 'get_available_tools', 'TOOLS', 'VERSION', ] TOOLS = [ # in preference order tesseract, libtesseract, cuneiform, ] try: # seal it VERSION = tuple(_version.version_tuple[:3]) except Exception as exc: print("WARNING: Failed to parse PyOCR version: " + str(_version.version)) print("WARNING: Exception was: " + str(exc)) VERSION = (0, 0, 0) def get_available_tools(): """ Return a list of OCR tools available on the local system. """ available = [] for tool in TOOLS: if tool.is_available(): available.append(tool) return available pyocr-0.8.5/src/pyocr/tesseract.py000066400000000000000000000357061450160556200171710ustar00rootroot00000000000000''' tesseract.py is a wrapper for google's Tesseract-OCR ( http://code.google.com/p/tesseract-ocr/ ). USAGE: > from PIL import Image > from pyocr.tesseract import image_to_string > print(image_to_string(Image.open('test.png'))) > print(image_to_string(Image.open('test-european.jpg'), lang='fra')) COPYRIGHT: PyOCR is released under the GPL v3. Copyright (c) Samuel Hoffstaetter, 2009 Copyright (c) Jerome Flesch, 2011-2016 https://gitlab.gnome.org/World/OpenPaperwork/pyocr#readme ''' import codecs import logging import os import shutil import subprocess import sys import tempfile from . import builders from .builders import DigitBuilder # backward compatibility from .error import TesseractError # backward compatibility from .util import digits_only # CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract' TESSDATA_EXTENSION = ".traineddata" logger = logging.getLogger(__name__) g_subprocess_startup_info = None g_creation_flags = 0 g_version = None __all__ = [ 'CharBoxBuilder', 'DigitBuilder', 'can_detect_orientation', 'detect_orientation', 'get_available_builders', 'get_available_languages', 'get_name', 'get_version', 'image_to_string', 'is_available', 'TesseractError', ] class CharBoxBuilder(builders.BaseBuilder): """ If passed to image_to_string(), image_to_string() will return an array of Box. Each box correspond to a character recognized in the image. """ def __init__(self): file_ext = ["box"] tess_flags = [] tess_conf = ["batch.nochop", "makebox"] cun_args = [] super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, cun_args) self.tesseract_layout = 1 @staticmethod def read_file(file_descriptor): """ Extract of set of Box from the lines of 'file_descriptor' Return: An array of Box. """ boxes = [] # note that the order of the boxes may matter to the caller for line in file_descriptor.readlines(): line = line.strip() if line == "": continue elements = line.split(" ") if len(elements) < 6: continue position = ((int(elements[1]), int(elements[2])), (int(elements[3]), int(elements[4]))) box = builders.Box(elements[0], position) boxes.append(box) return boxes @staticmethod def write_file(file_descriptor, boxes): """ Write boxes in a box file. Output is in a the same format than tesseract's one. Warning: The file_descriptor must support UTF-8 ! (see module 'codecs') """ for box in boxes: file_descriptor.write(str(box) + " 0\n") def __str__(self): return "Character boxes" def _set_environment(): global g_subprocess_startup_info global g_creation_flags if os.name == "nt": # pragma: no cover g_subprocess_startup_info = subprocess.STARTUPINFO() g_subprocess_startup_info.wShowWindow = subprocess.SW_HIDE g_subprocess_startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW g_creation_flags = 0x08000000 # CREATE_NO_WINDOW if getattr(sys, 'frozen', False): # pragma: no cover # Pyinstaller support if 'TESSDATA_PREFIX' in os.environ: # already changed return tesspath = os.path.join(sys._MEIPASS, "tesseract") tessprefix = os.path.join(sys._MEIPASS, "data") logger.info("Running in packaged environment") if not os.path.exists(tesspath): logger.warning( "Running from container, but no tesseract ({}) found !".format( tesspath ) ) else: logger.info("[{}] added to PATH".format(tesspath)) os.environ['PATH'] = ( tesspath + os.pathsep + os.environ['PATH'] ) if not os.path.exists(os.path.join(tessprefix, "tessdata")): logger.warning( "Running from container, but no tessdata ({}) found !".format( tessprefix ) ) else: version = get_version(set_env=False) if version[0] > 3: tessprefix = os.path.join(tessprefix, "tessdata") logger.info("TESSDATA_PREFIX set to [{}]".format(tessprefix)) os.environ['TESSDATA_PREFIX'] = tessprefix def can_detect_orientation(): version = get_version() langs = get_available_languages() return ( version[0] > 3 or (version[0] == 3 and version[1] >= 3) ) and 'osd' in langs def psm_parameter(): """Return the psm option string depending on the Tesseract version.""" try: version = get_version() return "--psm" if version[0] > 3 else "-psm" except Exception as exc: logger.warning( "psm_parameter(): failed to get Tesseract version. Assuming" "Tesseract >= 4 --> using option '--psm'", exc_info=exc ) return "--psm" def detect_orientation(image, lang=None): """ Arguments: image --- Pillow image to analyze lang --- lang to specify to tesseract Returns: { 'angle': 90, 'confidence': 23.73, } Raises: TesseractError --- if no script detected on the image """ _set_environment() with tempfile.TemporaryDirectory() as tmpdir: command = [TESSERACT_CMD, "input.bmp", 'stdout', psm_parameter(), "0"] version = get_version() if lang is not None: if version[0] < 4: command += ['-l', lang] else: command += ['-l', 'osd'] if image.mode != "RGB": image = image.convert("RGB") image.save(os.path.join(tmpdir, "input.bmp")) proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False, startupinfo=g_subprocess_startup_info, creationflags=g_creation_flags, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) with proc: proc.stdin.close() original_output = proc.stdout.read() proc.wait() original_output = original_output.decode("utf-8") original_output = original_output.strip() if "Could not initialize tesseract" in original_output: raise TesseractError(-1, "Error initializing tesseract: %s" % original_output) try: output = original_output.split("\n") output = [line.split(": ", 1) for line in output if (": " in line)] output = {x: y for (x, y) in output} angle = int(output.get('Rotate', output['Orientation in degrees'])) # Tesseract reports the angle in the opposite direction the one we # want angle = (360 - angle) % 360 return { 'angle': angle, 'confidence': float(output['Orientation confidence']), } except Exception as ex: raise TesseractError(-1, "No script found in image (%s - %s)" % (str(ex), original_output)) def get_name(): return "Tesseract (sh)" def get_available_builders(): return [ builders.LineBoxBuilder, builders.TextBuilder, builders.WordBoxBuilder, CharBoxBuilder, builders.DigitBuilder, builders.DigitLineBoxBuilder, ] def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None, flags=None, configs=None): ''' Runs Tesseract: `TESSERACT_CMD` \ `input_filename` \ `output_filename_base` \ [-l `lang`] \ [`configs`] Arguments: input_filename --- image to read output_filename_base --- file name in which must be stored the result (without the extension) cwd --- Run Tesseract in the specified working directory or use current one if None lang --- Tesseract language to use (if None, none will be specified) config --- List of Tesseract configs to use (if None, none will be specified) Returns: Returns (the exit status of Tesseract, Tesseract's output) ''' _set_environment() command = [TESSERACT_CMD, input_filename, output_filename_base] if lang is not None: command += ['-l', lang] if flags is not None: command += flags if configs is not None: command += configs proc = subprocess.Popen(command, cwd=cwd, startupinfo=g_subprocess_startup_info, creationflags=g_creation_flags, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Beware that in some cases, tesseract may print more on stderr than # allowed by the buffer of subprocess.Popen.stderr. So we must read stderr # asap or Tesseract will remain stuck when trying to write again on stderr. # In the end, we just have to make sure that proc.stderr.read() is called # before proc.wait() with proc: errors = proc.stdout.read() ret = proc.wait() return (ret, errors) def cleanup(filename): ''' Tries to remove the given filename. Ignores non-existent files ''' try: os.remove(filename) except OSError: # pragma: no cover pass class ReOpenableTempfile(object): # pragma: no cover """ On Windows, `tempfile.NamedTemporaryFile` occur Permission denied Error when file is still open. It returns `tempfile.NamedTemporaryFile` compatible object. """ def __init__(self, suffix): self.name = None with tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix, delete=False) as fp: self.name = fp.name def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def close(self): if self.name is not None: os.remove(self.name) self.name = None def image_to_string(image, lang=None, builder=None): ''' Runs tesseract on the specified image. First, the image is written to disk, and then the tesseract command is run on the image. Tesseract's result is read, and the temporary files are erased. Arguments: image --- image to OCR. lang --- tesseract language to use. builder --- builder used to configure Tesseract and read its result. The builder is used to specify the type of output expected. Possible builders are TextBuilder or CharBoxBuilder. If builder == None, the builder used will be TextBuilder. Returns: Depends of the specified builder. By default, it will return a simple string. ''' if builder is None: builder = builders.TextBuilder() with tempfile.TemporaryDirectory() as tmpdir: if image.mode != "RGB": image = image.convert("RGB") image.save(os.path.join(tmpdir, "input.bmp")) (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir, lang=lang, flags=builder.tesseract_flags, configs=builder.tesseract_configs) if status: raise TesseractError(status, errors) tested_files = [] output_file_name = "ERROR" for file_extension in builder.file_extensions: output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"), file_extension)) tested_files.append(output_file_name) try: with codecs.open(output_file_name, 'r', encoding='utf-8', errors='replace') as file_desc: return builder.read_file(file_desc) except FileNotFoundError: continue finally: cleanup(output_file_name) raise TesseractError( -1, "Unable to find output file (tested {})".format(tested_files) ) def is_available(): _set_environment() return shutil.which(TESSERACT_CMD) is not None def get_available_languages(): """ Returns the list of languages that Tesseract knows how to handle. Returns: An array of strings. Note that most languages name conform to ISO 639 terminology, but not all. Most of the time, truncating the language name name returned by this function to 3 letters should do the trick. """ _set_environment() proc = subprocess.Popen([TESSERACT_CMD, "--list-langs"], startupinfo=g_subprocess_startup_info, creationflags=g_creation_flags, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) with proc: langs = proc.stdout.read().decode('utf-8').splitlines(False) ret = proc.wait() if ret != 0: raise TesseractError(ret, "unable to get languages") return [lang for lang in langs if lang and lang[-1] != ':'] def get_version(set_env=True): """ Returns Tesseract version. Returns: A tuple corresponding to the version (for instance, (3, 0, 1) for 3.01) Exception: TesseractError --- Unable to run tesseract or to parse the version """ global g_version if g_version is not None: return g_version if set_env: _set_environment() command = [TESSERACT_CMD, "-v"] proc = subprocess.Popen(command, startupinfo=g_subprocess_startup_info, creationflags=g_creation_flags, stdout=subprocess.PIPE) with proc: ver_string = proc.stdout.read() ver_string = ver_string.decode('utf-8') ret = proc.wait() if ret not in (0, 1): raise TesseractError(ret, ver_string) try: ver_string = ver_string.split(" ")[1] els = ver_string.split(".") els = [digits_only(x) for x in els] major = els[0] minor = els[1] upd = 0 if len(els) >= 3: upd = els[2] version = (major, minor, upd) if version == (0, 0, 0): raise TesseractError( ret, ("Unable to parse Tesseract version (not a number): [%s]" % (ver_string))) g_version = version return version except IndexError: raise TesseractError( ret, ("Unable to parse Tesseract version (spliting failed): [%s]" % (ver_string))) pyocr-0.8.5/src/pyocr/util.py000066400000000000000000000003351450160556200161370ustar00rootroot00000000000000import re def digits_only(string): """Return all digits that the given string starts with.""" match = re.match(r'\D*(?P\d+)', string) if match: return int(match.group('digits')) return 0 pyocr-0.8.5/tests/000077500000000000000000000000001450160556200140265ustar00rootroot00000000000000pyocr-0.8.5/tests/__init__.py000066400000000000000000000000111450160556200161270ustar00rootroot00000000000000# module pyocr-0.8.5/tests/data/000077500000000000000000000000001450160556200147375ustar00rootroot00000000000000pyocr-0.8.5/tests/data/boxes000066400000000000000000000155341450160556200160120ustar00rootroot00000000000000T 105 705 130 734 0 h 136 704 155 734 0 e 161 703 178 725 0 ( 205 696 216 732 0 q 222 694 241 725 0 u 247 702 266 723 0 i 273 703 281 733 0 c 288 701 305 724 0 k 310 702 329 732 0 ) 335 695 347 730 0 [ 376 693 386 731 0 b 391 700 411 731 0 r 416 701 431 722 0 o 434 699 454 722 0 w 459 699 487 721 0 n 493 699 513 721 0 ] 518 691 528 729 0 { 559 691 572 728 0 f 580 698 598 729 0 o 597 697 617 720 0 x 622 698 642 718 0 } 650 690 663 727 0 j 687 688 700 727 0 u 706 695 727 717 0 m 732 696 764 718 0 p 768 687 789 717 0 s 794 694 809 717 0 ! 817 694 823 726 0 O 104 654 133 685 0 v 137 654 157 675 0 e 162 653 179 675 0 r 185 654 199 675 0 t 224 652 235 679 0 h 241 653 261 683 0 e 266 652 283 674 0 $ 310 649 328 683 0 4 334 652 353 681 0 3 360 651 376 681 0 , 385 645 392 655 0 4 399 651 418 680 0 5 425 650 442 680 0 6 449 650 468 679 0 . 476 649 481 654 0 7 490 650 508 679 0 8 515 649 533 679 0 < 561 653 583 674 0 l 589 648 598 679 0 a 604 647 622 670 0 z 627 647 645 668 0 y 649 638 668 668 0 > 673 652 696 673 0 # 722 646 742 677 0 9 748 646 766 676 0 0 772 646 791 675 0 d 818 644 838 675 0 o 843 644 862 666 0 g 867 635 887 666 0 & 103 604 134 635 0 d 160 603 180 634 0 u 186 603 205 624 0 c 210 602 227 625 0 k 232 603 252 633 0 / 257 602 269 633 0 g 273 594 293 624 0 o 298 601 318 624 0 o 322 601 342 623 0 s 347 601 361 623 0 e 366 600 383 623 0 , 389 595 396 605 0 a 424 599 442 622 0 s 448 599 463 622 0 1 493 599 504 629 0 2 514 599 532 629 0 . 540 598 545 603 0 5 554 599 572 628 0 % 578 597 614 629 0 o 638 596 658 619 0 f 662 597 680 628 0 E 700 596 725 626 0 - 731 604 744 610 0 m 749 595 781 618 0 a 785 594 803 617 0 i 809 595 819 626 0 l 825 595 835 626 0 f 103 555 120 585 0 r 121 555 135 576 0 o 138 553 158 576 0 m 164 554 194 575 0 a 220 552 238 575 0 s 244 552 259 574 0 p 263 544 284 574 0 a 288 551 306 574 0 m 312 552 344 573 0 m 349 551 381 573 0 e 385 550 403 572 0 r 408 551 423 572 0 @ 427 540 464 581 0 w 469 549 498 570 0 e 502 548 520 571 0 b 524 548 544 579 0 s 549 548 564 570 0 i 569 548 579 579 0 t 585 547 596 574 0 e 600 547 618 570 0 . 625 547 630 552 0 c 637 546 655 569 0 o 659 546 679 569 0 m 684 546 716 568 0 i 742 546 752 577 0 s 758 545 773 567 0 s 799 544 814 567 0 p 818 536 839 567 0 a 843 543 862 566 0 m 867 544 899 566 0 . 905 543 911 548 0 D 102 505 130 534 0 e 136 503 153 526 0 r 159 504 173 525 0 , 198 498 205 508 0 , 207 498 214 508 0 s 221 502 235 525 0 c 240 502 257 524 0 h 263 502 283 533 0 n 289 502 309 524 0 e 314 501 331 523 0 l 337 502 346 532 0 l 353 501 362 532 0 e 368 500 385 523 0 ” 390 521 406 531 0 b 433 499 453 531 0 r 459 500 473 521 0 a 477 499 495 521 0 u 500 498 520 520 0 n 526 499 546 521 0 e 551 498 568 520 0 F 594 498 617 528 0 u 621 497 641 518 0 c 646 496 663 519 0 h 669 497 689 527 0 s 694 496 709 518 0 s 735 495 750 518 0 p 754 487 775 517 0 r 780 495 795 517 0 i 799 495 809 526 0 n 815 495 836 517 0 g 840 486 861 516 0 t 866 493 877 520 0 fi 102 454 121 482 0 b 126 454 145 485 0 e 150 453 167 476 0 r 173 454 187 475 0 d 212 452 232 483 0 e 237 452 254 474 0 n 260 453 280 474 0 f 306 452 323 482 0 a 323 451 341 473 0 u 346 451 366 472 0 l 372 451 381 481 0 e 387 450 404 472 0 n 410 450 430 472 0 H 456 450 485 480 0 u 489 449 509 470 0 n 515 449 536 471 0 d 540 448 560 479 0 . 567 448 572 453 0 L 601 448 625 478 0 e 630 446 648 469 0 r 674 447 689 469 0 e 692 446 710 468 0 n 715 446 736 468 0 a 740 445 759 468 0 r 764 446 779 467 0 d 782 444 803 476 0 b 827 444 848 475 0 r 853 444 868 466 0 u 871 443 892 465 0 n 897 444 918 466 0 < 101 406 110 425 0 < 110 406 119 425 0 r 127 404 141 426 0 a 144 403 162 426 0 p 167 395 188 425 0 i 194 403 202 434 0 d 208 402 228 433 0 e 233 402 251 424 0 ) 256 404 265 423 0 ) 265 404 274 423 0 s 302 401 317 424 0 a 321 401 339 423 0 u 345 400 364 422 0 t 370 400 380 427 0 e 385 400 403 422 0 p 428 391 449 422 0 a 453 399 471 422 0 r 477 400 492 421 0 - 496 407 509 413 0 d 513 398 534 429 0 e 538 398 556 420 0 s 561 397 576 420 0 s 581 397 596 420 0 u 600 397 621 419 0 s 626 396 641 419 0 l 667 397 677 428 0 e 682 396 700 419 0 c 725 395 743 418 0 h 748 396 769 426 0 i 774 395 784 426 0 e 789 394 807 417 0 n 812 395 833 417 0 p 100 346 120 376 0 a 125 354 143 376 0 r 149 354 163 375 0 e 167 353 184 375 0 s 190 353 204 375 0 s 210 352 224 375 0 e 229 352 246 374 0 u 251 352 271 373 0 x 276 352 296 373 0 . 303 351 308 356 0 L 337 351 361 381 0 a 366 350 384 373 0 v 409 350 429 371 0 o 434 349 453 372 0 l 459 350 468 380 0 p 473 341 494 371 0 e 499 348 516 371 0 m 543 348 574 370 0 a 579 347 597 370 0 r 602 348 617 369 0 r 621 347 636 369 0 o 639 346 659 369 0 n 664 347 685 369 0 e 690 345 707 368 0 r 733 346 748 368 0 a 751 345 770 368 0 p 774 336 795 367 0 i 800 345 810 376 0 d 815 344 837 375 0 a 840 343 859 366 0 s 100 304 114 326 0 a 119 304 137 326 0 l 143 304 152 334 0 t 159 303 169 330 0 a 174 303 192 325 0 s 219 302 233 325 0 o 238 302 257 324 0 p 262 293 282 324 0 r 288 302 302 323 0 a 306 301 324 323 0 i 351 301 360 332 0 ] 367 301 376 332 0 c 403 300 420 322 0 a 425 299 443 322 0 n 449 300 469 322 0 e 474 299 491 321 0 p 517 290 538 321 0 i 543 299 553 329 0 g 558 289 579 320 0 r 584 298 599 320 0 o 602 297 622 320 0 . 628 297 633 302 0 E 662 297 687 327 0 l 693 296 703 327 0 z 729 296 748 317 0 o 752 295 772 318 0 r 776 295 792 317 0 r 795 295 811 317 0 o 814 294 834 317 0 m 99 254 130 276 0 a 135 253 153 276 0 r 159 254 174 275 0 r 178 254 193 275 0 é 196 252 216 284 0 n 221 253 242 274 0 r 268 252 283 274 0 é 286 251 304 283 0 p 309 243 330 273 0 i 335 251 345 282 0 d 350 250 371 281 0 o 375 250 395 273 0 s 421 249 436 272 0 a 440 249 459 272 0 l 464 250 474 280 0 t 479 249 490 276 0 a 495 248 513 271 0 s 540 248 555 270 0 o 559 247 579 270 0 b 582 247 603 279 0 r 608 248 623 269 0 e 626 246 644 269 0 e 669 246 687 269 0 l 692 246 702 277 0 p 728 237 749 268 0 e 753 245 771 267 0 r 776 245 791 267 0 r 795 245 810 267 0 o 813 244 833 267 0 p 98 196 118 226 0 e 123 203 140 226 0 r 146 204 160 225 0 e 164 203 181 225 0 z 186 203 204 224 0 o 209 202 228 225 0 s 233 202 248 224 0 o 253 202 272 224 0 . 279 202 284 207 0 A 313 202 342 232 0 r 369 201 383 222 0 a 387 200 405 222 0 p 409 191 430 222 0 o 435 199 454 222 0 s 459 199 474 221 0 a 478 198 497 221 0 m 523 199 555 221 0 a 560 197 578 220 0 r 583 198 598 220 0 r 602 198 617 219 0 o 621 196 640 219 0 m 645 197 677 219 0 r 703 196 718 218 0 é 721 195 740 227 0 p 744 187 765 218 0 i 770 195 780 226 0 d 785 194 806 226 0 a 810 194 829 217 0 s 98 154 112 176 0 a 117 154 135 176 0 l 141 154 150 184 0 t 157 153 167 180 0 a 172 153 190 175 0 s 217 152 231 175 0 o 236 152 255 174 0 b 259 152 279 183 0 r 285 152 299 174 0 e 303 151 320 173 0 0 346 150 366 173 0 C 391 150 408 172 0 5 413 149 431 179 0 0 436 149 456 172 0 p 481 140 502 171 0 r 507 149 522 171 0 e 525 148 543 171 0 g 547 139 568 170 0 u 572 147 593 169 0 i 598 148 608 179 0 e 613 140 631 169 0 o 635 146 655 169 0 s 660 146 675 169 0 o 679 146 699 169 0 . 705 146 710 151 0 pyocr-0.8.5/tests/data/boxes_empty_lines000066400000000000000000000011561450160556200204150ustar00rootroot00000000000000T 105 705 130 734 0 h 136 704 155 734 0 e 161 703 178 725 0 ( 205 696 216 732 0 q 222 694 241 725 0 u 247 702 266 723 0 i 273 703 281 733 0 c 288 701 305 724 0 k 310 702 329 732 0 ) 335 695 347 730 0 [ 376 693 386 731 0 b 391 700 411 731 0 r 416 701 431 722 0 o 434 699 454 722 0 w 459 699 487 721 0 n 493 699 513 721 0 ] 518 691 528 729 0 { 559 691 572 728 0 f 580 698 598 729 0 o 597 697 617 720 0 x 622 698 642 718 0 } 650 690 663 727 0 j 687 688 700 727 0 u 706 695 727 717 0 m 732 696 764 718 0 p 768 687 789 717 0 s 794 694 809 717 0 ! 817 694 823 726 0 O 104 654 133 685 0 v 137 654 157 675 0 e 162 653 179 675 0 pyocr-0.8.5/tests/data/boxes_short_lines000066400000000000000000000011701450160556200204120ustar00rootroot00000000000000T 105 705 130 734 0 h 136 704 155 734 0 e 161 703 178 725 0 ( 205 696 216 732 0 q 222 694 241 725 0 u 247 702 266 723 0 i 273 703 281 733 0 c 288 701 305 724 0 k 310 702 329 732 0 ) 335 695 347 730 0 [ 376 693 386 731 0 b 391 700 411 731 0 r 416 701 431 722 0 o 434 699 454 722 0 w 459 699 487 721 0 n 493 699 513 721 0 ] 518 691 528 729 0 { 559 691 572 728 0 f 580 698 598 729 0 o 597 697 617 720 0 x 622 698 642 718 0 } 650 690 663 727 0 j 687 688 700 727 0 u 706 695 727 717 0 m 732 696 764 718 0 p 768 687 789 717 0 s 794 694 809 717 0 ! 817 694 823 726 0 O 104 654 133 685 0 v 137 654 157 675 0 e 162 653 179 675 0 # 1 2 3 4 pyocr-0.8.5/tests/data/cuneiform.lines000066400000000000000000000211451450160556200177650ustar00rootroot00000000000000

The (qui ck) [brown] { fox ) jumps!
Over the $43,456.78 <lazy> ¹90 dog
Ec duck/goose, as 12.5'lo of E-mail
from aspammerQawebsite.corn is spam.
Der „schnelle" braune Fuchs springt
uber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra il cane pigro. El zorro
marron rapido salta sobre el perro
perezoso. A raposa marrom rapida
salta sobre o cao preguiqoso.

pyocr-0.8.5/tests/data/cuneiform.words000066400000000000000000000211451450160556200200110ustar00rootroot00000000000000

The (qui ck) [brown] { fox ) jumps!
Over the $43,456.78 <lazy> ¹90 dog
Ec duck/goose, as 12.5'lo of E-mail
from aspammerQawebsite.corn is spam.
Der „schnelle" braune Fuchs springt
uber den faulen Hund. Le renard brun
«rapide» saute par-dessus le chien
paresseux. La volpe marrone rapida
salta sopra il cane pigro. El zorro
marron rapido salta sobre el perro
perezoso. A raposa marrom rapida
salta sobre o cao preguiqoso.

pyocr-0.8.5/tests/data/digits000066400000000000000000000000131450160556200161370ustar00rootroot000000000000003355456544 pyocr-0.8.5/tests/data/digits.lines000066400000000000000000000020331450160556200172540ustar00rootroot00000000000000

3355456544

pyocr-0.8.5/tests/data/lohnsteuerbescheinigung.hocr000066400000000000000000000301141450160556200225360ustar00rootroot00000000000000

Lohnsteuerbescheinigung für das Kalenderjahr 2020

Nachstehende Daten wurden maschinell an die Finanzverwaltung übertragen.

ITZBund, Postfach 30 16 45, 53196 Bonn

06 42C1 DECO 05 6000 304A DV 08.20 0,80 Deutsche Post

*K4000* *30076300*000772*

1. Dauer des Dienstverhältnisses:

01.01. bis 31.07.

2. Zeiträume ohne Anspruch auf Arbeitslohn

Anzahl "U"

Großbuchstaben (S, M, F, FR)

3. Bruttoarbeitsiohn einschl. Sachbezüge ohne 9. und 10.

4. Einbehaltene Lohnsteuer von 3.

pyocr-0.8.5/tests/data/tesseract.lines000066400000000000000000000225551450160556200200010ustar00rootroot00000000000000

The (quick) [brown] {fox} jumps! Over the $43,456.78 <lazy> #90 dog & duck/goose, as 12.5% of E-mail from aspammer@website.com is spam. Der ,,schnelle” braune Fuchs springt fiber den faulen Hund. Le renard brun «rapide» saute par-dessus le chien paresseux. La volpe marrone rapida salta sopra i] cane pigro. El zorro marrén répido salta sobre el perro perezoso. A raposa marrom répida salta sobre 0 C50 preguieoso.

pyocr-0.8.5/tests/data/text000066400000000000000000000000631450160556200156450ustar00rootroot00000000000000Phrase en français. Avec des accents. Éphémère pyocr-0.8.5/tests/data/words000066400000000000000000000114621450160556200160240ustar00rootroot00000000000000 The
(quick)
[brown]
{fox}
jumps!
Over
the
$43,456.78
<lazy>
#90
dog
&
duck/goose,
as
12.5%
of
E-mail
from
aspammer@website.com
is
spam.
Der
,,schnelle”
braune
Fuchs
springt
fiber
den
faulen
Hund.
Le
renard
brun
«rapide»
saute
par-dessus
le
chien
paresseux.
La
volpe
marrone
rapida
salta
sopra
i]
cane
pigro.
El
zorro
marrén
répido
salta
sobre
el
perro
perezoso.
A
raposa
marrom
répida
salta
sobre
0
C50
preguieoso.
pyocr-0.8.5/tests/data/words_bbox000066400000000000000000000114611450160556200170350ustar00rootroot00000000000000 The
(quick)
[brown]
{fox}
jumps!
Over
the
$43,456.78
<lazy>
#90
dog
&
duck/goose,
as
12.5%
of
E-mail
from
aspammer@website.com
is
spam.
Der
,,schnelle”
braune
Fuchs
springt
fiber
den
faulen
Hund.
Le
renard
brun
«rapide»
saute
par-dessus
le
chien
paresseux.
La
volpe
marrone
rapida
salta
sopra
i]
cane
pigro.
El
zorro
marrén
répido
salta
sobre
el
perro
perezoso.
A
raposa
marrom
répida
salta
sobre
0
C50
preguieoso.
pyocr-0.8.5/tests/test_base.py000066400000000000000000000013061450160556200163510ustar00rootroot00000000000000import os import unittest class BaseTest(unittest.TestCase): tool = None def setUp(self): self.file_descriptors = [] def tearDown(self): for fd in self.file_descriptors: fd.close() def _get_file_handle(self, filename): fd = open(os.path.join( os.path.dirname(os.path.abspath(__file__)), "data", filename ), encoding="utf-8") self.file_descriptors.append(fd) return fd def _get_file_content(self, filename): with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", filename), encoding="utf-8") as fh: content = fh.read() return content pyocr-0.8.5/tests/test_box.py000066400000000000000000000120541450160556200162310ustar00rootroot00000000000000import unittest import xml.dom.minidom from pyocr import builders class TestBox(unittest.TestCase): """ These tests ensure the Box features are what they should. """ def setUp(self): super().setUp() self.box1 = builders.Box("word1", ((15, 22), (23, 42))) self.box1_bis = builders.Box("word1_bis", ((15, 22), (23, 42))) self.box2 = builders.Box("word2", ((30, 5), (40, 15)), 95) self.box_unicode = builders.Box("\xe9", ((1, 2), (3, 4))) def test_init(self): self.assertEqual(self.box1.content, "word1") self.assertSequenceEqual(self.box1.position, ((15, 22), (23, 42))) self.assertEqual(self.box1.confidence, 0) self.assertEqual(self.box2.confidence, 95) def test_get_xml_tag(self): impl = xml.dom.minidom.getDOMImplementation() doc = impl.createDocument(None, "root", None) tag = self.box1.get_xml_tag(doc) self.assertEqual(len(tag.childNodes), 1) self.assertEqual(tag.getAttribute("class"), "ocrx_word") self.assertEqual(tag.getAttribute("title"), "bbox 15 22 23 42; x_wconf 0") self.assertEqual(tag.firstChild.data, "word1") def test_str_method(self): self.assertEqual(str(self.box1), "word1 15 22 23 42") def test_str_unicode(self): self.assertEqual(str(self.box_unicode), "\xe9 1 2 3 4") def test_box_not_equal_none(self): self.assertNotEqual(self.box1, None) def test_box_equal(self): self.assertEqual(self.box1, self.box1_bis) def test_box_not_equal(self): self.assertNotEqual(self.box1, self.box2) def test_box_lower(self): self.assertLess(self.box2, self.box1) self.assertLessEqual(self.box2, self.box1) def test_box_greater(self): self.assertGreater(self.box1, self.box2) self.assertGreaterEqual(self.box1, self.box2) def test_box_equal_not_box(self): self.assertNotEqual(self.box1, 0) self.assertNotEqual(self.box1, []) def test_hash(self): self.assertEqual(hash(self.box1), hash(self.box1_bis)) self.assertNotEqual(hash(self.box1), hash(self.box2)) class TestLineBox(unittest.TestCase): def setUp(self): super().setUp() box1 = builders.Box("word1", ((15, 22), (23, 30))) box2 = builders.Box("word2", ((25, 23), (30, 32))) box3 = builders.Box("word3", ((32, 25), (40, 32)), 95) box4 = builders.Box("word4", ((41, 18), (44, 33)), 98) box_unicode = builders.Box("\xe9", ((1, 2), (3, 4)), 98) self.line1 = builders.LineBox( [box1, box2, box3, box4], ((14, 15), (45, 33)) ) self.line1_bis = builders.LineBox( [box1, box2], ((14, 15), (45, 33)) ) self.line2 = builders.LineBox( [box3, box4], ((30, 5), (53, 20)) ) self.line1_dupl = builders.LineBox( [box1, box2, box3, box4], ((14, 15), (45, 33)) ) self.line_unicode = builders.LineBox( [box1, box_unicode], ((1, 2), (3, 4)) ) def test_init(self): self.assertEqual(len(self.line1.word_boxes), 4) self.assertSequenceEqual(self.line1.position, ((14, 15), (45, 33))) self.assertEqual(self.line1.content, "word1 word2 word3 word4") def test_get_xml_tag(self): impl = xml.dom.minidom.getDOMImplementation() doc = impl.createDocument(None, "root", None) tag = self.line1.get_xml_tag(doc) self.assertEqual(len(tag.childNodes), 2 * len(self.line1.word_boxes) - 1) self.assertEqual(tag.getAttribute("class"), "ocr_line") self.assertEqual(tag.getAttribute("title"), "bbox 14 15 45 33") self.assertEqual(tag.firstChild.firstChild.data, "word1") self.assertEqual(tag.lastChild.firstChild.data, "word4") def test_line_str(self): expected = "[\n" for box in self.line1.word_boxes: expected += " " + box.__str__() + "\n" expected += "] 14 15 45 33" self.assertEqual(str(self.line1), expected) def test_str_unicode(self): self.assertEqual( str(self.line_unicode), "[\n word1 15 22 23 30\n \xe9 1 2 3 4\n] 1 2 3 4" ) def test_line_not_equal_none(self): self.assertNotEqual(self.line1, None) def test_box_equal(self): self.assertEqual(self.line1, self.line1_bis) def test_box_not_equal(self): self.assertNotEqual(self.line1, self.line2) def test_line_lower(self): self.assertLess(self.line2, self.line1) self.assertLessEqual(self.line2, self.line1) def test_line_greater(self): self.assertGreater(self.line1, self.line2) self.assertGreaterEqual(self.line1, self.line2) def test_hash(self): self.assertIsNotNone(hash(self.line1)) self.assertNotEqual(hash(self.line1), hash(self.line1_bis)) self.assertNotEqual(hash(self.line1), hash(self.line2)) self.assertEqual(hash(self.line1), hash(self.line1_dupl)) pyocr-0.8.5/tests/test_builder.py000066400000000000000000000326641450160556200171000ustar00rootroot00000000000000import unittest from io import StringIO from itertools import product from random import randint from unittest.mock import patch from pyocr import builders from .test_base import BaseTest class TestTextBuilder(unittest.TestCase): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.TextBuilder() def test_init(self): self.assertListEqual(self.builder.file_extensions, ["txt"]) self.assertListEqual(self.builder.tesseract_flags, ["--psm", "3"]) self.assertListEqual(self.builder.tesseract_configs, []) self.assertEqual(self.builder.tesseract_layout, 3) self.assertListEqual(self.builder.cuneiform_args, ["-f", "text"]) @patch("pyocr.tesseract.get_version") def test_init_cuneiform_params(self, get_version): get_version.return_value = (4, 0, 0) # XXX Maybe overkill? # this check all combinations of parameters for cun_dotmat, cun_fax, cun_sglcol in product(*((False, True),) * 3): builder = builders.TextBuilder( cuneiform_dotmatrix=cun_dotmat, cuneiform_fax=cun_fax, cuneiform_singlecolumn=cun_sglcol ) if cun_dotmat: self.assertIn("--dotmatrix", builder.cuneiform_args) else: self.assertNotIn("--dotmatrix", builder.cuneiform_args) if cun_fax: self.assertIn("--fax", builder.cuneiform_args) else: self.assertNotIn("--fax", builder.cuneiform_args) if cun_sglcol: self.assertIn("--singlecolumn", builder.cuneiform_args) else: self.assertNotIn("--singlecolumn", builder.cuneiform_args) def test_read_file(self): txt = "first line\nsecond line\n0123456789\n🖨 " input_fh = StringIO(txt) output = self.builder.read_file(input_fh) self.assertEqual(output, txt.strip()) def test_write_file(self): output = StringIO() txt = "first line\nsecond line\n0123456789\n🖨 " self.builder.write_file(output, txt) output.seek(0) self.assertEqual(output.read(), txt) def test_start_line(self): box = builders.Box("word", ((10, 11), (12, 13))) self.builder.start_line(box) self.assertListEqual(self.builder.built_text, [""]) def test_add_word_no_line(self): box = builders.Box("word", ((10, 11), (12, 13))) with self.assertRaises(IndexError): self.builder.add_word(box.content, box) def test_add_word(self): box = builders.Box("word", ((10, 11), (12, 13))) self.builder.start_line(box) self.builder.add_word(box.content, box) self.assertEqual(self.builder.built_text[0], box.content) self.builder.add_word(box.content, box) self.assertEqual(self.builder.built_text[0], box.content + " " + box.content) def test_end_line(self): before = list(self.builder.built_text) self.builder.end_line() self.assertEqual(self.builder.built_text, before) def test_get_output(self): box = builders.Box("word", ((10, 11), (12, 13))) self.builder.start_line(box) self.builder.add_word("word1", box) self.builder.add_word("word2", box) self.builder.start_line(box) self.builder.add_word("word3", box) self.builder.add_word("word4", box) self.assertEqual(self.builder.get_output(), "word1 word2\nword3 word4") def test_str_method(self): self.assertEqual(str(self.builder), "Raw text") class TestWordBoxBuilder(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.WordBoxBuilder() @patch("pyocr.tesseract.get_version") def test_init_tesseract_version_3(self, get_version): for version in range(6): get_version.return_value = (3, version, 0) builder = builders.WordBoxBuilder() self.assertListEqual(builder.tesseract_flags, ["-psm", "1"]) self.assertListEqual(builder.file_extensions, ["html", "hocr"]) self.assertListEqual(builder.tesseract_configs, ["hocr"]) self.assertListEqual(builder.cuneiform_args, ["-f", "hocr"]) self.assertListEqual(builder.word_boxes, []) self.assertEqual(builder.tesseract_layout, 1) @patch("pyocr.tesseract.get_version") def test_init_tesseract_version_4(self, get_version): get_version.return_value = (4, 0, 0) builder = builders.WordBoxBuilder() self.assertListEqual(builder.tesseract_flags, ["--psm", "1"]) self.assertListEqual(builder.file_extensions, ["html", "hocr"]) self.assertListEqual(builder.tesseract_configs, ["hocr"]) self.assertListEqual(builder.cuneiform_args, ["-f", "hocr"]) self.assertListEqual(builder.word_boxes, []) self.assertEqual(builder.tesseract_layout, 1) def test_read_file(self): words = self.builder.read_file(self._get_file_handle("words")) for word in words: self.assertIsInstance(word, builders.Box) def test_empty_read_file(self): output = StringIO() self.assertListEqual(self.builder.read_file(output), []) def test_read_file_bbox(self): words = self.builder.read_file(self._get_file_handle("words_bbox")) for word in words: self.assertIsInstance(word, builders.Box) self.assertNotEqual(words[-1].content, "preguieoso.") def test_write_file(self): output = StringIO() boxes = [ builders.Box("word1", ((10, 11), (12, 13)), 95), builders.Box("word2", ((11, 12), (13, 14))), builders.Box("word3", ((12, 13), (14, 15))), builders.Box("word4", ((13, 14), (15, 16)), 87), ] self.builder.write_file(output, boxes) output.seek(0) output = output.read() for box in boxes: self.assertIn(box.content, output) self.assertIn("{} {} {} {}".format( box.position[0][0], box.position[0][1], box.position[1][0], box.position[1][1], ), output) self.assertIn(str(box.confidence), output) def test_start_line(self): box = builders.Box("word", ((1, 2), (3, 4))) before = list(self.builder.word_boxes) self.builder.start_line(box) self.assertEqual(self.builder.word_boxes, before) def test_add_word(self): box = builders.Box("word", ((1, 2), (3, 4)), 42) self.builder.add_word(box.content, box.position, box.confidence) for box in self.builder.word_boxes: self.assertIsInstance(box, builders.Box) self.assertEqual(self.builder.word_boxes[0], box) def test_end_line(self): before = list(self.builder.word_boxes) self.builder.end_line() self.assertEqual(self.builder.word_boxes, before) def test_get_output(self): boxes = [ builders.Box("word1", ((10, 11), (12, 13)), 95), builders.Box("word2", ((11, 12), (13, 14))), builders.Box("word3", ((12, 13), (14, 15))), builders.Box("word4", ((13, 14), (15, 16)), 87), ] for box in boxes: self.builder.add_word(box.content, box.position, box.confidence) output = self.builder.get_output() for box, box_expected in zip(output, boxes): self.assertIsInstance(box, builders.Box) self.assertEqual(box, box_expected) self.assertEqual(box.content, box_expected.content) def test_str_method(self): self.assertEqual(str(self.builder), "Word boxes") class TestLineBoxBuilder(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.LineBoxBuilder() @patch("pyocr.tesseract.get_version") def test_init_tesseract_version_3(self, get_version): for version in range(6): get_version.return_value = (3, version, 0) builder = builders.LineBoxBuilder() self.assertListEqual(builder.tesseract_flags, ["-psm", "1"]) self.assertListEqual(builder.file_extensions, ["html", "hocr"]) self.assertListEqual(builder.tesseract_configs, ["hocr"]) self.assertListEqual(builder.cuneiform_args, ["-f", "hocr"]) self.assertListEqual(builder.lines, []) self.assertEqual(builder.tesseract_layout, 1) @patch("pyocr.tesseract.get_version") def test_init_tesseract_version_4(self, get_version): get_version.return_value = (4, 0, 0) builder = builders.LineBoxBuilder() self.assertListEqual(builder.tesseract_flags, ["--psm", "1"]) self.assertListEqual(builder.file_extensions, ["html", "hocr"]) self.assertListEqual(builder.tesseract_configs, ["hocr"]) self.assertListEqual(builder.cuneiform_args, ["-f", "hocr"]) self.assertListEqual(builder.lines, []) self.assertEqual(builder.tesseract_layout, 1) def test_read_file(self): for input_fh in (self._get_file_handle("tesseract.lines"), self._get_file_handle("cuneiform.lines")): lines = self.builder.read_file(input_fh) for line in lines: self.assertIsInstance(line, builders.LineBox) def test_empty_read_file(self): empty = StringIO() self.assertListEqual(self.builder.read_file(empty), []) def test_write_file(self): output_fh = StringIO() lines = [] for l in range(4): # noqa: E741 boxes = [] for b in range(4): word = "word" + str(4*l+b) position = ((4*l+b, 4*l+b+1), (4*l+b+2, 4*l+b+3)) boxes.append(builders.Box(word, position, randint(0, 100))) line_position = ((4*l, 4*(l+1)), (4*l+2, 4*(l+1)+2)) lines.append(builders.LineBox(boxes, line_position)) self.builder.write_file(output_fh, lines) output_fh.seek(0) output = output_fh.read() for line in lines: for box in line.word_boxes: self.assertIn(box.content, output) self.assertIn("{} {} {} {}".format( box.position[0][0], box.position[0][1], box.position[1][0], box.position[1][1], ), output) self.assertIn(str(box.confidence), output) def test_start_line(self): position = ((1, 2), (3, 4)) self.builder.start_line(position) self.assertEqual(len(self.builder.lines), 1) self.assertListEqual(self.builder.lines, [builders.LineBox([], position)]) self.builder.start_line(position) self.assertEqual(len(self.builder.lines), 1) self.assertListEqual(self.builder.lines, [builders.LineBox([], position)]) def test_add_word_no_line(self): box = builders.Box("word", ((1, 2), (3, 4)), 42) with self.assertRaises(IndexError): self.builder.add_word(box.content, box.position, box.confidence) self.assertListEqual(self.builder.lines, []) def test_end_line(self): before = list(self.builder.lines) self.builder.end_line() self.assertEqual(self.builder.lines, before) def test_get_output(self): lines = [] for l in range(4): # noqa: E741 boxes = [] for b in range(4): word = "word" + str(4*l+b) position = ((4*l+b, 0), (0, 0)) boxes.append(builders.Box(word, position, randint(0, 100))) line_position = ((4*l, 4*(l+1)), (4*l+2, 4*(l+1)+2)) lines.append(builders.LineBox(boxes, line_position)) for line in lines: self.builder.start_line(line.position) for word in line.word_boxes: self.builder.add_word(word.content, word.position, word.confidence) self.builder.end_line() # could be useful in future output = self.builder.get_output() for line, line_expected in zip(output, lines): self.assertIsInstance(line, builders.LineBox) self.assertEqual(line, line_expected) def test_missing_text_bug_129(self): fd = self._get_file_handle("lohnsteuerbescheinigung.hocr") lines = self.builder.read_file(fd) self.assertIn("Lohnsteuerbescheinigung", lines[0]) class TestDigitBuilder(unittest.TestCase): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.DigitBuilder() def test_init(self): self.assertIn("digits", self.builder.tesseract_configs) def test_str_method(self): self.assertEqual(str(self.builder), "Digits raw text") class TestDigitLineBoxBuilder(unittest.TestCase): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.DigitLineBoxBuilder() def test_init(self): self.assertIn("digits", self.builder.tesseract_configs) def test_str_method(self): self.assertEqual(str(self.builder), "Digit line boxes") pyocr-0.8.5/tests/test_cuneiform.py000066400000000000000000000273441450160556200174400ustar00rootroot00000000000000import subprocess from io import StringIO from unittest.mock import patch, MagicMock from PIL import Image from pyocr import builders from pyocr import cuneiform from .test_base import BaseTest class TestCuneiform(BaseTest): """ These tests make sure the requirements for the tests are met. """ @patch("shutil.which") def test_available(self, which): # XXX is it useful? which.return_value = True self.assertTrue(cuneiform.is_available()) which.assert_called_once_with("cuneiform") @patch("subprocess.Popen") def test_version(self, popen): stdout = MagicMock() stdout.stdout.read.return_value = ( b"Cuneiform for Linux 1.1.0\n" b"Usage: cuneiform [-l languagename -f format --dotmatrix --fax" b" --singlecolumn -o result_file] imagefile" ) popen.return_value = stdout self.assertSequenceEqual(cuneiform.get_version(), (1, 1, 0)) @patch("subprocess.Popen") def test_version_error(self, popen): stdout = MagicMock() stdout.stdout.read.return_value = b"\n" popen.return_value = stdout self.assertIsNone(cuneiform.get_version()) @patch("subprocess.Popen") def test_langs(self, popen): stdout = MagicMock() stdout.stdout.read.return_value = ( b"Cuneiform for Linux 1.1.0\n" b"Supported languages: eng ger fra rus swe spa ita ruseng ukr srp " b"hrv pol dan por dut cze rum hun bul slv lav lit est tur." ) popen.return_value = stdout langs = cuneiform.get_available_languages() self.assertIn("eng", langs) self.assertIn("fra", langs) popen.assert_called_once_with( ["cuneiform", "-l"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) def test_name(self): self.assertEqual(cuneiform.get_name(), "Cuneiform (sh)") def test_can_detect_orientation(self): self.assertFalse(cuneiform.can_detect_orientation()) def test_available_builders(self): self.assertListEqual( cuneiform.get_available_builders(), [ builders.TextBuilder, builders.WordBoxBuilder, builders.LineBoxBuilder, ] ) class TestCuneiformTxt(BaseTest): """ These tests make sure the "usual" OCR works fine. (the one generating a .txt file) """ @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.TextBuilder() self.image = Image.new(mode="RGB", size=(1, 1)) self.text_file = StringIO(self._get_file_content("text")) self.stdout = MagicMock() self.stdout.stdout.read.return_value = b"Cuneiform for Linux 1.1.0\n" self.stdout.wait.return_value = 0 self.tmp_filename = "/tmp/cuneiform_n0qfk87otxt" self.enter = MagicMock() self.enter.__enter__.return_value = MagicMock() self.enter.__enter__.return_value.configure_mock( name=self.tmp_filename ) @patch("pyocr.tesseract.get_version") @patch("pyocr.cuneiform.temp_file") @patch("codecs.open") @patch("subprocess.Popen") def test_image_to_string_defaults_to_text_buidler(self, popen, copen, temp_file, get_version): get_version.return_value = (4, 0, 0) popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(self.image) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("pyocr.cuneiform.temp_file") @patch("codecs.open") @patch("subprocess.Popen") def test_lang(self, popen, copen, temp_file): popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(self.image, lang="fra", builder=self.builder) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( ["cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("pyocr.cuneiform.temp_file") @patch("codecs.open") @patch("subprocess.Popen") def test_text(self, popen, copen, temp_file): popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(self.image, builder=self.builder) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("subprocess.Popen") def test_text_error(self, popen): message = ("Cuneiform for Linux 1.1.0\n" "Magick: Improper image header (example.png) reported by " "coders/png.c:2932 (ReadPNGImage)\n") self.stdout.stdout.read.return_value = message.encode() self.stdout.wait.return_value = 1 popen.return_value = self.stdout with self.assertRaises(cuneiform.CuneiformError) as ce: cuneiform.image_to_string(self.image, builder=self.builder) self.assertEqual(ce.exception.status, 1) self.assertEqual(ce.exception.message, message) @patch("pyocr.cuneiform.temp_file") @patch("codecs.open") @patch("subprocess.Popen") def test_text_non_rgb_image(self, popen, copen, temp_file): """This tests that image_to_string works with non RGB mode images and that image is converted in function.""" image = self.image.convert("L") popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(image, builder=self.builder) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) class TestCuneiformDigits(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.DigitBuilder() self.image = Image.new(mode="RGB", size=(1, 1)) def test_digits_not_implemented(self): with self.assertRaises(NotImplementedError): cuneiform.image_to_string(self.image, builder=self.builder) def test_digits_box_not_implemented(self): with self.assertRaises(NotImplementedError): cuneiform.image_to_string(self.image, builder=self.builder) class TestCuneiformWordBox(BaseTest): """ These tests make sure that cuneiform box handling works fine. """ @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.WordBoxBuilder() self.image = Image.new(mode="RGB", size=(1, 1)) self.text_file = StringIO(self._get_file_content("cuneiform.words")) self.stdout = MagicMock() self.stdout.stdout.read.return_value = b"Cuneiform for Linux 1.1.0\n" self.stdout.wait.return_value = 0 self.tmp_filename = "/tmp/cuneiform_n0qfk87otxt" self.enter = MagicMock() self.enter.__enter__.return_value = MagicMock() self.enter.__enter__.return_value.configure_mock( name=self.tmp_filename ) @patch("pyocr.cuneiform.temp_file") @patch("codecs.open") @patch("subprocess.Popen") def test_word(self, popen, copen, temp_file): popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(self.image, builder=self.builder) popen.assert_called_once_with( ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) for box in output: self.assertIsInstance(box, builders.Box) @patch("subprocess.Popen") def test_word_error(self, popen): stdout = MagicMock() message = ("Cuneiform for Linux 1.1.0\n" "Magick: Improper image header (example.png) reported by " "coders/png.c:2932 (ReadPNGImage)\n") stdout.stdout.read.return_value = message.encode() stdout.wait.return_value = 1 popen.return_value = stdout with self.assertRaises(cuneiform.CuneiformError) as ce: cuneiform.image_to_string(self.image, builder=self.builder) self.assertEqual(ce.exception.status, 1) self.assertEqual(ce.exception.message, message) class TestCuneiformLineBox(BaseTest): """ These tests make sure that cuneiform box handling works fine. """ @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.LineBoxBuilder() self.image = Image.new(mode="RGB", size=(1, 1)) self.text_file = StringIO(self._get_file_content("cuneiform.lines")) self.stdout = MagicMock() self.stdout.stdout.read.return_value = b"Cuneiform for Linux 1.1.0\n" self.stdout.wait.return_value = 0 self.tmp_filename = "/tmp/cuneiform_n0qfk87otxt" self.enter = MagicMock() self.enter.__enter__.return_value = MagicMock() self.enter.__enter__.return_value.configure_mock( name=self.tmp_filename ) @patch("pyocr.cuneiform.temp_file") @patch("codecs.open") @patch("subprocess.Popen") def test_line(self, popen, copen, temp_file): popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(self.image, builder=self.builder) popen.assert_called_once_with( ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) for box in output: self.assertIsInstance(box, builders.LineBox) @patch("subprocess.Popen") def test_line_error(self, popen): message = ("Cuneiform for Linux 1.1.0\n" "Magick: Improper image header (example.png) reported by " "coders/png.c:2932 (ReadPNGImage)\n") self.stdout.stdout.read.return_value = message.encode() self.stdout.wait.return_value = 1 popen.return_value = self.stdout with self.assertRaises(cuneiform.CuneiformError) as ce: cuneiform.image_to_string(self.image, builder=self.builder) self.assertEqual(ce.exception.status, 1) self.assertEqual(ce.exception.message, message) pyocr-0.8.5/tests/test_libtesseract.py000066400000000000000000002000411450160556200201200ustar00rootroot00000000000000import locale import os from ctypes import POINTER, cast, c_char_p, c_int from random import randint from unittest.mock import patch, call from PIL import Image from pyocr import builders from pyocr import libtesseract from pyocr.error import TesseractError from pyocr.libtesseract import tesseract_raw from .test_base import BaseTest class TestLibTesseract(BaseTest): """ These tests make sure the requirements for the tests are met. """ def setUp(self): super().setUp() self.handle = randint(0, 2**32-1) self.image = Image.new(mode="RGB", size=(1, 1)) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_available(self, libtess): libtess.TessVersion.return_value = b"4.0.0" self.assertTrue(libtesseract.is_available()) libtess.TessVersion.assert_called_once_with() @patch("pyocr.libtesseract.tesseract_raw") def test_not_available(self, raw): raw.is_available.return_value = False self.assertFalse(libtesseract.is_available()) raw.is_available.assert_called_once_with() @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_not_available_tesseract3(self, libtess): libtess.TessVersion.return_value = b"3.3.0" self.assertFalse(libtesseract.is_available()) libtess.TessVersion.assert_called_once_with() @patch("pyocr.libtesseract.get_available_languages") def test_can_detect_orientation(self, get_available_languages): get_available_languages.return_value = ['eng', 'fra', 'jpn', 'osd'] self.assertTrue(libtesseract.can_detect_orientation()) get_available_languages.return_value = ['eng', 'fra', 'jpn'] self.assertFalse(libtesseract.can_detect_orientation()) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_version(self, libtess): libtess.TessVersion.return_value = b"3.05" self.assertEqual(libtesseract.get_version(), (3, 5, 0)) libtess.TessVersion.return_value = b"3.2.1" self.assertEqual(libtesseract.get_version(), (3, 2, 1)) libtess.TessVersion.return_value = b"4.0.0" self.assertEqual(libtesseract.get_version(), (4, 0, 0)) libtess.TessVersion.return_value = b"4.0.0aplha" self.assertEqual(libtesseract.get_version(), (4, 0, 0)) libtess.TessVersion.return_value = b"3.5.1dev1" self.assertEqual(libtesseract.get_version(), (3, 5, 1)) libtess.TessVersion.assert_called_with() self.assertEqual(libtess.TessVersion.call_count, 5) def test_name(self): self.assertEqual(libtesseract.get_name(), "Tesseract (C-API)") def test_available_builders(self): self.assertEqual( libtesseract.get_available_builders(), [ builders.TextBuilder, builders.WordBoxBuilder, builders.DigitBuilder, builders.LineBoxBuilder, builders.DigitLineBoxBuilder, ] ) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_langs(self, libtess): libtess.TessBaseAPICreate.return_value = self.handle libtess.TessBaseAPIGetAvailableLanguagesAsVector.return_value = [ b"eng", b"fra", b"jpn", b"osd", b"" ] self.assertListEqual( libtesseract.get_available_languages(), ["eng", "fra", "jpn", "osd"] ) libtess.TessBaseAPICreate.assert_called_once_with() self.assertEqual( libtess.TessBaseAPIGetAvailableLanguagesAsVector.call_count, 1 ) args = libtess.TessBaseAPIGetAvailableLanguagesAsVector.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.handle) def test_tess_box_to_pyocr_box(self): box = (0, 1, 2, 3) self.assertSequenceEqual( libtesseract._tess_box_to_pyocr_box(box), ((0, 1), (2, 3)) ) @patch("pyocr.libtesseract.tesseract_raw") def test_detect_orientation(self, raw): raw.init.return_value = self.handle expected = { "orientation": raw.Orientation.PAGE_RIGHT, "confidence": 87, } raw.detect_os.return_value = expected self.assertEqual( libtesseract.detect_orientation(self.image), { "angle": 90, "confidence": 87, } ) raw.init.assert_called_once_with(lang="osd") raw.set_page_seg_mode.assert_called_once_with( self.handle, raw.PageSegMode.OSD_ONLY ) raw.set_image.assert_called_once_with(self.handle, self.image) raw.detect_os.assert_called_once_with(self.handle) @patch("pyocr.libtesseract.tesseract_raw") def test_detect_orientation_error(self, raw): raw.init.return_value = self.handle raw.detect_os.return_value = {"confidence": 0} with self.assertRaises(TesseractError) as te: libtesseract.detect_orientation(self.image) self.assertEqual(te.exception.status, "no script") self.assertEqual(te.exception.message, "no script detected") class TestLibTesseractRaw(BaseTest): def setUp(self): super().setUp() self.handle = randint(0, 2**32-1) self.iterator = randint(0, 2**32-1) self.image = Image.new("RGB", size=(1, 1)) @patch("locale.setlocale") @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_init_tesseract4(self, libtess, setlocale): libtess.TessVersion.return_value = b"4.0.0" libtess.TessBaseAPICreate.return_value = self.handle for lang in (None, "eng", "fra", "jpn", "osd"): api = tesseract_raw.init(lang) self.assertEqual(api, self.handle) libtess.TessBaseAPICreate.assert_called_once_with() self.assertEqual( libtess.TessBaseAPIInit3.call_count, 1 ) args = libtess.TessBaseAPIInit3.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1].value, None) self.assertEqual(args[2].value, lang.encode() if lang else None) self.assertEqual( libtess.TessBaseAPISetVariable.call_count, 1 ) args = libtess.TessBaseAPISetVariable.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1], b"tessedit_zero_rejection") self.assertEqual(args[2], b"F") setlocale.assert_called_once_with(locale.LC_ALL, "C") libtess.reset_mock() setlocale.reset_mock() @patch("locale.setlocale") @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_init_tesseract3(self, libtess, setlocale): libtess.TessVersion.return_value = b"3.5.0" libtess.TessBaseAPICreate.return_value = self.handle for lang in (None, "eng", "fra", "jpn", "osd"): api = tesseract_raw.init(lang) self.assertEqual(api, self.handle) libtess.TessBaseAPICreate.assert_called_once_with() self.assertEqual( libtess.TessBaseAPIInit3.call_count, 1 ) args = libtess.TessBaseAPIInit3.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1].value, None) self.assertEqual(args[2].value, lang.encode() if lang else None) self.assertEqual( libtess.TessBaseAPISetVariable.call_count, 1 ) args = libtess.TessBaseAPISetVariable.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1], b"tessedit_zero_rejection") self.assertEqual(args[2], b"F") self.assertFalse(setlocale.called) libtess.reset_mock() setlocale.reset_mock() @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_init_error(self, libtess): libtess.TessBaseAPICreate.return_value = self.handle libtess.TessBaseAPIInit3.side_effect = Exception( "Could not initialize" ) with self.assertRaises(Exception): tesseract_raw.init() self.assertEqual( libtess.TessBaseAPICreate.call_count, 1 ) self.assertEqual( libtess.TessBaseAPIDelete.call_count, 1 ) args = libtess.TessBaseAPIDelete.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.handle) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_cleanup(self, libtess): tesseract_raw.cleanup(self.handle) self.assertEqual( libtess.TessBaseAPIDelete.call_count, 1 ) args = libtess.TessBaseAPIDelete.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.handle) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_set_is_numeric(self, libtess): for mode in (True, False): wl = b"0123456789." if mode else b"" tesseract_raw.set_is_numeric(self.handle, mode) self.assertEqual( libtess.TessBaseAPISetVariable.call_count, 1 ) args = libtess.TessBaseAPISetVariable.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1], b"tessedit_char_whitelist") self.assertEqual(args[2], wl) libtess.reset_mock() @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_set_debug_file(self, libtess): for filename in ("file", b"file"): tesseract_raw.set_debug_file(self.handle, filename) self.assertEqual( libtess.TessBaseAPISetVariable.call_count, 1 ) args = libtess.TessBaseAPISetVariable.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1], b"debug_file") self.assertEqual(args[2], b"file") libtess.reset_mock() @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_set_page_seg_mode(self, libtess): tesseract_raw.set_page_seg_mode(self.handle, 3) self.assertEqual( libtess.TessBaseAPISetPageSegMode.call_count, 1 ) args = libtess.TessBaseAPISetPageSegMode.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1].value, 3) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_init_for_analyse_page(self, libtess): tesseract_raw.init_for_analyse_page(self.handle) self.assertEqual( libtess.TessBaseAPIInitForAnalysePage.call_count, 1 ) args = libtess.TessBaseAPIInitForAnalysePage.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.handle) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_set_image(self, libtess): tesseract_raw.set_image(self.handle, self.image) self.assertEqual(libtess.TessBaseAPISetImage.call_count, 1) args = libtess.TessBaseAPISetImage.call_args[0] self.assertEqual(len(args), 6) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1], self.image.tobytes("raw", "RGB")) self.assertEqual(args[2].value, self.image.width) self.assertEqual(args[3].value, self.image.height) self.assertEqual(args[4].value, 3) self.assertEqual(args[5].value, self.image.width * 3) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_recognize(self, libtess): libtess.TessBaseAPIRecognize.return_value = 0 self.assertEqual(tesseract_raw.recognize(self.handle), 0) self.assertEqual( libtess.TessBaseAPIRecognize.call_count, 1 ) args = libtess.TessBaseAPIRecognize.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.handle) self.assertIsNone(args[1].value) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_analyse_layout(self, libtess): layout = randint(0, 2**32-1) libtess.TessBaseAPIAnalyseLayout.return_value = layout self.assertEqual(tesseract_raw.analyse_layout(self.handle), layout) self.assertEqual( libtess.TessBaseAPIAnalyseLayout.call_count, 1 ) args = libtess.TessBaseAPIAnalyseLayout.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.handle) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_get_utf8_text(self, libtess): text = "Test text for get utf8" ptr = c_char_p(text.encode()) libtess.TessBaseAPIGetUTF8Text.return_value = ptr self.assertEqual(tesseract_raw.get_utf8_text(self.handle), text) self.assertEqual( libtess.TessBaseAPIGetUTF8Text.call_count, 1 ) args = libtess.TessBaseAPIGetUTF8Text.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.handle) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_page_iterator_delete(self, libtess): tesseract_raw.page_iterator_delete(self.iterator) self.assertEqual( libtess.TessPageIteratorDelete.call_count, 1 ) args = libtess.TessPageIteratorDelete.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.iterator) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_page_iterator_next(self, libtess): libtess.TessPageIteratorNext.return_value = self.iterator + 1 self.assertEqual(tesseract_raw.page_iterator_next( self.iterator, tesseract_raw.PageIteratorLevel.WORD ), self.iterator + 1) self.assertEqual( libtess.TessPageIteratorNext.call_count, 1 ) args = libtess.TessPageIteratorNext.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.iterator) self.assertEqual(args[1], tesseract_raw.PageIteratorLevel.WORD) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_page_iterator_beginning(self, libtess): libtess.TessPageIteratorIsAtBeginningOf.return_value = True self.assertTrue(tesseract_raw.page_iterator_is_at_beginning_of( self.iterator, tesseract_raw.PageIteratorLevel.WORD )) self.assertEqual( libtess.TessPageIteratorIsAtBeginningOf.call_count, 1 ) args = libtess.TessPageIteratorIsAtBeginningOf.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.iterator) self.assertEqual(args[1], tesseract_raw.PageIteratorLevel.WORD) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_page_iterator_final(self, libtess): element = randint(0, 2**32-1) libtess.TessPageIteratorIsAtFinalElement.return_value = True self.assertTrue(tesseract_raw.page_iterator_is_at_final_element( self.iterator, tesseract_raw.PageIteratorLevel.WORD, element )) self.assertEqual( libtess.TessPageIteratorIsAtFinalElement.call_count, 1 ) args = libtess.TessPageIteratorIsAtFinalElement.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.iterator) self.assertEqual(args[1], tesseract_raw.PageIteratorLevel.WORD) self.assertEqual(args[2], element) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_page_iterator_block_type(self, libtess): flowing = tesseract_raw.PolyBlockType.FLOWING_TEXT libtess.TessPageIteratorBlockType.return_value = flowing self.assertEqual(tesseract_raw.page_iterator_block_type(self.iterator), tesseract_raw.PolyBlockType.FLOWING_TEXT) self.assertEqual( libtess.TessPageIteratorBlockType.call_count, 1 ) args = libtess.TessPageIteratorBlockType.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.iterator) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_page_iterator_box(self, libtess): for res in (True, False): libtess.TessPageIteratorBoundingBox.return_value = res level = tesseract_raw.PageIteratorLevel.WORD result, box = tesseract_raw.page_iterator_bounding_box( self.iterator, level ) self.assertEqual(result, res) self.assertSequenceEqual(box, (0, 0, 0, 0)) self.assertEqual( libtess.TessPageIteratorBoundingBox.call_count, 1 ) args = libtess.TessPageIteratorBoundingBox.call_args[0] self.assertEqual(len(args), 6) self.assertEqual(args[0].value, self.iterator) self.assertEqual(args[1], level) self.assertEqual(cast(args[2], POINTER(c_int)).contents.value, 0) self.assertEqual(cast(args[3], POINTER(c_int)).contents.value, 0) self.assertEqual(cast(args[4], POINTER(c_int)).contents.value, 0) self.assertEqual(cast(args[5], POINTER(c_int)).contents.value, 0) libtess.reset_mock() @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_page_iterator_orientation(self, libtess): expected = { "orientation": 0, "writing_direction": 0, "textline_order": 0, "deskew_angle": 0, } self.assertEqual( tesseract_raw.page_iterator_orientation(self.iterator), expected ) self.assertEqual( libtess.TessPageIteratorOrientation.call_count, 1 ) args = libtess.TessPageIteratorOrientation.call_args[0] self.assertEqual(len(args), 5) self.assertEqual(args[0].value, self.iterator) self.assertEqual(cast(args[1], POINTER(c_int)).contents.value, 0) self.assertEqual(cast(args[2], POINTER(c_int)).contents.value, 0) self.assertEqual(cast(args[3], POINTER(c_int)).contents.value, 0) self.assertEqual(cast(args[4], POINTER(c_int)).contents.value, 0) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_get_iterator(self, libtess): libtess.TessBaseAPIGetIterator.return_value = self.iterator self.assertEqual(tesseract_raw.get_iterator(self.handle), self.iterator) self.assertEqual( libtess.TessBaseAPIGetIterator.call_count, 1 ) args = libtess.TessBaseAPIGetIterator.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.handle) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_result_iterator_page(self, libtess): libtess.TessResultIteratorGetPageIterator.return_value = self.iterator self.assertEqual( tesseract_raw.result_iterator_get_page_iterator(self.iterator), self.iterator ) self.assertEqual( libtess.TessResultIteratorGetPageIterator.call_count, 1 ) args = libtess.TessResultIteratorGetPageIterator.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, self.iterator) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_result_iterator_text(self, libtess): level = tesseract_raw.PageIteratorLevel.WORD text = "Test text for get utf8" ptr = c_char_p(text.encode()) libtess.TessResultIteratorGetUTF8Text.return_value = ptr self.assertEqual( tesseract_raw.result_iterator_get_utf8_text(self.iterator, level), text ) self.assertEqual( libtess.TessResultIteratorGetUTF8Text.call_count, 1 ) args = libtess.TessResultIteratorGetUTF8Text.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.iterator) self.assertEqual(args[1], level) libtess.TessDeleteText.assert_called_once_with(ptr) libtess.reset_mock() libtess.TessResultIteratorGetUTF8Text.return_value = None self.assertIsNone( tesseract_raw.result_iterator_get_utf8_text(self.iterator, level) ) self.assertEqual( libtess.TessResultIteratorGetUTF8Text.call_count, 1 ) args = libtess.TessResultIteratorGetUTF8Text.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.iterator) self.assertEqual(args[1], level) self.assertFalse(libtess.TessDeleteText.called) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_result_iterator_confidence(self, libtess): level = tesseract_raw.PageIteratorLevel.WORD libtess.TessResultIteratorConfidence.return_value = 95 self.assertEqual( tesseract_raw.result_iterator_get_confidence(self.iterator, level), 95 ) self.assertEqual( libtess.TessResultIteratorConfidence.call_count, 1 ) args = libtess.TessResultIteratorConfidence.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.iterator) self.assertEqual(args[1], level) libtess.reset_mock() libtess.TessResultIteratorConfidence.return_value = None self.assertIsNone( tesseract_raw.result_iterator_get_confidence(self.iterator, level) ) self.assertEqual( libtess.TessResultIteratorConfidence.call_count, 1 ) args = libtess.TessResultIteratorConfidence.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.iterator) self.assertEqual(args[1], level) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_detect_os(self, libtess): libtess.TessBaseAPIDetectOrientationScript.return_value = True self.assertEqual( tesseract_raw.detect_os(self.handle), { "orientation": 0, "confidence": 0, } ) self.assertEqual( libtess.TessBaseAPIDetectOrientationScript.call_count, 1 ) args = libtess.TessBaseAPIDetectOrientationScript.call_args[0] self.assertEqual(len(args), 5) self.assertEqual(args[0].value, self.handle) self.assertEqual(cast(args[1], POINTER(c_int)).contents.value, 0) self.assertEqual(cast(args[2], POINTER(c_int)).contents.value, 0) self.assertIsNone(args[3]) self.assertIsNone(args[4]) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_detect_os_error(self, libtess): libtess.TessBaseAPIDetectOrientationScript.return_value = False with self.assertRaises(TesseractError) as te: tesseract_raw.detect_os(self.handle) self.assertEqual(te.exception.status, "detect_orientation failed") self.assertEqual(te.exception.message, "TessBaseAPIDetectOrientationScript() failed") self.assertEqual( libtess.TessBaseAPIDetectOrientationScript.call_count, 1 ) args = libtess.TessBaseAPIDetectOrientationScript.call_args[0] self.assertEqual(len(args), 5) self.assertEqual(args[0].value, self.handle) self.assertEqual(cast(args[1], POINTER(c_int)).contents.value, 0) self.assertEqual(cast(args[2], POINTER(c_int)).contents.value, 0) self.assertIsNone(args[3]) self.assertIsNone(args[4]) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_detect_os_old(self, libtess): del libtess.TessBaseAPIDetectOrientationScript libtess.TessBaseAPIDetectOS.return_value = True self.assertEqual( tesseract_raw.detect_os(self.handle), { "orientation": 0, "confidence": 0, } ) self.assertEqual( libtess.TessBaseAPIDetectOS.call_count, 1 ) args = libtess.TessBaseAPIDetectOS.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.handle) self.assertIsInstance( cast(args[1], POINTER(tesseract_raw.OSResults)).contents, tesseract_raw.OSResults ) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_detect_os_old_error(self, libtess): del libtess.TessBaseAPIDetectOrientationScript libtess.TessBaseAPIDetectOS.return_value = False with self.assertRaises(TesseractError) as te: tesseract_raw.detect_os(self.handle) self.assertEqual(te.exception.status, "detect_orientation failed") self.assertEqual(te.exception.message, "TessBaseAPIDetectOS() failed") self.assertEqual( libtess.TessBaseAPIDetectOS.call_count, 1 ) args = libtess.TessBaseAPIDetectOS.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.handle) self.assertIsInstance( cast(args[1], POINTER(tesseract_raw.OSResults)).contents, tesseract_raw.OSResults ) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_set_input_name(self, libtess): input_file = "file" tesseract_raw.set_input_name(self.handle, input_file) self.assertEqual( libtess.TessBaseAPISetInputName.call_count, 1 ) args = libtess.TessBaseAPISetInputName.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, self.handle) self.assertEqual(args[1], input_file.encode()) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_init_pdf(self, libtess): output_file = "file" renderer = randint(0, 2**32-1) tessdata_dir = "/path/to/tess/data" libtess.TessBaseAPIGetDatapath.return_value = tessdata_dir libtess.TessPDFRendererCreate.return_value = renderer self.assertEqual( tesseract_raw.init_pdf_renderer(self.handle, output_file, True), renderer ) libtess.TessBaseAPIGetDatapath.assert_called_once_with(self.handle) self.assertEqual( libtess.TessPDFRendererCreate.call_count, 1 ) args = libtess.TessPDFRendererCreate.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0], output_file.encode()) self.assertEqual(args[1], tessdata_dir) self.assertEqual(args[2].value, True) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_begin_doc(self, libtess): renderer = randint(0, 2**32-1) doc_name = "doc" tesseract_raw.begin_document(renderer, doc_name) self.assertEqual( libtess.TessResultRendererBeginDocument.call_count, 1 ) args = libtess.TessResultRendererBeginDocument.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, renderer) self.assertEqual(args[1], doc_name.encode()) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_add_renderer_image(self, libtess): renderer = randint(0, 2**32-1) tesseract_raw.add_renderer_image(self.handle, renderer) self.assertEqual( libtess.TessResultRendererAddImage.call_count, 1 ) args = libtess.TessResultRendererAddImage.call_args[0] self.assertEqual(len(args), 2) self.assertEqual(args[0].value, renderer) self.assertEqual(args[1].value, self.handle) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") def test_end_doc(self, libtess): renderer = randint(0, 2**32-1) tesseract_raw.end_document(renderer) self.assertEqual( libtess.TessResultRendererEndDocument.call_count, 1 ) args = libtess.TessResultRendererEndDocument.call_args[0] self.assertEqual(len(args), 1) self.assertEqual(args[0].value, renderer) class TestLibTesseractText(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = builders.TextBuilder() self.handle = randint(0, 2**32-1) self.iterator = randint(0, 2**32-1) @patch("pyocr.tesseract.get_version") @patch("pyocr.libtesseract.tesseract_raw") def test_image_to_string_defaults_to_text_buidler(self, raw, get_version): get_version.return_value = (4, 0, 0) raw.init.return_value = self.handle raw.get_iterator.return_value = self.iterator raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("word1", "word2", "word3") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) self.assertEqual( libtesseract.image_to_string(self.image), "word1 word2 word3" ) raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) self.assertFalse(raw.set_is_numeric.called) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) raw.result_iterator_get_page_iterator.assert_called_once_with( self.iterator ) self.assertEqual(raw.page_iterator_is_at_beginning_of.call_count, 3) raw.page_iterator_is_at_beginning_of.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE) # called first at beginning and three times for each word self.assertEqual(raw.page_iterator_bounding_box.call_count, 4) self.assertListEqual( raw.page_iterator_bounding_box.call_args_list, [ call(self.iterator, raw.PageIteratorLevel.TEXTLINE), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), ] ) self.assertEqual(raw.page_iterator_is_at_final_element.call_count, 3) raw.page_iterator_is_at_final_element.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE, raw.PageIteratorLevel.WORD ) self.assertEqual(raw.result_iterator_get_utf8_text.call_count, 3) raw.result_iterator_get_utf8_text.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.result_iterator_get_confidence.call_count, 3) raw.result_iterator_get_confidence.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.page_iterator_next.call_count, 3) raw.page_iterator_next.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) raw.cleanup.assert_called_once_with(self.handle) @patch("pyocr.libtesseract.tesseract_raw") def test_lang(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = self.iterator raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("word1", "word2", "word3") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) self.assertEqual( libtesseract.image_to_string(self.image, lang="eng", builder=self.builder), "word1 word2 word3" ) raw.init.assert_called_once_with(lang="eng") raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) self.assertFalse(raw.set_is_numeric.called) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) raw.result_iterator_get_page_iterator.assert_called_once_with( self.iterator ) self.assertEqual(raw.page_iterator_is_at_beginning_of.call_count, 3) raw.page_iterator_is_at_beginning_of.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE) # called first at beginning and three times for each word self.assertEqual(raw.page_iterator_bounding_box.call_count, 4) self.assertListEqual( raw.page_iterator_bounding_box.call_args_list, [ call(self.iterator, raw.PageIteratorLevel.TEXTLINE), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), ] ) self.assertEqual(raw.page_iterator_is_at_final_element.call_count, 3) raw.page_iterator_is_at_final_element.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE, raw.PageIteratorLevel.WORD ) self.assertEqual(raw.result_iterator_get_utf8_text.call_count, 3) raw.result_iterator_get_utf8_text.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.result_iterator_get_confidence.call_count, 3) raw.result_iterator_get_confidence.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.page_iterator_next.call_count, 3) raw.page_iterator_next.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) raw.cleanup.assert_called_once_with(self.handle) @patch("pyocr.libtesseract.tesseract_raw") def test_lang_error(self, raw): raw.init.return_value = self.handle raw.get_available_languages.return_value = ["eng", "jpn", "osd"] with self.assertRaises(TesseractError) as te: libtesseract.image_to_string(self.image, lang="fra", builder=self.builder) self.assertEqual(te.exception.status, "no lang") self.assertEqual(te.exception.message, "language fra is not available") raw.init.assert_called_once_with(lang="fra") raw.get_available_languages.assert_called_once_with(self.handle) raw.cleanup.assert_called_once_with(self.handle) @patch("pyocr.libtesseract.tesseract_raw") def test_text(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = self.iterator raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("word1", "word2", None, "word3") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, False, True) self.assertEqual( libtesseract.image_to_string(self.image, builder=self.builder), "word1 word2 word3" ) raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) self.assertFalse(raw.set_is_numeric.called) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) raw.result_iterator_get_page_iterator.assert_called_once_with( self.iterator ) self.assertEqual(raw.page_iterator_is_at_beginning_of.call_count, 4) raw.page_iterator_is_at_beginning_of.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE) # called first at beginning and three times for each word self.assertEqual(raw.page_iterator_bounding_box.call_count, 4) self.assertListEqual( raw.page_iterator_bounding_box.call_args_list, [ call(self.iterator, raw.PageIteratorLevel.TEXTLINE), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), ] ) self.assertEqual(raw.page_iterator_is_at_final_element.call_count, 4) raw.page_iterator_is_at_final_element.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE, raw.PageIteratorLevel.WORD ) self.assertEqual(raw.result_iterator_get_utf8_text.call_count, 4) raw.result_iterator_get_utf8_text.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.result_iterator_get_confidence.call_count, 4) raw.result_iterator_get_confidence.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.page_iterator_next.call_count, 4) raw.page_iterator_next.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) raw.cleanup.assert_called_once_with(self.handle) @patch("pyocr.libtesseract.tesseract_raw") def test_text_error(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = None raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("word1", "word2", "word3") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) with self.assertRaises(TesseractError) as te: libtesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, "no script") self.assertEqual(te.exception.message, "no script detected") raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) self.assertFalse(raw.set_is_numeric.called) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) class TestLibTesseractDigits(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.DigitBuilder() self.image = Image.new(mode="RGB", size=(1, 1)) self.handle = randint(0, 2**32-1) self.iterator = randint(0, 2**32-1) @patch("pyocr.libtesseract.tesseract_raw") def test_digits(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = self.iterator raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("1", "2", "42") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) self.assertEqual( libtesseract.image_to_string(self.image, builder=self.builder), "1 2 42" ) raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) raw.set_is_numeric.assert_called_once_with(self.handle, True) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) raw.result_iterator_get_page_iterator.assert_called_once_with( self.iterator ) self.assertEqual(raw.page_iterator_is_at_beginning_of.call_count, 3) raw.page_iterator_is_at_beginning_of.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE) # called first at beginning and three times for each word self.assertEqual(raw.page_iterator_bounding_box.call_count, 4) self.assertListEqual( raw.page_iterator_bounding_box.call_args_list, [ call(self.iterator, raw.PageIteratorLevel.TEXTLINE), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), ] ) self.assertEqual(raw.page_iterator_is_at_final_element.call_count, 3) raw.page_iterator_is_at_final_element.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE, raw.PageIteratorLevel.WORD ) self.assertEqual(raw.result_iterator_get_utf8_text.call_count, 3) raw.result_iterator_get_utf8_text.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.result_iterator_get_confidence.call_count, 3) raw.result_iterator_get_confidence.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.page_iterator_next.call_count, 3) raw.page_iterator_next.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) raw.cleanup.assert_called_once_with(self.handle) class TestLibTesseractWordBox(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.WordBoxBuilder() self.image = Image.new("RGB", size=(1, 1)) self.handle = randint(0, 2**32-1) self.iterator = randint(0, 2**32-1) @patch("pyocr.libtesseract.tesseract_raw") def test_word(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = self.iterator raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("word1", "word2", "word3") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) self.assertListEqual( libtesseract.image_to_string(self.image, builder=self.builder), [ builders.Box("word1", ((0, 0), (0, 0))), builders.Box("word2", ((0, 0), (0, 0))), builders.Box("word3", ((0, 0), (0, 0))), ] ) raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) self.assertFalse(raw.set_is_numeric.called) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) raw.result_iterator_get_page_iterator.assert_called_once_with( self.iterator ) self.assertEqual(raw.page_iterator_is_at_beginning_of.call_count, 3) raw.page_iterator_is_at_beginning_of.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE) # called first at beginning and three times for each word self.assertEqual(raw.page_iterator_bounding_box.call_count, 4) self.assertListEqual( raw.page_iterator_bounding_box.call_args_list, [ call(self.iterator, raw.PageIteratorLevel.TEXTLINE), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), ] ) self.assertEqual(raw.page_iterator_is_at_final_element.call_count, 3) raw.page_iterator_is_at_final_element.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE, raw.PageIteratorLevel.WORD ) self.assertEqual(raw.result_iterator_get_utf8_text.call_count, 3) raw.result_iterator_get_utf8_text.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.result_iterator_get_confidence.call_count, 3) raw.result_iterator_get_confidence.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.page_iterator_next.call_count, 3) raw.page_iterator_next.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) raw.cleanup.assert_called_once_with(self.handle) @patch("pyocr.libtesseract.tesseract_raw") def test_word_error(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = None raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("word1", "word2", "word3") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) with self.assertRaises(TesseractError) as te: libtesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, "no script") self.assertEqual(te.exception.message, "no script detected") raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) self.assertFalse(raw.set_is_numeric.called) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) class TestLibTesseractLineBox(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = builders.LineBoxBuilder() self.handle = randint(0, 2**32-1) self.iterator = randint(0, 2**32-1) @patch("pyocr.libtesseract.tesseract_raw") def test_line(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = self.iterator raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("word1", "word2", "word3") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) self.assertListEqual( libtesseract.image_to_string(self.image, builder=self.builder), [ builders.LineBox([ builders.Box("word1", ((0, 0), (0, 0))), builders.Box("word2", ((0, 0), (0, 0))), builders.Box("word3", ((0, 0), (0, 0))), ], ((0, 0), (0, 0))) ] ) raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) self.assertFalse(raw.set_is_numeric.called) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) raw.result_iterator_get_page_iterator.assert_called_once_with( self.iterator ) self.assertEqual(raw.page_iterator_is_at_beginning_of.call_count, 3) raw.page_iterator_is_at_beginning_of.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE) # called first at beginning and three times for each word self.assertEqual(raw.page_iterator_bounding_box.call_count, 4) self.assertListEqual( raw.page_iterator_bounding_box.call_args_list, [ call(self.iterator, raw.PageIteratorLevel.TEXTLINE), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), ] ) self.assertEqual(raw.page_iterator_is_at_final_element.call_count, 3) raw.page_iterator_is_at_final_element.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE, raw.PageIteratorLevel.WORD ) self.assertEqual(raw.result_iterator_get_utf8_text.call_count, 3) raw.result_iterator_get_utf8_text.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.result_iterator_get_confidence.call_count, 3) raw.result_iterator_get_confidence.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.page_iterator_next.call_count, 3) raw.page_iterator_next.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) raw.cleanup.assert_called_once_with(self.handle) @patch("pyocr.libtesseract.tesseract_raw") def test_line_error(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = None raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("word1", "word2", "word3") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) with self.assertRaises(TesseractError) as te: libtesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, "no script") self.assertEqual(te.exception.message, "no script detected") raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) self.assertFalse(raw.set_is_numeric.called) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) class TestLibTesseractDigitsLineBox(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = builders.DigitLineBoxBuilder() self.handle = randint(0, 2**32-1) self.iterator = randint(0, 2**32-1) @patch("pyocr.libtesseract.tesseract_raw") def test_line(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = self.iterator raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("1", "2", "42") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) self.assertListEqual( libtesseract.image_to_string(self.image, builder=self.builder), [ builders.LineBox([ builders.Box("1", ((0, 0), (0, 0))), builders.Box("2", ((0, 0), (0, 0))), builders.Box("42", ((0, 0), (0, 0))), ], ((0, 0), (0, 0))) ] ) raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) raw.set_is_numeric.assert_called_once_with(self.handle, True) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) raw.result_iterator_get_page_iterator.assert_called_once_with( self.iterator ) self.assertEqual(raw.page_iterator_is_at_beginning_of.call_count, 3) raw.page_iterator_is_at_beginning_of.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE) # called first at beginning and three times for each word self.assertEqual(raw.page_iterator_bounding_box.call_count, 4) self.assertListEqual( raw.page_iterator_bounding_box.call_args_list, [ call(self.iterator, raw.PageIteratorLevel.TEXTLINE), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), call(self.iterator, raw.PageIteratorLevel.WORD), ] ) self.assertEqual(raw.page_iterator_is_at_final_element.call_count, 3) raw.page_iterator_is_at_final_element.assert_called_with( self.iterator, raw.PageIteratorLevel.TEXTLINE, raw.PageIteratorLevel.WORD ) self.assertEqual(raw.result_iterator_get_utf8_text.call_count, 3) raw.result_iterator_get_utf8_text.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.result_iterator_get_confidence.call_count, 3) raw.result_iterator_get_confidence.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) self.assertEqual(raw.page_iterator_next.call_count, 3) raw.page_iterator_next.assert_called_with( self.iterator, raw.PageIteratorLevel.WORD) raw.cleanup.assert_called_once_with(self.handle) @patch("pyocr.libtesseract.tesseract_raw") def test_line_error(self, raw): raw.init.return_value = self.handle raw.get_iterator.return_value = None raw.result_iterator_get_page_iterator.return_value = self.iterator raw.get_available_languages.return_value = ["eng", "fra", "jpn", "osd"] raw.page_iterator_next.side_effect = (True, True, False) raw.page_iterator_bounding_box.return_value = (True, (0, 0, 0, 0)) raw.result_iterator_get_utf8_text.side_effect = ("1", "2", "42") raw.page_iterator_is_at_beginning_of.side_effect = (True, False, False) raw.page_iterator_is_at_final_element.side_effect = (False, False, True) with self.assertRaises(TesseractError) as te: libtesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, "no script") self.assertEqual(te.exception.message, "no script detected") raw.init.assert_called_once_with(lang=None) raw.get_available_languages.assert_called_once_with(self.handle) raw.set_page_seg_mode.assert_called_once_with( self.handle, self.builder.tesseract_layout) raw.set_debug_file.assert_called_once_with(self.handle, os.devnull) raw.set_image.assert_called_once_with(self.handle, self.image) raw.set_is_numeric.assert_called_once_with(self.handle, True) raw.recognize.assert_called_once_with(self.handle) raw.get_iterator.assert_called_once_with(self.handle) class TestLibTesseractPDF(BaseTest): def setUp(self): super().setUp() self.image = Image.new(mode="RGB", size=(1, 1)) self.handle = 1234567 @patch("pyocr.libtesseract.tesseract_raw") def test_pdf(self, raw): renderer = 2345671 raw.init.return_value = self.handle raw.init_pdf_renderer.return_value = renderer libtesseract.image_to_pdf(self.image, "output") raw.init.assert_called_once_with(lang=None) raw.set_image.assert_called_once_with(self.handle, self.image) raw.set_page_seg_mode.assert_called_once_with( self.handle, raw.PageSegMode.AUTO_OSD ) raw.recognize.assert_called_once_with(self.handle) raw.init_pdf_renderer.assert_called_once_with( self.handle, "output", False ) raw.begin_document.assert_called_once_with(renderer, "") raw.add_renderer_image.assert_called_once_with(self.handle, renderer) raw.end_document.assert_called_once_with(renderer) self.assertListEqual( raw.cleanup.call_args_list, [call(self.handle), call(renderer)] ) @patch("pyocr.libtesseract.tesseract_raw") def test_multipage_pdf(self, raw): renderer = 2345671 raw.init.return_value = self.handle raw.init_pdf_renderer.return_value = renderer libtesseract.LibtesseractPdfBuilder() \ .set_output_file("output")\ .add_image(self.image)\ .add_image(self.image)\ .build() raw.init.assert_called_once_with(lang=None) raw.set_image.assert_called_with(self.handle, self.image) raw.set_image.assert_called_with(self.handle, self.image) raw.set_page_seg_mode.assert_called_once_with( self.handle, raw.PageSegMode.AUTO_OSD ) raw.recognize.assert_called_with(self.handle) raw.recognize.assert_called_with(self.handle) raw.init_pdf_renderer.assert_called_once_with( self.handle, "output", False ) raw.begin_document.assert_called_once_with(renderer, "") raw.add_renderer_image.assert_called_with( self.handle, renderer ) raw.add_renderer_image.assert_called_with( self.handle, renderer ) raw.end_document.assert_called_once_with(renderer) self.assertListEqual( raw.cleanup.call_args_list, [call(self.handle), call(renderer)] ) @patch("pyocr.libtesseract.tesseract_raw") def test_pdf_renderer_error(self, raw): renderer = None raw.init.return_value = self.handle raw.init_pdf_renderer.return_value = renderer with self.assertRaises(AssertionError): libtesseract.image_to_pdf(self.image, "output") raw.init.assert_called_once_with(lang=None) raw.set_page_seg_mode.assert_called_once_with( self.handle, raw.PageSegMode.AUTO_OSD ) raw.init_pdf_renderer.assert_called_once_with( self.handle, "output", False ) self.assertFalse(raw.set_image.called) self.assertFalse(raw.set_input_name.called) self.assertFalse(raw.recognize.called) self.assertFalse(raw.begin_document.called) self.assertFalse(raw.add_renderer_image.called) self.assertFalse(raw.end_document.called) raw.cleanup.assert_called_once_with(self.handle) pyocr-0.8.5/tests/test_tesseract.py000066400000000000000000001364561450160556200174530ustar00rootroot00000000000000import errno import os import subprocess from io import StringIO from tempfile import TemporaryDirectory from unittest.mock import patch, MagicMock from PIL import Image from pyocr import builders from pyocr import tesseract from .test_base import BaseTest class TestTesseract(BaseTest): """ These tests make sure the requirements for the tests are met. """ def setUp(self): super().setUp() self.stdout = MagicMock() self.image = Image.new(mode="RGB", size=(1, 1)) self.message = ( b"tesseract 4.0.0\n leptonica-1.76.0\n" b" libgif 5.1.4 : libjpeg 6b (libjpeg-turbo 1.5.2)" b" : libpng 1.6.34 " b": libtiff 4.0.9 : zlib 1.2.11 : libwebp 0.6.1" b" : libopenjp2 2.3.0\n" b" Found AVX2\n Found AVX\n Found SSE\n" ) self.stdout.stdout.read.return_value = self.message self.stdout.wait.return_value = 0 @patch("shutil.which") def test_available(self, which): which.return_value = True self.assertTrue(tesseract.is_available()) which.assert_called_once_with("tesseract") @patch("subprocess.Popen") def test_version_error(self, popen): tesseract.g_version = None # drop cached version self.stdout.wait.return_value = 2 popen.return_value = self.stdout with self.assertRaises(tesseract.TesseractError) as te: tesseract.get_version() self.assertEqual(te.exception.status, 2) self.assertEqual(te.exception.message, self.message.decode()) @patch("subprocess.Popen") def test_version_tesseract4(self, popen): tesseract.g_version = None # drop cached version popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0)) # stderr must be explicitely ignored when calling 'tesseract -v'. # See https://gitlab.gnome.org/World/OpenPaperwork/pyocr/-/issues/118 popen.assert_called_once() (args, kwargs) = popen.call_args self.assertNotIn('stderr', kwargs) @patch("subprocess.Popen") def test_version_tesseract4dev(self, popen): tesseract.g_version = None # drop cached version message = self.message.replace(b"tesseract 4.0.0", b"tesseract 4.00.00dev2") self.stdout.stdout.read.return_value = message popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0)) @patch("subprocess.Popen") def test_version_tesseract4alpha(self, popen): tesseract.g_version = None # drop cached version message = self.message.replace(b"tesseract 4.0.0", b"tesseract 4.00.00alpha") self.stdout.stdout.read.return_value = message popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0)) @patch("subprocess.Popen") def test_version_tesseract3(self, popen): tesseract.g_version = None # drop cached version message = self.message.replace(b"tesseract 4.0.0", b"tesseract 3.05") self.stdout.stdout.read.return_value = message popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (3, 5, 0)) @patch("subprocess.Popen") def test_version_tesseract3_no_minor(self, popen): tesseract.g_version = None # drop cached version message = self.message.replace(b"tesseract 4.0.0", b"tesseract 3.0") self.stdout.stdout.read.return_value = message popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (3, 0, 0)) @patch("subprocess.Popen") def test_version_windows(self, popen): tesseract.g_version = None # drop cached version message = self.message.replace(b"tesseract 4.0.0", b"tesseract v4.0.0.20181030") self.stdout.stdout.read.return_value = message popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0)) @patch("subprocess.Popen") def test_version_cache(self, popen): """ Make sure Tesseract is not called everytime we need the version. We need the version *often* in the code, and calling Tesseract everytime wouldn't be wise. """ tesseract.g_version = None # drop cached version self.stdout.stdout.read.return_value = self.message popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0)) self.stdout.stdout.read.return_value = "garbage" popen.return_value = self.stdout self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0)) @patch("subprocess.Popen") def test_version_error_splitting(self, popen): tesseract.g_version = None # drop cached version message = self.message.replace(b"tesseract 4.0.0", b"tesseract 3") self.stdout.stdout.read.return_value = message popen.return_value = self.stdout with self.assertRaises(tesseract.TesseractError) as te: tesseract.get_version() self.assertEqual(te.exception.status, 0) self.assertIn("Unable to parse Tesseract version (spliting failed): ", te.exception.message) @patch("subprocess.Popen") def test_version_error_nan(self, popen): tesseract.g_version = None # drop cached version message = self.message.replace(b"tesseract 4.0.0", b"tesseract A.B.C") self.stdout.stdout.read.return_value = message popen.return_value = self.stdout with self.assertRaises(tesseract.TesseractError) as te: tesseract.get_version() self.assertEqual(te.exception.status, 0) self.assertIn("Unable to parse Tesseract version (not a number): ", te.exception.message) @patch("subprocess.Popen") def test_langs(self, popen): self.stdout.stdout.read.return_value = ( b"List of available languages (4):\n" b"eng\n" b"fra\n" b"jpn\n" b"osd\n" ) popen.return_value = self.stdout langs = tesseract.get_available_languages() for lang in ("eng", "fra", "jpn", "osd"): self.assertIn(lang, langs) popen.assert_called_once_with( ["tesseract", "--list-langs"], startupinfo=None, creationflags=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("subprocess.Popen") def test_langs_error(self, popen): self.stdout.stdout.read.return_value = b"No languages\n" self.stdout.wait.return_value = 1 popen.return_value = self.stdout with self.assertRaises(tesseract.TesseractError) as te: tesseract.get_available_languages() self.assertEqual(te.exception.status, 1) self.assertEqual("unable to get languages", te.exception.message) popen.assert_called_once_with( ["tesseract", "--list-langs"], startupinfo=None, creationflags=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("pyocr.tesseract.get_available_languages") @patch("pyocr.tesseract.get_version") def test_can_detect_orientation_tesseract4(self, get_version, get_available_languages): get_version.return_value = (4, 0, 0) get_available_languages.return_value = ['eng', 'fra', 'jpn', 'osd'] self.assertTrue(tesseract.can_detect_orientation()) get_available_languages.return_value = ['eng', 'fra', 'jpn'] self.assertFalse(tesseract.can_detect_orientation()) @patch("pyocr.tesseract.get_available_languages") @patch("pyocr.tesseract.get_version") def test_can_detect_orientation_tesseract3( self, get_version, get_available_languages ): get_available_languages.return_value = ['eng', 'fra', 'jpn', 'osd'] get_version.return_value = (3, 3, 0) self.assertTrue(tesseract.can_detect_orientation()) @patch("pyocr.tesseract.get_available_languages") @patch("pyocr.tesseract.get_version") def test_cannot_detect_orientation_tesseract3( self, get_version, get_available_languages ): get_available_languages.return_value = ['eng', 'fra', 'jpn', 'osd'] get_version.return_value = (3, 2, 1) self.assertFalse(tesseract.can_detect_orientation()) def test_name(self): self.assertEqual(tesseract.get_name(), "Tesseract (sh)") @patch("pyocr.tesseract.get_version") def test_psm_parameter(self, get_version): get_version.return_value = (3, 5, 0) self.assertEqual(tesseract.psm_parameter(), "-psm") get_version.return_value = (4, 0, 0) self.assertEqual(tesseract.psm_parameter(), "--psm") def test_available_builders(self): self.assertListEqual( tesseract.get_available_builders(), [ builders.LineBoxBuilder, builders.TextBuilder, builders.WordBoxBuilder, tesseract.CharBoxBuilder, builders.DigitBuilder, builders.DigitLineBoxBuilder, ] ) @patch("pyocr.tesseract.get_version") @patch("subprocess.Popen") def test_run_tesseract(self, popen, get_version): message = ( b"Tesseract Open Source OCR Engine v4.0.0 with Leptonica\n" ) self.stdout.stdout.read.return_value = message popen.return_value = self.stdout with TemporaryDirectory() as tmpdir: self.image.save(os.path.join(tmpdir, "input.bmp")) status, error = tesseract.run_tesseract( "input.bmp", "output", cwd=tmpdir, ) self.assertEqual(status, 0) self.assertEqual(error, message) popen.assert_called_once_with( ["tesseract", "input.bmp", "output"], cwd=tmpdir, startupinfo=None, creationflags=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) get_version.return_value = (4, 0, 0) builder = builders.TextBuilder() with TemporaryDirectory() as tmpdir: self.image.save(os.path.join(tmpdir, "input2.bmp")) status, error = tesseract.run_tesseract( "input2.bmp", "output2", cwd=tmpdir, lang="fra", flags=builder.tesseract_flags, configs=builder.tesseract_configs, ) self.assertEqual(status, 0) self.assertEqual(error, message) popen.assert_called_with( ["tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"], cwd=tmpdir, startupinfo=None, creationflags=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) self.assertEqual(popen.call_count, 2) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract4(self, popen, temp_dir, get_version): get_version.return_value = (4, 0, 0) self.stdout.stdout.read.return_value = ( b"Page number: 0\n" b"Orientation in degrees: 90\n" b"Rotate: 270\n" b"Orientation confidence: 9.30\n" b"Script: Latin\n" b"Script confidence: 8.06\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.detect_orientation(self.image) self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "--psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract4_non_rgb_image(self, popen, temp_dir, get_version): """This tests that detect_orientation works with non RGB mode images and that image is converted in function.""" image = self.image.convert("L") get_version.return_value = (4, 0, 0) self.stdout.stdout.read.return_value = ( b"Page number: 0\n" b"Orientation in degrees: 90\n" b"Rotate: 270\n" b"Orientation confidence: 9.30\n" b"Script: Latin\n" b"Script confidence: 8.06\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.detect_orientation(image) self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "--psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract4_with_lang(self, popen, temp_dir, get_version): get_version.return_value = (4, 0, 0) self.stdout.stdout.read.return_value = ( b"Page number: 0\n" b"Orientation in degrees: 90\n" b"Rotate: 270\n" b"Orientation confidence: 9.30\n" b"Script: Latin\n" b"Script confidence: 8.06\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.detect_orientation(self.image, lang="fra") self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "--psm", "0", "-l", "osd"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract4_error(self, popen, temp_dir, get_version): get_version.return_value = (4, 0, 0) self.stdout.stdout.read.return_value = ( b"Could not initialize tesseract\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.detect_orientation(self.image) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "--psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) self.assertEqual(te.exception.status, -1) self.assertIn("Error initializing tesseract", te.exception.message) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract4_bad_output(self, popen, temp_dir, get_version): get_version.return_value = (4, 0, 0) self.stdout.stdout.read.return_value = ( b"Page number: 0\n" b"Orientation in degrees: ABC\n" b"Rotate: 270\n" b"Orientation confidence: AB.CD\n" b"Script: Latin\n" b"Script confidence: 8.06\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.detect_orientation(self.image) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "--psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) self.assertEqual(te.exception.status, -1) self.assertIn("No script found in image", te.exception.message) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract3(self, popen, temp_dir, get_version): get_version.return_value = (3, 5, 0) self.stdout.stdout.read.return_value = ( b"Page number: 0\n" b"Orientation in degrees: 90\n" b"Rotate: 270\n" b"Orientation confidence: 9.30\n" b"Script: Latin\n" b"Script confidence: 8.06\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.detect_orientation(self.image) self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "-psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract3_with_lang(self, popen, temp_dir, get_version): get_version.return_value = (3, 5, 0) self.stdout.stdout.read.return_value = ( b"Page number: 0\n" b"Orientation in degrees: 90\n" b"Rotate: 270\n" b"Orientation confidence: 9.30\n" b"Script: Latin\n" b"Script confidence: 8.06\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.detect_orientation(self.image, lang="fra") self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract3_error(self, popen, temp_dir, get_version): get_version.return_value = (3, 5, 0) self.stdout.stdout.read.return_value = ( b"Could not initialize tesseract\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.detect_orientation(self.image) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "-psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) self.assertEqual(te.exception.status, -1) self.assertIn("Error initializing tesseract", te.exception.message) @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("subprocess.Popen") def test_detect_orientation_tesseract3_bad_output(self, popen, temp_dir, get_version): get_version.return_value = (3, 5, 0) self.stdout.stdout.read.return_value = ( b"Page number: 0\n" b"Orientation in degrees: ABC\n" b"Rotate: 270\n" b"Orientation confidence: AB.CD\n" b"Script: Latin\n" b"Script confidence: 8.06\n" ) popen.return_value = self.stdout with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.detect_orientation(self.image) popen.assert_called_once_with( ["tesseract", "input.bmp", "stdout", "-psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, creationflags=0, cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) self.assertEqual(te.exception.status, -1) self.assertIn("No script found in image", te.exception.message) class TestTesseractTxt(BaseTest): """ These tests make sure the "usual" OCR works fine. (the one generating a .txt file) """ @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = builders.TextBuilder() @patch("pyocr.tesseract.get_version") @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_image_to_string_defaults_to_text_buidler(self, run_tesseract, copen, temp_dir, get_version): get_version.return_value = (4, 0, 0) run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("text")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.image_to_string(self.image) self.assertEqual(result, self._get_file_content("text").strip()) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_lang(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("text")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.image_to_string(self.image, lang="fra", builder=self.builder) self.assertEqual(result, self._get_file_content("text").strip()) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang="fra", flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_text(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("text")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(result, self._get_file_content("text").strip()) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_text_non_rgb_image(self, run_tesseract, copen, temp_dir): """This tests that image_to_string works with non RGB mode images and that image is converted in function.""" image = self.image.convert("L") run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("text")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.image_to_string(image, builder=self.builder) self.assertEqual(result, self._get_file_content("text").strip()) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_text_error(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (1, "Error") copen.return_value = StringIO(self._get_file_content("text")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, 1) self.assertEqual(te.exception.message, "Error") run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_text_error_file(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.side_effect = Exception("Unknown error") with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(Exception): tesseract.image_to_string(self.image, builder=self.builder) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_text_cannot_open_file(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.side_effect = PermissionError(errno.EPERM, "Error opening file") with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(PermissionError): tesseract.image_to_string(self.image, builder=self.builder) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_text_no_output(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "No file output") copen.side_effect = FileNotFoundError( errno.ENOENT, "[Errno 2] No such file or directory: 'output'" ) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, -1) self.assertIn("Unable to find output file (tested", te.exception.message) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) class TestTesseractCharBox(BaseTest): """ These tests make sure that Tesseract box handling works fine. """ @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = tesseract.CharBoxBuilder() @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_char(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("boxes")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.image_to_string(self.image, builder=self.builder) for box in result: self.assertIsInstance(box, builders.Box) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_char_error(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (1, "Error") copen.return_value = StringIO(self._get_file_content("boxes")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, 1) self.assertEqual(te.exception.message, "Error") run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_char_no_output(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "No file output") copen.side_effect = FileNotFoundError( errno.ENOENT, "[Errno 2] No such file or directory: 'output'" ) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, -1) self.assertIn("Unable to find output file (tested", te.exception.message) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) class TestCharBoxBuilder(BaseTest): def test_init(self): builder = tesseract.CharBoxBuilder() self.assertListEqual(builder.file_extensions, ["box"]) self.assertListEqual(builder.tesseract_flags, []) self.assertListEqual( builder.tesseract_configs, ["batch.nochop", "makebox"] ) self.assertListEqual(builder.cuneiform_args, []) self.assertEqual(builder.tesseract_layout, 1) def test_read_file(self): builder = tesseract.CharBoxBuilder() boxes = builder.read_file(self._get_file_handle("boxes")) for box in boxes: self.assertIsInstance(box, builders.Box) def test_read_empty_file(self): builder = tesseract.CharBoxBuilder() output = StringIO() self.assertListEqual(builder.read_file(output), []) def test_read_file_empty_lines(self): builder = tesseract.CharBoxBuilder() boxes = builder.read_file(self._get_file_handle("boxes_empty_lines")) for box in boxes: self.assertIsInstance(box, builders.Box) self.assertNotEqual(box.content, "") def test_read_file_short_lines(self): builder = tesseract.CharBoxBuilder() boxes = builder.read_file(self._get_file_handle("boxes_short_lines")) for box in boxes: self.assertIsInstance(box, builders.Box) self.assertNotEqual(box.content, "#") def test_write_file(self): builder = tesseract.CharBoxBuilder() output = StringIO() boxes = [ builders.Box("a", ((10, 11), (12, 13)), 95), builders.Box("b", ((11, 12), (13, 14))), builders.Box("c", ((12, 13), (14, 15))), builders.Box("d", ((13, 14), (15, 16)), 87), builders.Box("\xe9", ((14, 15), (16, 17)), 88), ] builder.write_file(output, boxes) output.seek(0) output = output.read() for box in boxes: self.assertIn(box.content, output) self.assertIn("{} {} {} {}".format( box.position[0][0], box.position[0][1], box.position[1][0], box.position[1][1], ), output) def test_str_method(self): self.assertEqual(str(tesseract.CharBoxBuilder()), "Character boxes") class TestTesseractDigits(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.builder = builders.DigitBuilder() self.image = Image.new(mode="RGB", size=(1, 1)) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_digits(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("digits")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with open(os.path.join(tmpdir, "output.txt"), "w") as fh: fh.write("") result = tesseract.image_to_string(self.image, builder=self.builder) for digit in result: self.assertIsInstance(int(digit), int) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) class TestTesseractWordBox(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = builders.WordBoxBuilder() @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_word(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("words")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with open(os.path.join(tmpdir, "output.hocr"), "w") as fh: fh.write("") result = tesseract.image_to_string(self.image, builder=self.builder) for box in result: self.assertIsInstance(box, builders.Box) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_word_error(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (1, "Error") copen.return_value = StringIO(self._get_file_content("words")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, 1) self.assertEqual(te.exception.message, "Error") run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_word_no_output(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "No file output") copen.side_effect = FileNotFoundError( errno.ENOENT, "[Errno 2] No such file or directory: 'output'" ) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, -1) self.assertIn("Unable to find output file (tested", te.exception.message) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) class TestTesseractLineBox(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = builders.LineBoxBuilder() @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_line(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO( self._get_file_content("tesseract.lines") ) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with open(os.path.join(tmpdir, "output.hocr"), "w") as fh: fh.write("") result = tesseract.image_to_string(self.image, builder=self.builder) for line in result: self.assertIsInstance(line, builders.LineBox) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_line_error(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (1, "Error") copen.return_value = StringIO( self._get_file_content("tesseract.lines") ) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, 1) self.assertEqual(te.exception.message, "Error") run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_line_no_output(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "No file output") copen.side_effect = FileNotFoundError( errno.ENOENT, "[Errno 2] No such file or directory: 'output'" ) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, -1) self.assertIn("Unable to find output file (tested", te.exception.message) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) class TestTesseractDigitsLineBox(BaseTest): @patch("pyocr.tesseract.get_version") def setUp(self, get_version): super().setUp() get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = builders.DigitLineBoxBuilder() @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_line(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("digits.lines")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with open(os.path.join(tmpdir, "output.hocr"), "w") as fh: fh.write("") result = tesseract.image_to_string(self.image, builder=self.builder) for line in result: self.assertIsInstance(line, builders.LineBox) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_line_error(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (1, "Error") copen.return_value = StringIO(self._get_file_content("digits.lines")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, 1) self.assertEqual(te.exception.message, "Error") run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) @patch("tempfile.TemporaryDirectory") @patch("codecs.open") @patch("pyocr.tesseract.run_tesseract") def test_line_no_output(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "No file output") copen.side_effect = FileNotFoundError( errno.ENOENT, "[Errno 2] No such file or directory: 'output'" ) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, -1) self.assertIn("Unable to find output file (tested", te.exception.message) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, ) pyocr-0.8.5/tests/test_utils.py000066400000000000000000000042361450160556200166040ustar00rootroot00000000000000import unittest from unittest.mock import patch import pyocr from pyocr.util import ( digits_only, ) class TestPyOCR(unittest.TestCase): @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") @patch("pyocr.libtesseract.tesseract_raw.is_available") @patch("shutil.which") def test_available_tools_tesseract4(self, which, is_available, libtess): which.return_value = True is_available.return_value = True libtess.TessVersion.return_value = b"4.0.0" self.assertListEqual( pyocr.get_available_tools(), [ pyocr.tesseract, pyocr.libtesseract, pyocr.cuneiform, ] ) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") @patch("pyocr.libtesseract.tesseract_raw.is_available") @patch("shutil.which") def test_available_tools_tesseract3(self, which, is_available, libtess): which.return_value = True is_available.return_value = True libtess.TessVersion.return_value = b"3.5.0" self.assertListEqual( pyocr.get_available_tools(), [ pyocr.tesseract, pyocr.libtesseract, pyocr.cuneiform, ] ) @patch("pyocr.libtesseract.tesseract_raw.g_libtesseract") @patch("pyocr.libtesseract.tesseract_raw.is_available") @patch("shutil.which") def test_available_tools_tesseract3_0(self, which, is_available, libtess): which.return_value = True is_available.return_value = True libtess.TessVersion.return_value = b"3.0.0" self.assertListEqual( pyocr.get_available_tools(), [ pyocr.tesseract, pyocr.cuneiform, ] ) def test_digits_only(self): self.assertEqual(digits_only("azer"), 0) self.assertEqual(digits_only("10.0.1"), 10) self.assertEqual(digits_only("42azer"), 42) self.assertEqual(digits_only("qsdf42azer"), 42) self.assertEqual(digits_only("v42"), 42) self.assertEqual(digits_only("v42x35"), 42) self.assertEqual(digits_only("v42x35qsdf"), 42)