Mon, 28 May 2012 19:43:22 +0200
ocrodjvu (0.7.10) unstable; urgency=low
* Improve error handling.
* ocrodjvu:
+ Attempt to fix encoding issues and eliminate unwanted control characters
in files produced by Tesseract and Cuneiform.
https://bugs.debian.org/671764
Thanks to Thomas Koch for the bug report.
* hocr2djvused:
+ Add the --fix-utf8 option.
* djvu2hocr:
+ Translate DjVu “region” to (instead of ,
which was causing XHTML validity errors).
* Tests: fix compatibility with PIL ≥ 1.2.
* Include example scans2djvu+hocr script.
* Fix merging results of two Tesseract runs.
Thanks to Janusz S. Bień for the bug report.
* Use RFC 3339 date format in the manual page. Don't call external programs
to build it.
-- Jakub Wilk Sat, 12 May 2012 00:37:50 +0200
ocrodjvu (0.7.9) unstable; urgency=low
* Improve error handling.
* Fix compatibility with Tesseract > 3.01.
-- Jakub Wilk Sat, 10 Mar 2012 23:36:03 +0100
ocrodjvu (0.7.8) unstable; urgency=low
* Improve test suite.
-- Jakub Wilk Sun, 22 Jan 2012 00:04:16 +0100
ocrodjvu (0.7.7) unstable; urgency=low
* Raise proper import error if html5lib is not installed.
Thanks to Kyrill Detinov for the bug report.
-- Jakub Wilk Sun, 11 Dec 2011 23:08:05 +0100
ocrodjvu (0.7.6) unstable; urgency=low
* Improve error handling.
* ocrodjvu:
+ Fix a regression in gocr, ocrad and tesseract engines, which made them
unusable.
-- Jakub Wilk Thu, 27 Oct 2011 18:06:38 +0200
ocrodjvu (0.7.5) unstable; urgency=low
* Check Python version in setup.py.
* Accept slightly malformed hOCR documents (with a text zone not completely
within the page area).
https://bugs.debian.org/575484#35
* Fix compatibility with Tesseract > 3.00.
Thanks to Janusz S. Bień for the bug report.
* ocrodjvu, hocr2djvused:
+ Add the --html5 option.
-- Jakub Wilk Sat, 27 Aug 2011 01:25:33 +0200
ocrodjvu (0.7.4) unstable; urgency=low
* Use a better method to detect Debian-based systems.
* hocr2djvused:
+ Ignore comments and
'''
def _wait_for_worker(worker):
stderr = worker.stderr.readlines()
try:
worker.wait()
except Exception:
for line in stderr:
sys.stderr.write('tesseract: {0}'.format(line))
raise
if len(stderr) == 1:
[line] = stderr
if line.startswith(('Tesseract Open Source OCR Engine', 'Page')):
# Annoyingly, Tesseract prints its own name on standard error even
# if nothing went wrong. Filter out such an unhelpful message.
return
for line in stderr:
sys.stderr.write('tesseract: {0}'.format(line))
def fix_html(s):
'''
Work around buggy hOCR output:
https://code.google.com/p/tesseract-ocr/issues/detail?id=376
'''
regex = re.compile(
r'''
( <[!/]?[a-z]+(?:\s+[^<>]*)?>
|
| (?<= // )
| &[a-z]+;
| &[#][0-9]+;
| &[#]x[0-9a-f]+;
| [^<>&]+
)
''', re.IGNORECASE | re.VERBOSE | re.MULTILINE
)
return ''.join(
chunk if n & 1 else cgi.escape(chunk)
for n, chunk in enumerate(regex.split(s))
)
class ExtractSettings(object):
def __init__(self, rotation=0, page_size=None, **kwargs):
self.rotation = rotation
self.page_size = page_size
class Engine(common.Engine):
name = 'tesseract'
image_format = image_io.TIFF
needs_utf8_fix = True
executable = utils.property('tesseract')
extra_args = utils.property([], shlex.split)
use_hocr = utils.property(None, int)
fix_html = utils.property(0, int)
def __init__(self, *args, **kwargs):
common.Engine.__init__(self, **kwargs)
try:
self._directory, self._extension = self.get_filesystem_info()
except errors.UnknownLanguageList:
raise errors.EngineNotFound(self.name)
if self.use_hocr is None:
self.use_hocr = self._extension == 'traineddata'
if self.use_hocr:
# Import hocr late,
# so that lxml is imported only when needed.
from .. import hocr
self._hocr = hocr
else:
self._hocr = None
self._user_to_tesseract = None # to be defined later
self._languages = list(self._get_languages())
def get_filesystem_info(self):
try:
tesseract = ipc.Subprocess([self.executable, '', '', '-l', 'nonexistent'],
stdout=ipc.PIPE,
stderr=ipc.PIPE,
)
except OSError:
raise errors.UnknownLanguageList
try:
stderr = tesseract.stderr.read()
match = _error_pattern.search(stderr)
if match is None:
raise errors.UnknownLanguageList
directory = match.group('dir')
extension = match.group('ext')
if not os.path.isdir(directory):
raise errors.UnknownLanguageList
finally:
try:
tesseract.wait()
except ipc.CalledProcessError:
pass
else:
# This should never happen. Recognizing non-existent image
# should always fail. But apparently there are Subversion
# snapshots of Tesseract in the wild that do it wrongly. Rather
# than failing hard, issue a warning:
warnings.warn('unexpected exit code from Tesseract', category=RuntimeWarning, stacklevel=2)
return directory, extension
def list_languages(self):
return iter(self._languages)
def _get_languages(self):
self._user_to_tesseract = {}
wildcard = '*.{ext}'.format(ext=self._extension)
for filename in glob.iglob(os.path.join(self._directory, wildcard)):
filename = os.path.basename(filename)
code = os.path.splitext(filename)[0]
if code == 'osd':
continue
try:
isocode = self.user_to_iso639(code)
except errors.InvalidLanguageId:
continue
self._user_to_tesseract[isocode] = code
yield isocode
def user_to_iso639(self, language):
match = _language_pattern.match(language)
if match is None:
raise errors.InvalidLanguageId(language)
isocode = iso639.b_to_t(match.group(1))
if match.group(2) is not None:
isocode += '-' + match.group(2)
return isocode
def user_to_tesseract(self, language):
result = []
for sublang in language.split('+'):
isocode = self.user_to_iso639(sublang)
try:
tesseract_code = self._user_to_tesseract[isocode]
except LookupError:
raise errors.MissingLanguagePack(isocode)
result += [tesseract_code]
return '+'.join(result)
def check_language(self, language):
self.user_to_tesseract(language)
@classmethod
def get_default_language(cls):
return os.getenv('tesslanguage') or 'eng'
def recognize_plain_text(self, image, language, details=None, uax29=None):
language = self.user_to_tesseract(language)
with temporary.directory() as output_dir:
worker = ipc.Subprocess(
[self.executable, image.name, os.path.join(output_dir, 'tmp'), '-l', language] + self.extra_args,
stdout=ipc.PIPE,
stderr=ipc.PIPE,
)
_wait_for_worker(worker)
with open(os.path.join(output_dir, 'tmp.txt'), 'rt') as file:
return common.Output(
file.read(),
format='txt',
)
def recognize_hocr(self, image, language, details=text_zones.TEXT_DETAILS_WORD, uax29=None):
language = self.user_to_tesseract(language)
character_details = details < text_zones.TEXT_DETAILS_WORD or (uax29 and details <= text_zones.TEXT_DETAILS_WORD)
with temporary.directory() as output_dir:
tessconf_path = os.path.join(output_dir, 'tessconf')
with open(tessconf_path, 'wt') as tessconf:
# Tesseract 3.00 doesn't come with any config file to enable hOCR
# output. Let's create our own one.
print('tessedit_create_hocr T', file=tessconf)
early_extra_args = []
late_extra_args = []
n_early = 0
for arg in self.extra_args:
if n_early > 0:
early_extra_args += [arg]
n_early -= 1
elif arg == '-psm':
early_extra_args += [arg]
n_early = 1
else:
late_extra_args += [arg]
commandline = (
[self.executable, image.name, os.path.join(output_dir, 'tmp')] +
early_extra_args +
['-l', language, tessconf_path] +
late_extra_args
)
worker = ipc.Subprocess(
commandline,
stdout=ipc.PIPE,
stderr=ipc.PIPE,
)
_wait_for_worker(worker)
hocr_path = os.path.join(output_dir, 'tmp.hocr')
if not os.path.exists(hocr_path):
hocr_path = hocr_path[:-4] + 'html'
with open(os.path.join(output_dir, hocr_path), 'r') as hocr_file:
contents = hocr_file.read()
if character_details:
worker = ipc.Subprocess(
[self.executable, image.name, os.path.join(output_dir, 'tmp'), '-l', language, 'makebox'] + self.extra_args,
stderr=ipc.PIPE,
)
_wait_for_worker(worker)
with open(os.path.join(output_dir, 'tmp.box'), 'r') as box_file:
contents = contents.replace(
'