--- python-pypdf-1.13.orig/debian/control +++ python-pypdf-1.13/debian/control @@ -0,0 +1,27 @@ +Source: python-pypdf +Section: python +Priority: optional +Maintainer: Debian Python Modules Team +Uploaders: Luciano Bello +Build-Depends: debhelper (>= 5.0.38), python-all, dpatch +Build-Depends-Indep: python-support (>= 0.6.4) +Standards-Version: 3.9.1 +Homepage: http://pybrary.net/pyPdf/ +XS-Python-Version: all + +Package: python-pypdf +Architecture: all +Depends: ${python:Depends}, ${misc:Depends} +XB-Python-Version: all +Description: PDF toolkit implemented solely in Python + A PDF toolkit implemented solely in Python. It is capable of: + * extracting document information (title, author, ...), + * splitting documents page by page, + * merging documents page by page, + * cropping pages, + * merging multiple pages into a single page, + * encrypting and decrypting PDF files. + By being Pure-Python, it should run on any Python platform without any + dependencies on external libraries. It can also work entirely on StringIO + objects rather than file streams, allowing for PDF manipulation in memory. It + is therefore a useful tool for websites that manage or manipulate PDFs. --- python-pypdf-1.13.orig/debian/copyright +++ python-pypdf-1.13/debian/copyright @@ -0,0 +1,45 @@ +This package was debianized by Luciano Bello on +Mon, 28 Apr 2008 18:50:19 -0300. + +It was downloaded from http://pybrary.net/pyPdf/ + +Upstream Author: + + Mathieu Fenniak + +Copyright: + + Copyright (c) 2006-2008, Mathieu Fenniak + Some contributions copyright (c) 2007, Ashish Kulkarni + +License: + + Copyright (c) The Regents of the University of California. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. Neither the name of the University nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + +The Debian packaging is (C) 2008, Luciano Bello and +is licensed under the GPL, see `/usr/share/common-licenses/GPL'. --- python-pypdf-1.13.orig/debian/README.source +++ python-pypdf-1.13/debian/README.source @@ -0,0 +1,38 @@ +This package uses dpatch to manage all modifications to the upstream +source. Changes are stored in the source package as diffs in +debian/patches and applied during the build. + +To get the fully patched source after unpacking the source package, cd +to the root level of the source package and run: + + debian/rules patch + +Removing a patch is as simple as removing its entry from the +debian/patches/00list file, and please also remove the patch file +itself. + +Creating a new patch is done with "dpatch-edit-patch patch XX_patchname" +where you should replace XX with a new number and patchname with a +descriptive shortname of the patch. You can then simply edit all the +files your patch wants to edit, and then simply "exit 0" from the shell +to actually create the patch file. + +To tweak an already existing patch, call "dpatch-edit-patch XX_patchname" +and replace XX_patchname with the actual filename from debian/patches +you want to use. + +To clean up afterwards again, "debian/rules unpatch" will do the +work for you - or you can of course choose to call +"fakeroot debian/rules clean" all together. + + +--- + +this documentation is part of dpatch package, and may be used by +packages using dpatch to comply with policy on README.source. This +documentation is meant to be useful to users who are not proficient in +dpatch in doing work with dpatch-based packages. Please send any +improvements to the BTS of dpatch package. + +original text by Gerfried Fuchs, edited by Junichi Uekawa +10 Aug 2008. --- python-pypdf-1.13.orig/debian/watch +++ python-pypdf-1.13/debian/watch @@ -0,0 +1,2 @@ +version=3 +http://pybrary.net/pyPdf/pyPdf-(.*)\.tar\.gz --- python-pypdf-1.13.orig/debian/compat +++ python-pypdf-1.13/debian/compat @@ -0,0 +1 @@ +5 --- python-pypdf-1.13.orig/debian/rules +++ python-pypdf-1.13/debian/rules @@ -0,0 +1,51 @@ +#!/usr/bin/make -f +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 +include /usr/share/dpatch/dpatch.make + +PYVERS=$(shell pyversions -r) + +build: patch-stamp + +clean: unpatch + dh_testdir + + # Add here commands to clean up after the build process. + for python in $(PYVERS); do \ + $$python setup.py clean; \ + done + + rm -f build-stamp + rm -f pyPdf/*.pyc pyPdf/*.pyo + + dh_clean + +install: build + dh_testdir + dh_testroot + dh_installdirs + + for python in $(PYVERS); do \ + $$python setup.py install --root=debian/python-pypdf; \ + done + +# Build architecture-dependent files here. +binary-arch: +# We have nothing to do by default. + +# Build architecture-independent files here. +binary-indep: build install + dh_testdir + dh_testroot + dh_installchangelogs + dh_installdocs README + dh_pysupport + dh_compress + dh_fixperms + dh_installdeb + dh_gencontrol + dh_md5sums + dh_builddeb + +binary: binary-indep binary-arch +.PHONY: build clean binary-indep binary-arch binary install --- python-pypdf-1.13.orig/debian/changelog +++ python-pypdf-1.13/debian/changelog @@ -0,0 +1,39 @@ +python-pypdf (1.13-1) unstable; urgency=low + + * New upstream release (Closes: #615961). + - DeprecationWarning in the sets fixed (Closes: #593574) + * New Standards-Version. + * Typo in the README file fixed (Closes: #567312) + * Better extra spaces handling (Closes: #563443) + + -- Luciano Bello Fri, 13 May 2011 10:06:14 -0300 + +python-pypdf (1.12-3) unstable; urgency=low + + * Change to XB-Python-Version: all - pypdf handles all versions. + * Add README.Source file. + * Bump Standards-Version to 3.8.4. + + -- Debian Python Modules Team Mon, 10 May 2010 23:18:19 +0200 + +python-pypdf (1.12-2) unstable; urgency=low + + * Upstream name fixed + * dpatch added + * Fixed the backspace escape-sequence '\b' (Closes: #511252) + + -- Luciano Bello Sat, 10 Jan 2009 12:19:47 -0200 + +python-pypdf (1.12-1) unstable; urgency=low + + * New upstream release. + * Standards-Version updated. + * debian/watch added by Sandro Tosi . + + -- Luciano Bello Thu, 04 Sep 2008 18:33:38 -0300 + +python-pypdf (1.10-1) unstable; urgency=low + + * Initial release (Closes: #467217) + + -- Luciano Bello Mon, 28 Apr 2008 18:20:06 -0300 --- python-pypdf-1.13.orig/debian/source/format +++ python-pypdf-1.13/debian/source/format @@ -0,0 +1 @@ +1.0 --- python-pypdf-1.13.orig/debian/patches/02_README_typo.dpatch +++ python-pypdf-1.13/debian/patches/02_README_typo.dpatch @@ -0,0 +1,18 @@ +#! /bin/sh /usr/share/dpatch/dpatch-run +## 02_README_typo.dpatch by Luciano Bello +## +## DP: #567312: example in README has syntax error + +@DPATCH@ +diff -urNad '--exclude=CVS' '--exclude=.svn' '--exclude=.git' '--exclude=.arch' '--exclude=.hg' '--exclude=_darcs' '--exclude=.bzr' trunk~/README trunk/README +--- trunk~/README 2010-12-04 19:49:56.000000000 -0300 ++++ trunk/README 2011-05-13 10:02:01.000000000 -0300 +@@ -29,7 +29,7 @@ + output.addPage(page5) + + # print how many pages input1 has: +- print "document1.pdf has %s pages." % input1.getNumPages()) ++ print "document1.pdf has %s pages." % (input1.getNumPages()) + + # finally, write "output" to document-output.pdf + outputStream = file("document-output.pdf", "wb") --- python-pypdf-1.13.orig/debian/patches/00list +++ python-pypdf-1.13/debian/patches/00list @@ -0,0 +1,2 @@ +02_README_typo.dpatch +02_lax_whitespace.dpatch --- python-pypdf-1.13.orig/debian/patches/02_lax_whitespace.dpatch +++ python-pypdf-1.13/debian/patches/02_lax_whitespace.dpatch @@ -0,0 +1,1946 @@ +#! /bin/sh /usr/share/dpatch/dpatch-run +## 02_lax_whitespace.dpatch by John V. Belmonte . +## +## DP: #563443: parsing not robust to whitespace + +@DPATCH@ +diff -urNad '--exclude=CVS' '--exclude=.svn' '--exclude=.git' '--exclude=.arch' '--exclude=.hg' '--exclude=_darcs' '--exclude=.bzr' trunk~/pyPdf/generic.py trunk/pyPdf/generic.py +--- trunk~/pyPdf/generic.py 2010-12-04 19:49:56.000000000 -0300 ++++ trunk/pyPdf/generic.py 2011-05-13 10:43:20.000000000 -0300 +@@ -35,7 +35,7 @@ + __author_email__ = "biziqe@mathieu.fenniak.net" + + import re +-from utils import readNonWhitespace, RC4_encrypt ++from utils import readNonWhitespace, readUntilWhitespace, RC4_encrypt + import filters + import utils + import decimal +@@ -81,7 +81,7 @@ + return NumberObject.readFromStream(stream) + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start +- if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: ++ if re.match(r"(\d+)\s+(\d+)\sR[^a-zA-Z]", peek) != None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) +@@ -183,19 +183,10 @@ + stream.write("%s %s R" % (self.idnum, self.generation)) + + def readFromStream(stream, pdf): +- idnum = "" +- while True: +- tok = stream.read(1) +- if tok.isspace(): +- break +- idnum += tok +- generation = "" +- while True: +- tok = stream.read(1) +- if tok.isspace(): +- break +- generation += tok +- r = stream.read(1) ++ idnum = readUntilWhitespace(stream) ++ readNonWhitespace(stream); stream.seek(-1, 1) ++ generation = readUntilWhitespace(stream) ++ r = readNonWhitespace(stream) + if r != "R": + raise utils.PdfReadError("error reading indirect object reference") + return IndirectObject(int(idnum), int(generation), pdf) +diff -urNad '--exclude=CVS' '--exclude=.svn' '--exclude=.git' '--exclude=.arch' '--exclude=.hg' '--exclude=_darcs' '--exclude=.bzr' trunk~/pyPdf/pdf.py trunk/pyPdf/pdf.py +--- trunk~/pyPdf/pdf.py 2010-12-04 19:49:56.000000000 -0300 ++++ trunk/pyPdf/pdf.py 2011-05-13 10:43:20.000000000 -0300 +@@ -686,10 +686,13 @@ + # tables that are off by whitespace bytes. + readNonWhitespace(stream); stream.seek(-1, 1) + idnum = readUntilWhitespace(stream) ++ readNonWhitespace(stream); stream.seek(-1, 1) + generation = readUntilWhitespace(stream) +- obj = stream.read(3) +- readNonWhitespace(stream) +- stream.seek(-1, 1) ++ readNonWhitespace(stream); stream.seek(-1, 1) ++ obj_token = stream.read(3) ++ if obj_token != 'obj': ++ raise utils.PdfReadError("Error reading object header") ++ readNonWhitespace(stream); stream.seek(-1, 1) + return int(idnum), int(generation) + + def cacheIndirectObject(self, generation, idnum, obj): +diff -urNad '--exclude=CVS' '--exclude=.svn' '--exclude=.git' '--exclude=.arch' '--exclude=.hg' '--exclude=_darcs' '--exclude=.bzr' trunk~/pyPdf/pdf.py.orig trunk/pyPdf/pdf.py.orig +--- trunk~/pyPdf/pdf.py.orig 1969-12-31 21:00:00.000000000 -0300 ++++ trunk/pyPdf/pdf.py.orig 2010-12-04 19:49:56.000000000 -0300 +@@ -0,0 +1,1871 @@ ++# -*- coding: utf-8 -*- ++# ++# vim: sw=4:expandtab:foldmethod=marker ++# ++# Copyright (c) 2006, Mathieu Fenniak ++# Copyright (c) 2007, Ashish Kulkarni ++# ++# All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are ++# met: ++# ++# * Redistributions of source code must retain the above copyright notice, ++# this list of conditions and the following disclaimer. ++# * Redistributions in binary form must reproduce the above copyright notice, ++# this list of conditions and the following disclaimer in the documentation ++# and/or other materials provided with the distribution. ++# * The name of the author may not be used to endorse or promote products ++# derived from this software without specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++# POSSIBILITY OF SUCH DAMAGE. ++ ++ ++""" ++A pure-Python PDF library with very minimal capabilities. It was designed to ++be able to split and merge PDF files by page, and that's about all it can do. ++It may be a solid base for future PDF file work in Python. ++""" ++__author__ = "Mathieu Fenniak" ++__author_email__ = "biziqe@mathieu.fenniak.net" ++ ++import math ++import struct ++from sys import version_info ++try: ++ from cStringIO import StringIO ++except ImportError: ++ from StringIO import StringIO ++ ++import filters ++import utils ++import warnings ++from generic import * ++from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList ++ ++if version_info < ( 2, 4 ): ++ from sets import ImmutableSet as frozenset ++ ++if version_info < ( 2, 5 ): ++ from md5 import md5 ++else: ++ from hashlib import md5 ++ ++## ++# This class supports writing PDF files out, given pages produced by another ++# class (typically {@link #PdfFileReader PdfFileReader}). ++class PdfFileWriter(object): ++ def __init__(self): ++ self._header = "%PDF-1.3" ++ self._objects = [] # array of indirect objects ++ ++ # The root of our page tree node. ++ pages = DictionaryObject() ++ pages.update({ ++ NameObject("/Type"): NameObject("/Pages"), ++ NameObject("/Count"): NumberObject(0), ++ NameObject("/Kids"): ArrayObject(), ++ }) ++ self._pages = self._addObject(pages) ++ ++ # info object ++ info = DictionaryObject() ++ info.update({ ++ NameObject("/Producer"): createStringObject(u"Python PDF Library - http://pybrary.net/pyPdf/") ++ }) ++ self._info = self._addObject(info) ++ ++ # root object ++ root = DictionaryObject() ++ root.update({ ++ NameObject("/Type"): NameObject("/Catalog"), ++ NameObject("/Pages"): self._pages, ++ }) ++ self._root = self._addObject(root) ++ ++ def _addObject(self, obj): ++ self._objects.append(obj) ++ return IndirectObject(len(self._objects), 0, self) ++ ++ def getObject(self, ido): ++ if ido.pdf != self: ++ raise ValueError("pdf must be self") ++ return self._objects[ido.idnum - 1] ++ ++ ## ++ # Common method for inserting or adding a page to this PDF file. ++ # ++ # @param page The page to add to the document. This argument should be ++ # an instance of {@link #PageObject PageObject}. ++ # @param action The function which will insert the page in the dictionnary. ++ # Takes: page list, page to add. ++ def _addPage(self, page, action): ++ assert page["/Type"] == "/Page" ++ page[NameObject("/Parent")] = self._pages ++ page = self._addObject(page) ++ pages = self.getObject(self._pages) ++ action(pages["/Kids"], page) ++ pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) ++ ++ ## ++ # Adds a page to this PDF file. The page is usually acquired from a ++ # {@link #PdfFileReader PdfFileReader} instance. ++ #

++ # Stability: Added in v1.0, will exist for all v1.x releases. ++ # ++ # @param page The page to add to the document. This argument should be ++ # an instance of {@link #PageObject PageObject}. ++ def addPage(self, page): ++ self._addPage(page, list.append) ++ ++ ## ++ # Insert a page in this PDF file. The page is usually acquired from a ++ # {@link #PdfFileReader PdfFileReader} instance. ++ # ++ # @param page The page to add to the document. This argument should be ++ # an instance of {@link #PageObject PageObject}. ++ # @param index Position at which the page will be inserted. ++ def insertPage(self, page, index=0): ++ self._addPage(page, lambda l, p: l.insert(index, p)) ++ ++ ## ++ # Retrieves a page by number from this PDF file. ++ # @return Returns a {@link #PageObject PageObject} instance. ++ def getPage(self, pageNumber): ++ pages = self.getObject(self._pages) ++ # XXX: crude hack ++ return pages["/Kids"][pageNumber].getObject() ++ ++ ## ++ # Return the number of pages. ++ # @return The number of pages. ++ def getNumPages(self): ++ pages = self.getObject(self._pages) ++ return int(pages[NameObject("/Count")]) ++ ++ ## ++ # Append a blank page to this PDF file and returns it. If no page size ++ # is specified, use the size of the last page; throw ++ # PageSizeNotDefinedError if it doesn't exist. ++ # @param width The width of the new page expressed in default user ++ # space units. ++ # @param height The height of the new page expressed in default user ++ # space units. ++ def addBlankPage(self, width=None, height=None): ++ page = PageObject.createBlankPage(self, width, height) ++ self.addPage(page) ++ return page ++ ++ ## ++ # Insert a blank page to this PDF file and returns it. If no page size ++ # is specified, use the size of the page in the given index; throw ++ # PageSizeNotDefinedError if it doesn't exist. ++ # @param width The width of the new page expressed in default user ++ # space units. ++ # @param height The height of the new page expressed in default user ++ # space units. ++ # @param index Position to add the page. ++ def insertBlankPage(self, width=None, height=None, index=0): ++ if width is None or height is None and \ ++ (self.getNumPages() - 1) >= index: ++ oldpage = self.getPage(index) ++ width = oldpage.mediaBox.getWidth() ++ height = oldpage.mediaBox.getHeight() ++ page = PageObject.createBlankPage(self, width, height) ++ self.insertPage(page, index) ++ return page ++ ++ ## ++ # Encrypt this PDF file with the PDF Standard encryption handler. ++ # @param user_pwd The "user password", which allows for opening and reading ++ # the PDF file with the restrictions provided. ++ # @param owner_pwd The "owner password", which allows for opening the PDF ++ # files without any restrictions. By default, the owner password is the ++ # same as the user password. ++ # @param use_128bit Boolean argument as to whether to use 128bit ++ # encryption. When false, 40bit encryption will be used. By default, this ++ # flag is on. ++ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): ++ import time, random ++ if owner_pwd == None: ++ owner_pwd = user_pwd ++ if use_128bit: ++ V = 2 ++ rev = 3 ++ keylen = 128 / 8 ++ else: ++ V = 1 ++ rev = 2 ++ keylen = 40 / 8 ++ # permit everything: ++ P = -1 ++ O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) ++ ID_1 = md5(repr(time.time())).digest() ++ ID_2 = md5(repr(random.random())).digest() ++ self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2))) ++ if rev == 2: ++ U, key = _alg34(user_pwd, O, P, ID_1) ++ else: ++ assert rev == 3 ++ U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) ++ encrypt = DictionaryObject() ++ encrypt[NameObject("/Filter")] = NameObject("/Standard") ++ encrypt[NameObject("/V")] = NumberObject(V) ++ if V == 2: ++ encrypt[NameObject("/Length")] = NumberObject(keylen * 8) ++ encrypt[NameObject("/R")] = NumberObject(rev) ++ encrypt[NameObject("/O")] = ByteStringObject(O) ++ encrypt[NameObject("/U")] = ByteStringObject(U) ++ encrypt[NameObject("/P")] = NumberObject(P) ++ self._encrypt = self._addObject(encrypt) ++ self._encrypt_key = key ++ ++ ## ++ # Writes the collection of pages added to this object out as a PDF file. ++ #

++ # Stability: Added in v1.0, will exist for all v1.x releases. ++ # @param stream An object to write the file to. The object must support ++ # the write method, and the tell method, similar to a file object. ++ def write(self, stream): ++ import struct ++ ++ externalReferenceMap = {} ++ ++ # PDF objects sometimes have circular references to their /Page objects ++ # inside their object tree (for example, annotations). Those will be ++ # indirect references to objects that we've recreated in this PDF. To ++ # address this problem, PageObject's store their original object ++ # reference number, and we add it to the external reference map before ++ # we sweep for indirect references. This forces self-page-referencing ++ # trees to reference the correct new object location, rather than ++ # copying in a new copy of the page object. ++ for objIndex in xrange(len(self._objects)): ++ obj = self._objects[objIndex] ++ if isinstance(obj, PageObject) and obj.indirectRef != None: ++ data = obj.indirectRef ++ if not externalReferenceMap.has_key(data.pdf): ++ externalReferenceMap[data.pdf] = {} ++ if not externalReferenceMap[data.pdf].has_key(data.generation): ++ externalReferenceMap[data.pdf][data.generation] = {} ++ externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self) ++ ++ self.stack = [] ++ self._sweepIndirectReferences(externalReferenceMap, self._root) ++ del self.stack ++ ++ # Begin writing: ++ object_positions = [] ++ stream.write(self._header + "\n") ++ for i in range(len(self._objects)): ++ idnum = (i + 1) ++ obj = self._objects[i] ++ object_positions.append(stream.tell()) ++ stream.write(str(idnum) + " 0 obj\n") ++ key = None ++ if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: ++ pack1 = struct.pack(" ++# Stability: Added in v1.0, will exist for all v1.x releases. ++# ++# @param stream An object that supports the standard read and seek methods ++# similar to a file object. ++class PdfFileReader(object): ++ def __init__(self, stream): ++ self.flattenedPages = None ++ self.resolvedObjects = {} ++ self.read(stream) ++ self.stream = stream ++ self._override_encryption = False ++ ++ ## ++ # Retrieves the PDF file's document information dictionary, if it exists. ++ # Note that some PDF files use metadata streams instead of docinfo ++ # dictionaries, and these metadata streams will not be accessed by this ++ # function. ++ #

++ # Stability: Added in v1.6, will exist for all future v1.x releases. ++ # @return Returns a {@link #DocumentInformation DocumentInformation} ++ # instance, or None if none exists. ++ def getDocumentInfo(self): ++ if not self.trailer.has_key("/Info"): ++ return None ++ obj = self.trailer['/Info'] ++ retval = DocumentInformation() ++ retval.update(obj) ++ return retval ++ ++ ## ++ # Read-only property that accesses the {@link ++ # #PdfFileReader.getDocumentInfo getDocumentInfo} function. ++ #

++ # Stability: Added in v1.7, will exist for all future v1.x releases. ++ documentInfo = property(lambda self: self.getDocumentInfo(), None, None) ++ ++ ## ++ # Retrieves XMP (Extensible Metadata Platform) data from the PDF document ++ # root. ++ #

++ # Stability: Added in v1.12, will exist for all future v1.x releases. ++ # @return Returns a {@link #generic.XmpInformation XmlInformation} ++ # instance that can be used to access XMP metadata from the document. ++ # Can also return None if no metadata was found on the document root. ++ def getXmpMetadata(self): ++ try: ++ self._override_encryption = True ++ return self.trailer["/Root"].getXmpMetadata() ++ finally: ++ self._override_encryption = False ++ ++ ## ++ # Read-only property that accesses the {@link #PdfFileReader.getXmpData ++ # getXmpData} function. ++ #

++ # Stability: Added in v1.12, will exist for all future v1.x releases. ++ xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) ++ ++ ## ++ # Calculates the number of pages in this PDF file. ++ #

++ # Stability: Added in v1.0, will exist for all v1.x releases. ++ # @return Returns an integer. ++ def getNumPages(self): ++ if self.flattenedPages == None: ++ self._flatten() ++ return len(self.flattenedPages) ++ ++ ## ++ # Read-only property that accesses the {@link #PdfFileReader.getNumPages ++ # getNumPages} function. ++ #

++ # Stability: Added in v1.7, will exist for all future v1.x releases. ++ numPages = property(lambda self: self.getNumPages(), None, None) ++ ++ ## ++ # Retrieves a page by number from this PDF file. ++ #

++ # Stability: Added in v1.0, will exist for all v1.x releases. ++ # @return Returns a {@link #PageObject PageObject} instance. ++ def getPage(self, pageNumber): ++ ## ensure that we're not trying to access an encrypted PDF ++ #assert not self.trailer.has_key("/Encrypt") ++ if self.flattenedPages == None: ++ self._flatten() ++ return self.flattenedPages[pageNumber] ++ ++ ## ++ # Read-only property that accesses the ++ # {@link #PdfFileReader.getNamedDestinations ++ # getNamedDestinations} function. ++ #

++ # Stability: Added in v1.10, will exist for all future v1.x releases. ++ namedDestinations = property(lambda self: ++ self.getNamedDestinations(), None, None) ++ ++ ## ++ # Retrieves the named destinations present in the document. ++ #

++ # Stability: Added in v1.10, will exist for all future v1.x releases. ++ # @return Returns a dict which maps names to {@link #Destination ++ # destinations}. ++ def getNamedDestinations(self, tree=None, retval=None): ++ if retval == None: ++ retval = {} ++ catalog = self.trailer["/Root"] ++ ++ # get the name tree ++ if catalog.has_key("/Dests"): ++ tree = catalog["/Dests"] ++ elif catalog.has_key("/Names"): ++ names = catalog['/Names'] ++ if names.has_key("/Dests"): ++ tree = names['/Dests'] ++ ++ if tree == None: ++ return retval ++ ++ if tree.has_key("/Kids"): ++ # recurse down the tree ++ for kid in tree["/Kids"]: ++ self.getNamedDestinations(kid.getObject(), retval) ++ ++ if tree.has_key("/Names"): ++ names = tree["/Names"] ++ for i in range(0, len(names), 2): ++ key = names[i].getObject() ++ val = names[i+1].getObject() ++ if isinstance(val, DictionaryObject) and val.has_key('/D'): ++ val = val['/D'] ++ dest = self._buildDestination(key, val) ++ if dest != None: ++ retval[key] = dest ++ ++ return retval ++ ++ ## ++ # Read-only property that accesses the {@link #PdfFileReader.getOutlines ++ # getOutlines} function. ++ #

++ # Stability: Added in v1.10, will exist for all future v1.x releases. ++ outlines = property(lambda self: self.getOutlines(), None, None) ++ ++ ## ++ # Retrieves the document outline present in the document. ++ #

++ # Stability: Added in v1.10, will exist for all future v1.x releases. ++ # @return Returns a nested list of {@link #Destination destinations}. ++ def getOutlines(self, node=None, outlines=None): ++ if outlines == None: ++ outlines = [] ++ catalog = self.trailer["/Root"] ++ ++ # get the outline dictionary and named destinations ++ if catalog.has_key("/Outlines"): ++ lines = catalog["/Outlines"] ++ if lines.has_key("/First"): ++ node = lines["/First"] ++ self._namedDests = self.getNamedDestinations() ++ ++ if node == None: ++ return outlines ++ ++ # see if there are any more outlines ++ while 1: ++ outline = self._buildOutline(node) ++ if outline: ++ outlines.append(outline) ++ ++ # check for sub-outlines ++ if node.has_key("/First"): ++ subOutlines = [] ++ self.getOutlines(node["/First"], subOutlines) ++ if subOutlines: ++ outlines.append(subOutlines) ++ ++ if not node.has_key("/Next"): ++ break ++ node = node["/Next"] ++ ++ return outlines ++ ++ def _buildDestination(self, title, array): ++ page, typ = array[0:2] ++ array = array[2:] ++ return Destination(title, page, typ, *array) ++ ++ def _buildOutline(self, node): ++ dest, title, outline = None, None, None ++ ++ if node.has_key("/A") and node.has_key("/Title"): ++ # Action, section 8.5 (only type GoTo supported) ++ title = node["/Title"] ++ action = node["/A"] ++ if action["/S"] == "/GoTo": ++ dest = action["/D"] ++ elif node.has_key("/Dest") and node.has_key("/Title"): ++ # Destination, section 8.2.1 ++ title = node["/Title"] ++ dest = node["/Dest"] ++ ++ # if destination found, then create outline ++ if dest: ++ if isinstance(dest, ArrayObject): ++ outline = self._buildDestination(title, dest) ++ elif isinstance(dest, unicode) and self._namedDests.has_key(dest): ++ outline = self._namedDests[dest] ++ outline[NameObject("/Title")] = title ++ else: ++ raise utils.PdfReadError("Unexpected destination %r" % dest) ++ return outline ++ ++ ## ++ # Read-only property that emulates a list based upon the {@link ++ # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage ++ # getPage} functions. ++ #

++ # Stability: Added in v1.7, and will exist for all future v1.x releases. ++ pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), ++ None, None) ++ ++ def _flatten(self, pages=None, inherit=None, indirectRef=None): ++ inheritablePageAttributes = ( ++ NameObject("/Resources"), NameObject("/MediaBox"), ++ NameObject("/CropBox"), NameObject("/Rotate") ++ ) ++ if inherit == None: ++ inherit = dict() ++ if pages == None: ++ self.flattenedPages = [] ++ catalog = self.trailer["/Root"].getObject() ++ pages = catalog["/Pages"].getObject() ++ t = pages["/Type"] ++ if t == "/Pages": ++ for attr in inheritablePageAttributes: ++ if pages.has_key(attr): ++ inherit[attr] = pages[attr] ++ for page in pages["/Kids"]: ++ addt = {} ++ if isinstance(page, IndirectObject): ++ addt["indirectRef"] = page ++ self._flatten(page.getObject(), inherit, **addt) ++ elif t == "/Page": ++ for attr,value in inherit.items(): ++ # if the page has it's own value, it does not inherit the ++ # parent's value: ++ if not pages.has_key(attr): ++ pages[attr] = value ++ pageObj = PageObject(self, indirectRef) ++ pageObj.update(pages) ++ self.flattenedPages.append(pageObj) ++ ++ def getObject(self, indirectReference): ++ retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) ++ if retval != None: ++ return retval ++ if indirectReference.generation == 0 and \ ++ self.xref_objStm.has_key(indirectReference.idnum): ++ # indirect reference to object in object stream ++ # read the entire object stream into memory ++ stmnum,idx = self.xref_objStm[indirectReference.idnum] ++ objStm = IndirectObject(stmnum, 0, self).getObject() ++ assert objStm['/Type'] == '/ObjStm' ++ assert idx < objStm['/N'] ++ streamData = StringIO(objStm.getData()) ++ for i in range(objStm['/N']): ++ objnum = NumberObject.readFromStream(streamData) ++ readNonWhitespace(streamData) ++ streamData.seek(-1, 1) ++ offset = NumberObject.readFromStream(streamData) ++ readNonWhitespace(streamData) ++ streamData.seek(-1, 1) ++ t = streamData.tell() ++ streamData.seek(objStm['/First']+offset, 0) ++ obj = readObject(streamData, self) ++ self.resolvedObjects[0][objnum] = obj ++ streamData.seek(t, 0) ++ return self.resolvedObjects[0][indirectReference.idnum] ++ start = self.xref[indirectReference.generation][indirectReference.idnum] ++ self.stream.seek(start, 0) ++ idnum, generation = self.readObjectHeader(self.stream) ++ assert idnum == indirectReference.idnum ++ assert generation == indirectReference.generation ++ retval = readObject(self.stream, self) ++ ++ # override encryption is used for the /Encrypt dictionary ++ if not self._override_encryption and self.isEncrypted: ++ # if we don't have the encryption key: ++ if not hasattr(self, '_decryption_key'): ++ raise Exception, "file has not been decrypted" ++ # otherwise, decrypt here... ++ import struct ++ pack1 = struct.pack("= len(array): ++ break ++ ++ def readNextEndLine(self, stream): ++ line = "" ++ while True: ++ x = stream.read(1) ++ stream.seek(-2, 1) ++ if x == '\n' or x == '\r': ++ while x == '\n' or x == '\r': ++ x = stream.read(1) ++ stream.seek(-2, 1) ++ stream.seek(1, 1) ++ break ++ else: ++ line = x + line ++ return line ++ ++ ## ++ # When using an encrypted / secured PDF file with the PDF Standard ++ # encryption handler, this function will allow the file to be decrypted. ++ # It checks the given password against the document's user password and ++ # owner password, and then stores the resulting decryption key if either ++ # password is correct. ++ #

++ # It does not matter which password was matched. Both passwords provide ++ # the correct decryption key that will allow the document to be used with ++ # this library. ++ #

++ # Stability: Added in v1.8, will exist for all future v1.x releases. ++ # ++ # @return 0 if the password failed, 1 if the password matched the user ++ # password, and 2 if the password matched the owner password. ++ # ++ # @exception NotImplementedError Document uses an unsupported encryption ++ # method. ++ def decrypt(self, password): ++ self._override_encryption = True ++ try: ++ return self._decrypt(password) ++ finally: ++ self._override_encryption = False ++ ++ def _decrypt(self, password): ++ encrypt = self.trailer['/Encrypt'].getObject() ++ if encrypt['/Filter'] != '/Standard': ++ raise NotImplementedError, "only Standard PDF encryption handler is available" ++ if not (encrypt['/V'] in (1, 2)): ++ raise NotImplementedError, "only algorithm code 1 and 2 are supported" ++ user_password, key = self._authenticateUserPassword(password) ++ if user_password: ++ self._decryption_key = key ++ return 1 ++ else: ++ rev = encrypt['/R'].getObject() ++ if rev == 2: ++ keylen = 5 ++ else: ++ keylen = encrypt['/Length'].getObject() / 8 ++ key = _alg33_1(password, rev, keylen) ++ real_O = encrypt["/O"].getObject() ++ if rev == 2: ++ userpass = utils.RC4_encrypt(key, real_O) ++ else: ++ val = real_O ++ for i in range(19, -1, -1): ++ new_key = '' ++ for l in range(len(key)): ++ new_key += chr(ord(key[l]) ^ i) ++ val = utils.RC4_encrypt(new_key, val) ++ userpass = val ++ owner_password, key = self._authenticateUserPassword(userpass) ++ if owner_password: ++ self._decryption_key = key ++ return 2 ++ return 0 ++ ++ def _authenticateUserPassword(self, password): ++ encrypt = self.trailer['/Encrypt'].getObject() ++ rev = encrypt['/R'].getObject() ++ owner_entry = encrypt['/O'].getObject().original_bytes ++ p_entry = encrypt['/P'].getObject() ++ id_entry = self.trailer['/ID'].getObject() ++ id1_entry = id_entry[0].getObject() ++ if rev == 2: ++ U, key = _alg34(password, owner_entry, p_entry, id1_entry) ++ elif rev >= 3: ++ U, key = _alg35(password, rev, ++ encrypt["/Length"].getObject() / 8, owner_entry, ++ p_entry, id1_entry, ++ encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) ++ real_U = encrypt['/U'].getObject().original_bytes ++ return U == real_U, key ++ ++ def getIsEncrypted(self): ++ return self.trailer.has_key("/Encrypt") ++ ++ ## ++ # Read-only boolean property showing whether this PDF file is encrypted. ++ # Note that this property, if true, will remain true even after the {@link ++ # #PdfFileReader.decrypt decrypt} function is called. ++ isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) ++ ++ ++def getRectangle(self, name, defaults): ++ retval = self.get(name) ++ if isinstance(retval, RectangleObject): ++ return retval ++ if retval == None: ++ for d in defaults: ++ retval = self.get(d) ++ if retval != None: ++ break ++ if isinstance(retval, IndirectObject): ++ retval = self.pdf.getObject(retval) ++ retval = RectangleObject(retval) ++ setRectangle(self, name, retval) ++ return retval ++ ++def setRectangle(self, name, value): ++ if not isinstance(name, NameObject): ++ name = NameObject(name) ++ self[name] = value ++ ++def deleteRectangle(self, name): ++ del self[name] ++ ++def createRectangleAccessor(name, fallback): ++ return \ ++ property( ++ lambda self: getRectangle(self, name, fallback), ++ lambda self, value: setRectangle(self, name, value), ++ lambda self: deleteRectangle(self, name) ++ ) ++ ++## ++# This class represents a single page within a PDF file. Typically this object ++# will be created by accessing the {@link #PdfFileReader.getPage getPage} ++# function of the {@link #PdfFileReader PdfFileReader} class, but it is ++# also possible to create an empty page with the createBlankPage static ++# method. ++# @param pdf PDF file the page belongs to (optional, defaults to None). ++class PageObject(DictionaryObject): ++ def __init__(self, pdf=None, indirectRef=None): ++ DictionaryObject.__init__(self) ++ self.pdf = pdf ++ # Stores the original indirect reference to this object in its source PDF ++ self.indirectRef = indirectRef ++ ++ ## ++ # Returns a new blank page. ++ # If width or height is None, try to get the page size from the ++ # last page of pdf. If pdf is None or contains no page, a ++ # PageSizeNotDefinedError is raised. ++ # @param pdf PDF file the page belongs to ++ # @param width The width of the new page expressed in default user ++ # space units. ++ # @param height The height of the new page expressed in default user ++ # space units. ++ def createBlankPage(pdf=None, width=None, height=None): ++ page = PageObject(pdf) ++ ++ # Creates a new page (cf PDF Reference 7.7.3.3) ++ page.__setitem__(NameObject('/Type'), NameObject('/Page')) ++ page.__setitem__(NameObject('/Parent'), NullObject()) ++ page.__setitem__(NameObject('/Resources'), DictionaryObject()) ++ if width is None or height is None: ++ if pdf is not None and pdf.getNumPages() > 0: ++ lastpage = pdf.getPage(pdf.getNumPages() - 1) ++ width = lastpage.mediaBox.getWidth() ++ height = lastpage.mediaBox.getHeight() ++ else: ++ raise utils.PageSizeNotDefinedError() ++ page.__setitem__(NameObject('/MediaBox'), ++ RectangleObject([0, 0, width, height])) ++ ++ return page ++ createBlankPage = staticmethod(createBlankPage) ++ ++ ## ++ # Rotates a page clockwise by increments of 90 degrees. ++ #

++ # Stability: Added in v1.1, will exist for all future v1.x releases. ++ # @param angle Angle to rotate the page. Must be an increment of 90 deg. ++ def rotateClockwise(self, angle): ++ assert angle % 90 == 0 ++ self._rotate(angle) ++ return self ++ ++ ## ++ # Rotates a page counter-clockwise by increments of 90 degrees. ++ #

++ # Stability: Added in v1.1, will exist for all future v1.x releases. ++ # @param angle Angle to rotate the page. Must be an increment of 90 deg. ++ def rotateCounterClockwise(self, angle): ++ assert angle % 90 == 0 ++ self._rotate(-angle) ++ return self ++ ++ def _rotate(self, angle): ++ currentAngle = self.get("/Rotate", 0) ++ self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) ++ ++ def _mergeResources(res1, res2, resource): ++ newRes = DictionaryObject() ++ newRes.update(res1.get(resource, DictionaryObject()).getObject()) ++ page2Res = res2.get(resource, DictionaryObject()).getObject() ++ renameRes = {} ++ for key in page2Res.keys(): ++ if newRes.has_key(key) and newRes[key] != page2Res[key]: ++ newname = NameObject(key + "renamed") ++ renameRes[key] = newname ++ newRes[newname] = page2Res[key] ++ elif not newRes.has_key(key): ++ newRes[key] = page2Res.raw_get(key) ++ return newRes, renameRes ++ _mergeResources = staticmethod(_mergeResources) ++ ++ def _contentStreamRename(stream, rename, pdf): ++ if not rename: ++ return stream ++ stream = ContentStream(stream, pdf) ++ for operands,operator in stream.operations: ++ for i in range(len(operands)): ++ op = operands[i] ++ if isinstance(op, NameObject): ++ operands[i] = rename.get(op, op) ++ return stream ++ _contentStreamRename = staticmethod(_contentStreamRename) ++ ++ def _pushPopGS(contents, pdf): ++ # adds a graphics state "push" and "pop" to the beginning and end ++ # of a content stream. This isolates it from changes such as ++ # transformation matricies. ++ stream = ContentStream(contents, pdf) ++ stream.operations.insert(0, [[], "q"]) ++ stream.operations.append([[], "Q"]) ++ return stream ++ _pushPopGS = staticmethod(_pushPopGS) ++ ++ def _addTransformationMatrix(contents, pdf, ctm): ++ # adds transformation matrix at the beginning of the given ++ # contents stream. ++ a, b, c, d, e, f = ctm ++ contents = ContentStream(contents, pdf) ++ contents.operations.insert(0, [[FloatObject(a), FloatObject(b), ++ FloatObject(c), FloatObject(d), FloatObject(e), ++ FloatObject(f)], " cm"]) ++ return contents ++ _addTransformationMatrix = staticmethod(_addTransformationMatrix) ++ ++ ## ++ # Returns the /Contents object, or None if it doesn't exist. ++ # /Contents is optionnal, as described in PDF Reference 7.7.3.3 ++ def getContents(self): ++ if self.has_key("/Contents"): ++ return self["/Contents"].getObject() ++ else: ++ return None ++ ++ ## ++ # Merges the content streams of two pages into one. Resource references ++ # (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc ++ # of this page are not altered. The parameter page's content stream will ++ # be added to the end of this page's content stream, meaning that it will ++ # be drawn after, or "on top" of this page. ++ #

++ # Stability: Added in v1.4, will exist for all future 1.x releases. ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged ++ # into this one. ++ def mergePage(self, page2): ++ self._mergePage(page2) ++ ++ ## ++ # Actually merges the content streams of two pages into one. Resource ++ # references (i.e. fonts) are maintained from both pages. The ++ # mediabox/cropbox/etc of this page are not altered. The parameter page's ++ # content stream will be added to the end of this page's content stream, ++ # meaning that it will be drawn after, or "on top" of this page. ++ # ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged ++ # into this one. ++ # @param page2transformation A fuction which applies a transformation to ++ # the content stream of page2. Takes: page2 ++ # contents stream. Must return: new contents ++ # stream. If omitted, the content stream will ++ # not be modified. ++ def _mergePage(self, page2, page2transformation=None): ++ # First we work on merging the resource dictionaries. This allows us ++ # to find out what symbols in the content streams we might need to ++ # rename. ++ ++ newResources = DictionaryObject() ++ rename = {} ++ originalResources = self["/Resources"].getObject() ++ page2Resources = page2["/Resources"].getObject() ++ ++ for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": ++ new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) ++ if new: ++ newResources[NameObject(res)] = new ++ rename.update(newrename) ++ ++ # Combine /ProcSet sets. ++ newResources[NameObject("/ProcSet")] = ArrayObject( ++ frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( ++ frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) ++ ) ++ ) ++ ++ newContentArray = ArrayObject() ++ ++ originalContent = self.getContents() ++ if originalContent is not None: ++ newContentArray.append(PageObject._pushPopGS( ++ originalContent, self.pdf)) ++ ++ page2Content = page2.getContents() ++ if page2Content is not None: ++ if page2transformation is not None: ++ page2Content = page2transformation(page2Content) ++ page2Content = PageObject._contentStreamRename( ++ page2Content, rename, self.pdf) ++ page2Content = PageObject._pushPopGS(page2Content, self.pdf) ++ newContentArray.append(page2Content) ++ ++ self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) ++ self[NameObject('/Resources')] = newResources ++ ++ ## ++ # This is similar to mergePage, but a transformation matrix is ++ # applied to the merged stream. ++ # ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged. ++ # @param ctm A 6 elements tuple containing the operands of the ++ # transformation matrix ++ def mergeTransformedPage(self, page2, ctm): ++ self._mergePage(page2, lambda page2Content: ++ PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm)) ++ ++ ## ++ # This is similar to mergePage, but the stream to be merged is scaled ++ # by appling a transformation matrix. ++ # ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged. ++ # @param factor The scaling factor ++ def mergeScaledPage(self, page2, factor): ++ # CTM to scale : [ sx 0 0 sy 0 0 ] ++ return self.mergeTransformedPage(page2, [factor, 0, ++ 0, factor, ++ 0, 0]) ++ ++ ## ++ # This is similar to mergePage, but the stream to be merged is rotated ++ # by appling a transformation matrix. ++ # ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged. ++ # @param rotation The angle of the rotation, in degrees ++ def mergeRotatedPage(self, page2, rotation): ++ rotation = math.radians(rotation) ++ return self.mergeTransformedPage(page2, ++ [math.cos(rotation), math.sin(rotation), ++ -math.sin(rotation), math.cos(rotation), ++ 0, 0]) ++ ++ ## ++ # This is similar to mergePage, but the stream to be merged is translated ++ # by appling a transformation matrix. ++ # ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged. ++ # @param tx The translation on X axis ++ # @param tx The translation on Y axis ++ def mergeTranslatedPage(self, page2, tx, ty): ++ return self.mergeTransformedPage(page2, [1, 0, ++ 0, 1, ++ tx, ty]) ++ ++ ## ++ # This is similar to mergePage, but the stream to be merged is rotated ++ # and scaled by appling a transformation matrix. ++ # ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged. ++ # @param rotation The angle of the rotation, in degrees ++ # @param factor The scaling factor ++ def mergeRotatedScaledPage(self, page2, rotation, scale): ++ rotation = math.radians(rotation) ++ rotating = [[math.cos(rotation), math.sin(rotation),0], ++ [-math.sin(rotation),math.cos(rotation), 0], ++ [0, 0, 1]] ++ scaling = [[scale,0, 0], ++ [0, scale,0], ++ [0, 0, 1]] ++ ctm = utils.matrixMultiply(rotating, scaling) ++ ++ return self.mergeTransformedPage(page2, ++ [ctm[0][0], ctm[0][1], ++ ctm[1][0], ctm[1][1], ++ ctm[2][0], ctm[2][1]]) ++ ++ ## ++ # This is similar to mergePage, but the stream to be merged is translated ++ # and scaled by appling a transformation matrix. ++ # ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged. ++ # @param scale The scaling factor ++ # @param tx The translation on X axis ++ # @param tx The translation on Y axis ++ def mergeScaledTranslatedPage(self, page2, scale, tx, ty): ++ translation = [[1, 0, 0], ++ [0, 1, 0], ++ [tx,ty,1]] ++ scaling = [[scale,0, 0], ++ [0, scale,0], ++ [0, 0, 1]] ++ ctm = utils.matrixMultiply(scaling, translation) ++ ++ return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], ++ ctm[1][0], ctm[1][1], ++ ctm[2][0], ctm[2][1]]) ++ ++ ## ++ # This is similar to mergePage, but the stream to be merged is translated, ++ # rotated and scaled by appling a transformation matrix. ++ # ++ # @param page2 An instance of {@link #PageObject PageObject} to be merged. ++ # @param tx The translation on X axis ++ # @param ty The translation on Y axis ++ # @param rotation The angle of the rotation, in degrees ++ # @param scale The scaling factor ++ def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty): ++ translation = [[1, 0, 0], ++ [0, 1, 0], ++ [tx,ty,1]] ++ rotation = math.radians(rotation) ++ rotating = [[math.cos(rotation), math.sin(rotation),0], ++ [-math.sin(rotation),math.cos(rotation), 0], ++ [0, 0, 1]] ++ scaling = [[scale,0, 0], ++ [0, scale,0], ++ [0, 0, 1]] ++ ctm = utils.matrixMultiply(rotating, scaling) ++ ctm = utils.matrixMultiply(ctm, translation) ++ ++ return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], ++ ctm[1][0], ctm[1][1], ++ ctm[2][0], ctm[2][1]]) ++ ++ ## ++ # Applys a transformation matrix the page. ++ # ++ # @param ctm A 6 elements tuple containing the operands of the ++ # transformation matrix ++ def addTransformation(self, ctm): ++ originalContent = self.getContents() ++ if originalContent is not None: ++ newContent = PageObject._addTransformationMatrix( ++ originalContent, self.pdf, ctm) ++ newContent = PageObject._pushPopGS(newContent, self.pdf) ++ self[NameObject('/Contents')] = newContent ++ ++ ## ++ # Scales a page by the given factors by appling a transformation ++ # matrix to its content and updating the page size. ++ # ++ # @param sx The scaling factor on horizontal axis ++ # @param sy The scaling factor on vertical axis ++ def scale(self, sx, sy): ++ self.addTransformation([sx, 0, ++ 0, sy, ++ 0, 0]) ++ self.mediaBox = RectangleObject([ ++ float(self.mediaBox.getLowerLeft_x()) * sx, ++ float(self.mediaBox.getLowerLeft_y()) * sy, ++ float(self.mediaBox.getUpperRight_x()) * sx, ++ float(self.mediaBox.getUpperRight_y()) * sy]) ++ ++ ## ++ # Scales a page by the given factor by appling a transformation ++ # matrix to its content and updating the page size. ++ # ++ # @param factor The scaling factor ++ def scaleBy(self, factor): ++ self.scale(factor, factor) ++ ++ ## ++ # Scales a page to the specified dimentions by appling a ++ # transformation matrix to its content and updating the page size. ++ # ++ # @param width The new width ++ # @param height The new heigth ++ def scaleTo(self, width, height): ++ sx = width / (self.mediaBox.getUpperRight_x() - ++ self.mediaBox.getLowerLeft_x ()) ++ sy = height / (self.mediaBox.getUpperRight_y() - ++ self.mediaBox.getLowerLeft_x ()) ++ self.scale(sx, sy) ++ ++ ## ++ # Compresses the size of this page by joining all content streams and ++ # applying a FlateDecode filter. ++ #

++ # Stability: Added in v1.6, will exist for all future v1.x releases. ++ # However, it is possible that this function will perform no action if ++ # content stream compression becomes "automatic" for some reason. ++ def compressContentStreams(self): ++ content = self.getContents() ++ if content is not None: ++ if not isinstance(content, ContentStream): ++ content = ContentStream(content, self.pdf) ++ self[NameObject("/Contents")] = content.flateEncode() ++ ++ ## ++ # Locate all text drawing commands, in the order they are provided in the ++ # content stream, and extract the text. This works well for some PDF ++ # files, but poorly for others, depending on the generator used. This will ++ # be refined in the future. Do not rely on the order of text coming out of ++ # this function, as it will change if this function is made more ++ # sophisticated. ++ #

++ # Stability: Added in v1.7, will exist for all future v1.x releases. May ++ # be overhauled to provide more ordered text in the future. ++ # @return a unicode string object ++ def extractText(self): ++ text = u"" ++ content = self["/Contents"].getObject() ++ if not isinstance(content, ContentStream): ++ content = ContentStream(content, self.pdf) ++ # Note: we check all strings are TextStringObjects. ByteStringObjects ++ # are strings where the byte->string encoding was unknown, so adding ++ # them to the text here would be gibberish. ++ for operands,operator in content.operations: ++ if operator == "Tj": ++ _text = operands[0] ++ if isinstance(_text, TextStringObject): ++ text += _text ++ elif operator == "T*": ++ text += "\n" ++ elif operator == "'": ++ text += "\n" ++ _text = operands[0] ++ if isinstance(_text, TextStringObject): ++ text += operands[0] ++ elif operator == '"': ++ _text = operands[2] ++ if isinstance(_text, TextStringObject): ++ text += "\n" ++ text += _text ++ elif operator == "TJ": ++ for i in operands[0]: ++ if isinstance(i, TextStringObject): ++ text += i ++ return text ++ ++ ## ++ # A rectangle (RectangleObject), expressed in default user space units, ++ # defining the boundaries of the physical medium on which the page is ++ # intended to be displayed or printed. ++ #

++ # Stability: Added in v1.4, will exist for all future v1.x releases. ++ mediaBox = createRectangleAccessor("/MediaBox", ()) ++ ++ ## ++ # A rectangle (RectangleObject), expressed in default user space units, ++ # defining the visible region of default user space. When the page is ++ # displayed or printed, its contents are to be clipped (cropped) to this ++ # rectangle and then imposed on the output medium in some ++ # implementation-defined manner. Default value: same as MediaBox. ++ #

++ # Stability: Added in v1.4, will exist for all future v1.x releases. ++ cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) ++ ++ ## ++ # A rectangle (RectangleObject), expressed in default user space units, ++ # defining the region to which the contents of the page should be clipped ++ # when output in a production enviroment. ++ #

++ # Stability: Added in v1.4, will exist for all future v1.x releases. ++ bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) ++ ++ ## ++ # A rectangle (RectangleObject), expressed in default user space units, ++ # defining the intended dimensions of the finished page after trimming. ++ #

++ # Stability: Added in v1.4, will exist for all future v1.x releases. ++ trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) ++ ++ ## ++ # A rectangle (RectangleObject), expressed in default user space units, ++ # defining the extent of the page's meaningful content as intended by the ++ # page's creator. ++ #

++ # Stability: Added in v1.4, will exist for all future v1.x releases. ++ artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) ++ ++ ++class ContentStream(DecodedStreamObject): ++ def __init__(self, stream, pdf): ++ self.pdf = pdf ++ self.operations = [] ++ # stream may be a StreamObject or an ArrayObject containing ++ # multiple StreamObjects to be cat'd together. ++ stream = stream.getObject() ++ if isinstance(stream, ArrayObject): ++ data = "" ++ for s in stream: ++ data += s.getObject().getData() ++ stream = StringIO(data) ++ else: ++ stream = StringIO(stream.getData()) ++ self.__parseContentStream(stream) ++ ++ def __parseContentStream(self, stream): ++ # file("f:\\tmp.txt", "w").write(stream.read()) ++ stream.seek(0, 0) ++ operands = [] ++ while True: ++ peek = readNonWhitespace(stream) ++ if peek == '': ++ break ++ stream.seek(-1, 1) ++ if peek.isalpha() or peek == "'" or peek == '"': ++ operator = "" ++ while True: ++ tok = stream.read(1) ++ if tok.isspace() or tok in NameObject.delimiterCharacters: ++ stream.seek(-1, 1) ++ break ++ elif tok == '': ++ break ++ operator += tok ++ if operator == "BI": ++ # begin inline image - a completely different parsing ++ # mechanism is required, of course... thanks buddy... ++ assert operands == [] ++ ii = self._readInlineImage(stream) ++ self.operations.append((ii, "INLINE IMAGE")) ++ else: ++ self.operations.append((operands, operator)) ++ operands = [] ++ elif peek == '%': ++ # If we encounter a comment in the content stream, we have to ++ # handle it here. Typically, readObject will handle ++ # encountering a comment -- but readObject assumes that ++ # following the comment must be the object we're trying to ++ # read. In this case, it could be an operator instead. ++ while peek not in ('\r', '\n'): ++ peek = stream.read(1) ++ else: ++ operands.append(readObject(stream, None)) ++ ++ def _readInlineImage(self, stream): ++ # begin reading just after the "BI" - begin image ++ # first read the dictionary of settings. ++ settings = DictionaryObject() ++ while True: ++ tok = readNonWhitespace(stream) ++ stream.seek(-1, 1) ++ if tok == "I": ++ # "ID" - begin of image data ++ break ++ key = readObject(stream, self.pdf) ++ tok = readNonWhitespace(stream) ++ stream.seek(-1, 1) ++ value = readObject(stream, self.pdf) ++ settings[key] = value ++ # left at beginning of ID ++ tmp = stream.read(3) ++ assert tmp[:2] == "ID" ++ data = "" ++ while True: ++ tok = stream.read(1) ++ if tok == "E": ++ next = stream.read(1) ++ if next == "I": ++ break ++ else: ++ stream.seek(-1, 1) ++ data += tok ++ else: ++ data += tok ++ x = readNonWhitespace(stream) ++ stream.seek(-1, 1) ++ return {"settings": settings, "data": data} ++ ++ def _getData(self): ++ newdata = StringIO() ++ for operands,operator in self.operations: ++ if operator == "INLINE IMAGE": ++ newdata.write("BI") ++ dicttext = StringIO() ++ operands["settings"].writeToStream(dicttext, None) ++ newdata.write(dicttext.getvalue()[2:-2]) ++ newdata.write("ID ") ++ newdata.write(operands["data"]) ++ newdata.write("EI") ++ else: ++ for op in operands: ++ op.writeToStream(newdata, None) ++ newdata.write(" ") ++ newdata.write(operator) ++ newdata.write("\n") ++ return newdata.getvalue() ++ ++ def _setData(self, value): ++ self.__parseContentStream(StringIO(value)) ++ ++ _data = property(_getData, _setData) ++ ++ ++## ++# A class representing the basic document metadata provided in a PDF File. ++#

++# As of pyPdf v1.10, all text properties of the document metadata have two ++# properties, eg. author and author_raw. The non-raw property will always ++# return a TextStringObject, making it ideal for a case where the metadata is ++# being displayed. The raw property can sometimes return a ByteStringObject, ++# if pyPdf was unable to decode the string's text encoding; this requires ++# additional safety in the caller and therefore is not as commonly accessed. ++class DocumentInformation(DictionaryObject): ++ def __init__(self): ++ DictionaryObject.__init__(self) ++ ++ def getText(self, key): ++ retval = self.get(key, None) ++ if isinstance(retval, TextStringObject): ++ return retval ++ return None ++ ++ ## ++ # Read-only property accessing the document's title. Added in v1.6, will ++ # exist for all future v1.x releases. Modified in v1.10 to always return a ++ # unicode string (TextStringObject). ++ # @return A unicode string, or None if the title is not provided. ++ title = property(lambda self: self.getText("/Title")) ++ title_raw = property(lambda self: self.get("/Title")) ++ ++ ## ++ # Read-only property accessing the document's author. Added in v1.6, will ++ # exist for all future v1.x releases. Modified in v1.10 to always return a ++ # unicode string (TextStringObject). ++ # @return A unicode string, or None if the author is not provided. ++ author = property(lambda self: self.getText("/Author")) ++ author_raw = property(lambda self: self.get("/Author")) ++ ++ ## ++ # Read-only property accessing the subject of the document. Added in v1.6, ++ # will exist for all future v1.x releases. Modified in v1.10 to always ++ # return a unicode string (TextStringObject). ++ # @return A unicode string, or None if the subject is not provided. ++ subject = property(lambda self: self.getText("/Subject")) ++ subject_raw = property(lambda self: self.get("/Subject")) ++ ++ ## ++ # Read-only property accessing the document's creator. If the document was ++ # converted to PDF from another format, the name of the application (for ++ # example, OpenOffice) that created the original document from which it was ++ # converted. Added in v1.6, will exist for all future v1.x releases. ++ # Modified in v1.10 to always return a unicode string (TextStringObject). ++ # @return A unicode string, or None if the creator is not provided. ++ creator = property(lambda self: self.getText("/Creator")) ++ creator_raw = property(lambda self: self.get("/Creator")) ++ ++ ## ++ # Read-only property accessing the document's producer. If the document ++ # was converted to PDF from another format, the name of the application ++ # (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will ++ # exist for all future v1.x releases. Modified in v1.10 to always return a ++ # unicode string (TextStringObject). ++ # @return A unicode string, or None if the producer is not provided. ++ producer = property(lambda self: self.getText("/Producer")) ++ producer_raw = property(lambda self: self.get("/Producer")) ++ ++ ++## ++# A class representing a destination within a PDF file. ++# See section 8.2.1 of the PDF 1.6 reference. ++# Stability: Added in v1.10, will exist for all v1.x releases. ++class Destination(DictionaryObject): ++ def __init__(self, title, page, typ, *args): ++ DictionaryObject.__init__(self) ++ self[NameObject("/Title")] = title ++ self[NameObject("/Page")] = page ++ self[NameObject("/Type")] = typ ++ ++ # from table 8.2 of the PDF 1.6 reference. ++ if typ == "/XYZ": ++ (self[NameObject("/Left")], self[NameObject("/Top")], ++ self[NameObject("/Zoom")]) = args ++ elif typ == "/FitR": ++ (self[NameObject("/Left")], self[NameObject("/Bottom")], ++ self[NameObject("/Right")], self[NameObject("/Top")]) = args ++ elif typ in ["/FitH", "FitBH"]: ++ self[NameObject("/Top")], = args ++ elif typ in ["/FitV", "FitBV"]: ++ self[NameObject("/Left")], = args ++ elif typ in ["/Fit", "FitB"]: ++ pass ++ else: ++ raise utils.PdfReadError("Unknown Destination Type: %r" % typ) ++ ++ ## ++ # Read-only property accessing the destination title. ++ # @return A string. ++ title = property(lambda self: self.get("/Title")) ++ ++ ## ++ # Read-only property accessing the destination page. ++ # @return An integer. ++ page = property(lambda self: self.get("/Page")) ++ ++ ## ++ # Read-only property accessing the destination type. ++ # @return A string. ++ typ = property(lambda self: self.get("/Type")) ++ ++ ## ++ # Read-only property accessing the zoom factor. ++ # @return A number, or None if not available. ++ zoom = property(lambda self: self.get("/Zoom", None)) ++ ++ ## ++ # Read-only property accessing the left horizontal coordinate. ++ # @return A number, or None if not available. ++ left = property(lambda self: self.get("/Left", None)) ++ ++ ## ++ # Read-only property accessing the right horizontal coordinate. ++ # @return A number, or None if not available. ++ right = property(lambda self: self.get("/Right", None)) ++ ++ ## ++ # Read-only property accessing the top vertical coordinate. ++ # @return A number, or None if not available. ++ top = property(lambda self: self.get("/Top", None)) ++ ++ ## ++ # Read-only property accessing the bottom vertical coordinate. ++ # @return A number, or None if not available. ++ bottom = property(lambda self: self.get("/Bottom", None)) ++ ++def convertToInt(d, size): ++ if size > 8: ++ raise utils.PdfReadError("invalid size in convertToInt") ++ d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d ++ d = d[-8:] ++ return struct.unpack(">q", d)[0] ++ ++# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 ++_encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \ ++ '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \ ++ '\xa9\xfe\x64\x53\x69\x7a' ++ ++# Implementation of algorithm 3.2 of the PDF standard security handler, ++# section 3.5.2 of the PDF 1.6 reference. ++def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): ++ # 1. Pad or truncate the password string to exactly 32 bytes. If the ++ # password string is more than 32 bytes long, use only its first 32 bytes; ++ # if it is less than 32 bytes long, pad it by appending the required number ++ # of additional bytes from the beginning of the padding string ++ # (_encryption_padding). ++ password = (password + _encryption_padding)[:32] ++ # 2. Initialize the MD5 hash function and pass the result of step 1 as ++ # input to this function. ++ import struct ++ m = md5(password) ++ # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash ++ # function. ++ m.update(owner_entry) ++ # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass ++ # these bytes to the MD5 hash function, low-order byte first. ++ p_entry = struct.pack('= 3 and not metadata_encrypt: ++ m.update("\xff\xff\xff\xff") ++ # 7. Finish the hash. ++ md5_hash = m.digest() ++ # 8. (Revision 3 or greater) Do the following 50 times: Take the output ++ # from the previous MD5 hash and pass the first n bytes of the output as ++ # input into a new MD5 hash, where n is the number of bytes of the ++ # encryption key as defined by the value of the encryption dictionary's ++ # /Length entry. ++ if rev >= 3: ++ for i in range(50): ++ md5_hash = md5(md5_hash[:keylen]).digest() ++ # 9. Set the encryption key to the first n bytes of the output from the ++ # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or ++ # greater, depends on the value of the encryption dictionary's /Length ++ # entry. ++ return md5_hash[:keylen] ++ ++# Implementation of algorithm 3.3 of the PDF standard security handler, ++# section 3.5.2 of the PDF 1.6 reference. ++def _alg33(owner_pwd, user_pwd, rev, keylen): ++ # steps 1 - 4 ++ key = _alg33_1(owner_pwd, rev, keylen) ++ # 5. Pad or truncate the user password string as described in step 1 of ++ # algorithm 3.2. ++ user_pwd = (user_pwd + _encryption_padding)[:32] ++ # 6. Encrypt the result of step 5, using an RC4 encryption function with ++ # the encryption key obtained in step 4. ++ val = utils.RC4_encrypt(key, user_pwd) ++ # 7. (Revision 3 or greater) Do the following 19 times: Take the output ++ # from the previous invocation of the RC4 function and pass it as input to ++ # a new invocation of the function; use an encryption key generated by ++ # taking each byte of the encryption key obtained in step 4 and performing ++ # an XOR operation between that byte and the single-byte value of the ++ # iteration counter (from 1 to 19). ++ if rev >= 3: ++ for i in range(1, 20): ++ new_key = '' ++ for l in range(len(key)): ++ new_key += chr(ord(key[l]) ^ i) ++ val = utils.RC4_encrypt(new_key, val) ++ # 8. Store the output from the final invocation of the RC4 as the value of ++ # the /O entry in the encryption dictionary. ++ return val ++ ++# Steps 1-4 of algorithm 3.3 ++def _alg33_1(password, rev, keylen): ++ # 1. Pad or truncate the owner password string as described in step 1 of ++ # algorithm 3.2. If there is no owner password, use the user password ++ # instead. ++ password = (password + _encryption_padding)[:32] ++ # 2. Initialize the MD5 hash function and pass the result of step 1 as ++ # input to this function. ++ m = md5(password) ++ # 3. (Revision 3 or greater) Do the following 50 times: Take the output ++ # from the previous MD5 hash and pass it as input into a new MD5 hash. ++ md5_hash = m.digest() ++ if rev >= 3: ++ for i in range(50): ++ md5_hash = md5(md5_hash).digest() ++ # 4. Create an RC4 encryption key using the first n bytes of the output ++ # from the final MD5 hash, where n is always 5 for revision 2 but, for ++ # revision 3 or greater, depends on the value of the encryption ++ # dictionary's /Length entry. ++ key = md5_hash[:keylen] ++ return key ++ ++# Implementation of algorithm 3.4 of the PDF standard security handler, ++# section 3.5.2 of the PDF 1.6 reference. ++def _alg34(password, owner_entry, p_entry, id1_entry): ++ # 1. Create an encryption key based on the user password string, as ++ # described in algorithm 3.2. ++ key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) ++ # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, ++ # using an RC4 encryption function with the encryption key from the ++ # preceding step. ++ U = utils.RC4_encrypt(key, _encryption_padding) ++ # 3. Store the result of step 2 as the value of the /U entry in the ++ # encryption dictionary. ++ return U, key ++ ++# Implementation of algorithm 3.4 of the PDF standard security handler, ++# section 3.5.2 of the PDF 1.6 reference. ++def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): ++ # 1. Create an encryption key based on the user password string, as ++ # described in Algorithm 3.2. ++ key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) ++ # 2. Initialize the MD5 hash function and pass the 32-byte padding string ++ # shown in step 1 of Algorithm 3.2 as input to this function. ++ m = md5() ++ m.update(_encryption_padding) ++ # 3. Pass the first element of the file's file identifier array (the value ++ # of the ID entry in the document's trailer dictionary; see Table 3.13 on ++ # page 73) to the hash function and finish the hash. (See implementation ++ # note 25 in Appendix H.) ++ m.update(id1_entry) ++ md5_hash = m.digest() ++ # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption ++ # function with the encryption key from step 1. ++ val = utils.RC4_encrypt(key, md5_hash) ++ # 5. Do the following 19 times: Take the output from the previous ++ # invocation of the RC4 function and pass it as input to a new invocation ++ # of the function; use an encryption key generated by taking each byte of ++ # the original encryption key (obtained in step 2) and performing an XOR ++ # operation between that byte and the single-byte value of the iteration ++ # counter (from 1 to 19). ++ for i in range(1, 20): ++ new_key = '' ++ for l in range(len(key)): ++ new_key += chr(ord(key[l]) ^ i) ++ val = utils.RC4_encrypt(new_key, val) ++ # 6. Append 16 bytes of arbitrary padding to the output from the final ++ # invocation of the RC4 function and store the 32-byte result as the value ++ # of the U entry in the encryption dictionary. ++ # (implementator note: I don't know what "arbitrary padding" is supposed to ++ # mean, so I have used null bytes. This seems to match a few other ++ # people's implementations) ++ return val + ('\x00' * 16), key ++ ++#if __name__ == "__main__": ++# output = PdfFileWriter() ++# ++# input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb")) ++# page1 = input1.getPage(0) ++# ++# input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb")) ++# page2 = input2.getPage(0) ++# page3 = input2.getPage(1) ++# page1.mergePage(page2) ++# page1.mergePage(page3) ++# ++# input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb")) ++# page1.mergePage(input3.getPage(0)) ++# ++# page1.compressContentStreams() ++# ++# output.addPage(page1) ++# output.write(file("test\\merge-test.pdf", "wb")) ++ ++ --- python-pypdf-1.13.orig/debian/patches/01_backspace_parsed.dpatch +++ python-pypdf-1.13/debian/patches/01_backspace_parsed.dpatch @@ -0,0 +1,18 @@ +#! /bin/sh /usr/share/dpatch/dpatch-run +## 01_backspace_parsed.dpatch by Ralf Schlatterbeck +## +## DP: #511252: escape-sequence '\b' in PDF strings is incorrectly parsed as 'b' instead of Backspace. + +@DPATCH@ +diff -urNad trunk~/pyPdf/generic.py trunk/pyPdf/generic.py +--- trunk~/pyPdf/generic.py 2008-08-11 22:04:17.000000000 -0300 ++++ trunk/pyPdf/generic.py 2009-01-09 11:24:13.000000000 -0200 +@@ -299,7 +299,7 @@ + elif tok == "t": + tok = "\t" + elif tok == "b": +- tok == "\b" ++ tok = "\b" + elif tok == "f": + tok = "\f" + elif tok == "(":