pax_global_header00006660000000000000000000000064136654471460014532gustar00rootroot0000000000000052 comment=cf771f66a372da8380cb6c9fe5d9d40133dcc563 gofpdi-1.0.13/000077500000000000000000000000001366544714600130645ustar00rootroot00000000000000gofpdi-1.0.13/LICENSE000066400000000000000000000021771366544714600141000ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2019-2020 David Barnes Copyright (c) 2017 Setasign - Jan Slabon, https://www.setasign.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. gofpdi-1.0.13/README.md000066400000000000000000000154401366544714600143470ustar00rootroot00000000000000# gofpdi [![MIT licensed](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/phpdave11/gofpdi/master/LICENSE) [![Report](https://goreportcard.com/badge/github.com/phpdave11/gofpdi)](https://goreportcard.com/report/github.com/phpdave11/gofpdi) [![GoDoc](https://img.shields.io/badge/godoc-gofpdi-blue.svg)](https://godoc.org/github.com/phpdave11/gofpdi) ## Go Free PDF Document Importer gofpdi allows you to import an existing PDF into a new PDF. The following PDF generation libraries are supported: - [gopdf](https://github.com/signintech/gopdf) - [gofpdf](https://github.com/phpdave11/gofpdf) ## Acknowledgments This package’s code is derived from the [fpdi](https://github.com/Setasign/FPDI/tree/1.6.x-legacy) library created by [Jan Slabon](https://github.com/JanSlabon). [mrtsbt](https://github.com/mrtsbt) added support for reading a PDF from an `io.ReadSeeker` stream and also added support for using gofpdi concurrently. [Asher Tuggle](https://github.com/awesomeunleashed) added support for reading PDFs that have split xref tables. ## Examples ### gopdf example ```go package main import ( "github.com/signintech/gopdf" "io" "net/http" "os" ) func main() { var err error // Download a Font fontUrl := "https://github.com/google/fonts/raw/master/ofl/daysone/DaysOne-Regular.ttf" if err = DownloadFile("example-font.ttf", fontUrl); err != nil { panic(err) } // Download a PDF fileUrl := "https://tcpdf.org/files/examples/example_012.pdf" if err = DownloadFile("example-pdf.pdf", fileUrl); err != nil { panic(err) } pdf := gopdf.GoPdf{} pdf.Start(gopdf.Config{PageSize: gopdf.Rect{W: 595.28, H: 841.89}}) //595.28, 841.89 = A4 pdf.AddPage() err = pdf.AddTTFFont("daysone", "example-font.ttf") if err != nil { panic(err) } err = pdf.SetFont("daysone", "", 20) if err != nil { panic(err) } // Color the page pdf.SetLineWidth(0.1) pdf.SetFillColor(124, 252, 0) //setup fill color pdf.RectFromUpperLeftWithStyle(50, 100, 400, 600, "FD") pdf.SetFillColor(0, 0, 0) pdf.SetX(50) pdf.SetY(50) pdf.Cell(nil, "Import existing PDF into GoPDF Document") // Import page 1 tpl1 := pdf.ImportPage("example-pdf.pdf", 1, "/MediaBox") // Draw pdf onto page pdf.UseImportedTemplate(tpl1, 50, 100, 400, 0) pdf.WritePdf("example.pdf") } // DownloadFile will download a url to a local file. It's efficient because it will // write as it downloads and not load the whole file into memory. func DownloadFile(filepath string, url string) error { // Get the data resp, err := http.Get(url) if err != nil { return err } defer resp.Body.Close() // Create the file out, err := os.Create(filepath) if err != nil { return err } defer out.Close() // Write the body to file _, err = io.Copy(out, resp.Body) return err } ``` Generated PDF: [example.pdf](https://github.com/signintech/gopdf/files/3144466/example.pdf) Screenshot of PDF: ![example](https://user-images.githubusercontent.com/9421180/57180557-4c1dbd80-6e4f-11e9-8f47-9d40217805be.jpg) ### gofpdf example #1 - import PDF from file ```go package main import ( "github.com/phpdave11/gofpdf" "github.com/phpdave11/gofpdf/contrib/gofpdi" "io" "net/http" "os" ) func main() { var err error pdf := gofpdf.New("P", "mm", "A4", "") // Download a PDF fileUrl := "https://tcpdf.org/files/examples/example_026.pdf" if err = DownloadFile("example-pdf.pdf", fileUrl); err != nil { panic(err) } // Import example-pdf.pdf with gofpdi free pdf document importer tpl1 := gofpdi.ImportPage(pdf, "example-pdf.pdf", 1, "/MediaBox") pdf.AddPage() pdf.SetFillColor(200, 700, 220) pdf.Rect(20, 50, 150, 215, "F") // Draw imported template onto page gofpdi.UseImportedTemplate(pdf, tpl1, 20, 50, 150, 0) pdf.SetFont("Helvetica", "", 20) pdf.Cell(0, 0, "Import existing PDF into gofpdf document with gofpdi") err = pdf.OutputFileAndClose("example.pdf") if err != nil { panic(err) } } // DownloadFile will download a url to a local file. It's efficient because it will // write as it downloads and not load the whole file into memory. func DownloadFile(filepath string, url string) error { // Get the data resp, err := http.Get(url) if err != nil { return err } defer resp.Body.Close() // Create the file out, err := os.Create(filepath) if err != nil { return err } defer out.Close() // Write the body to file _, err = io.Copy(out, resp.Body) return err } ``` Generated PDF: [example.pdf](https://github.com/phpdave11/gofpdf/files/3178770/example.pdf) Screenshot of PDF: ![example](https://user-images.githubusercontent.com/9421180/57713804-ca8d1300-7638-11e9-9f8e-e3f803374803.jpg) ### gofpdf example #2 - import PDF from stream ```go package main import ( "bytes" "github.com/phpdave11/gofpdf" "github.com/phpdave11/gofpdf/contrib/gofpdi" "io" "io/ioutil" "net/http" ) func main() { var err error pdf := gofpdf.New("P", "mm", "A4", "") // Download a PDF into memory res, err := http.Get("https://tcpdf.org/files/examples/example_038.pdf") if err != nil { panic(err) } pdfBytes, err := ioutil.ReadAll(res.Body) res.Body.Close() if err != nil { panic(err) } // convert []byte to io.ReadSeeker rs := io.ReadSeeker(bytes.NewReader(pdfBytes)) // Import in-memory PDF stream with gofpdi free pdf document importer tpl1 := gofpdi.ImportPageFromStream(pdf, &rs, 1, "/TrimBox") pdf.AddPage() pdf.SetFillColor(200, 700, 220) pdf.Rect(20, 50, 150, 215, "F") // Draw imported template onto page gofpdi.UseImportedTemplate(pdf, tpl1, 20, 50, 150, 0) pdf.SetFont("Helvetica", "", 20) pdf.Cell(0, 0, "Import PDF stream into gofpdf document with gofpdi") err = pdf.OutputFileAndClose("example.pdf") if err != nil { panic(err) } } ``` Generated PDF: [example.pdf](https://github.com/phpdave11/gofpdi/files/3483219/example.pdf) Screenshot of PDF: ![example.jpg](https://user-images.githubusercontent.com/9421180/62728726-18b87500-b9e2-11e9-885c-7c68b7ac6222.jpg) gofpdi-1.0.13/const.go000066400000000000000000000003731366544714600145440ustar00rootroot00000000000000package gofpdi const ( PDF_TYPE_NULL = iota PDF_TYPE_NUMERIC PDF_TYPE_TOKEN PDF_TYPE_HEX PDF_TYPE_STRING PDF_TYPE_DICTIONARY PDF_TYPE_ARRAY PDF_TYPE_OBJDEC PDF_TYPE_OBJREF PDF_TYPE_OBJECT PDF_TYPE_STREAM PDF_TYPE_BOOLEAN PDF_TYPE_REAL ) gofpdi-1.0.13/go.mod000066400000000000000000000001221366544714600141650ustar00rootroot00000000000000module github.com/phpdave11/gofpdi go 1.12 require github.com/pkg/errors v0.8.1 gofpdi-1.0.13/go.sum000066400000000000000000000002411366544714600142140ustar00rootroot00000000000000github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= gofpdi-1.0.13/helper.go000066400000000000000000000052351366544714600146770ustar00rootroot00000000000000package gofpdi import ( "strings" ) // Determine if a value is numeric // Courtesy of https://github.com/syyongx/php2go/blob/master/php.go func is_numeric(val interface{}) bool { switch val.(type) { case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64: case float32, float64, complex64, complex128: return true case string: str := val.(string) if str == "" { return false } // Trim any whitespace str = strings.TrimSpace(str) //fmt.Println(str) if str[0] == '-' || str[0] == '+' { if len(str) == 1 { return false } str = str[1:] } // hex if len(str) > 2 && str[0] == '0' && (str[1] == 'x' || str[1] == 'X') { for _, h := range str[2:] { if !((h >= '0' && h <= '9') || (h >= 'a' && h <= 'f') || (h >= 'A' && h <= 'F')) { return false } } return true } // 0-9,Point,Scientific p, s, l := 0, 0, len(str) for i, v := range str { if v == '.' { // Point if p > 0 || s > 0 || i+1 == l { return false } p = i } else if v == 'e' || v == 'E' { // Scientific if i == 0 || s > 0 || i+1 == l { return false } s = i } else if v < '0' || v > '9' { return false } } return true } return false } func in_array(needle interface{}, hystack interface{}) bool { switch key := needle.(type) { case string: for _, item := range hystack.([]string) { if key == item { return true } } case int: for _, item := range hystack.([]int) { if key == item { return true } } case int64: for _, item := range hystack.([]int64) { if key == item { return true } } default: return false } return false } // Taken from png library // intSize is either 32 or 64. const intSize = 32 << (^uint(0) >> 63) func abs(x int) int { // m := -1 if x < 0. m := 0 otherwise. m := x >> (intSize - 1) // In two's complement representation, the negative number // of any number (except the smallest one) can be computed // by flipping all the bits and add 1. This is faster than // code with a branch. // See Hacker's Delight, section 2-4. return (x ^ m) - m } // filterPaeth applies the Paeth filter to the cdat slice. // cdat is the current row's data, pdat is the previous row's data. func filterPaeth(cdat, pdat []byte, bytesPerPixel int) { var a, b, c, pa, pb, pc int for i := 0; i < bytesPerPixel; i++ { a, c = 0, 0 for j := i; j < len(cdat); j += bytesPerPixel { b = int(pdat[j]) pa = b - c pb = a - c pc = abs(pa + pb) pa = abs(pa) pb = abs(pb) if pa <= pb && pa <= pc { // No-op. } else if pb <= pc { a = b } else { a = c } a += int(cdat[j]) a &= 0xff cdat[j] = uint8(a) c = b } } } gofpdi-1.0.13/importer.go000066400000000000000000000140271366544714600152600ustar00rootroot00000000000000package gofpdi import ( "fmt" "io" ) // The Importer class to be used by a pdf generation library type Importer struct { sourceFile string readers map[string]*PdfReader writers map[string]*PdfWriter tplMap map[int]*TplInfo tplN int writer *PdfWriter importedPages map[string]int } type TplInfo struct { SourceFile string Writer *PdfWriter TemplateId int } func (this *Importer) GetReader() *PdfReader { return this.GetReaderForFile(this.sourceFile) } func (this *Importer) GetWriter() *PdfWriter { return this.GetWriterForFile(this.sourceFile) } func (this *Importer) GetReaderForFile(file string) *PdfReader { if _, ok := this.readers[file]; ok { return this.readers[file] } return nil } func (this *Importer) GetWriterForFile(file string) *PdfWriter { if _, ok := this.writers[file]; ok { return this.writers[file] } return nil } func NewImporter() *Importer { importer := &Importer{} importer.init() return importer } func (this *Importer) init() { this.readers = make(map[string]*PdfReader, 0) this.writers = make(map[string]*PdfWriter, 0) this.tplMap = make(map[int]*TplInfo, 0) this.writer, _ = NewPdfWriter("") this.importedPages = make(map[string]int, 0) } func (this *Importer) SetSourceFile(f string) { this.sourceFile = f // If reader hasn't been instantiated, do that now if _, ok := this.readers[this.sourceFile]; !ok { reader, err := NewPdfReader(this.sourceFile) if err != nil { panic(err) } this.readers[this.sourceFile] = reader } // If writer hasn't been instantiated, do that now if _, ok := this.writers[this.sourceFile]; !ok { writer, err := NewPdfWriter("") if err != nil { panic(err) } // Make the next writer start template numbers at this.tplN writer.SetTplIdOffset(this.tplN) this.writers[this.sourceFile] = writer } } func (this *Importer) SetSourceStream(rs *io.ReadSeeker) { this.sourceFile = fmt.Sprintf("%v", rs) if _, ok := this.readers[this.sourceFile]; !ok { reader, err := NewPdfReaderFromStream(*rs) if err != nil { panic(err) } this.readers[this.sourceFile] = reader } // If writer hasn't been instantiated, do that now if _, ok := this.writers[this.sourceFile]; !ok { writer, err := NewPdfWriter("") if err != nil { panic(err) } // Make the next writer start template numbers at this.tplN writer.SetTplIdOffset(this.tplN) this.writers[this.sourceFile] = writer } } func (this *Importer) GetNumPages() int { result, err := this.GetReader().getNumPages() if err != nil { panic(err) } return result } func (this *Importer) GetPageSizes() map[int]map[string]map[string]float64 { result, err := this.GetReader().getAllPageBoxes(1.0) if err != nil { panic(err) } return result } func (this *Importer) ImportPage(pageno int, box string) int { // If page has already been imported, return existing tplN pageNameNumber := fmt.Sprintf("%s-%04d", this.sourceFile, pageno) if _, ok := this.importedPages[pageNameNumber]; ok { return this.importedPages[pageNameNumber] } res, err := this.GetWriter().ImportPage(this.GetReader(), pageno, box) if err != nil { panic(err) } // Get current template id tplN := this.tplN // Set tpl info this.tplMap[tplN] = &TplInfo{SourceFile: this.sourceFile, TemplateId: res, Writer: this.GetWriter()} // Increment template id this.tplN++ // Cache imported page tplN this.importedPages[pageNameNumber] = tplN return tplN } func (this *Importer) SetNextObjectID(objId int) { this.GetWriter().SetNextObjectID(objId) } // Put form xobjects and get back a map of template names (e.g. /GOFPDITPL1) and their object ids (int) func (this *Importer) PutFormXobjects() map[string]int { res := make(map[string]int, 0) tplNamesIds, err := this.GetWriter().PutFormXobjects(this.GetReader()) if err != nil { panic(err) } for tplName, pdfObjId := range tplNamesIds { res[tplName] = pdfObjId.id } return res } // Put form xobjects and get back a map of template names (e.g. /GOFPDITPL1) and their object ids (sha1 hash) func (this *Importer) PutFormXobjectsUnordered() map[string]string { this.GetWriter().SetUseHash(true) res := make(map[string]string, 0) tplNamesIds, err := this.GetWriter().PutFormXobjects(this.GetReader()) if err != nil { panic(err) } for tplName, pdfObjId := range tplNamesIds { res[tplName] = pdfObjId.hash } return res } // Get object ids (int) and their contents (string) func (this *Importer) GetImportedObjects() map[int]string { res := make(map[int]string, 0) pdfObjIdBytes := this.GetWriter().GetImportedObjects() for pdfObjId, bytes := range pdfObjIdBytes { res[pdfObjId.id] = string(bytes) } return res } // Get object ids (sha1 hash) and their contents ([]byte) // The contents may have references to other object hashes which will need to be replaced by the pdf generator library // The positions of the hashes (sha1 - 40 characters) can be obtained by calling GetImportedObjHashPos() func (this *Importer) GetImportedObjectsUnordered() map[string][]byte { res := make(map[string][]byte, 0) pdfObjIdBytes := this.GetWriter().GetImportedObjects() for pdfObjId, bytes := range pdfObjIdBytes { res[pdfObjId.hash] = bytes } return res } // Get the positions of the hashes (sha1 - 40 characters) within each object, to be replaced with // actual objects ids by the pdf generator library func (this *Importer) GetImportedObjHashPos() map[string]map[int]string { res := make(map[string]map[int]string, 0) pdfObjIdPosHash := this.GetWriter().GetImportedObjHashPos() for pdfObjId, posHashMap := range pdfObjIdPosHash { res[pdfObjId.hash] = posHashMap } return res } // For a given template id (returned from ImportPage), get the template name (e.g. /GOFPDITPL1) and // the 4 float64 values necessary to draw the template a x,y for a given width and height. func (this *Importer) UseTemplate(tplid int, _x float64, _y float64, _w float64, _h float64) (string, float64, float64, float64, float64) { // Look up template id in importer tpl map tplInfo := this.tplMap[tplid] return tplInfo.Writer.UseTemplate(tplInfo.TemplateId, _x, _y, _w, _h) } gofpdi-1.0.13/reader.go000066400000000000000000001220701366544714600146570ustar00rootroot00000000000000package gofpdi import ( "bufio" "bytes" "compress/zlib" "encoding/binary" "fmt" "github.com/pkg/errors" "io" "io/ioutil" "math" "os" "strconv" ) type PdfReader struct { availableBoxes []string stack []string trailer *PdfValue catalog *PdfValue pages []*PdfValue xrefPos int xref map[int]map[int]int xrefStream map[int][2]int f io.ReadSeeker nBytes int64 sourceFile string curPage int alreadyRead bool pageCount int } func NewPdfReaderFromStream(rs io.ReadSeeker) (*PdfReader, error) { length, err := rs.Seek(0, 2) if err != nil { return nil, errors.Wrapf(err, "Failed to determine stream length") } parser := &PdfReader{f: rs, nBytes: length} if err := parser.init(); err != nil { return nil, errors.Wrap(err, "Failed to initialize parser") } if err := parser.read(); err != nil { return nil, errors.Wrap(err, "Failed to read pdf from stream") } return parser, nil } func NewPdfReader(filename string) (*PdfReader, error) { var err error f, err := os.Open(filename) if err != nil { return nil, errors.Wrap(err, "Failed to open file") } info, err := f.Stat() if err != nil { return nil, errors.Wrap(err, "Failed to obtain file information") } parser := &PdfReader{f: f, sourceFile: filename, nBytes: info.Size()} if err = parser.init(); err != nil { return nil, errors.Wrap(err, "Failed to initialize parser") } if err = parser.read(); err != nil { return nil, errors.Wrap(err, "Failed to read pdf") } return parser, nil } func (this *PdfReader) init() error { this.availableBoxes = []string{"/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"} this.xref = make(map[int]map[int]int, 0) this.xrefStream = make(map[int][2]int, 0) err := this.read() if err != nil { return errors.Wrap(err, "Failed to read pdf") } return nil } type PdfValue struct { Type int String string Token string Int int Real float64 Bool bool Dictionary map[string]*PdfValue Array []*PdfValue Id int NewId int Gen int Value *PdfValue Stream *PdfValue Bytes []byte } // Jump over comments func (this *PdfReader) skipComments(r *bufio.Reader) error { var err error var b byte for { b, err = r.ReadByte() if err != nil { return errors.Wrap(err, "Failed to ReadByte while skipping comments") } if b == '\n' || b == '\r' { if b == '\r' { // Peek and see if next char is \n b2, err := r.ReadByte() if err != nil { return errors.Wrap(err, "Failed to read byte") } if b2 != '\n' { r.UnreadByte() } } break } } return nil } // Advance reader so that whitespace is ignored func (this *PdfReader) skipWhitespace(r *bufio.Reader) error { var err error var b byte for { b, err = r.ReadByte() if err != nil { if err == io.EOF { break } return errors.Wrap(err, "Failed to read byte") } if b == ' ' || b == '\n' || b == '\r' || b == '\t' { continue } else { r.UnreadByte() break } } return nil } // Read a token func (this *PdfReader) readToken(r *bufio.Reader) (string, error) { var err error // If there is a token available on the stack, pop it out and return it. if len(this.stack) > 0 { var popped string popped, this.stack = this.stack[len(this.stack)-1], this.stack[:len(this.stack)-1] return popped, nil } err = this.skipWhitespace(r) if err != nil { return "", errors.Wrap(err, "Failed to skip whitespace") } b, err := r.ReadByte() if err != nil { if err == io.EOF { return "", nil } return "", errors.Wrap(err, "Failed to read byte") } switch b { case '[', ']', '(', ')': // This is either an array or literal string delimeter, return it. return string(b), nil case '<', '>': // This could either be a hex string or a dictionary delimiter. // Determine the appropriate case and return the token. nb, err := r.ReadByte() if err != nil { return "", errors.Wrap(err, "Failed to read byte") } if nb == b { return string(b) + string(nb), nil } else { r.UnreadByte() return string(b), nil } case '%': err = this.skipComments(r) if err != nil { return "", errors.Wrap(err, "Failed to skip comments") } return this.readToken(r) default: // FIXME this may not be performant to create new strings for each byte // Is it probably better to create a buffer and then convert to a string at the end. str := string(b) loop: for { b, err := r.ReadByte() if err != nil { return "", errors.Wrap(err, "Failed to read byte") } switch b { case ' ', '%', '[', ']', '<', '>', '(', ')', '\r', '\n', '\t', '/': r.UnreadByte() break loop default: str += string(b) } } return str, nil } return "", nil } // Read a value based on a token func (this *PdfReader) readValue(r *bufio.Reader, t string) (*PdfValue, error) { var err error var b byte result := &PdfValue{} result.Type = -1 result.Token = t result.Dictionary = make(map[string]*PdfValue, 0) result.Array = make([]*PdfValue, 0) switch t { case "<": // This is a hex string // Read bytes until '>' is found var s string for { b, err = r.ReadByte() if err != nil { return nil, errors.Wrap(err, "Failed to read byte") } if b != '>' { s += string(b) } else { break } } result.Type = PDF_TYPE_HEX result.String = s case "<<": // This is a dictionary // Recurse into this function until we reach the end of the dictionary. for { key, err := this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } if key == "" { return nil, errors.New("Token is empty") } if key == ">>" { break } // read next token newKey, err := this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } value, err := this.readValue(r, newKey) if err != nil { return nil, errors.Wrap(err, "Failed to read value for token: "+newKey) } if value.Type == -1 { return result, nil } // Catch missing value if value.Type == PDF_TYPE_TOKEN && value.String == ">>" { result.Type = PDF_TYPE_NULL result.Dictionary[key] = value break } // Set value in dictionary result.Dictionary[key] = value } result.Type = PDF_TYPE_DICTIONARY return result, nil case "[": // This is an array tmpResult := make([]*PdfValue, 0) // Recurse into this function until we reach the end of the array for { key, err := this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } if key == "" { return nil, errors.New("Token is empty") } if key == "]" { break } value, err := this.readValue(r, key) if err != nil { return nil, errors.Wrap(err, "Failed to read value for token: "+key) } if value.Type == -1 { return result, nil } tmpResult = append(tmpResult, value) } result.Type = PDF_TYPE_ARRAY result.Array = tmpResult case "(": // This is a string openBrackets := 1 // Create new buffer var buf bytes.Buffer // Read bytes until brackets are balanced for openBrackets > 0 { b, err := r.ReadByte() if err != nil { return nil, errors.Wrap(err, "Failed to read byte") } switch b { case '(': openBrackets++ case ')': openBrackets-- case '\\': nb, err := r.ReadByte() if err != nil { return nil, errors.Wrap(err, "Failed to read byte") } buf.WriteByte(b) buf.WriteByte(nb) continue } if openBrackets > 0 { buf.WriteByte(b) } } result.Type = PDF_TYPE_STRING result.String = buf.String() case "stream": return nil, errors.New("Stream not implemented") default: result.Type = PDF_TYPE_TOKEN result.Token = t if is_numeric(t) { // A numeric token. Make sure that it is not part of something else t2, err := this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } if t2 != "" { if is_numeric(t2) { // Two numeric tokens in a row. // In this case, we're probably in front of either an object reference // or an object specification. // Determine the case and return the data. t3, err := this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } if t3 != "" { switch t3 { case "obj": result.Type = PDF_TYPE_OBJDEC result.Id, _ = strconv.Atoi(t) result.Gen, _ = strconv.Atoi(t2) return result, nil case "R": result.Type = PDF_TYPE_OBJREF result.Id, _ = strconv.Atoi(t) result.Gen, _ = strconv.Atoi(t2) return result, nil } // If we get to this point, that numeric value up there was just a numeric value. // Push the extra tokens back into the stack and return the value. this.stack = append(this.stack, t3) } } this.stack = append(this.stack, t2) } if n, err := strconv.Atoi(t); err == nil { result.Type = PDF_TYPE_NUMERIC result.Int = n result.Real = float64(n) // Also assign Real value here to fix page box parsing bugs } else { result.Type = PDF_TYPE_REAL result.Real, _ = strconv.ParseFloat(t, 64) } } else if t == "true" || t == "false" { result.Type = PDF_TYPE_BOOLEAN result.Bool = t == "true" } else if t == "null" { result.Type = PDF_TYPE_NULL } else { result.Type = PDF_TYPE_TOKEN result.Token = t } } return result, nil } // Resolve a compressed object (PDF 1.5) func (this *PdfReader) resolveCompressedObject(objSpec *PdfValue) (*PdfValue, error) { var err error // Make sure object reference exists in xrefStream if _, ok := this.xrefStream[objSpec.Id]; !ok { return nil, errors.New(fmt.Sprintf("Could not find object ID %d in xref stream or xref table.", objSpec.Id)) } // Get object id and index objectId := this.xrefStream[objSpec.Id][0] objectIndex := this.xrefStream[objSpec.Id][1] // Read compressed object compressedObjSpec := &PdfValue{Type: PDF_TYPE_OBJREF, Id: objectId, Gen: 0} // Resolve compressed object compressedObj, err := this.resolveObject(compressedObjSpec) if err != nil { return nil, errors.Wrap(err, "Failed to resolve compressed object") } // Verify object type is /ObjStm if _, ok := compressedObj.Value.Dictionary["/Type"]; ok { if compressedObj.Value.Dictionary["/Type"].Token != "/ObjStm" { return nil, errors.New("Expected compressed object type to be /ObjStm") } } else { return nil, errors.New("Could not determine compressed object type.") } // Get number of sub-objects in compressed object n := compressedObj.Value.Dictionary["/N"].Int if n <= 0 { return nil, errors.New("No sub objects in compressed object") } // Get offset of first object first := compressedObj.Value.Dictionary["/First"].Int // Get length //length := compressedObj.Value.Dictionary["/Length"].Int // Check for filter filter := "" if _, ok := compressedObj.Value.Dictionary["/Filter"]; ok { filter = compressedObj.Value.Dictionary["/Filter"].Token if filter != "/FlateDecode" { return nil, errors.New("Unsupported filter - expected /FlateDecode, got: " + filter) } } if filter == "/FlateDecode" { // Decompress if filter is /FlateDecode // Uncompress zlib compressed data var out bytes.Buffer zlibReader, _ := zlib.NewReader(bytes.NewBuffer(compressedObj.Stream.Bytes)) defer zlibReader.Close() io.Copy(&out, zlibReader) // Set stream to uncompressed data compressedObj.Stream.Bytes = out.Bytes() } // Get io.Reader for bytes r := bufio.NewReader(bytes.NewBuffer(compressedObj.Stream.Bytes)) subObjId := 0 subObjPos := 0 // Read sub-object indeces and their positions within the (un)compressed object for i := 0; i < n; i++ { var token string var _objidx int var _objpos int // Read first token (object index) token, err = this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } // Convert line (string) into int _objidx, err = strconv.Atoi(token) if err != nil { return nil, errors.Wrap(err, "Failed to convert token into integer: "+token) } // Read first token (object index) token, err = this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } // Convert line (string) into int _objpos, err = strconv.Atoi(token) if err != nil { return nil, errors.Wrap(err, "Failed to convert token into integer: "+token) } if i == objectIndex { subObjId = _objidx subObjPos = _objpos } } // Now create an io.ReadSeeker rs := io.ReadSeeker(bytes.NewReader(compressedObj.Stream.Bytes)) // Determine where to seek to (sub-object position + /First) seekTo := int64(subObjPos + first) // Fast forward to the object rs.Seek(seekTo, 0) // Create a new io.Reader r = bufio.NewReader(rs) // Read token token, err := this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } // Read object obj, err := this.readValue(r, token) if err != nil { return nil, errors.Wrap(err, "Failed to read value for token: "+token) } result := &PdfValue{} result.Id = subObjId result.Gen = 0 result.Type = PDF_TYPE_OBJECT result.Value = obj return result, nil } func (this *PdfReader) resolveObject(objSpec *PdfValue) (*PdfValue, error) { var err error var old_pos int64 // Create new bufio.Reader r := bufio.NewReader(this.f) if objSpec.Type == PDF_TYPE_OBJREF { // This is a reference, resolve it. offset := this.xref[objSpec.Id][objSpec.Gen] if _, ok := this.xref[objSpec.Id]; !ok { // This may be a compressed object return this.resolveCompressedObject(objSpec) } // Save current file position // This is needed if you want to resolve reference while you're reading another object. // (e.g.: if you need to determine the length of a stream) old_pos, err = this.f.Seek(0, os.SEEK_CUR) if err != nil { return nil, errors.Wrap(err, "Failed to get current position of file") } // Reposition the file pointer and load the object header _, err = this.f.Seek(int64(offset), 0) if err != nil { return nil, errors.Wrap(err, "Failed to set position of file") } token, err := this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } obj, err := this.readValue(r, token) if err != nil { return nil, errors.Wrap(err, "Failed to read value for token: "+token) } if obj.Type != PDF_TYPE_OBJDEC { return nil, errors.New(fmt.Sprintf("Expected type to be PDF_TYPE_OBJDEC, got: %d", obj.Type)) } if obj.Id != objSpec.Id { return nil, errors.New(fmt.Sprintf("Object ID (%d) does not match ObjSpec ID (%d)", obj.Id, objSpec.Id)) } if obj.Gen != objSpec.Gen { return nil, errors.New("Object Gen does not match ObjSpec Gen") } // Read next token token, err = this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } // Read actual object value value, err := this.readValue(r, token) if err != nil { return nil, errors.Wrap(err, "Failed to read value for token: "+token) } // Read next token token, err = this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } result := &PdfValue{} result.Id = obj.Id result.Gen = obj.Gen result.Type = PDF_TYPE_OBJECT result.Value = value if token == "stream" { result.Type = PDF_TYPE_STREAM err = this.skipWhitespace(r) if err != nil { return nil, errors.Wrap(err, "Failed to skip whitespace") } // Get stream length dictionary lengthDict := value.Dictionary["/Length"] // Get number of bytes of stream length := lengthDict.Int // If lengthDict is an object reference, resolve the object and set length if lengthDict.Type == PDF_TYPE_OBJREF { lengthDict, err = this.resolveObject(lengthDict) if err != nil { return nil, errors.Wrap(err, "Failed to resolve length object of stream") } // Set length to resolved object value length = lengthDict.Value.Int } // Read length bytes bytes := make([]byte, length) // Cannot use reader.Read() because that may not read all the bytes _, err := io.ReadFull(r, bytes) if err != nil { return nil, errors.Wrap(err, "Failed to read bytes from buffer") } token, err = this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } if token != "endstream" { return nil, errors.New("Expected next token to be: endstream, got: " + token) } token, err = this.readToken(r) if err != nil { return nil, errors.Wrap(err, "Failed to read token") } streamObj := &PdfValue{} streamObj.Type = PDF_TYPE_STREAM streamObj.Bytes = bytes result.Stream = streamObj } if token != "endobj" { return nil, errors.New("Expected next token to be: endobj, got: " + token) } // Reposition the file pointer to previous position _, err = this.f.Seek(old_pos, 0) if err != nil { return nil, errors.Wrap(err, "Failed to set position of file") } return result, nil } else { return objSpec, nil } return &PdfValue{}, nil } // Find the xref offset (should be at the end of the PDF) func (this *PdfReader) findXref() error { var result int var err error var toRead int64 toRead = 1500 // If PDF is smaller than 1500 bytes, be sure to only read the number of bytes that are in the file fileSize := this.nBytes if fileSize < toRead { toRead = fileSize } // 0 means relative to the origin of the file, // 1 means relative to the current offset, // and 2 means relative to the end. whence := 2 // Perform seek operation _, err = this.f.Seek(-toRead, whence) if err != nil { return errors.Wrap(err, "Failed to set position of file") } // Create new bufio.Reader r := bufio.NewReader(this.f) for { // Read all tokens until "startxref" is found token, err := this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } if token == "startxref" { token, err = this.readToken(r) // Probably EOF before finding startxref if err != nil { return errors.Wrap(err, "Failed to find startxref token") } // Convert line (string) into int result, err = strconv.Atoi(token) if err != nil { return errors.Wrap(err, "Failed to convert xref position into integer: "+token) } // Successfully read the xref position this.xrefPos = result break } } // Rewind file pointer whence = 0 _, err = this.f.Seek(0, whence) if err != nil { return errors.Wrap(err, "Failed to set position of file") } this.xrefPos = result return nil } // Read and parse the xref table func (this *PdfReader) readXref() error { var err error // Create new bufio.Reader r := bufio.NewReader(this.f) // Set file pointer to xref start _, err = this.f.Seek(int64(this.xrefPos), 0) if err != nil { return errors.Wrap(err, "Failed to set position of file") } // Xref should start with 'xref' t, err := this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } if t != "xref" { // Maybe this is an XRef stream ... v, err := this.readValue(r, t) if err != nil { return errors.Wrap(err, "Failed to read XRef stream") } if v.Type == PDF_TYPE_OBJDEC { // Read next token t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } // Read actual object value v, err := this.readValue(r, t) if err != nil { return errors.Wrap(err, "Failed to read value for token: "+t) } // If /Type is set, check to see if it is XRef if _, ok := v.Dictionary["/Type"]; ok { if v.Dictionary["/Type"].Token == "/XRef" { // Continue reading xref stream data now that it is confirmed that it is an xref stream // Check for /DecodeParms paethDecode := false if _, ok := v.Dictionary["/DecodeParms"]; ok { columns := 0 predictor := 0 if _, ok2 := v.Dictionary["/DecodeParms"].Dictionary["/Columns"]; ok2 { columns = v.Dictionary["/DecodeParms"].Dictionary["/Columns"].Int } if _, ok2 := v.Dictionary["/DecodeParms"].Dictionary["/Predictor"]; ok2 { predictor = v.Dictionary["/DecodeParms"].Dictionary["/Predictor"].Int } if columns > 4 || predictor > 12 { return errors.New("Unsupported /DecodeParms - only tested with /Columns <= 4 and /Predictor <= 12") } paethDecode = true } /* // Check to make sure field size is [1 2 1] - not yet tested with other field sizes if v.Dictionary["/W"].Array[0].Int != 1 || v.Dictionary["/W"].Array[1].Int > 4 || v.Dictionary["/W"].Array[2].Int != 1 { return errors.New(fmt.Sprintf("Unsupported field sizes in cross-reference stream dictionary: /W [%d %d %d]", v.Dictionary["/W"].Array[0].Int, v.Dictionary["/W"].Array[1].Int, v.Dictionary["/W"].Array[2].Int)) } */ index := make([]int, 2) // If /Index is not set, this is an error if _, ok := v.Dictionary["/Index"]; ok { if len(v.Dictionary["/Index"].Array) < 2 { return errors.Wrap(err, "Index array does not contain 2 elements") } index[0] = v.Dictionary["/Index"].Array[0].Int index[1] = v.Dictionary["/Index"].Array[1].Int } else { index[0] = 0 } prevXref := 0 // Check for previous xref stream if _, ok := v.Dictionary["/Prev"]; ok { prevXref = v.Dictionary["/Prev"].Int } // Set root object if _, ok := v.Dictionary["/Root"]; ok { // Just set the whole dictionary with /Root key to keep compatibiltiy with existing code this.trailer = v } else { // Don't return an error here. The trailer could be in another XRef stream. //return errors.New("Did not set root object") } startObject := index[0] err = this.skipWhitespace(r) if err != nil { return errors.Wrap(err, "Failed to skip whitespace") } // Get stream length dictionary lengthDict := v.Dictionary["/Length"] // Get number of bytes of stream length := lengthDict.Int // If lengthDict is an object reference, resolve the object and set length if lengthDict.Type == PDF_TYPE_OBJREF { lengthDict, err = this.resolveObject(lengthDict) if err != nil { return errors.Wrap(err, "Failed to resolve length object of stream") } // Set length to resolved object value length = lengthDict.Value.Int } t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } if t != "stream" { return errors.New("Expected next token to be: stream, got: " + t) } err = this.skipWhitespace(r) if err != nil { return errors.Wrap(err, "Failed to skip whitespace") } // Read length bytes data := make([]byte, length) // Cannot use reader.Read() because that may not read all the bytes _, err := io.ReadFull(r, data) if err != nil { return errors.Wrap(err, "Failed to read bytes from buffer") } // Look for endstream token t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } if t != "endstream" { return errors.New("Expected next token to be: endstream, got: " + t) } // Look for endobj token t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } if t != "endobj" { return errors.New("Expected next token to be: endobj, got: " + t) } // Now decode zlib data b := bytes.NewReader(data) z, err := zlib.NewReader(b) if err != nil { return errors.Wrap(err, "zlib.NewReader error") } defer z.Close() p, err := ioutil.ReadAll(z) if err != nil { return errors.Wrap(err, "ioutil.ReadAll error") } objPos := 0 objGen := 0 i := startObject // Decode result with paeth algorithm var result []byte b = bytes.NewReader(p) firstFieldSize := v.Dictionary["/W"].Array[0].Int middleFieldSize := v.Dictionary["/W"].Array[1].Int lastFieldSize := v.Dictionary["/W"].Array[2].Int fieldSize := firstFieldSize + middleFieldSize + lastFieldSize if paethDecode { fieldSize++ } prevRow := make([]byte, fieldSize) for { result = make([]byte, fieldSize) _, err := io.ReadFull(b, result) if err != nil { if err == io.EOF { break } else { return errors.Wrap(err, "io.ReadFull error") } } if paethDecode { filterPaeth(result, prevRow, fieldSize) copy(prevRow, result) } objectData := make([]byte, fieldSize) if paethDecode { copy(objectData, result[1:fieldSize]) } else { copy(objectData, result[0:fieldSize]) } if objectData[0] == 1 { // Regular objects b := make([]byte, 4) copy(b[4-middleFieldSize:], objectData[1:1+middleFieldSize]) objPos = int(binary.BigEndian.Uint32(b)) objGen = int(objectData[firstFieldSize+middleFieldSize]) // Append map[int]int this.xref[i] = make(map[int]int, 1) // Set object id, generation, and position this.xref[i][objGen] = objPos } else if objectData[0] == 2 { // Compressed objects b := make([]byte, 4) copy(b[4-middleFieldSize:], objectData[1:1+middleFieldSize]) objId := int(binary.BigEndian.Uint32(b)) objIdx := int(objectData[firstFieldSize+middleFieldSize]) // object id (i) is located in StmObj (objId) at index (objIdx) this.xrefStream[i] = [2]int{objId, objIdx} } i++ } // Check for previous xref stream if prevXref > 0 { // Set xrefPos to /Prev xref this.xrefPos = prevXref // Read preivous xref xrefErr := this.readXref() if xrefErr != nil { return errors.Wrap(xrefErr, "Failed to read prev xref") } } } } return nil } return errors.New("Expected xref to start with 'xref'. Got: " + t) } for { // Next value will be the starting object id (usually 0, but not always) or the trailer t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } // Check for trailer if t == "trailer" { break } // Convert token to int startObject, err := strconv.Atoi(t) if err != nil { return errors.Wrap(err, "Failed to convert start object to integer: "+t) } // Determine how many objects there are t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } // Convert token to int numObject, err := strconv.Atoi(t) if err != nil { return errors.Wrap(err, "Failed to convert num object to integer: "+t) } // For all objects in xref, read object position, object generation, and status (free or new) for i := startObject; i < startObject+numObject; i++ { t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } // Get object position as int objPos, err := strconv.Atoi(t) if err != nil { return errors.Wrap(err, "Failed to convert object position to integer: "+t) } t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } // Get object generation as int objGen, err := strconv.Atoi(t) if err != nil { return errors.Wrap(err, "Failed to convert object generation to integer: "+t) } // Get object status (free or new) objStatus, err := this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } if objStatus != "f" && objStatus != "n" { return errors.New("Expected objStatus to be 'n' or 'f', got: " + objStatus) } // Append map[int]int this.xref[i] = make(map[int]int, 1) // Set object id, generation, and position this.xref[i][objGen] = objPos } } // Read trailer dictionary t, err = this.readToken(r) if err != nil { return errors.Wrap(err, "Failed to read token") } trailer, err := this.readValue(r, t) if err != nil { return errors.Wrap(err, "Failed to read value for token: "+t) } // If /Root is set, then set trailer object so that /Root can be read later if _, ok := trailer.Dictionary["/Root"]; ok { this.trailer = trailer } // If a /Prev xref trailer is specified, parse that if tr, ok := trailer.Dictionary["/Prev"]; ok { // Resolve parent xref table this.xrefPos = tr.Int return this.readXref() } return nil } // Read root (catalog object) func (this *PdfReader) readRoot() error { var err error rootObjSpec := this.trailer.Dictionary["/Root"] // Read root (catalog) this.catalog, err = this.resolveObject(rootObjSpec) if err != nil { return errors.Wrap(err, "Failed to resolve root object") } return nil } // Read kids (pages inside a page tree) func (this *PdfReader) readKids(kids *PdfValue, r int) error { // Loop through pages and add to result for i := 0; i < len(kids.Array); i++ { page, err := this.resolveObject(kids.Array[i]) if err != nil { return errors.Wrap(err, "Failed to resolve page/pages object") } objType := page.Value.Dictionary["/Type"].Token if objType == "/Page" { // Set page and increment curPage this.pages[this.curPage] = page this.curPage++ } else if objType == "/Pages" { // Resolve kids subKids, err := this.resolveObject(page.Value.Dictionary["/Kids"]) if err != nil { return errors.Wrap(err, "Failed to resolve kids") } // Recurse into page tree err = this.readKids(subKids, r+1) if err != nil { return errors.Wrap(err, "Failed to read kids") } } else { return errors.Wrap(err, fmt.Sprintf("Unknown object type '%s'. Expected: /Pages or /Page", objType)) } } return nil } // Read all pages in PDF func (this *PdfReader) readPages() error { var err error // resolve_pages_dict pagesDict, err := this.resolveObject(this.catalog.Value.Dictionary["/Pages"]) if err != nil { return errors.Wrap(err, "Failed to resolve pages object") } // This will normally return itself kids, err := this.resolveObject(pagesDict.Value.Dictionary["/Kids"]) if err != nil { return errors.Wrap(err, "Failed to resolve kids object") } // Get number of pages pageCount, err := this.resolveObject(pagesDict.Value.Dictionary["/Count"]) if err != nil { return errors.Wrap(err, "Failed to get page count") } this.pageCount = pageCount.Int // Allocate pages this.pages = make([]*PdfValue, pageCount.Int) // Read kids err = this.readKids(kids, 0) if err != nil { return errors.Wrap(err, "Failed to read kids") } return nil } // Get references to page resources for a given page number func (this *PdfReader) getPageResources(pageno int) (*PdfValue, error) { var err error // Check to make sure page exists in pages slice if len(this.pages) < pageno { return nil, errors.New(fmt.Sprintf("Page %d does not exist!!", pageno)) } // Resolve page object page, err := this.resolveObject(this.pages[pageno-1]) if err != nil { return nil, errors.Wrap(err, "Failed to resolve page object") } // Check to see if /Resources exists in Dictionary if _, ok := page.Value.Dictionary["/Resources"]; ok { // Resolve /Resources object res, err := this.resolveObject(page.Value.Dictionary["/Resources"]) if err != nil { return nil, errors.Wrap(err, "Failed to resolve resources object") } // If type is PDF_TYPE_OBJECT, return its Value if res.Type == PDF_TYPE_OBJECT { return res.Value, nil } // Otherwise, returned the resolved object return res, nil } else { // If /Resources does not exist, check to see if /Parent exists and return that if _, ok := page.Value.Dictionary["/Parent"]; ok { // Resolve parent object res, err := this.resolveObject(page.Value.Dictionary["/Parent"]) if err != nil { return nil, errors.Wrap(err, "Failed to resolve parent object") } // If /Parent object type is PDF_TYPE_OBJECT, return its Value if res.Type == PDF_TYPE_OBJECT { return res.Value, nil } // Otherwise, return the resolved parent object return res, nil } } // Return an empty PdfValue if we got here // TODO: Improve error handling return &PdfValue{}, nil } // Get page content and return a slice of PdfValue objects func (this *PdfReader) getPageContent(objSpec *PdfValue) ([]*PdfValue, error) { var err error var content *PdfValue // Allocate slice contents := make([]*PdfValue, 0) if objSpec.Type == PDF_TYPE_OBJREF { // If objSpec is an object reference, resolve the object and append it to contents content, err = this.resolveObject(objSpec) if err != nil { return nil, errors.Wrap(err, "Failed to resolve object") } contents = append(contents, content) } else if objSpec.Type == PDF_TYPE_ARRAY { // If objSpec is an array, loop through the array and recursively get page content and append to contents for i := 0; i < len(objSpec.Array); i++ { tmpContents, err := this.getPageContent(objSpec.Array[i]) if err != nil { return nil, errors.Wrap(err, "Failed to get page content") } for j := 0; j < len(tmpContents); j++ { contents = append(contents, tmpContents[j]) } } } return contents, nil } // Get content (i.e. PDF drawing instructions) func (this *PdfReader) getContent(pageno int) (string, error) { var err error var contents []*PdfValue // Check to make sure page exists in pages slice if len(this.pages) < pageno { return "", errors.New(fmt.Sprintf("Page %d does not exist.", pageno)) } // Get page page := this.pages[pageno-1] // FIXME: This could be slow, converting []byte to string and appending many times buffer := "" // Check to make sure /Contents exists in page dictionary if _, ok := page.Value.Dictionary["/Contents"]; ok { // Get an array of page content contents, err = this.getPageContent(page.Value.Dictionary["/Contents"]) if err != nil { return "", errors.Wrap(err, "Failed to get page content") } for i := 0; i < len(contents); i++ { // Decode content if one or more /Filter is specified. // Most common filter is FlateDecode which can be uncompressed with zlib tmpBuffer, err := this.rebuildContentStream(contents[i]) if err != nil { return "", errors.Wrap(err, "Failed to rebuild content stream") } // FIXME: This is probably slow buffer += string(tmpBuffer) } } return buffer, nil } // Rebuild content stream // This will decode content if one or more /Filter (such as FlateDecode) is specified. // If there are multiple filters, they will be decoded in the order in which they were specified. func (this *PdfReader) rebuildContentStream(content *PdfValue) ([]byte, error) { var err error var tmpFilter *PdfValue // Allocate slice of PdfValue filters := make([]*PdfValue, 0) // If content has a /Filter, append it to filters slice if _, ok := content.Value.Dictionary["/Filter"]; ok { filter := content.Value.Dictionary["/Filter"] // If filter type is a reference, resolve it if filter.Type == PDF_TYPE_OBJREF { tmpFilter, err = this.resolveObject(filter) if err != nil { return nil, errors.Wrap(err, "Failed to resolve object") } filter = tmpFilter.Value } if filter.Type == PDF_TYPE_TOKEN { // If filter type is a token (e.g. FlateDecode), appent it to filters slice filters = append(filters, filter) } else if filter.Type == PDF_TYPE_ARRAY { // If filter type is an array, then there are multiple filters. Set filters variable to array value. filters = filter.Array } } // Set stream variable to content bytes stream := content.Stream.Bytes // Loop through filters and apply each filter to stream for i := 0; i < len(filters); i++ { switch filters[i].Token { case "/FlateDecode": // Uncompress zlib compressed data var out bytes.Buffer zlibReader, _ := zlib.NewReader(bytes.NewBuffer(stream)) defer zlibReader.Close() io.Copy(&out, zlibReader) // Set stream to uncompressed data stream = out.Bytes() default: return nil, errors.New("Unspported filter: " + filters[i].Token) } } return stream, nil } func (this *PdfReader) getNumPages() (int, error) { if this.pageCount == 0 { return 0, errors.New("Page count is 0") } return this.pageCount, nil } func (this *PdfReader) getAllPageBoxes(k float64) (map[int]map[string]map[string]float64, error) { var err error // Allocate result with the number of available boxes result := make(map[int]map[string]map[string]float64, len(this.pages)) for i := 1; i <= len(this.pages); i++ { result[i], err = this.getPageBoxes(i, k) if result[i] == nil { return nil, errors.Wrap(err, "Unable to get page box") } } return result, nil } // Get all page box data func (this *PdfReader) getPageBoxes(pageno int, k float64) (map[string]map[string]float64, error) { var err error // Allocate result with the number of available boxes result := make(map[string]map[string]float64, len(this.availableBoxes)) // Check to make sure page exists in pages slice if len(this.pages) < pageno { return nil, errors.New(fmt.Sprintf("Page %d does not exist?", pageno)) } // Resolve page object page, err := this.resolveObject(this.pages[pageno-1]) if err != nil { return nil, errors.New("Failed to resolve page object") } // Loop through available boxes and add to result for i := 0; i < len(this.availableBoxes); i++ { box, err := this.getPageBox(page, this.availableBoxes[i], k) if err != nil { return nil, errors.New("Failed to get page box") } result[this.availableBoxes[i]] = box } return result, nil } // Get a specific page box value (e.g. MediaBox) and return its values func (this *PdfReader) getPageBox(page *PdfValue, box_index string, k float64) (map[string]float64, error) { var err error var tmpBox *PdfValue // Allocate 8 fields in result result := make(map[string]float64, 8) // Check to make sure box_index (e.g. MediaBox) exists in page dictionary if _, ok := page.Value.Dictionary[box_index]; ok { box := page.Value.Dictionary[box_index] // If the box type is a reference, resolve it if box.Type == PDF_TYPE_OBJREF { tmpBox, err = this.resolveObject(box) if err != nil { return nil, errors.New("Failed to resolve object") } box = tmpBox.Value } if box.Type == PDF_TYPE_ARRAY { // If the box type is an array, calculate scaled value based on k result["x"] = box.Array[0].Real / k result["y"] = box.Array[1].Real / k result["w"] = math.Abs(box.Array[0].Real-box.Array[2].Real) / k result["h"] = math.Abs(box.Array[1].Real-box.Array[3].Real) / k result["llx"] = math.Min(box.Array[0].Real, box.Array[2].Real) result["lly"] = math.Min(box.Array[1].Real, box.Array[3].Real) result["urx"] = math.Max(box.Array[0].Real, box.Array[2].Real) result["ury"] = math.Max(box.Array[1].Real, box.Array[3].Real) } else { // TODO: Improve error handling return nil, errors.New("Could not get page box") } } else if _, ok := page.Value.Dictionary["/Parent"]; ok { parentObj, err := this.resolveObject(page.Value.Dictionary["/Parent"]) if err != nil { return nil, errors.Wrap(err, "Could not resolve parent object") } // If the page box is inherited from /Parent, recursively return page box of parent return this.getPageBox(parentObj, box_index, k) } return result, nil } // Get page rotation for a page number func (this *PdfReader) getPageRotation(pageno int) (*PdfValue, error) { // Check to make sure page exists in pages slice if len(this.pages) < pageno { return nil, errors.New(fmt.Sprintf("Page %d does not exist!!!!", pageno)) } return this._getPageRotation(this.pages[pageno-1]) } // Get page rotation for a page object spec func (this *PdfReader) _getPageRotation(page *PdfValue) (*PdfValue, error) { var err error // Resolve page object page, err = this.resolveObject(page) if err != nil { return nil, errors.New("Failed to resolve page object") } // Check to make sure /Rotate exists in page dictionary if _, ok := page.Value.Dictionary["/Rotate"]; ok { res, err := this.resolveObject(page.Value.Dictionary["/Rotate"]) if err != nil { return nil, errors.New("Failed to resolve rotate object") } // If the type is PDF_TYPE_OBJECT, return its value if res.Type == PDF_TYPE_OBJECT { return res.Value, nil } // Otherwise, return the object return res, nil } else { // Check to see if parent has a rotation if _, ok := page.Value.Dictionary["/Parent"]; ok { // Recursively return /Parent page rotation res, err := this._getPageRotation(page.Value.Dictionary["/Parent"]) if err != nil { return nil, errors.Wrap(err, "Failed to get page rotation for parent") } // If the type is PDF_TYPE_OBJECT, return its value if res.Type == PDF_TYPE_OBJECT { return res.Value, nil } // Otherwise, return the object return res, nil } } return &PdfValue{Int: 0}, nil } func (this *PdfReader) read() error { // Only run once if !this.alreadyRead { var err error // Find xref position err = this.findXref() if err != nil { return errors.Wrap(err, "Failed to find xref position") } // Parse xref table err = this.readXref() if err != nil { return errors.Wrap(err, "Failed to read xref table") } // Read catalog err = this.readRoot() if err != nil { return errors.Wrap(err, "Failed to read root") } // Read pages err = this.readPages() if err != nil { return errors.Wrap(err, "Failed to to read pages") } // Now that this has been read, do not read again this.alreadyRead = true } return nil } gofpdi-1.0.13/writer.go000066400000000000000000000307611366544714600147360ustar00rootroot00000000000000package gofpdi import ( "bufio" "bytes" "compress/zlib" "crypto/sha1" "encoding/hex" "fmt" "github.com/pkg/errors" "math" "os" ) type PdfWriter struct { f *os.File w *bufio.Writer r *PdfReader k float64 tpls []*PdfTemplate m int n int offsets map[int]int offset int result map[int]string // Keep track of which objects have already been written obj_stack map[int]*PdfValue don_obj_stack map[int]*PdfValue written_objs map[*PdfObjectId][]byte written_obj_pos map[*PdfObjectId]map[int]string current_obj *PdfObject current_obj_id int tpl_id_offset int use_hash bool } type PdfObjectId struct { id int hash string } type PdfObject struct { id *PdfObjectId buffer *bytes.Buffer } func (this *PdfWriter) SetTplIdOffset(n int) { this.tpl_id_offset = n } func (this *PdfWriter) Init() { this.k = 1 this.obj_stack = make(map[int]*PdfValue, 0) this.don_obj_stack = make(map[int]*PdfValue, 0) this.tpls = make([]*PdfTemplate, 0) this.written_objs = make(map[*PdfObjectId][]byte, 0) this.written_obj_pos = make(map[*PdfObjectId]map[int]string, 0) this.current_obj = new(PdfObject) } func (this *PdfWriter) SetUseHash(b bool) { this.use_hash = b } func (this *PdfWriter) SetNextObjectID(id int) { this.n = id - 1 } func NewPdfWriter(filename string) (*PdfWriter, error) { writer := &PdfWriter{} writer.Init() if filename != "" { var err error f, err := os.Create(filename) if err != nil { return nil, errors.Wrap(err, "Unable to create filename: "+filename) } writer.f = f writer.w = bufio.NewWriter(f) } return writer, nil } // Done with parsing. Now, create templates. type PdfTemplate struct { Id int Reader *PdfReader Resources *PdfValue Buffer string Box map[string]float64 Boxes map[string]map[string]float64 X float64 Y float64 W float64 H float64 Rotation int N int } func (this *PdfWriter) GetImportedObjects() map[*PdfObjectId][]byte { return this.written_objs } // For each object (uniquely identified by a sha1 hash), return the positions // of each hash within the object, to be replaced with pdf object ids (integers) func (this *PdfWriter) GetImportedObjHashPos() map[*PdfObjectId]map[int]string { return this.written_obj_pos } func (this *PdfWriter) ClearImportedObjects() { this.written_objs = make(map[*PdfObjectId][]byte, 0) } // Create a PdfTemplate object from a page number (e.g. 1) and a boxName (e.g. MediaBox) func (this *PdfWriter) ImportPage(reader *PdfReader, pageno int, boxName string) (int, error) { var err error // Set default scale to 1 this.k = 1 // Get all page boxes pageBoxes, err := reader.getPageBoxes(1, this.k) if err != nil { return -1, errors.Wrap(err, "Failed to get page boxes") } // If requested box name does not exist for this page, use an alternate box if _, ok := pageBoxes[boxName]; !ok { if boxName == "/BleedBox" || boxName == "/TrimBox" || boxName == "ArtBox" { boxName = "/CropBox" } else if boxName == "/CropBox" { boxName = "/MediaBox" } } // If the requested box name or an alternate box name cannot be found, trigger an error // TODO: Improve error handling if _, ok := pageBoxes[boxName]; !ok { return -1, errors.New("Box not found: " + boxName) } pageResources, err := reader.getPageResources(pageno) if err != nil { return -1, errors.Wrap(err, "Failed to get page resources") } content, err := reader.getContent(pageno) if err != nil { return -1, errors.Wrap(err, "Failed to get content") } // Set template values tpl := &PdfTemplate{} tpl.Reader = reader tpl.Resources = pageResources tpl.Buffer = content tpl.Box = pageBoxes[boxName] tpl.Boxes = pageBoxes tpl.X = 0 tpl.Y = 0 tpl.W = tpl.Box["w"] tpl.H = tpl.Box["h"] // Set template rotation rotation, err := reader.getPageRotation(pageno) if err != nil { return -1, errors.Wrap(err, "Failed to get page rotation") } angle := rotation.Int % 360 // Normalize angle if angle != 0 { steps := angle / 90 w := tpl.W h := tpl.H if steps%2 == 0 { tpl.W = w tpl.H = h } else { tpl.W = h tpl.H = w } if angle < 0 { angle += 360 } tpl.Rotation = angle * -1 } this.tpls = append(this.tpls, tpl) // Return last template id return len(this.tpls) - 1, nil } // Create a new object and keep track of the offset for the xref table func (this *PdfWriter) newObj(objId int, onlyNewObj bool) { if objId < 0 { this.n++ objId = this.n } if !onlyNewObj { // set current object id integer this.current_obj_id = objId // Create new PdfObject and PdfObjectId this.current_obj = new(PdfObject) this.current_obj.buffer = new(bytes.Buffer) this.current_obj.id = new(PdfObjectId) this.current_obj.id.id = objId this.current_obj.id.hash = this.shaOfInt(objId) this.written_obj_pos[this.current_obj.id] = make(map[int]string, 0) } } func (this *PdfWriter) endObj() { this.out("endobj") this.written_objs[this.current_obj.id] = this.current_obj.buffer.Bytes() this.current_obj_id = -1 } func (this *PdfWriter) shaOfInt(i int) string { hasher := sha1.New() hasher.Write([]byte(fmt.Sprintf("%s-%s", i, this.r.sourceFile))) sha := hex.EncodeToString(hasher.Sum(nil)) return sha } func (this *PdfWriter) outObjRef(objId int) { sha := this.shaOfInt(objId) // Keep track of object hash and position - to be replaced with actual object id (integer) this.written_obj_pos[this.current_obj.id][this.current_obj.buffer.Len()] = sha if this.use_hash { this.current_obj.buffer.WriteString(sha) } else { this.current_obj.buffer.WriteString(fmt.Sprintf("%d", objId)) } this.current_obj.buffer.WriteString(" 0 R ") } // Output PDF data with a newline func (this *PdfWriter) out(s string) { this.current_obj.buffer.WriteString(s) this.current_obj.buffer.WriteString("\n") } // Output PDF data func (this *PdfWriter) straightOut(s string) { this.current_obj.buffer.WriteString(s) } // Output a PdfValue func (this *PdfWriter) writeValue(value *PdfValue) { switch value.Type { case PDF_TYPE_TOKEN: this.straightOut(value.Token + " ") break case PDF_TYPE_NUMERIC: this.straightOut(fmt.Sprintf("%d", value.Int) + " ") break case PDF_TYPE_REAL: this.straightOut(fmt.Sprintf("%F", value.Real) + " ") break case PDF_TYPE_ARRAY: this.straightOut("[") for i := 0; i < len(value.Array); i++ { this.writeValue(value.Array[i]) } this.out("]") break case PDF_TYPE_DICTIONARY: this.straightOut("<<") for k, v := range value.Dictionary { this.straightOut(k + " ") this.writeValue(v) } this.straightOut(">>") break case PDF_TYPE_OBJREF: // An indirect object reference. Fill the object stack if needed. // Check to see if object already exists on the don_obj_stack. if _, ok := this.don_obj_stack[value.Id]; !ok { this.newObj(-1, true) this.obj_stack[value.Id] = &PdfValue{Type: PDF_TYPE_OBJREF, Gen: value.Gen, Id: value.Id, NewId: this.n} this.don_obj_stack[value.Id] = &PdfValue{Type: PDF_TYPE_OBJREF, Gen: value.Gen, Id: value.Id, NewId: this.n} } // Get object ID from don_obj_stack objId := this.don_obj_stack[value.Id].NewId this.outObjRef(objId) //this.out(fmt.Sprintf("%d 0 R", objId)) break case PDF_TYPE_STRING: // A string this.straightOut("(" + value.String + ")") break case PDF_TYPE_STREAM: // A stream. First, output the stream dictionary, then the stream data itself. this.writeValue(value.Value) this.out("stream") this.out(string(value.Stream.Bytes)) this.out("endstream") break case PDF_TYPE_HEX: this.straightOut("<" + value.String + ">") break case PDF_TYPE_BOOLEAN: if value.Bool { this.straightOut("true") } else { this.straightOut("false") } break case PDF_TYPE_NULL: // The null object this.straightOut("null ") break } } // Output Form XObjects (1 for each template) // returns a map of template names (e.g. /GOFPDITPL1) to PdfObjectId func (this *PdfWriter) PutFormXobjects(reader *PdfReader) (map[string]*PdfObjectId, error) { // Set current reader this.r = reader var err error var result = make(map[string]*PdfObjectId, 0) compress := true filter := "" if compress { filter = "/Filter /FlateDecode " } for i := 0; i < len(this.tpls); i++ { tpl := this.tpls[i] if tpl == nil { return nil, errors.New("Template is nil") } var p string if compress { var b bytes.Buffer w := zlib.NewWriter(&b) w.Write([]byte(tpl.Buffer)) w.Close() p = b.String() } else { p = tpl.Buffer } // Create new PDF object this.newObj(-1, false) cN := this.n // remember current "n" tpl.N = this.n // Return xobject form name and object position pdfObjId := new(PdfObjectId) pdfObjId.id = cN pdfObjId.hash = this.shaOfInt(cN) result[fmt.Sprintf("/GOFPDITPL%d", i+this.tpl_id_offset)] = pdfObjId this.out("<<" + filter + "/Type /XObject") this.out("/Subtype /Form") this.out("/FormType 1") this.out(fmt.Sprintf("/BBox [%.2F %.2F %.2F %.2F]", tpl.Box["llx"]*this.k, tpl.Box["lly"]*this.k, (tpl.Box["urx"]+tpl.X)*this.k, (tpl.Box["ury"]-tpl.Y)*this.k)) var c, s, tx, ty float64 c = 1 // Handle rotated pages if tpl.Box != nil { tx = -tpl.Box["llx"] ty = -tpl.Box["lly"] if tpl.Rotation != 0 { angle := float64(tpl.Rotation) * math.Pi / 180.0 c = math.Cos(float64(angle)) s = math.Sin(float64(angle)) switch tpl.Rotation { case -90: tx = -tpl.Box["lly"] ty = tpl.Box["urx"] break case -180: tx = tpl.Box["urx"] ty = tpl.Box["ury"] break case -270: tx = tpl.Box["ury"] ty = -tpl.Box["llx"] } } } else { tx = -tpl.Box["x"] * 2 ty = tpl.Box["y"] * 2 } tx *= this.k ty *= this.k if c != 1 || s != 0 || tx != 0 || ty != 0 { this.out(fmt.Sprintf("/Matrix [%.5F %.5F %.5F %.5F %.5F %.5F]", c, s, -s, c, tx, ty)) } // Now write resources this.out("/Resources ") if tpl.Resources != nil { this.writeValue(tpl.Resources) // "n" will be changed } else { return nil, errors.New("Template resources are empty") } nN := this.n // remember new "n" this.n = cN // reset to current "n" this.out("/Length " + fmt.Sprintf("%d", len(p)) + " >>") this.out("stream") this.out(p) this.out("endstream") this.endObj() this.n = nN // reset to new "n" // Put imported objects, starting with the ones from the XObject's Resources, // then from dependencies of those resources). err = this.putImportedObjects(reader) if err != nil { return nil, errors.Wrap(err, "Failed to put imported objects") } } return result, nil } func (this *PdfWriter) putImportedObjects(reader *PdfReader) error { var err error var nObj *PdfValue // obj_stack will have new items added to it in the inner loop, so do another loop to check for extras // TODO make the order of this the same every time for { atLeastOne := false // FIXME: How to determine number of objects before this loop? for i := 0; i < 9999; i++ { k := i v := this.obj_stack[i] if v == nil { continue } atLeastOne = true nObj, err = reader.resolveObject(v) if err != nil { return errors.Wrap(err, "Unable to resolve object") } // New object with "NewId" field this.newObj(v.NewId, false) if nObj.Type == PDF_TYPE_STREAM { this.writeValue(nObj) } else { this.writeValue(nObj.Value) } this.endObj() // Remove from stack this.obj_stack[k] = nil } if !atLeastOne { break } } return nil } // Get the calculated size of a template // If one size is given, this method calculates the other one func (this *PdfWriter) getTemplateSize(tplid int, _w float64, _h float64) map[string]float64 { result := make(map[string]float64, 2) tpl := this.tpls[tplid] w := tpl.W h := tpl.H if _w == 0 && _h == 0 { _w = w _h = h } if _w == 0 { _w = _h * w / h } if _h == 0 { _h = _w * h / w } result["w"] = _w result["h"] = _h return result } func (this *PdfWriter) UseTemplate(tplid int, _x float64, _y float64, _w float64, _h float64) (string, float64, float64, float64, float64) { tpl := this.tpls[tplid] w := tpl.W h := tpl.H _x += tpl.X _y += tpl.Y wh := this.getTemplateSize(0, _w, _h) _w = wh["w"] _h = wh["h"] tData := make(map[string]float64, 9) tData["x"] = 0.0 tData["y"] = 0.0 tData["w"] = _w tData["h"] = _h tData["scaleX"] = (_w / w) tData["scaleY"] = (_h / h) tData["tx"] = _x tData["ty"] = (0 - _y - _h) tData["lty"] = (0 - _y - _h) - (0-h)*(_h/h) return fmt.Sprintf("/GOFPDITPL%d", tplid+this.tpl_id_offset), tData["scaleX"], tData["scaleY"], tData["tx"] * this.k, tData["ty"] * this.k }