pstotext-1.9/0040775000076400007640000000000010000725401011420 5ustar rjlrjlpstotext-1.9/bundle.c0100664000076400007640000000102607777477655013103 0ustar rjlrjl/* Copyright (C) 1995, Digital Equipment Corporation. */ /* All rights reserved. */ /* See the file pstotext.txt for a full description. */ /* Last modified on Fri Jan 09 21:17:00 AEST 2004 by rjl */ /* modified on Thu Aug 1 11:32:09 PDT 1996 by mcjones */ /* rjl: Fix compiler warnings */ #include #include "bundle.h" void putbundle(BUNDLE b, FILE *f) { const char **ppLine = b; for (ppLine = b; *ppLine!=NULL; ppLine++) { fputs(*ppLine, f); } } pstotext-1.9/bundle.h0100664000076400007640000000106307777477527013107 0ustar rjlrjl/* Copyright (C) 1995, Digital Equipment Corporation. */ /* All rights reserved. */ /* See the file pstotext.txt for a full description. */ /* Last modified on Fri Jan 09 21:15:00 AEST 2004 by rjl */ /* modified on Fri Oct 11 15:35:24 PDT 1996 by mcjones */ /* rjl: Fix compiler warnings */ typedef const char *BUNDLE[]; extern void putbundle(BUNDLE b, FILE *f); /* Write bundle "b" to file "f". "b" should have been constructed from "b.ps" by the ".ps.h" rule in the pstotext Makefile. */ pstotext-1.9/descrip.mms0100664000076400007640000000265406536201216013607 0ustar rjlrjl# # VMS MMK build file for PSTOTEXT # # Hunter Goatley, 27-MAY-1998 13:22 # .IFDEF __ALPHA__ CFLAGS = $(CFLAGS)/PREFIX=ALL/L_DOUBLE=64 .ELSE .IFDEF __DECC__ CFLAGS = $(CFLAGS)/PREFIX=ALL .ELSE OPTFILE = ,VAXCRTL.OPT OPTIONS = $(OPTFILE)/OPTIONS .ENDIF .ENDIF .IFDEF __DEBUG__ CDBG = /DEBUG/NOOPTIMIZE LDBG = /DEBUG .ELSE LDBG = /NOTRACEBACK .ENDIF .SUFFIXES .PS .PS.H: @ mkbundle := $sys$disk:[]mkbundle.exe - mkbundle $(MMS$SOURCE) $(MMS$TARGET) .C.OBJ: $(CC)$(CFLAGS)$(CDBG) $(MMS$SOURCE) PSTOTEXT : PSTOTEXT.EXE !PStoTEXT built MKBUNDLE.EXE : MKBUNDLE.OBJ $(OPTFILE) $(LINK)$(LINKFLAGS)$(LDBG) $(MMS$SOURCE)$(OPTIONS) MKBUNDLE.OBJ : MKBUNDLE.C @ open/write tmp strip-cr.edt @ cr[0,7] = 13 @ eof[0,7] = 26 @ write tmp "s/''cr'//:e" @ write tmp "s/''eof'//:e" @ write tmp "exit" @ close tmp @ define/user sys$output _NLA0: @ edit/edt/command=strip-cr.edt mkbundle.c @ deletee/nolog strip-cr.edt; $(CC)$(CFLAGS)$(CDBG) $(MMS$SOURCE) PSTOTEXT_OBJS = MAIN.OBJ, BUNDLE.OBJ, PTOTDLL.OBJ PSTOTEXT.EXE : $(PSTOTEXT_OBJS) $(OPTFILE) $(LINK)$(LINKFLAGS)$(LDBG) $(PSTOTEXT_OBJS) $(OPTIONS) MAIN.OBJ : MAIN.C, BUNDLE.H, OCR.H, ROT270.H, ROT90.H, PTOTDLL.H, VMS.H PTOTDLL.OBJ : PTOTDLL.C, PTOTDLL.H BUNDLE.OBJ : BUNDLE.C, BUNDLE.H OCR.H : OCR.PS, MKBUNDLE.EXE ROT90.H : ROT90.PS, MKBUNDLE.EXE ROT270.H : ROT270.PS, MKBUNDLE.EXE VAXCRTL.OPT : @ open/write tmp $(MMS$TARGET) @ write tmp "sys$share:vaxcrtl.exe/share" @ close tmp pstotext-1.9/main.c0100664000076400007640000002235107777477702012553 0ustar rjlrjl/* Copyright (C) 1995-1998, Digital Equipment Corporation. */ /* All rights reserved. */ /* See the file pstotext.txt for a full description. */ /* Last modified on Fri Jan 09 21:17:00 AEST 2004 by rjl */ /* modified on Sat Jun 02 15:04:00 AEST 2001 by rjl */ /* modified on Fri Oct 16 16:27:54 PDT 1998 by mcjones */ /* modified on Thu Nov 16 13:33:13 PST 1995 by deutsch */ /* * Modifications by rjl: * Use mkstemp not tempnam. * ANSI C Function prototypes. * Fixed const warnings. * Applied debian pstotext-1.8g-6 patches. * * Modified on 27-MAY-1998 13:08 by Hunter Goatley * Ported to VMS. Various minor changes to allow it to work on * both VAX and Alpha (VAX C and DEC C). VMS pipes don't work * right, so the GS output is dumped to a temporary file that's * read, instead of reading from pipes (which is, of course, how * VMS implements pipes anyway). Also added -output option. */ #ifdef VMS #include "vms.h" #else #include #include #include #endif #include #include #include #include #include "bundle.h" #include "ocr.h" #include "rot270.h" #include "rot90.h" #include "ptotdll.h" #define BOOLEAN int #define FALSE 0 #define TRUE 1 #define LINELEN 2000 /* longest allowable line from gs ocr.ps output */ extern BUNDLE ocr, rot270, rot90; static BOOLEAN cork = FALSE; static BOOLEAN debug = FALSE; static const char *gs_cmd = "gs"; static const char *outfile = ""; static char *cmd; /* = argv[0] */ static enum { portrait, landscape, landscapeOther} orientation = portrait; static BOOLEAN bboxes = FALSE; static int explicitFiles = 0; /* count of explicit file arguments */ static void usage(void) { fprintf(stderr, "pstotext 1.9 of 2003-01-09\n"); fprintf(stderr, "Copyright (C) 1995-1998, Digital Equipment Corporation.\n"); fprintf(stderr, "Modified by Ghostgum Software Pty Ltd.\n"); fprintf(stderr, "Comments to {mcjones,birrell}@pa.dec.com\n\n"); #ifdef VMS fprintf(stderr, "VMS Comments to goathunter@madgoat.com\n\n"); #endif fprintf(stderr, "Usage: %s [option|file]...\n", cmd); fprintf(stderr, "Options:\n"); fprintf(stderr, " -cork assume Cork encoding for dvips output\n"); fprintf(stderr, " -landscape rotate 270 degrees\n"); #ifdef VMS fprintf(stderr, " -landscapeother rotate 90 degrees\n"); #else fprintf(stderr, " -landscapeOther rotate 90 degrees\n"); #endif fprintf(stderr, " -portrait don't rotate (default)\n"); fprintf(stderr, " -bboxes output one word per line with bounding box\n"); fprintf(stderr, " -debug show Ghostscript output and error messages\n"); fprintf(stderr, " -gs \"command\" Ghostscript command\n"); fprintf(stderr, " - read from stdin (default if no files specified)\n"); fprintf(stderr, " -output file output results to \"file\" (default is stdout)\n"); } static char *make_temp(BUNDLE b) { /* Return pathname of temporary file containing bundle "b". Caller should unlink file (and, technically, free pathname). */ FILE *f; char *path = NULL; #ifdef VMS path = tempnam("SYS$SCRATCH:", ".ps2t"); #else const char *pattern = "/tmp/ps2tXXXXXX"; char *templ = (char*)malloc(strlen(pattern)+1); int fd; strcpy(templ, pattern); fd = mkstemp(templ); if (fd == -1) { fprintf(stderr, "mkstemp() failed"); exit(1); } close(fd); path = (char*)malloc(strlen(templ)+1); strcpy(path, templ); #endif f = fopen(path, "w"); if (f==NULL) {perror(cmd); exit(1);} putbundle(b, f); fclose(f); return path; } static char *ocr_path = NULL, *rotate_path = NULL; static FILE *gs = NULL; static void *instance; /* pstotext state */ #ifdef VMS static char *cmdfile = NULL, *gsoutfile = NULL; #endif static int cleanup(void) { int gsstatus, status = 0; pstotextExit(instance); if (gs!=NULL) { #ifdef VMS gsstatus = fclose(gs); #else gsstatus = pclose(gs); #endif if (WIFEXITED(gsstatus)) { if (WEXITSTATUS(gsstatus)!=0) status = 3; else if (WIFSIGNALED(gsstatus)) status = 4; } } if ((rotate_path!=NULL) && (strcmp(rotate_path, "")!=0)) { unlink(rotate_path); free(rotate_path); rotate_path = NULL; } if (ocr_path!=NULL) { unlink(ocr_path); free(ocr_path); ocr_path = NULL; } #ifdef VMS if (cmdfile!=NULL) unlink(cmdfile); if (gsoutfile!=NULL) unlink(gsoutfile); #endif return status; } static void handler(int x) { int status = cleanup(); if (status!=0) exit(status); #ifdef VMS exit(1); #else exit(2); #endif } static int do_it(char *path) { /* If "path" is NULL, then "stdin" should be processed. */ char *gs_cmdline; char *input; int status; char norotate[] = ""; FILE *fileout; #ifdef VMS FILE *cfile; #endif fileout = stdout; if (strlen(outfile) != 0) { #ifdef VMS fileout = fopen(outfile, "w", "rfm=var","rat=cr"); #else fileout = fopen(outfile, "w"); #endif /* VMS */ if (fileout == NULL) {perror(cmd); exit(1);} } signal(SIGINT, handler); signal(SIGHUP, handler); ocr_path = make_temp(ocr); switch (orientation) { case portrait: rotate_path = norotate; break; case landscape: rotate_path = make_temp(rot270); break; case landscapeOther: rotate_path = make_temp(rot90); break; } if ((ocr_path == NULL) || (rotate_path == NULL)) { fprintf(stderr,"No memory available\n"); cleanup(); exit(1); } if (path==NULL) { input = (char*)malloc(2); if (input == NULL) { fprintf(stderr,"No memory available\n"); cleanup(); exit(1); } strcpy(input, "-"); } else { input = (char*)malloc(strlen(path) + 6); if (input == NULL) { fprintf(stderr,"No memory available\n"); cleanup(); exit(1); } strcpy(input, "-- '"); strcat(input, path); strcat(input, "'"); } gs_cmdline = (char*)malloc(strlen(gs_cmd)+strlen(rotate_path)+ strlen(ocr_path) + strlen(input) + 128); if (gs_cmdline == NULL) { fprintf(stderr, "No memory available\n"); cleanup(); exit(1); } sprintf( gs_cmdline, #ifdef VMS "%s -r72 \"-dNODISPLAY\" \"-dFIXEDMEDIA\" \"-dDELAYBIND\" \"-dWRITESYSTEMDICT\" %s \"-dNOPAUSE\" %s %s %s", #else "%s -r72 -dNODISPLAY -dFIXEDMEDIA -dDELAYBIND -dWRITESYSTEMDICT %s -dNOPAUSE %s %s %s", #endif gs_cmd, (debug ? "" : "-q"), rotate_path, ocr_path, input ); if (debug) fprintf(stderr, "%s\n", gs_cmdline); #ifdef VMS cmdfile = tempnam("SYS$SCRATCH:","PS2TGS"); gsoutfile = tempnam("SYS$SCRATCH:","GSRES"); if ((cfile = fopen(cmdfile,"w")) == NULL) {perror(cmd);exit(1);} fprintf (cfile, "$ define/user sys$output %s\n", gsoutfile); fprintf (cfile, "$ %s\n", gs_cmdline); fprintf (cfile, "$ deletee/nolog %s;*\n", cmdfile); fputs ("$ exit\n", cfile); fclose (cfile); sprintf(gs_cmdline, "@%s.", cmdfile); system(gs_cmdline); if ((gs = fopen(gsoutfile, "r")) == NULL) { fprintf(stderr, "Error opening output file %s from GS command\n", gsoutfile); perror(cmd); exit(1); } #else gs = popen(gs_cmdline, "r"); if (gs==0) {perror(cmd); exit(1);} #endif status = pstotextInit(&instance); if (status!=0) { fprintf(stderr, "%s: internal error %d\n", cmd, status); exit(5); } if (cork) { status = pstotextSetCork(instance, TRUE); if (status!=0) { fprintf(stderr, "%s: internal error %d\n", cmd, status); exit(5); } } while (TRUE) { char line[LINELEN]; const char *pre, *word, *post; int llx, lly, urx, ury; if (fgets(line, LINELEN, gs)==NULL) break; if (debug) fputs(line, stderr); status = pstotextFilter( instance, line, &pre, &word, &post, &llx, &lly, &urx, &ury); if (status!=0) { fprintf(stderr, "%s: internal error %d\n", cmd, status); exit(5); } if (word!=NULL) { if (!bboxes) { fputs(pre, fileout); fputs(word, fileout); fputs(post, fileout); if ( debug ) fputc('\n', stderr); } else { if (pre) { if (*pre == ' ') pre++; fputs(pre, fileout); } fprintf(fileout, "%6d\t%6d\t%6d\t%6d\t%s\n", llx, lly, urx, ury, word); if (post) fputs(post, fileout); } } } if (fileout != stdout) fclose(fileout); status = cleanup(); if (status!=0) exit(status); return status; } int main(int argc, char *argv[]) { int i; char *arg; cmd = argv[0]; for (i = 1; i=argc) {usage(); exit(1);} gs_cmd = argv[i]; } else if (strcasecmp(arg, "-output")==0) { i++; if (i>=argc) {usage(); exit(1);} outfile = argv[i]; } else if (strcmp(arg, "-")==0) do_it(NULL); else if (arg[0] == '-') {usage(); exit(1);} else /* file */ { explicitFiles++; do_it(arg); } } if (explicitFiles==0) do_it(NULL); exit(0); } pstotext-1.9/Makefile0100664000076400007640000000203007777500163013077 0ustar rjlrjl# Copyright (C) 1995, Digital Equipment Corporation. # All rights reserved. # See the file pstotext.txt for a full description. # Last modified on Fri Jan 09 21:20:00 AEST 2004 by rjl (fixed dependencies) # Last modified on Wed Oct 28 08:45:54 PST 1998 by mcjones # pstotext now requires an ANSI-compatible C compiler, such as gcc. # If you absolutely must use a pre-ANSI compiler, you can try # commenting out the includes of ptotdll.h in ptotdll.c and main.c. # PMcJ 6 Sep 96 CC=gcc #CC=cc -std BUNDLE = ocr.h rot270.h rot90.h all: pstotext main.o: main.c ptotdll.h bundle.h ocr.h rot270.h rot90.h $(CC) -c $*.c ptotdll.o: ptotdll.c ptotdll.h $(CC) -c $*.c pstotext: bundle.o main.o ptotdll.o $(CC) -o pstotext main.o bundle.o ptotdll.o -lm .SUFFIXES: .ps .c.o: $(CC) -c $*.c # "Bundle" an Ascii file. .ps.h: echo "const char *$*[] = {" > $*.h sed -e 's/"/\\"/g' -e 's/\(.*\)/ "\1\\n",/' $*.ps >> $*.h echo " 0" >> $*.h echo "};" >> $*.h tidy: rm -f ,* .,* .emacs_[0-9]* core *~ clean: rm -f pstotext *.o core $(BUNDLE) pstotext-1.9/mkbundle.c0100664000076400007640000000231707777500431013412 0ustar rjlrjl/* mkbundle.c */ /* Created by Russell Lang, 1996-10-11 */ /* Updated by Russell Lang, 2004-01-09 to fix compiler warnings */ #include #include int usage(void) { fprintf(stderr, "Usage: mkbundle psfile hdrfile\n"); return 1; } int main(int argc, char *argv[]) { FILE *psfile, *hdrfile; char inbuf[256], outbuf[256]; char *s, *d; if (argc!=3) return usage(); if ( (psfile = fopen(argv[1], "r")) == (FILE *)NULL ) return usage(); if ( (hdrfile = fopen(argv[2], "w")) == (FILE *)NULL ) { fclose(psfile); return usage(); } strcpy(inbuf, argv[1]); strtok(inbuf, "."); fputs("const char *", hdrfile); fputs(inbuf, hdrfile); fputs("[] = {\n", hdrfile); while ( fgets(inbuf, sizeof(inbuf)-1, psfile) ) { d = outbuf; for (s=inbuf; *s; s++) { if (*s == '"') { *d++ = '\\'; *d++ = '0'; *d++ = '4'; *d++ = '2'; } else { if (*s != '\n') *d++ = *s; else { *d++ = '\\'; *d++ = '0'; *d++ = '1'; *d++ = '2'; } } } *d = '\0'; fputs(" \042", hdrfile); fputs(outbuf, hdrfile); fputs("\042,\n", hdrfile); } fputs(" 0\n};\n\n", hdrfile); fclose(psfile); fclose(hdrfile); return 0; } pstotext-1.9/mkrch.c0100664000076400007640000000247506610564716012723 0ustar rjlrjl/* mkrch.c */ /* Make resource script header */ /* Created by Russell Lang, 1996-10-11 */ #include #include int usage(void) { fprintf(stderr, "Usage: mkbundle psfile hdrfile resource_id\n"); return 1; } int main(int argc, char *argv[]) { FILE *psfile, *hdrfile; char inbuf[256], outbuf[256]; char *s, *d; if (argc!=4) return usage(); if ( (psfile = fopen(argv[1], "r")) == (FILE *)NULL ) return usage(); if ( (hdrfile = fopen(argv[2], "w")) == (FILE *)NULL ) { fclose(psfile); return usage(); } strcpy(inbuf, argv[1]); strtok(inbuf, "."); fputs("\n", hdrfile); #ifdef __EMX__ fputs("RCDATA ", hdrfile); fputs(argv[3], hdrfile); fputs("\nBEGIN\n", hdrfile); #else fputs(argv[3], hdrfile); fputs(" RCDATA\nBEGIN\n", hdrfile); #endif while ( fgets(inbuf, sizeof(inbuf)-1, psfile) ) { d = outbuf; for (s=inbuf; *s; s++) { if (*s == '"') { *d++ = '\\'; *d++ = '0'; *d++ = '4'; *d++ = '2'; } else { if (*s != '\n') *d++ = *s; else { *d++ = '\\'; *d++ = '0'; *d++ = '1'; *d++ = '2'; } } } *d = '\0'; fputs(" \042", hdrfile); fputs(outbuf, hdrfile); fputs("\042\n", hdrfile); } fputs(" \042\\0\\0\042\nEND\n\n", hdrfile); fclose(psfile); fclose(hdrfile); return 0; } pstotext-1.9/ocr.ps0100664000076400007640000006476407777500206012612 0ustar rjlrjl% Copyright (C) 1995, Digital Equipment Corporation. % All rights reserved. % See the file pstotext.txt for a full description. % ocr.ps, part of BuildLectern % % This is a PostScript library to send the characters rendered by a % PostScript job back on stdout. The output is intended to allow % reconstruction of the document's words and an approximation of the words' % bounding rectangles. % Last modified on Fri Jan 9 21:20:00 AEST 2004 by rjl % modified on Sat Feb 5 21:00:00 AEST 2000 by rjl % modified on Fri Oct 2 17:13:53 PDT 1998 by mcjones % modified on Thu Jan 25 15:24:37 PST 1996 by deutsch % modified on Wed May 3 15:41:30 PDT 1995 by birrell % Modified by rjl % For Ghostscript 6 or later. % Change ashow, widthshow and awidthshow to call show for each character. % Add xshow, yshow, xyshow. % Restrictions: % % This library redefines some names that are originally defined as % operators, and of course the new definitions are procedures. Some % jobs might be sensitive to this distinction. In this case, you'd % need to make "redef" create a new operator. For example, "odef". % % This library reports characters rendered by show, ashow, widthshow, % awidthshow, kshow, xshow, yshow and xyshow. % The characters are reported even if they would % be invisible in final hardcopy. This could happen, for example, if the % characters get clipped, or if they get overprinted, or if they are the % same color as their background, or if the font's glyph make no marks. % % PostScript doesn't specify a standard character code for the characters % being rendered; rather, the font maps small integers into glyphs. It's % not possible in general to determine a translation back into a standard % character code. This library approximates this by giving mappings from % the job's characters to indexes into a table of known glyph names. This % handles most PostScript jobs that we've encountered, but it's not an % absolute solution to the problem. % Output format: % % The "reporting coordinate system" is the device coordinate system. The % application interpreting this library's output needs to understand the % device coordinate system. % % Positions are reported in the reporting coordinate system, but with (x,y) % values multiplied by 100 and rounded to integers. Note that % positions might involve negative integers. % % The directives, whitespace and numbers are themselves encoded in ASCII. % The strings in the "S" directives are in currentfont's encoding: they % should be treated as 8-bit binary data. % % QI 6-integers % Specifies the inverse of the currentmatrix for % the current output device, after rotXXX.ps and % before the document is processed. % % QM m blx bly trx try 256-pairs % Introduces a new character metrics table used by % some font, where "m" is an integer that identifies % the metrics table in subsequent "F" directives. "m" % is small. (blx,bly) is the bottom left corner of the % font's bounding box, and (trx,try) is its top right. % The pairs are the stringwidth of each character in the % font's encoding. All of these are reported in the % font's character coordinate system, multiplied by 100 % and rounded to integers. The metrics % table will be referenced in a subsequent "F" % directive, which includes information mapping the % font's metrics to the initial user coordinate system. % Note that values of "m" might get reused, after a % "restore". % % QE e n n-integers Introduces a new encoding, where "e" is an integer % that identifies the encoding in subsequent "F" % directives. "e" is small. "e" is followed by % an integer "n" and then by a sequence of % exactly n integers. The i'th integer % specifies the glyph for the i'th entry of the % font's encoding vector. The value of the % integer is usually an index in the array % "StandardGlyphs", defined below. Note that % the first 256 entries of that array equal the % corresponding entries of ISOLatin1Encoding. % The value 9999 indicates that this entry in % the font's encoding specifies a glyph not % named in StandardGlyphs. Note that values of % "e" might get reused, after a "restore". % % QF n x y x' y' e m Introduces a new font, where "n" is an integer % that identifies the font in subsequent "S" % directives. "n" is small. (x,y) is the % position corresponding to (1000,0) in the font's % character coordinate system, and (x',y') is the % position corresponding to (0,1000), both as they would % be if the character were drawn with its origin at % the origin of the reporting coordinate system. "e" is % an integer specifying a previously defined % encoding vector. "m" is an integer specifying a % previously defined metrics table. Note that values % of "n" might get reused, after a "restore". % % QS n x y l s x' y' Reports rendering of string "s" in font "n". % (x,y) is the position corresponding to the origin of % the first character. "l" is length of the string, % followed immediately by a single space then the string. % (x',y') is the position that would correspond to the % origin of a subsequent character. The % string reported by this directive is never empty. % The string also never contains a "space": strings % that would have contained a "space" are split up % into multiple directives, with the "space" % omitted. Here "space" means the first character % in the font's encoding that maps to the glyph % named "/space", if there is such a character. % % QC copypage was invoked % % QZ erasepage was invoked % % QP showpage was invoked % % globals and subroutines % %/setglobal where % { pop currentglobal /setglobal load true setglobal } % { { } } %ifelse revision 353 ge { NOBIND /DELAYBIND where { pop DELAYBIND or } if { systemdict begin /bind /.bind load /.forcedef where { pop .forcedef } { def } ifelse end } if } if % put our private stuff in a local dictionary, % but place a reference to it in systemdict systemdict begin /pstotextLocalDict 30 dict /.forcedef where { pop .forcedef } { def } ifelse end pstotextLocalDict begin % following stuff is private /redef { systemdict begin 1 index exch .makeoperator def end } bind def /privateDict currentdict def % for lastFontNum and lastEncoding /fonts 200 dict def % maps font to integer "n" /fontsUnit 200 dict def % (1000,0) and (0,1000) transformed, per font /encodings 200 dict def % maps encoding array to integer "e" /encodingSpace 200 dict def % space char for each encoding /metrics 200 dict def % maps font UniqueID to integer "m" /lastFontNum 0 def % last integer used for a font /lastEncoding 0 def % last integer used for an encoding /lastMetrics 0 def % last integer used for a metrics table /tempString 20 string def % scratch for printing integers /reportMatrix matrix identmatrix def % maps device coords to reporting coords /inUse false def % prevents recursive invokcation of "report" /TimesRomanGlyphs [ % ISOLatin1Encoding ... /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /space /exclam /quotedbl /numbersign /dollar /percent /ampersand /quoteright /parenleft /parenright /asterisk /plus /comma /minus /period /slash /zero /one % 50 /two /three /four /five /six /seven /eight /nine /colon /semicolon /less /equal /greater /question /at /A /B /C /D /E /F /G /H /I /J /K /L /M /N /O /P /Q /R /S /T /U /V /W /X /Y /Z /bracketleft /backslash /bracketright /asciicircum /underscore /quoteleft /a /b /c % 100 /d /e /f /g /h /i /j /k /l /m /n /o /p /q /r /s /t /u /v /w /x /y /z /braceleft /bar /braceright /asciitilde /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /dotlessi /grave /acute /circumflex /tilde /macron % 150 /breve /dotaccent /dieresis /.notdef /ring /cedilla /.notdef /hungarumlaut /ogonek /caron /space /exclamdown /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright /ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus /twosuperior /threesuperior /acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf /threequarters /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla % 200 /Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis /Eth /Ntilde /Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn /germandbls /agrave /aacute /acircumflex /atilde /adieresis /aring /ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis /eth /ntilde /ograve /oacute /ocircumflex /otilde /odieresis /divide /oslash /ugrave % 250 /uacute /ucircumflex /udieresis /yacute /thorn /ydieresis % Other glyphs from /Times-Roman ... % 256 /quotedblright /Scaron /dagger /guilsinglleft /Zcaron /daggerdbl /Lslash /ellipsis /guilsinglright /oe /fi /bullet /perthousand /quotedblbase /endash /emdash /trademark /florin /lslash /scaron /Ydieresis /fl /fraction /quotedblleft /quotesinglbase /quotesingle /zcaron /OE % 284 ] def /dvipsGlyphs [ % Self-named glyphs for dvitops type 3 fonts ... % 284 0 1 255 { 1 string dup 0 4 -1 roll put cvn } for % 540 ] def /ttypeGlyphs1 [ % Glyph names used by MS TrueType encodings ... % 540 /G00 /G01 /G02 /G03 /G04 /G05 /G06 /G07 /G08 /G09 /G0a /G0b /G0c /G0d /G0e /G0f /G10 /G11 /G12 /G13 /G14 /G15 /G16 /G17 /G18 /G19 /G1a /G1b /G1c /G1d /G1e /G1f /G20 /G21 /G22 /G23 /G24 /G25 /G26 /G27 /G28 /G29 /G2a /G2b /G2c /G2d /G2e /G2f /G30 /G31 /G32 /G33 /G34 /G35 /G36 /G37 /G38 /G39 /G3a /G3b % 600 /G3c /G3d /G3e /G3f /G40 /G41 /G42 /G43 /G44 /G45 /G46 /G47 /G48 /G49 /G4a /G4b /G4c /G4d /G4e /G4f /G50 /G51 /G52 /G53 /G54 /G55 /G56 /G57 /G58 /G59 /G5a /G5b /G5c /G5d /G5e /G5f /G60 /G61 /G62 /G63 /G64 /G65 /G66 /G67 /G68 /G69 /G6a /G6b /G6c /G6d /G6e /G6f /G70 /G71 /G72 /G73 /G74 /G75 /G76 /G77 /G78 /G79 /G7a /G7b /G7c /G7d /G7e /G7f /G80 /G81 /G82 /G83 /G84 /G85 /G86 /G87 /G88 /G89 /G8a /G8b /G8c /G8d /G8e /G8f /G90 /G91 /G92 /G93 /G94 /G95 /G96 /G97 /G98 /G99 /G9a /G9b /G9c /G9d /G9e /G9f /Ga0 /Ga1 /Ga2 /Ga3 /Ga4 /Ga5 /Ga6 /Ga7 /Ga8 /Ga9 /Gaa /Gab /Gac /Gad /Gae /Gaf /Gb0 /Gb1 /Gb2 /Gb3 /Gb4 /Gb5 /Gb6 /Gb7 /Gb8 /Gb9 /Gba /Gbb /Gbc /Gbd /Gbe /Gbf /Gc0 /Gc1 /Gc2 /Gc3 /Gc4 /Gc5 /Gc6 /Gc7 /Gc8 /Gc9 /Gca /Gcb /Gcc /Gcd /Gce /Gcf /Gd0 /Gd1 % 750 /Gd2 /Gd3 /Gd4 /Gd5 /Gd6 /Gd7 /Gd8 /Gd9 /Gda /Gdb /Gdc /Gdd /Gde /Gdf /Ge0 /Ge1 /Ge2 /Ge3 /Ge4 /Ge5 /Ge6 /Ge7 /Ge8 /Ge9 /Gea /Geb /Gec /Ged /Gee /Gef /Gf0 /Gf1 /Gf2 /Gf3 /Gf4 /Gf5 /Gf6 /Gf7 /Gf8 /Gf9 /Gfa /Gfb /Gfc /Gfd /Gfe /Gff % 796 ] def /ttypeGlyphs2 [ % 796 /G00 /G01 /G02 /G03 /G04 /G05 /G06 /G07 /G08 /G09 /G0A /G0B /G0C /G0D /G0E /G0F /G10 /G11 /G12 /G13 /G14 /G15 /G16 /G17 /G18 /G19 /G1A /G1B /G1C /G1D /G1E /G1F /G20 /G21 /G22 /G23 /G24 /G25 /G26 /G27 /G28 /G29 /G2A /G2B /G2C /G2D /G2E /G2F /G30 /G31 /G32 /G33 /G34 /G35 /G36 /G37 /G38 /G39 /G3A /G3B /G3C /G3D /G3E /G3F /G40 /G41 /G42 /G43 /G44 /G45 /G46 /G47 /G48 /G49 /G4A /G4B /G4C /G4D /G4E /G4F /G50 /G51 /G52 /G53 /G54 /G55 /G56 /G57 /G58 /G59 /G5A /G5B /G5C /G5D /G5E /G5F /G60 /G61 /G62 /G63 /G64 /G65 /G66 /G67 % 900 /G68 /G69 /G6A /G6B /G6C /G6D /G6E /G6F /G70 /G71 /G72 /G73 /G74 /G75 /G76 /G77 /G78 /G79 /G7A /G7B /G7C /G7D /G7E /G7F /G80 /G81 /G82 /G83 /G84 /G85 /G86 /G87 /G88 /G89 /G8A /G8B /G8C /G8D /G8E /G8F /G90 /G91 /G92 /G93 /G94 /G95 /G96 /G97 /G98 /G99 /G9A /G9B /G9C /G9D /G9E /G9F /GA0 /GA1 /GA2 /GA3 /GA4 /GA5 /GA6 /GA7 /GA8 /GA9 /GAA /GAB /GAC /GAD /GAE /GAF /GB0 /GB1 /GB2 /GB3 /GB4 /GB5 /GB6 /GB7 /GB8 /GB9 /GBA /GBB /GBC /GBD /GBE /GBF /GC0 /GC1 /GC2 /GC3 /GC4 /GC5 /GC6 /GC7 /GC8 /GC9 /GCA /GCB /GCC /GCD /GCE /GCF /GD0 /GD1 /GD2 /GD3 /GD4 /GD5 /GD6 /GD7 /GD8 /GD9 /GDA /GDB /GDC /GDD /GDE /GDF /GE0 /GE1 /GE2 /GE3 /GE4 /GE5 /GE6 /GE7 /GE8 /GE9 /GEA /GEB /GEC /GED /GEE /GEF /GF0 /GF1 /GF2 /GF3 /GF4 /GF5 /GF6 /GF7 /GF8 /GF9 /GFA /GFB /GFC /GFD % 1050 /GFE /GFF % 1052 ] def /oldDviGlyphs [ % More self-named glyphs for old dvitops type 3 fonts ... % 1052 0 1 127 { 10 3 string cvrs cvn } for % 1180 ] def /StandardGlyphs //TimesRomanGlyphs length 896 add array def //StandardGlyphs 0 //TimesRomanGlyphs putinterval //StandardGlyphs //TimesRomanGlyphs length //dvipsGlyphs putinterval //StandardGlyphs //TimesRomanGlyphs length 256 add //ttypeGlyphs1 putinterval //StandardGlyphs //TimesRomanGlyphs length 512 add //ttypeGlyphs2 putinterval //StandardGlyphs //TimesRomanGlyphs length 768 add //oldDviGlyphs putinterval /standardMap StandardGlyphs length dict % Maps names to indices in StandardGlyphs. 0 StandardGlyphs { 2 index exch 2 copy known { pop pop } { 2 index put } ifelse 1 add } forall pop def /printInt { % stack: n % Prints an integer followed by a space on stdout //tempString cvs print ( ) print } bind def /showxy { %stack: x y % prints a pair of integers on stdout, converting to 1/100th's exch 100 mul round cvi //printInt exec 100 mul round cvi //printInt exec } bind def /characterToReporting { % stack: x y -> x' y' % Transforms a vector in currentfont's character coordinate % system to the reporting coordinate system currentfont /FontMatrix get dtransform % to current user coordinates dtransform % to device coordinates //reportMatrix idtransform % to reporting coordinates } def /printCharacterOrigin { % Prints the position in the reporting coordinate system at which a % character origin would be painted using currentfont 0 0 currentfont /FontMatrix get transform currentpoint exch 4 -1 roll add 3 1 roll add transform //reportMatrix itransform //showxy exec } bind def /printMap { % Print map from indices in currentfont/Encoding to StandardEncoding indices 16 currentfont /Encoding get dup length //printInt exec { exch dup 16 eq { () = pop 1 } { 1 add } ifelse exch //standardMap exch 2 copy known { get //printInt exec } { pop pop (9999 ) print } ifelse } forall pop () = } bind def /metricsString (X) def /printMetrics { % Print bounding box and character metrics for currentfont % Sadly, dvitops produces illegal type 3 fonts with no /.notdef entry. The % use of "stopped" deals with that and any other silliness. currentfont /FontBBox get aload pop 4 2 roll //showxy exec //showxy exec currentfont /FontMatrix get currentfont /FontType get 3 eq { //privateDict /pm.dictCount countdictstack put count //privateDict exch /pm.count exch put //privateDict /pm.save save put 0 1 255 { dup 8 mod 0 eq { () = } if //metricsString exch 0 exch put { //metricsString //stringwidth exec 2 index idtransform % to font's character coordinates } stopped { count //privateDict /pm.count get sub { pop } repeat countdictstack //privateDict /pm.dictCount get sub { end } repeat //privateDict /pm.save get restore //privateDict /pm.save save put 0 0 } if //showxy exec } for //privateDict /pm.save get restore % necessary to balance the "save" }{ currentfont /CharStrings get currentfont /Encoding get 0 1 255 { dup 8 mod 0 eq { () = } if 2 copy get 3 index exch known { //metricsString exch 0 exch put //metricsString //stringwidth exec 4 index idtransform % to font's character coordinates } { pop 0 0 } ifelse //showxy exec } for pop pop % encoding, charstrings } ifelse pop % fontmatrix } bind def % % The main work: reportMetrics, reportEncoding, reportFont and report % /reportMetrics { % stack: -> m % privateDict is open % Print new "m" directive for currentfont (QM ) print lastMetrics 1 add /lastMetrics 1 index def dup //printInt exec //printMetrics exec () = } bind def /reportEncoding { % stack: -> e % privateDict is open % Print new "e" directive for currentfont (QE ) print lastEncoding 1 add /lastEncoding 1 index def dup //printInt exec //printMap exec } bind def /spaceString (X) def % for space char from current encoding /reportFont { % stack: any -> unchanged % Report currentfont to the user and record its number % //privateDict is open % Sets //privateDict/spaceString to font's encoding's space string, if any, % and sets //privateDict/hasSpace to indicate whether there's a space char. currentfont /UniqueID known not { //reportMetrics exec } { //metrics currentfont /UniqueID get 2 copy known { get } { //reportMetrics exec dup 4 1 roll put % define in /metrics } ifelse } ifelse % stack: m //encodings currentfont /Encoding get 2 copy known { get } { //reportEncoding exec % leaves e on stack dup 4 1 roll put % define in /encodings % Find space character number for the encoding. % Biased towards number 32, so avoid problems with weird encodings //encodingSpace 1 index % dict and key for putting the result currentfont /Encoding get dup length 32 le { false } { dup 32 get dup /space eq exch /G20 eq or } ifelse { pop 32 } { % It's not character number 32: search from 0 0 exch { dup /space eq exch dup /G20 eq exch /suppress eq or or { exit } if 1 add } forall } ifelse put % put in /encodingSpace } ifelse % stack: m e //encodingSpace 1 index get % font's space character (256 if none) /hasSpace 1 index 256 lt def hasSpace { //spaceString exch 0 exch put } { pop } ifelse (QF ) print lastFontNum 1 add /lastFontNum 1 index def //fonts currentfont 2 index put % record fontNum in /fonts dup //printInt exec % print fontNum % stack: m e f 1000 0 //characterToReporting exec 2 copy //showxy exec 0 1000 //characterToReporting exec 2 copy //showxy exec 4 array astore % stack: m e f array //fontsUnit 3 1 roll put % stack: m e //printInt exec % print encoding number //printInt exec % print metrics number () = } bind def /reportFontCreation { % stack: font % Report a newly created font. Called now so that if later uses are % inside a save/restore we don't forget it. % NOTE: this is currently not used, since it actually slows things down //privateDict begin inUse not { /inUse true def dup currentfont exch setfont //reportFont exec setfont /inUse false def } if end } bind def /reportSubString { % stack: args string -> args % Report the rendering of a string, assumed to be a single word. % privateDict is open. /n is font number, /p is call-back dup length 0 eq { p } { (QS ) print n //printInt exec //printCharacterOrigin exec dup length //printInt exec dup print ( ) print /p load end % close privateDict during the call-back exec % render the string; leaves args on stack //privateDict begin //printCharacterOrigin exec () = } ifelse } bind def /report { % stack: args string proc -> args % Report the rendering of a string. % Calls proc for each word and space. The call-back should expect "args" % on the stack followed by a string, and should leave "args" on the stack. //privateDict begin inUse { end exec } { /inUse true def //fonts currentfont 2 copy known not { //reportFont exec } if get % stack: probable font-number % check if points transform as before ... //fontsUnit 1 index get 1000 0 //characterToReporting exec 0 1000 //characterToReporting exec 4 index 3 get ne 4 1 roll 4 index 2 get ne 4 1 roll 4 index 1 get ne 4 1 roll 4 index 0 get ne 5 -1 roll pop or or or { % if transformed points differ //reportFont exec pop //fonts currentfont get } if % stack: args string proc n /n exch def /p exch def % stack: args string hasSpace { { % begin loop //spaceString search { exch pop exch /s exch def //reportSubString exec //spaceString p s } { //reportSubString exec exit } ifelse } loop } { //reportSubString exec } ifelse /inUse false def end } ifelse } bind def /dontReport { % stack: proc % Call "proc" with //privateDict/inUse set to true //privateDict /inUse get { exec } { //privateDict /inUse true put exec //privateDict /inUse false put } ifelse } bind def /kshow.temp (X) def % scratch space for kshow % Output from groff 1.08 sometimes uses ashow to show letters from % two separate words, with the offset being used to add the space. % Implement 'awidthshow' and friends as calls to 'show' to catch this. /pstotext_awidthshow { % cx cy char ax ay string -- //kshow.temp 0 1 3 index length 1 sub { 2 index exch get 1 index 0 2 index put 1 index show 4 index 4 index rmoveto 5 index eq { 6 index 6 index rmoveto } if } for pop pop pop pop pop pop pop } def /pstotext_xshow { % string numarray -- //kshow.temp 0 1 4 index length 1 sub { currentpoint 2 index 6 index exch get 4 index 0 2 index put pop 3 index show moveto % set char width from array 2 index exch get 0 rmoveto } for pop pop pop } def /pstotext_yshow { % string numarray -- //kshow.temp 0 1 4 index length 1 sub { currentpoint 2 index 6 index exch get 4 index 0 2 index put pop 3 index show moveto % set char height from array 2 index exch get 0 exch rmoveto } for pop pop pop } def /pstotext_xyshow { % string numarray -- //kshow.temp 0 1 4 index length 1 sub { currentpoint 2 index 6 index exch get 4 index 0 2 index put pop 3 index show moveto % set char width and height from array dup add 2 index 1 index get exch 3 index exch 1 add get rmoveto } for pop pop pop } def % Output inverse of initial currentmatrix, for possible use by postprocessor. (QI ) print matrix currentmatrix matrix invertmatrix { 100 mul round cvi //printInt exec } forall () = userdict begin % subsequent definitions are publicly visible % Objects placed in systemdict must be in global memory, % and must not reference local objects. /setglobal where { pop currentglobal true setglobal } { } ifelse % % Redefine the character rendering operations to call "report" % /show { { show } //systemdict /pstotextLocalDict get /report get exec } bind redef %/ashow { {3 copy ashow pop} % //systemdict /pstotextLocalDict get /report get % exec pop pop } bind redef % Implement as call to show to catch PostScript that uses % these to show letters from separate words. /ashow { 0 0 -1 6 3 roll //systemdict /pstotextLocalDict get /pstotext_awidthshow get exec } bind redef /widthshow { 0 0 3 -1 roll //systemdict /pstotextLocalDict get /pstotext_awidthshow get exec } bind redef /awidthshow { //systemdict /pstotextLocalDict get /pstotext_awidthshow get exec } bind redef /xshow { //systemdict /pstotextLocalDict get /pstotext_xshow get exec } bind redef /yshow { //systemdict /pstotextLocalDict get /pstotext_yshow get exec } bind redef /xyshow { //systemdict /pstotextLocalDict get /pstotext_xyshow get exec } bind redef /kshow { % stack: proc string exch //systemdict /pstotextLocalDict get exch /kshow.proc exch put false exch % stack: false string { % stack: false next | prev true next //systemdict /pstotextLocaldict get /kshow.temp get 0 2 index put exch { //systemdict /pstotextLocalDict get /kshow.proc get exec } { pop } ifelse //systemdict /pstotextLocaldict get /kshow.temp get //show exec //systemdict /pstotextLocaldict get /kshow.temp get 0 get true % stack: this true } forall % stack: false | last true { pop } if } bind redef % % Redefine non-rendering operations so that they don't report % /stringwidth { {stringwidth} //systemdict /pstotextLocalDict get /dontReport get exec } bind redef /charpath { {charpath} //systemdict /pstotextLocalDict get /dontReport get exec } bind redef % % Intercept and report the page operations % /copypage { (QC) = flush copypage } bind redef /erasepage { (QZ) = flush erasepage } bind redef /showpage { (QP) = flush showpage } bind redef % % Intercept font creation so as to record the font inside less save/restore's % NOTE: disabled, because it actually slows things down % % /definefont { definefont //reportFontCreation exec } bind redef % /makefont { makefont //reportFontCreation exec } bind redef % /scalefont { scalefont //reportFontCreation exec } bind redef % % Clean-up % % restore local/global state /setglobal where { pop setglobal } { } ifelse end % close nested userdict begin end % close private dictionary % Bind the operators we just defined, and all the others if we didn't % do it before. Also reenable 'bind' for future files. revision 353 ge { .bindoperators NOBIND currentdict systemdict ne and { systemdict begin .bindoperators end } if /DELAYBIND where { pop DELAYBIND { .bindnow } if } if } if systemdict readonly pop % Restore the current local/global VM mode. % exec % % Testing % false { 100 dict begin (Times 12, two strings; second one sloping up with ashow:)= /Times-Roman findfont 12 scalefont dup /t12 exch def setfont 72 300 moveto (Hello world) show 72 280 moveto 10 1 (Hello world once more) ashow (Times 10 two strings:)= /Times-Roman findfont 10 scalefont setfont 72 260 moveto (Third) show 72 240 moveto (Fourth) show (Symbol 12, one string:)= /Symbol findfont 12 scalefont setfont 72 220 moveto (symbol string) show (Helvetica 12, two strings:)= /Helvetica findfont 12 scalefont setfont 72 200 moveto (Fifth) show 72 180 moveto (Sixth) show (Times 12 again, two strings; second one with kshow:)= t12 setfont 72 160 moveto (Seventh) show end 72 140 moveto gsave /dx 1.0 def { pop pop dx 1 add /dx 1 index def 0 rmoveto } (Accelerated letter spacing) kshow grestore (Times 12 scaled by 2:)= 72 100 moveto gsave 2 2 scale (Ninth) show grestore count 0 ne { (Left on stack:)= pstack } if flush } if pstotext-1.9/pstotext.10100664000076400007640000001324307047001434013406 0ustar rjlrjl.\" This file generated automatically by mtex2man(1) .nh .TH "pstotext" "1" .SH "NAME" pstotext \- extract ASCII text from a PostScript or PDF file .SH "SYNTAX" \fBpstotext [option|pathname]...\fR .PP where option includes: .PP .PD 0 .RS 0 .TP 6 \-cork .TP 6 \-landscape .TP 6 \-landscapeOther .TP 6 \-portrait .TP 6 \- .TP 6 \-output file .TP 6 \-gs command .TP 6 \-debug .TP 6 \-bboxes .RE .PD .PP .SH "DESCRIPTION" \fBpstotext\fR reads one or more PostScript or PDF files, and writes to standard output a representation of the plain text that would be displayed if the PostScript file were printed. As is described in the DETAILS section below, this representation is only an approximation. Nevertheless, it is often useful for information retrieval (e.g., running grep(1) or building a full\-text index) or to recover the text from a PostScript file whose source you have lost. .PP \fBpstotext\fR calls Ghostscript, and requires Aladdin Ghostscript version 3.51 or newer. Ghostscript must be invokable on the current search path as gs. Alternatively, you can use the \-gs option to specify the command (pathname and options) to run Ghostscript. For example, on Windows you might use \-gs "c:\\gs\\gswin32c.exe \-Ic:\\gs;c:\\gs\\fonts". .PP \fBpstotext\fR reads and processes its command line from left to right, ignoring the case of options. When it encounters a pathname, it opens the file and expects to find a PostScript job or PDF document to process. The option \- means to read and process a PostScript job from standard input. If no \- or pathname arguments are encountered, \fBpstotext\fR reads a PostScript job from standard input. (PDF documents require random access, hence cannot be read from standard input.) You can use the \-output option to specify an output file (remember to invoke it \fIbefore\fR the input file); otherwise \fBpstotext\fR writes to standard output. .PP The option \-cork is only relevant for PostScript files produced by dvips from TeX or LaTeX documents; it tells \fBpstotext\fR to use the Cork encoding (known as T1 in LaTeX) rather than the old TeX text encoding (known as OT1 in LaTeX). Unfortunately files produced by dvips don't distinguish which font encodings were used. .PP The options \-landscape and \-landscapeOther should be used for documents that must be rotated 90 degrees clockwise or counterclockwise, respectively, in order to be readable. .PP The options \-debug and \-bboxes are mostly of use for the maintainers of \fBpstotext\fR. \-debug shows Ghostscript output and error messages. \-bboxes outputs one word per line with bounding box information. .SH "DETAILS" \fBpstotext\fR does its work by telling Ghostscript to load a PostScript library that causes it to write to its standard output information about each string rendered by a PostScript job or PDF document. This information includes the characters of the string, and enough additional information to approximate the string's bounding rectangle. \fBpstotext\fR post\-processes this information and outputs a sequence of words delimited by space, newline, and formfeed. .PP \fBpstotext\fR outputs words in the same sequence as they are rendered by the document. This usually, but not always, follows the order that a human would read the words on a page. Within this sequence, words are separated by either space or newline depending on whether or not they fall on the same line. Each page is terminated with a formfeed. If you use the incorrect option from the set {\-portrait, \-landscape, \-landscapeOther}, \fBpstotext\fR is likely to substitute newline for space. .PP A PostScript job or PDF document often renders one word as several strings in order to get correct spacing between particular pairs of characters. \fBpstotext\fR does its best to assemble these strings back into words, using a simple heuristic: strings separated by a distance of less than 0.3 times the minimum of the average character widths in the two strings are considered to be part of the same word. Note that this typically causes leading and trailing punctuation characters to be included with a word. .PP The PostScript language provides a flexible encoding scheme by which character codes in strings select specific characters (symbols), so a PostScript job is free to use any character code. On the other hand, \fBpstotext\fR always translates to the ISO 8859\-1 (Latin\-1) character code, which is an extension to ASCII covering most of the Western European languages. When a character isn't present in ISO 8859\-1, \fBpstotext\fR uses a sequence of characters, e.g., "\-\-\-" for em dash or "A\\226" for Abreve. \fBpstotext\fR can be fooled by a font whose Encoding vector doesn't follow Adobe's conventions, but it contains heuristics allowing it to handle a wide variety of misbehaving fonts. .PP (\fBpstotext\fR no longer translates hyphen (\\255) to minus (\\055).) .SH "AUTHOR" Andrew Birrell (PostScript libraries), Paul McJones (application), Russell Lang (Windows and OS/2 adaptation), and Hunter Goatley (VMS adaptation). .SH "SEE ALSO" \fBpstotext\fR incorporates technology originally developed for the Virtual Paper project at SRC; see http://www.research.digital.com/SRC/virtualpaper/. .PP As mentioned above, \fBpstotext\fR invokes Ghostscript. See gs(1) or http://www.cs.wisc.edu/~ghost/. .SH "COPYRIGHT" .PP Copyright 1995\-8 Digital Equipment Corporation. .br Distributed only by permission. .br See file pstotext.txt for details. .br .BR .PP .EX Last modified on Sat Feb 5 21:00:00 AEST 2000 by rjl modified on Fri Jun 5 14:02:37 PDT 1998 by mcjones modified on Wed Jun 7 17:47:56 PDT 1995 by birrell .EE .PP This file was generated automatically by mtex software; see the mtex home page at http://www.research.digital.com/SRC/mtex/. pstotext-1.9/pstotext.hlp0100664000076400007640000001247606536170270014047 0ustar rjlrjl1 PSTOTEXT PSTOTEXT - extract ASCII text from a PostScript or PDF file Syntax: pstotext [option|pathname]... where option includes: -cork -landscape -landscapeother -portrait - -gs command -debug -bboxes -output file 2 Description PSTOTEXT reads one or more PostScript or PDF files, and writes to standard output a representation of the plain text that would be displayed if the PostScript file were printed. As is described in the DETAILS section, this representation is only an approximation. Nevertheless, it is often useful for information retrieval (e.g., running grep(1) or building a full-text index) or to recover the text from a PostScript file whose source you have lost. PSTOTEXT calls Ghostscript, and requires Aladdin Ghostscript version 3.51 or newer. Ghostscript must be invokable on the current search path as gs. Alternatively, you can use the "-gs" option to specify the command (pathname and options) to run Ghostscript. PSTOTEXT reads and processes its command line from left to right, ignoring the case of options. When it encounters a pathname, it opens the file and expects to find a PostScript job or PDF document to process. The option "-" means to read and process a PostScript job from standard input. If no - or pathname arguments are encountered, PSTOTEXT reads a PostScript job from standard input. (PDF documents require random access, hence cannot be read from standard input.) The option -cork is only relevant for PostScript files produced by dvips from TeX or LaTeX documents; it tells PSTOTEXT to use the Cork encoding (known as T1 in LaTeX) rather than the old TeX text encoding (known as OT1 in LaTeX). Unfortunately files produced by dvips don't distinguish which font encodings were used. The options -landscape and -landscapeother should be used for documents that must be rotated 90 degrees clockwise or counterclockwise, respectively, in order to be readable. The options -debug and -bboxes are mostly of use for the maintainers of PSTOTEXT. -debug shows Ghostscript output and error messages. -bboxes outputs one word per line with bounding box information. 2 Details PSTOTEXT does its work by telling Ghostscript to load a PostScript library that causes it to write to its standard output information about each string rendered by a PostScript job or PDF document. This information includes the characters of the string, and enough additional information to approximate the string's bounding rectangle. PSTOTEXT post-processes this information and outputs a sequence of words delimited by space, newline, and formfeed. PSTOTEXT outputs words in the same sequence as they are rendered by the document. This usually, but not always, follows the order that a human would read the words on a page. Within this sequence, words are separated by either space or newline depending on whether or not they fall on the same line. Each page is terminated with a formfeed. If you use the incorrect option from the set {-portrait, -landscape, -landscapeother}, PSTOTEXT is likely to substitute newline for space. A PostScript job or PDF document often renders one word as several strings in order to get correct spacing between particular pairs of characters. PSTOTEXT does its best to assemble these strings back into words, using a simple heuristic: strings separated by a distance of less than 0.3 times the minimum of the average character widths in the two strings are considered to be part of the same word. Note that this typically causes leading and trailing punctuation characters to be included with a word. The PostScript language provides a flexible encoding scheme by which character codes in strings select specific characters (symbols), so a PostScript job is free to use any character code. On the other hand, PSTOTEXT always translates to the ISO 8859-1 (Latin-1) character code, which is an extension to ASCII covering most of the Western European languages. When a character isn't present in ISO 8859-1, PSTOTEXT uses a sequence of characters, e.g., "---" for em dash or "A\\226" for Abreve. PSTOTEXT can be fooled by a font whose Encoding vector doesn't follow Adobe's conventions, but it contains heuristics allowing it to handle a wide variety of misbehaving fonts. (PSTOTEXT no longer translates hyphen (\\255) to minus (\\055).) 2 Options -cork assume Cork encoding for dvips output -landscape rotate 270 degrees -landscapeother rotate 90 degrees -portrait don't rotate (default) -bboxes output one word per line with bounding box -debug show Ghostscript output and error messages -gs "command" Ghostscript command -output file output results to "file" (default is stdout) - read from stdin (default if no files specified) 2 Authors Andrew Birrell (PostScript libraries), Paul McJones (application), and Russell Lang (Windows and OS/2 adaptation). VMS port by Hunter Goatley. 2 See_Also PSTOTEXT incorporates technology originally developed for the Virtual Paper project at SRC; see http://www.research.digital.com/SRC/virtualpaper/ As mentioned above, PSTOTEXT invokes Ghostscript. See gs(1) or http://www.cs.wisc.edu/~ghost/. 2 Copyright Copyright 1995 Digital Equipment Corporation. Distributed only by permission. See file pstotext.txt for details. pstotext-1.9/pstotext.txt0100664000076400007640000001730307777500245014104 0ustar rjlrjl=================================================================== pstotext.txt 9 Jan 2004 =================================================================== pstotext 1.9 - PostScript text extractor. Requires Ghostscript. The files pstotxt1.dll (Win16), pstotxt2.dll (OS/2) pstotxt3.dll (Win32), and pstotext.zip (sources) constitute the pstotext package, which was written by Paul McJones and Andrew Birrell of Digital Equipment Corporation's Systems Research Center. These files are copyright by Digital Equipment Corporation. You may use them subject to the attached END USER LICENSE AGREEMENT. The source files are available in the GSview source distribution, or directly from the authors: http://www.research.digital.com/SRC/virtualpaper/pstotext.html You can build DLL and command-line versions of pstotext for various platforms from the sources: PLATFORM BUILD FILE Unix Makefile Win16 pstotxt1.mak OS/2 pstotxt2.mak Win32 pstotxt3.mak (Borland C++ 4.5) Win32 pstotxtv.mak (Microsoft Visual Studio) VMS descrip.mms pstotext was developed as a part of the Virtual Paper system, which is a high-quality viewer for computer-generated or scanned documents. See: http://www.research.digital.com/SRC/virtualpaper/ We appreciate Russell Lang's willingness to incorporate pstotext in GSview, and his help in redesigning the API to the DLL module and in debugging the final version. We thank Hunter Goatley for the VMS port. You can contact the authors at and . For Windows and OS, contact gsview@ghostgum.com.au; for VMS, contact . Version 1.9 is a modification of 1.8h to recognise excessive space in "ashow" as being a word separator, as found output from groff 1.08. Also fixed widthshow, awidthshow and added xshow, yshow and xyshow. Uses ANSI prototypes. These are now required, not optional. Fixes a number of compiler warnings. Note that neither of the original authors still work at DEC/Compaq/HP. 2004-01-09 by Russell Lang at Ghostgum Software Pty Ltd. Version 1.8h is a modification of the original 1.8 to allow operation with Aladdin Ghostscript 6.0, to add -output to Windows and OS/2 executables, and to insert line breaks and form feeds is pstotext output. Modifications made 2000-07-15 by Russell Lang at Ghostgum Software Pty Ltd. main.c changed to use mkstemp not tempnam for Unix, 2000-06-02 by Russell Lang at Ghostgum Software Pty Ltd. ========================== END USER LICENSE AGREEMENT ========================== GRANT. Subject to the provisions contained herein, Digital Equipment Corporation ("Digital") hereby grants you a non-exclusive license to use its accompanying proprietary software product and associated documentation ("Software") free of charge pursuant to the terms and conditions of this Agreement. You are not entitled to support or telephone assistance in connection with your use of the Software. SOFTWARE AND DOCUMENTATION. Digital shall furnish the Software to you electronically or on media in source code form. This license does not grant you any right to any enhancement or update to the Software and Documentation. USE RESTRICTIONS. You may use, copy, modify, and distribute the Software in source code or object code form, subject to the following conditions: (1) If the Software is modified, any Software containing modifications must prominently state in the modified product or documentation (i) that it has been modified, (ii) the identity of the person or entity that made the modifications, and (iii) the date the modifications were made. (2) Each copy of the Software made by you shall be subject to the terms of this Agreement and shall contain all of Digital's notices regarding copyrights, trademarks and other proprietary rights as contained in the Software originally provided to you. (3) The Software may not be transferred to any third party unless such third party receives a copy of this Agreement and agrees to be bound by all of its terms and conditions. TITLE. Title, ownership rights, and intellectual property rights in and to the Software shall remain in Digital and/or its suppliers. The Software is protected by the copyright laws of the United States and international copyright treaties. CONTENT. Title, ownership rights, and intellectual property rights in and to the content accessed through the Software is the property of the applicable content owner and may be protected by applicable copyright or other law. This License gives you no rights to such content. DISCLAIMER OF WARRANTY. Since the Software is provided free of charge, the Software is provided on an "AS IS" basis, without warranty of any kind, including without limitation the warranties of merchantability, fitness for a particular purpose and non-infringement. The entire risk as to the quality and performance of the Software is borne by you. Should the Software prove defective, you, and not Digital assume the entire cost of any service and repair. This disclaimer of warranty constitutes an essential part of the agreement. LIMITATION OF LIABILITY. UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, TORT, CONTRACT, OR OTHERWISE, SHALL DIGITAL OR ITS SUPPLIERS RESELLERS, OR LICENSEES BE LIABLE TO YOU OR ANY OTHER PERSON FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF DIGITAL SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY. EXPORT CONTROLS. You may not download or otherwise export or reexport the Software or any underlying information or technology except in full compliance with all United States and other applicable laws and regulations. By downloading or using the Software, you are agreeing to the foregoing. TERMINATION. This Agreement shall automatically terminate upon failure by you to comply with its terms, in which case you shall immediately discontinue the use of the Software and shall within ten (10) days return to Digital or destroy all copies of the Software. You may also terminate this Agreement at any time by destroying the Software and all copies thereof. MISCELLANEOUS. This Agreement represents the complete and exclusive statement of the agreements concerning this license between the parties. It may be amended only by a writing executed by both parties. If any provision of this Agreement is held to be unenforceable for any reason, such provision shall be reformed only to the extent necessary to make it enforceable, and such decision shall not affect the enforceability (i) of such provision under other circumstances or (ii) of the remaining provisions hereof under all circumstances. Headings shall not be considered in interpreting this Agreement. This Agreement shall be governed by and construed under the laws of the Commonwealth of Massachusetts, except as governed by Federal law. This Agreement will not be governed by the United Nations Convention of Contracts for the International Sale of Goods, the application of which is hereby expressly excluded. U.S. Government Restricted Rights. Use, duplication or disclosure by the Government is subject to restrictions set forth in subparagraphs (a) through (d) of the Commercial Computer-Restricted Rights clause at FAR 52 227-19 when applicable, or in subparagraph (c) (1) (ii) of the Rights in Technical Data and Computer Software clause at DFARS 252.227-7013, and in similar clauses in the NASA FAR Supplement. Contractor/manufacturer is Digital Equipment Corporation. pstotext-1.9/pstotxt1.def0100664000076400007640000000031006610564742013722 0ustar rjlrjlLIBRARY PSTOTXT1 DESCRIPTION 'pstotext - Copyright (C) 1995, Digital Equipment Corporation' DATA SINGLE SHARED EXPORTS pstotextInit @1 pstotextFilter @2 pstotextExit @3 pstotextSetCork @4 pstotext-1.9/pstotxt1.mak0100664000076400007640000000222106610565076013740 0ustar rjlrjl# pstotxt1.mak # Makefile for pstotxt1.dll, for use with Win16 GSview # makefile created by # Russell Lang, 1996-07-29 # updated 1996-10-11 # For debugging, use bcc -v COMPBASE = c:\bc45 COMPDIR = $(COMPBASE)\bin INCDIR = $(COMPBASE)\include LIBDIR = $(COMPBASE)\lib #DEBUGLINK = -v MODEL=l CCAUX=$(COMPDIR)\bcc CC=$(COMPDIR)\bcc $(DEBUGLINK) -m$(MODEL) -DNEED_PROTO all: pstotxt1.dll .c.obj: $(CC) -c $*.c ocr.h: ocr.ps mkrch.exe mkrch $*.ps $*.h 1 rot270.h: rot270.ps mkrch.exe mkrch $*.ps $*.h 2 rot90.h: rot90.ps mkrch.exe mkrch $*.ps $*.h 3 mkrch.exe: mkrch.c $(CCAUX) $*.c pstotxt1.obj: ptotdll.c ptotdll.h $(CC) -c -WD -opstotxt1.obj ptotdll.c pstotxt1.rc: ocr.h rot270.h rot90.h copy ocr.h+rot270.h+rot90.h pstotxt1.rc pstotxt1.res: pstotxt1.rc $(COMPDIR)\brcc -i$(INCDIR) -r pstotxt1.rc pstotxt1.dll: pstotxt1.obj pstotxt1.res $(CC) -WD -epstotxt1.dll pstotxt1.obj $(COMPDIR)\rlink pstotxt1.res pstotxt1.dll prezip: pstotxt1.dll copy pstotxt1.dll ..\pstotxt1.dll copy pstotext.txt ..\pstotext.txt clean: -del pstotxt1.dll -del pstotxt1.res -del pstotxt1.rc -del *.obj -del ocr.h -del rot270.h -del rot90.h -del mkrch.exe pstotext-1.9/pstotxt2.def0100664000076400007640000000031106610564754013727 0ustar rjlrjlLIBRARY PSTOTXT2 DESCRIPTION 'pstotext - Copyright (C) 1995, Digital Equipment Corporation' DATA MULTIPLE NONSHARED EXPORTS pstotextInit @1 pstotextFilter @2 pstotextExit @3 pstotextSetCork @4 pstotext-1.9/pstotxt2.mak0100664000076400007640000000243607047206512013741 0ustar rjlrjl# pstotxt2.mak # Makefile for pstotxt2.dll, for use with OS/2 GSview # makefile created by # Russell Lang, 1996-07-29 # updated 1996-10-11 DRIVE= COMP=gcc COMPBASE=$(DRIVE)\emx EMXPATH=$(DRIVE)/emx FLAGS=-Zdll -Zso -Zsys -Zomf -D__DLL__ COMPDIR=$(COMPBASE)\bin INCDIR=$(EMXPATH)/include LIBDIR=$(EMXPATH)/lib all: pstotxt2.dll pstotxt2.exe .c.obj: $(COMP) $(FLAGS) -c $*.c ocr.h: ocr.ps mkrch.exe mkrch $*.ps $*.h 1 rot270.h: rot270.ps mkrch.exe mkrch $*.ps $*.h 2 rot90.h: rot90.ps mkrch.exe mkrch $*.ps $*.h 3 mkrch.exe: mkrch.c $(COMP) -o $*.exe $*.c pstotxt2.obj: ptotdll.c ptotdll.h $(COMP) $(FLAGS) -c -o pstotxt2.obj ptotdll.c pstotxt2.rc: ocr.h rot270.h rot90.h copy ocr.h+rot270.h+rot90.h pstotxt2.rc pstotxt2.res: pstotxt2.rc rc -i $(COMPBASE)\include -r $*.rc pstotxt2.dll: pstotxt2.obj pstotxt2.def pstotxt2.res $(COMP) $(FLAGS) -o $*.dll $*.obj $*.def rc $*.res $*.dll pstotxt2.exe: pstotxtd.c $(COMP) -o pstotxtd.exe pstotxtd.c -del pstotxt2.exe rename pstotxtd.exe pstotxt2.exe prezip: all copy pstotxt2.dll ..\pstotxt2.dll copy pstotxt2.exe ..\pstotxt2.exe copy pstotext.txt ..\pstotext.txt clean: -del pstotxtd.exe -del pstotxt2.exe -del pstotxt2.dll -del pstotxt2.res -del pstotxt2.rc -del *.obj -del ocr.h -del rot270.h -del rot90.h -del mkrch.exe pstotext-1.9/pstotxt3.def0100664000076400007640000000033206610565126013725 0ustar rjlrjlLIBRARY PSTOTXT3 INITINSTANCE DESCRIPTION 'pstotext - Copyright (C) 1995, Digital Equipment Corporation' DATA MULTIPLE NONSHARED EXPORTS pstotextInit @1 pstotextFilter @2 pstotextExit @3 pstotextSetCork @4 pstotext-1.9/pstotxt3.mak0100664000076400007640000000250406610565000013731 0ustar rjlrjl# pstotxt3.mak # Makefile for pstotxt3.dll, for use with Win32 GSview # makefile created by # Russell Lang, 1996-07-29 # updated 1996-10-11 # For debugging, use bcc -v COMPBASE = c:\bc45 COMPDIR = $(COMPBASE)\bin INCDIR = $(COMPBASE)\include LIBDIR = $(COMPBASE)\lib #DEBUGLINK = -v CCAUX=$(COMPDIR)\bcc CC=$(COMPDIR)\bcc32 $(DEBUGLINK) -DNEED_PROTO all: pstotxt3.dll pstotxt3.exe .c.obj: $(CC) -c $*.c ocr.h: ocr.ps mkrch.exe mkrch $*.ps $*.h 1 rot270.h: rot270.ps mkrch.exe mkrch $*.ps $*.h 2 rot90.h: rot90.ps mkrch.exe mkrch $*.ps $*.h 3 mkrch.exe: mkrch.c $(CCAUX) $*.c pstotxt3.obj: ptotdll.c ptotdll.h $(CC) -c -WD -opstotxt3.obj ptotdll.c pstotxt3.rc: ocr.h rot270.h rot90.h copy ocr.h+rot270.h+rot90.h pstotxt3.rc pstotxt3.res: pstotxt3.rc $(COMPDIR)\brcc32 -i$(INCDIR) -r pstotxt3.rc pstotxt3.dll: pstotxt3.obj pstotxt3.res $(CC) -WD -epstotxt3.dll pstotxt3.obj $(COMPDIR)\brc32 pstotxt3.res pstotxt3.dll pstotxt3.exe: pstotxtd.c $(CC) -WC -epstotxtd.exe pstotxtd.c -del pstotxt3.exe rename pstotxtd.exe pstotxt3.exe prezip: all copy pstotxt3.dll ..\pstotxt3.dll copy pstotxt3.exe ..\pstotxt3.exe copy pstotext.txt ..\pstotext.txt clean: -del pstotxtd.exe -del pstotxt3.exe -del pstotxt3.dll -del pstotxt3.res -del pstotxt3.rc -del *.obj -del ocr.h -del rot270.h -del rot90.h -del mkrch.exe pstotext-1.9/pstotxtd.c0100664000076400007640000003635707777500227013520 0ustar rjlrjl/* pstotxtd.c */ /* OS/2 and Win32 Command line interface to pstotxt[23].dll */ /* 8086 MS-DOS command line EXE. */ /* Russell Lang */ /* derived from main.c */ /* Copyright (C) 1995, Digital Equipment Corporation. */ /* All rights reserved. */ /* See the file pstotext.txt for a full description. */ /* Last modified on Fri Jan 9 21:11:00 AEST 2004 by rjl */ /* modified on Sat Mar 11 09:16:00 AEST 2000 by rjl */ /* modified on Fri Oct 16 16:30:54 PDT 1998 by mcjones */ /* modified on Thu Nov 16 13:33:13 PST 1995 by deutsch */ /* Modifications by rjl * Fixed compiler warnings. */ #ifndef MSDOS #ifdef _Windows #include #include #ifndef __BORLANDC__ #define mktemp(t) _mktemp(t) #endif #else #define INCL_DOS #include #endif #endif #include #include #include #include #ifdef MSDOS #include #include #include "bundle.h" #include "ocr.h" #include "rot270.h" #include "rot90.h" typedef int HMODULE; #endif #include "ptotdll.h" #define BOOLEAN int #define FALSE 0 #define TRUE 1 #define LINELEN 2000 /* longest allowable line from gs ocr.ps output */ /* resource IDs for pstotxt3.dll */ #define OCR_PROLOG 1 #define ROT270_PROLOG 2 #define ROT90_PROLOG 3 static int cleanup(void); static void do_it(char *path); #define strcasecmp stricmp #define MAXPATHLEN 256 static int debug = FALSE; static int cork = FALSE; #ifdef MSDOS static char *gscommand = "gs386.exe"; #else #ifdef _Windows static char *gscommand = "gswin32c.exe"; #else static char *gscommand = "gsos2.exe"; #endif #endif static char *outfile = ""; static char *cmd; /* = argv[0] */ static enum { portrait, landscape, landscapeOther} orientation = portrait; static int bboxes = FALSE; static int explicitFiles = 0; /* count of explicit file arguments */ void usage(void) { fprintf(stderr, "pstotext 1.8i of 2003-01-08\n"); fprintf(stderr, "Copyright (C) 1995-1998, Digital Equipment Corporation.\n"); fprintf(stderr, "Modified by Ghostgum Software Pty Ltd for Ghostscript 6.0.\n"); fprintf(stderr, "Comments to {mcjones,birrell}@pa.dec.com.\n\n"); fprintf(stderr, "Usage: %s [option|file]...\n", cmd); fprintf(stderr, "Options:\n"); fprintf(stderr, " -cork Assume Cork encoding for dvips output\n"); fprintf(stderr, " -landscape rotate 270 degrees\n"); fprintf(stderr, " -landscapeOther rotate 90 degrees\n"); fprintf(stderr, " -portrait don't rotate (default)\n"); fprintf(stderr, " -bboxes output one word per line with bounding box\n"); fprintf(stderr, " -debug show Ghostscript output and error messages\n"); fprintf(stderr, " -gs \042command\042 Ghostscript command\n"); fprintf(stderr, " -output file output results to \042file\042 (default is stdout)\n"); fprintf(stderr, " - read from stdin (default if no files specified)\n"); } #ifndef _Windows #define WINAPI /* nothing for OS/2 or MSDOS */ #endif typedef int (WINAPI *PFN_pstotextInit)(void **instance); typedef int (WINAPI *PFN_pstotextFilter)(void *instance, char *instr, char **pre, char **word, char **post, int *llx, int *lly, int *urx, int *ury); typedef int (WINAPI *PFN_pstotextExit)(void *instance); typedef int (WINAPI *PFN_pstotextSetCork)(void *instance, int value); HMODULE pstotextModule; void *pstotextInstance; PFN_pstotextInit dllfn_pstotextInit; PFN_pstotextFilter dllfn_pstotextFilter; PFN_pstotextExit dllfn_pstotextExit; PFN_pstotextSetCork dllfn_pstotextSetCork; #ifdef _Windows int load_pstotext(void) { char dllname[256]; char *p; /* get path to EXE */ GetModuleFileName(0, dllname, sizeof(dllname)); if ((p = strrchr(dllname,'\\')) != (char *)NULL) p++; else p = dllname; *p = '\0'; #ifdef __WIN32__ #ifdef DECALPHA strcat(dllname, "pstotxta.dll"); #else strcat(dllname, "pstotxt3.dll"); #endif #else strcat(dllname, "pstotxt1.dll"); #endif if (debug) { fputs(dllname, stdout); fputc('\n', stdout); } /* load pstotext DLL */ pstotextModule = LoadLibrary(dllname); if (pstotextModule < (HINSTANCE)HINSTANCE_ERROR) { fprintf(stderr, "Can't load %s\n", dllname); return 1; } dllfn_pstotextInit = (PFN_pstotextInit) GetProcAddress(pstotextModule, "pstotextInit"); if (dllfn_pstotextInit == (PFN_pstotextInit)NULL) { fprintf(stderr, "Can't find pstotextInit() in %s\n", dllname); FreeLibrary(pstotextModule); return 1; } dllfn_pstotextFilter = (PFN_pstotextFilter) GetProcAddress(pstotextModule, "pstotextFilter"); if (dllfn_pstotextFilter == (PFN_pstotextFilter)NULL) { fprintf(stderr, "Can't find pstotextFilter() in %s\n", dllname); FreeLibrary(pstotextModule); return 1; } dllfn_pstotextExit = (PFN_pstotextExit) GetProcAddress(pstotextModule, "pstotextExit"); if (dllfn_pstotextExit == (PFN_pstotextExit)NULL) { fprintf(stderr, "Can't find pstotextExit() in %s\n", dllname); FreeLibrary(pstotextModule); return 1; } dllfn_pstotextSetCork = (PFN_pstotextSetCork) GetProcAddress(pstotextModule, "pstotextSetCork"); if (dllfn_pstotextSetCork == (PFN_pstotextSetCork)NULL) { fprintf(stderr, "Can't find pstotextSetCork() in %s\n", dllname); FreeLibrary(pstotextModule); return 1; } dllfn_pstotextInit(&pstotextInstance); return 0; } int unload_pstotext(void) { if (pstotextInstance) dllfn_pstotextExit(pstotextInstance); pstotextInstance = NULL; FreeLibrary(pstotextModule); pstotextModule = NULL; return 0; } void send_prolog(FILE *f, int resource) { HGLOBAL hglobal; LPSTR prolog; hglobal = LoadResource(pstotextModule, FindResource(pstotextModule, (LPSTR)resource, RT_RCDATA)); if ( (prolog = (LPSTR)LockResource(hglobal)) != (LPSTR)NULL) { fputs(prolog, f); FreeResource(hglobal); } } #else /* !_Windows */ #ifdef MSDOS int load_pstotext(void) { dllfn_pstotextInit = pstotextInit; dllfn_pstotextFilter = pstotextFilter; dllfn_pstotextExit = pstotextExit; dllfn_pstotextSetCork = pstotextSetCork; dllfn_pstotextInit(&pstotextInstance); return 0; } int unload_pstotext(void) { return 0; } void send_prolog(FILE *f, int resource) { switch (resource) { case OCR_PROLOG: putbundle(ocr, f); break; case ROT270_PROLOG: putbundle(rot270, f); break; case ROT90_PROLOG: putbundle(rot90, f); break; } } #else /* !_Windows && !MSDOS */ /* OS/2 */ int load_pstotext(void) { char dllname[256]; char buf[256]; char *p; APIRET rc; PTIB pptib; PPIB pppib; if ( (rc = DosGetInfoBlocks(&pptib, &pppib)) != 0 ) { fprintf(stderr,"Couldn't get pid, rc = \n", rc); return rc; } /* get path to EXE */ if ( (rc = DosQueryModuleName(pppib->pib_hmte, sizeof(dllname), dllname)) != 0 ) { fprintf(stderr,"Couldn't get module name, rc = %d\n", rc); return rc; } if ((p = strrchr(dllname,'\\')) != (PCHAR)NULL) { p++; *p = '\0'; } strcat(dllname, "pstotxt2.dll"); if (debug) { fputs(dllname, stdout); fputc('\n', stdout); } /* load pstotext DLL */ if (DosLoadModule(buf, sizeof(buf), dllname, &pstotextModule)) { fprintf(stderr, "Can't load %s\n", dllname); return 1; } if ((rc = DosQueryProcAddr(pstotextModule, 0, "pstotextInit", (PFN *)(&dllfn_pstotextInit))) !=0) { fprintf(stderr, "Can't find pstotextInit() in %s\n", dllname); DosFreeModule(pstotextModule); pstotextModule = (HMODULE)NULL; return 1; } if ((rc = DosQueryProcAddr(pstotextModule, 0, "pstotextFilter", (PFN *)(&dllfn_pstotextFilter))) !=0) { fprintf(stderr, "Can't find pstotextFilter() in %s\n", dllname); DosFreeModule(pstotextModule); pstotextModule = (HMODULE)NULL; return 1; } if ((rc = DosQueryProcAddr(pstotextModule, 0, "pstotextExit", (PFN *)(&dllfn_pstotextExit))) !=0) { fprintf(stderr, "Can't find pstotextExit() in %s\n", dllname); DosFreeModule(pstotextModule); pstotextModule = (HMODULE)NULL; return 1; } if ((rc = DosQueryProcAddr(pstotextModule, 0, "pstotextSetCork", (PFN *)(&dllfn_pstotextSetCork))) !=0) { fprintf(stderr, "Can't find pstotextSetCork() in %s\n", dllname); DosFreeModule(pstotextModule); pstotextModule = (HMODULE)NULL; return 1; } dllfn_pstotextInit(&pstotextInstance); return 0; } int unload_pstotext(void) { if (pstotextInstance) dllfn_pstotextExit(pstotextInstance); pstotextInstance = NULL; if (pstotextModule) DosFreeModule(pstotextModule); pstotextModule = (HMODULE)NULL; return 0; } int send_prolog(FILE *f, int resource) { char *prolog, *p; APIRET rc; int code = -1; rc = DosGetResource(pstotextModule, RT_RCDATA, resource, (PPVOID)&prolog); if (!rc && (prolog != (char *)NULL) ) { code = 0; p = prolog; while (*p) { if (!code) fputs(p, f); p += strlen(p)+1; } DosFreeResource(prolog); } else { fprintf(stderr, "Failed to load pstotext resource %d\n", resource); } return code; } #endif /* OS/2 (!MSDOS) */ #endif /* (!_Windows) */ /* create an empty temporary file and return its name */ static char *scratch_file(void) { FILE *f; char *temp; char *path = malloc(256); if (path == NULL) return NULL; if ( (temp = getenv("TEMP")) != NULL ) strcpy(path, temp); else if ( (temp = getenv("TMP")) != NULL ) strcpy(path, temp); else strcpy(path, "c:\\"); /* Prevent X's in path from being converted by mktemp. */ for ( temp = path; *temp; temp++ ) { *temp = (char)tolower(*temp); if (*temp == '/') *temp = '\\'; } if ( strlen(path) && (path[strlen(path)-1] != '\\') ) strcat(path, "\\"); strcat(path, "ptXXXXXX"); mktemp(path); f = fopen(path, "w"); if (f==NULL) {perror(cmd); exit(1);} fclose(f); return path; } static char *make_temp(int resource) { /* Return pathname of temporary file containing prolog from resources. Caller should unlink file (and, technically, free pathname). */ FILE *f; char *path = scratch_file(); if (path == NULL) {perror(cmd); cleanup(); exit(1);} f = fopen(path, "w"); if (f==NULL) {perror(cmd); cleanup(); exit(1);} send_prolog(f, resource); fclose(f); return path; } static char *ocr_path = NULL, *rotate_path = NULL; static FILE *gs = NULL; char *gstemp = NULL; static void *instance; /* pstotext state */ static int cleanup(void) { int status = 0; unload_pstotext(); if (gs!=NULL) { #if defined(_Windows) || defined(MSDOS) fclose(gs); #else pclose(gs); #endif } if (gstemp!=NULL && !debug) unlink(gstemp); if (rotate_path!=NULL && strcmp(rotate_path, "")!=0 && !debug) unlink(rotate_path); if (ocr_path!=NULL && !debug) unlink(ocr_path); return status; } static void handler(int code) { int status = code; /* suppress unreference 'code' warning */ status = cleanup(); if (status!=0) exit(status); exit(2); } static void do_it(char *path) { /* If "path" is NULL, then "stdin" should be processed. */ char gs_cmd[2*MAXPATHLEN]; char input[MAXPATHLEN]; int status; FILE *fileout; #ifdef MSDOS char *gsargtemp; FILE *gsargfile; #endif fileout = stdout; if (strlen(outfile) != 0) { fileout = fopen(outfile, "w"); if (fileout == NULL) {perror(cmd); exit(1);} } signal(SIGINT, handler); signal(SIGTERM, handler); status = load_pstotext(); if (status!=0) { fprintf(stderr, "%s: internal error %d\n", cmd, status); exit(5); } if (cork) { status = dllfn_pstotextSetCork(pstotextInstance, TRUE); if (status!=0) { fprintf(stderr, "%s: internal error %d\n", cmd, status); exit(5); } } ocr_path = make_temp(OCR_PROLOG); switch (orientation) { case portrait: rotate_path = ""; break; case landscape: rotate_path = make_temp(ROT270_PROLOG); break; case landscapeOther: rotate_path = make_temp(ROT90_PROLOG); break; } if (path==NULL) strcpy(input, "-"); else {strcpy(input, "-- "); strcat(input, path);} #if defined(_Windows) || defined(MSDOS) /* don't support pipes, so write gs output to a temporary file */ if ( (gstemp = scratch_file()) == NULL) { cleanup(); exit(1); } #endif #ifdef MSDOS /* MSDOS has command line length problems */ if ( (gsargtemp = scratch_file()) == NULL) { cleanup(); exit(1); } if ( (gsargfile = fopen(gsargtemp, "w")) == (FILE *)NULL) { cleanup(); exit(1); } fprintf(gsargfile, "-r72 -dNODISPLAY -dFIXEDMEDIA -dDELAYBIND -dWRITESYSTEMDICT %s -dNOPAUSE\n", (debug ? "" : "-q")); fputs(rotate_path, gsargfile); fputs("\n", gsargfile); fputs(ocr_path, gsargfile); fputs("\n", gsargfile); fputs(input, gsargfile); fputs("\n", gsargfile); fclose(gsargfile); sprintf(gs_cmd, "%s @%s %s %s", gscommand, gsargtemp, #if defined(_Windows) || defined(MSDOS) "> ", gstemp #else "", "" #endif ); #else /* !MSDOS */ sprintf(gs_cmd, "%s -r72 -dNODISPLAY -dFIXEDMEDIA -dDELAYBIND -dWRITESYSTEMDICT %s -dNOPAUSE %s %s %s %s %s", gscommand, (debug ? "" : "-q"), ocr_path, rotate_path, input, #if defined(_Windows) || defined(MSDOS) "> ", gstemp #else "", "" #endif ); #endif if (debug) { fputs(gs_cmd, stdout); fputc('\n', stdout); } #if defined(_Windows) || defined(MSDOS) if (system(gs_cmd)) { fprintf(stderr,"\nCan't run (errno=%d):\n %s\n", errno, gs_cmd); cleanup(); exit(1); } gs = fopen(gstemp, "r"); #else gs = popen(gs_cmd, "r"); #endif #ifdef MSDOS if (!debug) unlink(gsargtemp); free(gsargtemp); #endif if( gs==NULL ) {perror(cmd); cleanup(); exit(1);} while (gs != NULL) { /* while TRUE */ char line[LINELEN]; char *pre, *word, *post; int llx, lly, urx, ury; if (fgets(line, LINELEN, gs)==NULL) break; if (debug) fputs(line, stdout); status = dllfn_pstotextFilter( pstotextInstance, line, &pre, &word, &post, &llx, &lly, &urx, &ury); if (status!=0) { fprintf(stderr, "%s: internal error %d\n", cmd, status); cleanup(); exit(5); } if (word!=NULL) if (!bboxes) { fputs(pre, fileout); fputs(word, fileout); fputs(post, fileout); if (debug) fputc('\n', stdout); } else { if (pre) { if (*pre == ' ') pre++; fputs(pre, fileout); } fprintf(fileout, "%6d\t%6d\t%6d\t%6d\t%s\n", llx, lly, urx, ury, word); if (post) fputs(post, fileout); } } if (fileout != stdout) fclose(fileout); status = cleanup(); if (status!=0) exit(status); } int main(int argc, char *argv[]) { int i; char *arg; cmd = argv[0]; for (i = 1; i=argc) {usage(); exit(1);} outfile = argv[i]; } else if (strcmp(arg, "-")==0) do_it(NULL); else if (arg[0] == '-') {usage(); exit(1);} else /* file */ { explicitFiles++; do_it(arg); } } if (explicitFiles==0) do_it(NULL); return 0; } pstotext-1.9/pstotxtm.mak0100664000076400007640000000210706610565012014025 0ustar rjlrjl# pstotxtm.mak # Makefile for pstotxtm.exe, for 8086 MS-DOS # makefile created by # Russell Lang, 1996-10-11 # For debugging, use bcc -v COMPBASE = c:\bc45 COMPDIR = $(COMPBASE)\bin INCDIR = $(COMPBASE)\include LIBDIR = $(COMPBASE)\lib #DEBUGLINK = -v MODEL=l CCAUX=$(COMPDIR)\bcc CC=$(COMPDIR)\bcc $(DEBUGLINK) -m$(MODEL) -DNEED_PROTO -DMSDOS all: pstotxtm.exe .c.obj: $(CC) -c $*.c ocr.h: ocr.ps mkbundle.exe mkbundle $*.ps $*.h rot270.h: rot270.ps mkbundle.exe mkbundle $*.ps $*.h rot90.h: rot90.ps mkbundle.exe mkbundle $*.ps $*.h mkbundle.exe: mkbundle.c $(CCAUX) $*.c ptotdll.obj: ptotdll.c ptotdll.h $(CC) -c $*.c bundle.obj: bundle.c $(CC) -c $*.c pstotxtm.obj: pstotxtd.c ocr.h rot270.h rot90.h $(CC) -c -opstotxtm.obj pstotxtd.c pstotxtm.exe: pstotxtm.obj ptotdll.obj bundle.obj $(CC) -epstotxtm.exe pstotxtm.obj ptotdll.obj bundle.obj rename pstotxtm.exe pstotxtm.exe prezip: pstotxtm.exe copy pstotxtm.exe ..\pstotxtm.exe copy pstotext.txt ..\pstotext.txt clean: -del pstotxtm.exe -del *.obj -del ocr.h -del rot270.h -del rot90.h -del mkbundle.exe pstotext-1.9/pstotxtv.mak0100664000076400007640000000474207777477027014074 0ustar rjlrjl# pstotxtv.mak # Makefile for pstotxt[3a].dll, for use with GSview, # Microsoft Visual C++ and Win32 Intel x86 or DEC Alpha. # Has not been tested with DEC Alpha # makefile created by # Russell Lang, 1998-10-09 # Updated for MSVC++ 7, 2003-03-09 # Updated for MSVC++ 7.1 2004-01-09 # Edit VCVER and DEVBASE as required !ifndef VCVER VCVER=71 !endif !ifndef DEVBASE !if $(VCVER) <= 5 DEVBASE=C:\Program Files\devstudio !endif !if $(VCVER) == 6 DEVBASE=C:\Program Files\Microsoft Visual Studio !endif !if $(VCVER) == 7 DEVBASE=C:\Program Files\Microsoft Visual Studio .NET !endif !if $(VCVER) == 71 DEVBASE=C:\Program Files\Microsoft Visual Studio .NET 2003 !endif !endif # Debugging DEBUG=1 !if $(DEBUG) DEBUGLINK=/DEBUG CDEBUG=/Zi !endif # For Intel x386 use pstotxt3 DEST=pstotxt3 DEFS=/D__WIN32__ CFLAGS=$(DEFS) /W4 # For Alpha, uncomment the following two lines #DEST=pstotxta #DEFS=/D__WIN32__ /DDECALPHA #CFLAGS=$(DEFS) /W4 !if $(VCVER) <= 5 COMPBASE = $(DEVBASE)\vc RCOMP="$(DEVBASE)\sharedide\bin\rc" -D_MSC_VER $(DEFS) !endif !if $(VCVER) == 6 COMPBASE = $(DEVBASE)\vc98 RCOMP="$(DEVBASE)\common\msdev98\bin\rc" -D_MSC_VER $(DEFS) !endif !if (($(VCVER) == 7) || ($(VCVER) == 71)) COMPBASE = $(DEVBASE)\vc7 RCOMP="$(DEVBASE)\Vc7\bin\rc" -D_MSC_VER $(DEFS) !endif COMPDIR = $(COMPBASE)\bin INCDIR = $(COMPBASE)\include LIBDIR = $(COMPBASE)\lib CC="$(COMPDIR)\cl" -DNEED_PROTO $(CFLAGS) $(CDEBUG) "-I$(INCDIR)" CCAUX=$(CC) all: $(DEST).dll $(DEST).exe .c.obj: $(CC) -c $*.c ocr.h: ocr.ps mkrch.exe mkrch $*.ps $*.h 1 rot270.h: rot270.ps mkrch.exe mkrch $*.ps $*.h 2 rot90.h: rot90.ps mkrch.exe mkrch $*.ps $*.h 3 mkrch.exe: mkrch.c $(CCAUX) $*.c $(DEST).obj: ptotdll.c ptotdll.h $(CC) /c /D_Windows /D__DLL__ /Fo$(DEST).obj ptotdll.c $(DEST).rc: ocr.h rot270.h rot90.h copy ocr.h+rot270.h+rot90.h $(DEST).rc $(DEST).res: pstotxt3.rc $(RCOMP) "-i$(INCDIR)" -r $(DEST).rc $(DEST).dll: $(DEST).obj $(DEST).res "$(COMPDIR)\link" $(DEBUGLINK) /DLL /DEF:pstotxt3.def /OUT:$(DEST).dll $(DEST).obj $(DEST).res $(DEST).exe: pstotxtd.c $(CC) /D_Windows /Fe$(DEST).exe pstotxtd.c /link $(DEBUGLINK) prezip: all copy $(DEST).dll ..\$(DEST).dll copy $(DEST).exe ..\$(DEST).exe copy pstotext.txt ..\pstotext.txt clean: -del pstotxtd.exe -del $(DEST).exe -del $(DEST).dll -del $(DEST).res -del $(DEST).rc -del $(DEST).exp -del $(DEST).ilk -del $(DEST).lib -del $(DEST).pdb -del *.obj -del ocr.h -del rot270.h -del rot90.h -del mkrch.exe -del mkrch.ilk -del mkrch.pdb -del vc*.pdb pstotext-1.9/ptotdll.c0100664000076400007640000007236707777500077013315 0ustar rjlrjl/* Copyright (C) 1995-1998, Digital Equipment Corporation. */ /* All rights reserved. */ /* See the file pstotext.txt for a full description. */ /* Last modified on Fri Jan 09 21:19:00 AEST 2004 by rjl */ /* modified on Fri Jan 09 08:21:00 AEST 2004 by rjl */ /* modified on Wed Oct 28 08:42:15 PST 1998 by mcjones */ /* modified on Sun Jul 28 00:00:00 UTC 1996 by rjl */ /* Modifications by rjl * Fixed compiler warnings */ /* This module is based on OCR_PS.m3, a module of the Virtual Paper project at the DEC Systems Research Center: http://www.research.digital.com/SRC/virtualpaper/ */ #include #include #include #include "ptotdll.h" #ifndef NULL #define NULL 0 #endif #define BOOLEAN int #define FALSE 0 #define TRUE 1 #define MIN(a,b) ((a)<=(b)?(a):(b)) #define MAX(a,b) ((b)<=(a)?(a):(b)) /* Character encoding. Each element of the QE directive produced by ocr.ps is either an index in the StandardGlyph array, or is "NonstandardGlyph" (indicating the corresponding entry in the font's encoding specifies some nonstandard glyph). */ typedef unsigned GlyphIndex; #define NonstandardGlyph 9999 #define UnknownChar '#' /* substitute for nonstandard glyph */ /* The first 256 entries in StandardGlyphs correspond to ISOLatin1; the next 28 entries correspond to characters not in ISOLatin1, but defined in the standard /Times-Roman font. */ #define LastISOLatin1 255 #define FIRSTSpecialGlyphs (LastISOLatin1+1) #define LASTSpecialGlyphs (LastISOLatin1+28) static const char *SpecialGlyphs[] = { "''", /* quotedblright */ "S\237", /* Scaron */ "+", /* dagger */ "<", /* guilsinglleft */ "Z\237", /* Zcaron */ "#", /* daggerdbl */ "L/", /* Lslash */ "...", /* ellipsis */ ">", /* guilsinglright */ "oe", /* oe */ "fi", /* fi */ ".", /* bullet */ "o/oo", /* perthousand */ "''", /* quotedblbase */ "--", /* endash */ "---", /* emdash */ "^TM", /* trademark */ "f", /* florin */ "l/", /* lslash */ "s\237", /* scaron */ "Y\250", /* Ydieresis */ "fl", /* fl */ "/", /* fraction */ "``", /* quotedblleft */ "'", /* quotesinglbase */ "'", /* quotesingle */ "z\237", /* zcaron */ "OE" /* OE */ }; /* The next 256 entries correspond to the self-named glyphs used in Type 3 fonts from dvips: "\000", ..., "\377": */ #define FirstDvips (LASTSpecialGlyphs+1) #define LastDvips (FirstDvips+256-1) /* The next 512 entries correspond to glyph names used in Microsoft TrueType fonts: "G00", ..., "Gff" and "G00", ..., "GFF", which in both cases correspond to ISOLatin1 with some extensions. */ #define FirstTT1 (LastDvips+1) #define LastTT1 (FirstTT1+256-1) #define FirstTT2 (LastTT1+1) #define LastTT2 (FirstTT2+256-1) #define FirstOldDvips (LastTT2+1) #define LastOldDvips (FirstOldDvips+128-1) /* note only 128 */ #define FIRSTTTSpecialGlyphs (FirstTT1+130) #define LASTTTSpecialGlyphs (FirstTT1+159) static const char *TTSpecialGlyphs[] = { "'", /* quotesinglbase */ "f", /* florin */ "''", /* quotdblbase */ "...", /* ellipsis */ "+", /* dagger */ "#", /* daggerdbl */ "\223", /* circumflex */ "o/oo", /* perthousand */ "S\237", /* Scaron */ "<", /* guilsinglleft */ "OE", /* OE */ "#", /* */ "#", /* */ "#", /* */ "#", /* */ "`", /* ISOLatin1: quoteleft */ "'", /* ISOLatin1: quoteright */ "``", /* quotedblleft */ "''", /* quotedblright */ ".", /* bullet */ "--", /* endash */ "---", /* emdash */ "~", /* ISOLatin1: tilde */ "^TM", /* trademark */ "s\237", /* scaron */ ">", /* guilsinglright */ "oe", /* oe */ "#", /* */ "#", /* */ "Y\250" /* Ydieresis" */ }; #define FIRSTDvipsGlyphs FirstDvips #define LASTDvipsGlyphs (FirstDvips+127) static const char *DvipsGlyphs[] = { /* 00x */ "\\Gamma", "\\Delta", "\\Theta", "\\Lambda", "\\Xi", "\\Pi", "\\Sigma", "\\Upsilon", /* 01x */ "\\Phi", "\\Psi", "\\Omega", "ff", "fi", "fl", "ffi", "ffl", /* 02x */ "i", /* \imath */ "j", /* \jmath */ "`", "'", "\237", /* caron */ "\226", /* breve */ "\257", /* macron */ "\232", /* ring */ /* 03x */ "\270", /* cedilla */ "\337", /* germandbls */ "ae", "oe", "\370", /* oslash */ "AE", "OE", "\330", /* Oslash */ /* 04x */ "/" /* bar for Polish suppressed-L ??? */, "!", "''", "#", "$", "%", "&", "'", /* 05x */ "(", ")", "*", "+", ",", "\255" /* hyphen */, ".", "/", /* 06x */ "0", "1", "2", "3", "4", "5", "6", "7", /* 07x */ "8", "9", ":", ";", "!" /* exclamdown */, "=", "?" /* questiondown */, "?", /* 010x */ "@", "A", "B", "C", "D", "E", "F", "G", /* 011x */ "H", "I", "J", "K", "L", "M", "N", "O", /* 012x */ "P", "Q", "R", "S", "T", "U", "V", "W", /* 013x */ "X", "Y", "Z", "[", "``", "]", "\223" /* circumflex */, "\227" /* dotaccent */, /* 014x */ "`", "a", "b", "c", "d", "e", "f", "g", /* 015x */ "h", "i", "j", "k", "l", "m", "n", "o", /* 016x */ "p", "q", "r", "s", "t", "u", "v", "w", /* 017x */ "x", "y", "z", "--", /* en dash */ "---", /* em dash */ "\235", /* hungarumlaut */ "~", "\250" /* dieresis */ }; #define FIRSTCorkSpecialGlyphs FirstDvips #define LASTCorkSpecialGlyphs (FirstDvips+0277) static const char *CorkSpecialGlyphs[] = { /* 000 - accents for lowercase letters */ "`", "'", "^", "~", "\230", /* umlaut/dieresis */ "\235", /* hungarumlaut */ "\232", /* ring */ "\237", /* hacek/caron */ "\226", /* breve */ "\257", /* macron */ "\227", /* dot above/dotaccent */ "\270", /* cedilla */ "\236", /* ogonek */ /* 015 - miscellaneous */ "'", /* single base quote/quotesinglbase */ "<", /* single opening guillemet/guilsinglleft */ ">", /* single closing guillemet/guilsinglright */ "``", /* english opening quotes/quotedblleft */ "''", /* english closing quotes/quotedblright */ ",,", /* base quotes/quotedblbase */ "<<", /* opening guillemets/guillemotleft */ ">>", /* closing guillemets/guillemotright */ "--", /* en dash/endash */ "---", /* em dash/emdash */ "", /* compound work mark (invisible)/ */ "o", /* perthousandzero (used in conjunction with %) */ "\220", /* dotless i/dotlessi */ "j", /* dotless j */ "ff", /* ligature ff */ "fi", /* ligature fi */ "fl", /* ligature fl */ "ffi", /* ligature ffi */ "ffl", /* ligature ffl */ "_", /* visible space */ /* 041 - ASCII */ "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", "=", ">", "?", "@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\","]", "^", "_", "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "{", "|", "}", "~", "\255", /* hyphenchar (hanging) */ /* 200 - letters for eastern European languages from latin-2 */ "A\226", /* Abreve */ "A\236", /* Aogonek */ "C\264", /* Cacute */ "C\237", /* Chacek */ "D\237", /* Dhacek */ "E\237", /* Ehacek */ "E\236", /* Eogonek */ "G\226", /* Gbreve */ "L\264", /* Lacute */ "L\237", /* Lhacek */ "L/", /* Lslash/Lstroke */ "N\264", /* Nacute */ "N\237", /* Nhacek */ "\\NG", /* Eng */ "O\235", /* Ohungarumlaut */ "R\264", /* Racute */ "R\237", /* Rhacek */ "S\264", /* Sacute */ "S\237", /* Shacek */ "S\270", /* Scedilla */ "T\237", /* Thacek */ "T\270", /* Tcedilla */ "U\235", /* Uhungarumlaut */ "U\232", /* Uring */ "Y\250", /* Ydieresis */ "Z\264", /* Zacute */ "Z\237", /* Zhacek */ "Z\227", /* Zdot */ "IJ", /* IJ */ "I\227", /* Idot */ "\\dj", /* dbar */ "\247", /* section */ "a\226", /* abreve */ "a\236", /* aogonek */ "c\222", /* cacute */ "c\237", /* chacek */ "d\237", /* dhacek */ "e\237", /* ehacek */ "e\236", /* eogonek */ "g\226", /* gbreve */ "l\222", /* lacute */ "l\237", /* lhacek */ "l/", /* lslash */ "n\222", /* nacute */ "n\237", /* nhacek */ "\\ng", /* eng */ "o\235", /* ohungarumlaut */ "r\222", /* racute */ "r\237", /* rhacek */ "s\222", /* sacute */ "s\237", /* shacek */ "s\270", /* scedilla */ "t\237", /* thacek */ "t\270", /* tcedilla */ "u\235", /* uhungarumlaut */ "u\232", /* uring */ "y\230", /* ydieresis */ "z\222", /* zacute */ "z\237", /* zhacek */ "z\227", /* zdot */ "ij", /* ij */ "\241", /* exclamdown */ "\277", /* questiondown */ "\243" /* sterling */ /* 0300-0377 is same as ISO 8859/1 except: 0337 is Ess-zed and 0377 is ess-zed/germandbls */ }; /* There are gaps in the set of printable ISOLatin1 characters: */ /*CONST ISOLatin1Gaps = SET OF [0..255] { 8_0..8_37, 8_177..8_217, 8_231, 8_234}; */ typedef struct { double blx, bly, toprx, topry; /* font matrix in character coordinates */ struct {double x, y;} chr[256]; /* widths in character coordinates */ } MetricsRec; typedef MetricsRec *Metrics; typedef Metrics MetricsTable[]; typedef GlyphIndex EncodingVector[256]; typedef EncodingVector *Encoding; typedef Encoding EncodingTable[]; typedef struct { double x, y; /* (1000,0) in font's character coordinate system */ double xp, yp; /* (0,1000) in font's character coordinate system */ int e; /* index in "encoding" */ int m; /* index in "metrics" */ double bx, by, tx, ty; /* height of font bbox in reporting coordinates */ } FontRec; typedef FontRec *Font; typedef Font FontTable[]; /* Instance "T". */ #define state_normal 0 #define state_metrics 1 #define state_encoding 2 typedef struct { double itransform[6]; /* transform from device to default coordinates */ int metricsSize; MetricsTable *metrics; int encodingSize; EncodingTable *encoding; BOOLEAN dvipsIsCork; /* assume Cork rather than "OT1" for dvips output */ int fontSize; FontTable *font; /* Data for current word prefix: */ char buf[1000]; int lbuf; /* elements 0 through "lbuf-1" of "buf" are in use */ int f; /* font number */ double x0, y0, x1, y1; /* initial and final currentpoint */ BOOLEAN nonEmptyPage; long blx, bly, toprx, topry; /* bounding box of last word output */ char word[1000]; /* last word output */ int state; /* state-specific components: */ /* state_encoding: */ int encoding_e, encoding_n, encoding_i; /* state_metrics: */ int metrics_m, metrics_i; } T; static int ReadChar(char **instr); static void UnreadChar(char **instr); static int ReadInt(char **instr); static long ReadLong(char **instr); static int ParseInverseTransform(T *t, char *instr); static int ParseEncoding(T *t, char *instr); static int ParseEncodingMore(T *t, char *instr); static void ReadPair(double *x, double *y, char **instr); static int ParseFont(T *t, char *instr); static int ParseMetrics(T *t, char *instr); static int ParseMetricsMore(T *t, char *instr); static void Itransform(T *t, double *x1, double *y1, double x0, double y0); static void Output(T *t, const char **pre, const char **word, int *llx, int *lly, int *urx, int *ury); static BOOLEAN SameDirection(double x0, double y0, double x1, double y1); static int ParseString( T *t, char *instr, const char **pre, const char **word, const char **post, int *llx, int *lly, int *urx, int *ury); int DLLEXPORT pstotextInit(void **instance) { T *t; int i; t = (T *)malloc(sizeof(T)); if (t == NULL) return PSTOTEXT_INIT_MALLOC; t->state = state_normal; /* Initialize t->itransform to the identity transform. */ t->itransform[0] = 1.0; t->itransform[1] = 0.0; t->itransform[2] = 0.0; t->itransform[3] = 1.0; t->itransform[4] = 0.0; t->itransform[5] = 0.0; t->metricsSize = t->encodingSize = t->fontSize = 100; t->metrics = (MetricsTable *)malloc(t->metricsSize * sizeof(Metrics)); if (t->metrics == NULL) { free(t); return PSTOTEXT_INIT_MALLOC; } for(i=0; imetricsSize; i++)(*t->metrics)[i] = NULL; t->encoding = (EncodingTable *)malloc(t->encodingSize * sizeof(Encoding)); if (t->encoding == NULL) { free(t); return PSTOTEXT_INIT_MALLOC; } for(i=0;iencodingSize;i++)(*t->encoding)[i] = NULL; t->dvipsIsCork = FALSE; t->font = (FontTable *)malloc(t->fontSize * sizeof(Font)); if (t->font == NULL) { free(t); return PSTOTEXT_INIT_MALLOC; } for(i=0;ifontSize;i++)(*t->font)[i] = NULL; t->lbuf = 0; t->nonEmptyPage = FALSE; t->blx = t->bly = t->toprx = t->topry = 0; *instance = t; return 0; } int DLLEXPORT pstotextSetCork(void *instance, int value) { T *t = (T *)instance; t->dvipsIsCork = value; return 0; } int DLLEXPORT pstotextExit(void *instance) { T *t = (T *)instance; free(t->metrics); free(t->encoding); free(t->font); free(t); return 0; } static int ReadChar(char **instr) { int c = **(unsigned char**)instr; (*instr)++; return c; } static void UnreadChar(char **instr) { (*instr)--; } static int ReadInt(char **instr) { int i = 0; int sign = 1; int c; while ((c = ReadChar(instr))==' ') /* skip */ ; if (c=='-') {sign = -1; c = ReadChar(instr); } while ('0' <= c && c <= '9') {i = i*10+(c-'0'); c = ReadChar(instr);} UnreadChar(instr); return i*sign; } static long ReadLong(char **instr) { long i = 0; int sign = 1; int c; while ((c = ReadChar(instr))==' ') /* skip */ ; if (c=='-') {sign = -1; c = ReadChar(instr); } while ('0' <= c && c <= '9') {i = i*10+(c-'0'); c = ReadChar(instr);} UnreadChar(instr); return i*sign; } static int ParseInverseTransform(T *t, char *instr) { int i; for (i = 0; i<6; i++) t->itransform[i] = ReadLong(&instr) / 100.0; return 0; } static int ParseEncoding(T *t, char *instr) { /* Parse first line of QE directive. */ int e = ReadInt(&instr); int n = ReadInt(&instr); int i; if (e<0) return PSTOTEXT_FILTER_BADENCODINGNUMBER; if (n>/*256*/1024) return PSTOTEXT_FILTER_TOOMANYGLYPHINDEXES; /* Grow "t->encoding" if necessary. */ if (t->encodingSize<=e) { int oldSize = t->encodingSize; t->encodingSize = 2*e; t->encoding = (EncodingTable *)realloc( (char *)t->encoding, t->encodingSize * sizeof(Encoding) ); for(i=oldSize;iencodingSize;i++)(*t->encoding)[i] = NULL; } /* If this is the first encoding numbered "e", allocate array. */ if ((*t->encoding)[e] == NULL) (*t->encoding)[e] = (EncodingVector *)malloc(sizeof(EncodingVector)); t-> state = state_encoding; t->encoding_e = e; t->encoding_n = n; t->encoding_i = 0; return 0; } static int ParseEncodingMore(T *t, char *instr) { /* Parse subsequent line of QE directive. */ Encoding enc = (*t->encoding)[t->encoding_e]; int i, tooSparse; for (i = t->encoding_i; iencoding_i+16 ; i++) (*enc)[i] = (iencoding_n) ? ReadInt(&instr) : NonstandardGlyph; t->encoding_i += 16; if (t->encoding_i < 256) /* skip */ ; else { /* End of directive. */ t->state = state_normal; /* Some applications build the encoding vector incrementally. If this one doesn't have at least the lower-case letters, we augment it with ISOLatin1. */ tooSparse = 0; for (i = 'a'; i<='z'; i++) tooSparse = (*enc)[i] == NonstandardGlyph; if (tooSparse) for (i = 0; i<256; i++) if ((*enc)[i] == NonstandardGlyph) (*enc)[i] = i; } return 0; } #define GuessAscend 0.9 #define GuessDescend -0.3 static void ReadPair(double /*out*/ *x, /*out*/ double *y, char **instr) { *x = ReadLong(instr) / 100.0; *y = ReadLong(instr) / 100.0; } static int ParseFont(T *t, char *instr) { /* Parse QF directive. */ int n = ReadInt(&instr), i; Metrics mt; Font f; double xmax, bly, topry; if (n<0) return PSTOTEXT_FILTER_BADFONTNUMBER; /* Grow "t->font" if necessary. */ if (t->fontSize<=n) { int oldSize = t->fontSize; t->fontSize = 2*n; t->font = (FontTable *)realloc( (char *)t->font, t->fontSize * sizeof(Font) ); for(i=oldSize;ifontSize;i++)(*t->font)[i] = NULL; } /* If this is the first font numbered "n", allocate "FontRec". */ if ((*t->font)[n] == NULL) (*t->font)[n] = (Font)malloc(sizeof(FontRec)); f = (*t->font)[n]; ReadPair(&f->x, &f->y, &instr); ReadPair(&f->xp, &f->yp, &instr); f->e = ReadInt(&instr); if ((*t->encoding)[f->e] == NULL) return PSTOTEXT_FILTER_BADENCODINGNUMBER; f->m = ReadInt(&instr); mt = (*t->metrics)[f->m]; if (mt == NULL) return PSTOTEXT_FILTER_BADMETRICNUMBER; /* Transform height of font bounding box to reporting coordinates: */ f->bx = f->xp * mt->bly / 1000.0; f->by = f->yp * mt->bly / 1000.0; f->tx = f->xp * mt->topry / 1000.0; f->ty = f->yp * mt->topry / 1000.0; /* In some fonts produced by dvips, the FontBBox is incorrectly defined as [0 0 1 1]. We check for this, and apply the same heuristic used for an undefined FontBBox in "ParseMetrics". */ if (f->by-f->ty < 1.1) { xmax = 0.0; for (i = 0; i<256; i++) if (mt->chr[i].x > xmax) xmax = mt->chr[i].x; bly = GuessDescend * xmax; topry = GuessAscend * xmax; f->bx = f->xp * bly / 1000.0; f->by = f->yp * bly / 1000.0; f->tx = f->xp * topry / 1000.0; f->ty = f->yp * topry / 1000.0; } return 0; } static int ParseMetrics(T *t, char *instr) { /* Parse first line of QM directive. */ int m = ReadInt(&instr), i; Metrics mt; if (m<0) return PSTOTEXT_FILTER_BADMETRICNUMBER; /* Grow "t->metrics" if necessary. */ if (t->metricsSize<=m) { int oldSize = t->metricsSize; t->metricsSize = 2*m; t->metrics = (MetricsTable *)realloc( (char *)t->metrics, t->metricsSize * sizeof(Metrics) ); for (i=oldSize;imetricsSize;i++)(*t->metrics)[i] = NULL; } /* If this is the first metrics numbered "m", allocate "MetricsRec". */ if ((*t->metrics)[m] == NULL) (*t->metrics)[m] = (Metrics)malloc(sizeof(MetricsRec)); mt = (*t->metrics)[m]; ReadPair(&mt->blx, &mt->bly, &instr); ReadPair(&mt->toprx, &mt->topry, &instr); t->state = state_metrics; t->metrics_m = m; t->metrics_i = 0; return 0; } static int ParseMetricsMore(T *t, char *instr) { /* Parse subsequent line of QM directive. */ int i; Metrics mt = (*t->metrics)[t->metrics_m]; for (i = t->metrics_i; imetrics_i+8; i++) ReadPair(&mt->chr[i].x, &mt->chr[i].y, &instr); t->metrics_i += 8; if (t->metrics_i < 256) /* skip */ ; else { /* End of directive. */ t->state = state_normal; /* If "FontBBox" was not specified, take a guess. */ if (mt->blx == 0.0 && mt->bly == 0.0 && mt->toprx == 0.0 && mt->topry == 0.0) { for (i = 0; i<256; i++) if (mt->chr[i].x > mt->toprx) mt->toprx = mt->chr[i].x; mt->bly = GuessDescend * mt->toprx; mt->topry = GuessAscend * mt->toprx; } } return 0; } static void Itransform(T *t, double *x1, double *y1, double x0, double y0) { /* Set (*x1, *y1) to (t->itransform) * (x0, y0). */ *x1 = t->itransform[0]*x0 + t->itransform[2]*y0 + t->itransform[4]; *y1 = t->itransform[1]*x0 + t->itransform[3]*y0 + t->itransform[5]; } static void Output(T *t, const char **pre, const char **word, int *llx, int *lly, int *urx, int *ury) { /* Output the next word. */ double x0, y0, x1, y1, x2, y2, x3, y3; long blx, bly, toprx, topry, mid; Font f; f = (*t->font)[t->f]; /* Compute the corners of the parallelogram with width "(t->x0,t->y0)" to "(t->x1,t->y1)" and height "(f.bx,f.by)" to "(f.tx,f.ty)". Then compute the bottom left corner and the top right corner of the bounding box (rectangle with sides parallel to the coordinate system) of this rectangle. */ x0 = t->x0 + f->bx; y0 = t->y0 + f->by; x1 = t->x1 + f->bx; y1 = t->y1 + f->by; x2 = t->x0 + f->tx; y2 = t->y0 + f->ty; x3 = t->x1 + f->tx; y3 = t->y1 + f->ty; blx = (long)ceil(MIN(MIN(MIN(x0, x1), x2), x3)); bly = (long)ceil(MAX(MAX(MAX(y0, y1), y2), y3)); /* *** should this be floor? PMcJ 981002 */ toprx = (long)floor(MAX(MAX(MAX(x0, x1), x2), x3)); topry = (long)floor(MIN(MIN(MIN(y0, y1), y2), y3)); /* *** should this be ceil? PMcJ 981002 */ if (blx!=toprx && bly!=topry) { /* Output word separator if this isn't first word on page. */ if (t->nonEmptyPage) { mid = (topry+bly) / 2; if (blxblx <= blx && t->topry <= mid && mid <= t->bly) *pre = " "; /* same line */ else *pre = "\n"; /* different line */ } else *pre = ""; /* Output elements "0" through "t->lbuf-1" of "t->buf". */ t->buf[t->lbuf] = '\0'; strncpy(t->word, t->buf, t->lbuf+1); *word = t->word; t->nonEmptyPage = TRUE; t->blx = blx; t->bly = bly; t->toprx = toprx; t->topry = topry; /* transform device units to default PostScript units */ Itransform( t, &x1, &y1, (double)blx, (double)bly); blx = (long)floor(x1); bly = (long)floor(y1); Itransform( t, &x1, &y1, (double)toprx, (double)topry); toprx = (long)ceil(x1); topry = (long)ceil(y1); if (blx < toprx) { *llx = blx; *urx = toprx; } else { *llx = toprx; *urx = blx; } if (bly < topry) { *lly = bly; *ury = topry; } else { *lly = topry; *ury = bly; } } /*if (blx!=toprx && bly!=topry) { */ t->lbuf = 0; } static BOOLEAN SameDirection(double x0, double y0, double x1, double y1) { return (y0 == 0.0 && y1 == 0.0 && x0*x1 > 0.0) || (x0 == 0.0 && x1 == 0.0 && y0*y1 > 0.0) || (x0 * y1 == x1 * y0); } static int ParseString(T *t, char *instr, const char **pre, const char **word, const char **post, int *llx, int *lly, int *urx, int *ury) { /* Parse QS directive. */ #define spaceTol 0.3 /* fraction of average character width to signal word break */ char buf[1000]; int n, ch, i, j, in, l; Font f; Encoding enc; GlyphIndex glyph; double x0, y0, x1, y1, xsp, ysp, dx, dy, maxx, maxy; #define SetBuf() \ { \ strncpy(t->buf, buf, l); \ t->lbuf = l; \ t->f = n; \ t->x0 = x0; t->y0 = y0; t->x1 = x1; t->y1 = y1; \ } n = ReadInt(&instr); /* index in "t->font" */ f = (*t->font)[n]; if (f == NULL) return PSTOTEXT_FILTER_BADFONTNUMBER; enc = (*t->encoding)[f->e]; if (enc==NULL) return PSTOTEXT_FILTER_BADENCODINGNUMBER; ReadPair(&x0, &y0, &instr); /* initial currentpoint */ j = ReadInt(&instr); /* length of string */ ch = ReadChar(&instr); if (ch != ' ') return PSTOTEXT_FILTER_BADQS; l = 0; for (i = 0; i<=j-1; i++) { in = ReadChar(&instr); /* if (in=='\0') return PSTOTEXT_FILTER_BADQS; */ /* TeX uses '\0' */ glyph = (*enc)[in]; /* If "glyph==0", then "in" mapped to the glyph ".notdef". This is usually a mistake, but we check for several known cases: */ if (glyph == 0) { /* If any element of the current encoding is in the range used by Microsoft TrueType, assume this character is, too. */ int k; BOOLEAN tt = FALSE; for(k = 0; !tt && k < sizeof(*enc)/sizeof((*enc)[0]); k++) { if (FirstTT1 <= (*enc)[k] && (*enc)[k] <= LastTT2) tt = TRUE; } if (tt) glyph = FirstTT1 + (int)in; /* There are too many other exceptions to actually trap this: else if (in == '\r') ; // Adobe Illustrator does this... else if (in == '\t') ; // MacDraw Pro does this... else if (in == '\032') ; // MS Word on Mac does this... else return PSTOTEXT_FILTER_BADGLYPHINDEX; */ } if (glyph == 0) /* skip */; else if (glyph <= LastISOLatin1) { buf[l] = (char)glyph; /* *** if (glyph IN ISOLatin1Gaps) buf[l] = UnknownChar; */ l++; } else if (glyph <= LASTSpecialGlyphs) { const char *str = SpecialGlyphs[glyph-FIRSTSpecialGlyphs]; int lstr = strlen(str); strncpy(&buf[l], str, lstr); l += lstr; } else if (glyph <= LastDvips) { const char *str; int lstr; char tempstr[2]; if (t->dvipsIsCork) { if (glyph <= LASTCorkSpecialGlyphs) str = CorkSpecialGlyphs[glyph-FIRSTCorkSpecialGlyphs]; else if (glyph == FIRSTCorkSpecialGlyphs+0337) str = "SS"; else if (glyph == FIRSTCorkSpecialGlyphs+0377) str = "\337"; else { tempstr[0] = (char)(glyph-FIRSTCorkSpecialGlyphs); tempstr[1] = '\0'; str = &tempstr[0]; } } else if (glyph <= LASTDvipsGlyphs) /* Assume old text layout (OT1?). */ str = DvipsGlyphs[glyph-FIRSTDvipsGlyphs]; else { tempstr[0] = UnknownChar; tempstr[1] = '\0'; str = &tempstr[0]; } lstr = strlen(str); strncpy(&buf[l], str, lstr); l += lstr; } else if (glyph <= LastTT2) { if (FirstTT2 <= glyph) glyph -= FirstTT2-FirstTT1; if (glyph < FirstTT1+32) { buf[l] = UnknownChar; l++; } else if (glyph < FIRSTTTSpecialGlyphs || LASTTTSpecialGlyphs < glyph) { buf[l] = (char)(glyph - FirstTT1); l++; } else { const char *str = TTSpecialGlyphs[glyph-FIRSTTTSpecialGlyphs]; int lstr = strlen(str); strncpy(&buf[l], str, lstr); l += lstr; } } else if (glyph <= LastOldDvips) { const char *str = DvipsGlyphs[glyph-FirstOldDvips]; int lstr = strlen(str); strncpy(&buf[l], str, lstr); l += lstr; } else if (glyph == NonstandardGlyph) { /* not in StandardGlyphs */ buf[l] = UnknownChar; l++; } else return PSTOTEXT_FILTER_BADGLYPHINDEX; /* We no longer substitute minus for hyphen. */ /* if (buf[l-1] == '\255') buf[l-1] = '-'; */ } ReadPair(&x1, &y1, &instr); /* final currentpoint */ if (l != 0) { /* "l==0" e.g., when Adobe Illustrator outputs "\r" */ if (t->lbuf == 0) {SetBuf();} else { /* If the distance between this string and the previous one is less than "spaceTol" times the minimum of the average character widths in the two strings, and the two strings are in the same direction, then append this string to the previous one. Otherwise, output the previous string and then save the current one. Sometimes this string overlaps the previous string, e.g., when TeX is overprinting an accent over another character. So we make a special case for this (but only handle the left-to-right orientation). */ /* Set "(xsp,ysp)" to the reporting space coordinates of the minimum of the average width of the characters in this string and the previous one. */ xsp = MIN((t->x1-t->x0) / t->lbuf, (x1-x0) / l); ysp = MIN((t->y1-t->y0) / t->lbuf, (y1-y0) / l); dx = x0 - t->x1; dy = y0 - t->y1; maxx = spaceTol * xsp; maxy = spaceTol * ysp; if ((dx*dx + dy*dy < maxx*maxx + maxy*maxy) || ((t->y1 == y0 && t->x0 <= t->x1 && t->x0 <= x0 && x0 <= t->x1) && SameDirection(t->x1-t->x0, t->y1-t->y0, x1-x0, y1-y0))) { if (t->lbuf+l >= sizeof(t->buf)) { Output(t, pre, word, llx, lly, urx, ury); *post = ""; SetBuf(); } else { strncpy(&t->buf[t->lbuf], buf, l); t->lbuf += l; t->x1 = x1; t->y1 = y1; /* *** Merge font bounding boxes? */ } } else { Output(t, pre, word, llx, lly, urx, ury); *post = ""; SetBuf(); } } } return 0; } int DLLEXPORT pstotextFilter(void *instance, char *instr, const char **pre, const char **word, const char **post, int *llx, int *lly, int *urx, int *ury) { T *t = (T *)instance; int c; *word = NULL; switch (t->state) { case state_normal: do {c = ReadChar(&instr); if (c=='\0') return 0;} while (c!='Q'); c = ReadChar(&instr); switch (c) { case 'I': return ParseInverseTransform(t, instr); case 'M': return ParseMetrics(t, instr); case 'E': return ParseEncoding(t, instr); case 'F': return ParseFont(t, instr); case 'S': return ParseString( t, instr, pre, word, post, llx, lly, urx, ury); case 'C': case 'P': /* copypage, showpage */ /* If any QS directives have been encountered on this page, t->buf will be nonempty now. */ if (t->lbuf > 0) { Output(t, pre, word, llx, lly, urx, ury); *post = "\n\f\n"; } else { *pre = ""; *word = ""; *llx = 0; *lly = 0; *urx = 0; *ury = 0; *post = "\f\n"; } t->nonEmptyPage = FALSE; t->blx = t->bly = t->toprx = t->topry = 0; break; case 'Z': /* erasepage */ /* skip */ break; case '\0': return 0; /* default: skip */ } break; case state_metrics: return ParseMetricsMore(t, instr); case state_encoding: return ParseEncodingMore(t, instr); } return 0; } pstotext-1.9/ptotdll.h0100664000076400007640000000532407777500117013302 0ustar rjlrjl/* Copyright (C) 1996, Digital Equipment Corporation. */ /* All rights reserved. */ /* See the file pstotext.txt for a full description. */ /* Last modified on Fri Jan 09 21:17:00 AEST 2004 by rjl */ /* modified on Sun Oct 13 08:46:00 PDT 1996 by mcjones */ /* modified on Mon Jul 29 14:29:00 UTC 1996 by rjl */ /* Interface to ptotdll.c, which is based on OCR_PS.m3, a module of the Virtual Paper project at the DEC Systems Research Center: http://www.research.digital.com/SRC/virtualpaper/ */ /* Modifications by rjl * Fixed compiler warnings */ #ifndef _PTOTDLL_H #define _PTOTDLL_H #endif #ifdef __DLL__ #ifdef _Windows #include #ifdef __BORLANDC__ #define DLLEXPORT CALLBACK _export /* Win32 Borland C++ */ #else #define DLLEXPORT __declspec(dllexport) CALLBACK /* Win32 MS VC++ */ #endif #else #include #define DLLEXPORT /* OS/2 */ #endif #else #define DLLEXPORT /* all others */ #endif /* DLL exported functions */ int DLLEXPORT pstotextInit(void **instance); /* Allocate and initialize the instance data, and set *instance to the address of that data. Return zero if successful, or PSTOTEXT_INIT_MALLOC if not enough storage could be allocated. */ int DLLEXPORT pstotextSetCork(void *instance, int value); /* Assume the Cork encoding for dvips-produced fonts if value==1, or the old text layout if value==0. (Default is value==0.) */ int DLLEXPORT pstotextFilter( /* input parameters: */ void *instance, char *instr, /* output parameters: */ /* rjl: "char **" made const in 1.9 */ const char **pre, const char **word, const char **post, int *llx, int *lly, int *urx, int *ury ); /* Process *instr, a null-terminated line of GS ocr.ps output, and set the output parameters as follows if ( no output available this call ) *word = 0; else { *pre = "" or " " or "\n"; *word = null-terminated word (not equal to 0); (*llx, *lly, *urx, *ury) = bounding box for *word; *post = "" or "\f\n" or "\n\f\n"; } The strings assigned to *pre, *word, and *post are only valid until the next call of pstotextFilter or pstotextExit. pstotextFilter normally returns 0, but a nonzero result implies an error: */ int DLLEXPORT pstotextExit(void *instance); /* Free the storage pointed to by instance. */ /* return codes from pstotextInit and pstotextFilter */ #define PSTOTEXT_INIT_MALLOC 100 #define PSTOTEXT_FILTER_BADENCODINGNUMBER 200 #define PSTOTEXT_FILTER_TOOMANYGLYPHINDEXES 201 #define PSTOTEXT_FILTER_BADFONTNUMBER 202 #define PSTOTEXT_FILTER_BADMETRICNUMBER 203 #define PSTOTEXT_FILTER_BADQS 204 #define PSTOTEXT_FILTER_BADGLYPHINDEX 205 pstotext-1.9/rot270.ps0100664000076400007640000000534406235323356013046 0ustar rjlrjl% Copyright (C) 1995, Digital Equipment Corporation. % All rights reserved. % See the file pstotext.txt for a full description. % % Lectern: % PostScript to be prepended to a job to rotate its images by //angle, % adjusting the page shape appropriately. This works only for % devices (such as ppmraw) that allow arbitrary imaging areas. % Last modified on Thu Aug 1 11:36:13 PDT 1996 by mcjones % modified on Tue Mar 14 14:54:44 PST 1995 by birrell 4 dict begin /myDict currentdict def /angle 270 def /languagelevel where { pop languagelevel 2 ge } { FALSE } ifelse { % Use setpagedevice for level 2 implementations /lastW 0 def /lastH 0 def << /BeginPage { //myDict begin pop % page number currentpagedevice /PageSize get aload pop % stack: W H % (H = ) print dup = % (W = ) print 1 index = flush % (lastH = ) print lastH = % (lastW = ) print lastW = flush lastH ne exch lastW ne or { % /PageSize has changed: rotate it << /PageSize [ currentpagedevice /PageSize get aload pop % stack: W H exch % stack: newW newH /lastW 2 index def /lastH 1 index def ] >> setpagedevice % calls BeginPage recursively % (H := ) print lastH = % (W := ) print lastW = flush } { % Rotate ctm. Don't do this if we're also adjusting PageSize, % because that causes another BeginPage call, which does this. currentpagedevice /PageSize get aload pop % stack: width height 2 div exch 2 div exch % stack: center 2 copy translate //angle rotate neg exch neg translate } ifelse end % //myDict } >> setpagedevice } { % Use ghostscript specific code for earlier systems % flip current frame buffer currentdevice getdeviceprops >> /HWSize get aload pop exch 2 array astore mark exch /HWSize exch currentdevice putdeviceprops pop % fix showpage to add a rotation and translation /adbFixup { currentdevice getdeviceprops >> dup /HWSize get aload pop 3 -1 roll /HWResolution get aload pop % stack: xsize ysize xres yres exch 4 -1 roll exch div 72 mul % stack: ysize yres xpts 3 1 roll div 72 mul % stack: xpts ypts 2 div exch 2 div exch 2 copy translate //angle rotate neg exch neg translate } bind odef /adbShowpage /showpage load def userdict begin /showpage { adbShowpage adbFixup } bind odef end adbFixup % define new ".setpagesize" that flips its arguments statusdict begin /adbSet /.setpagesize load def /.setpagesize { exch adbSet adbFixup } bind def end } ifelse end % //myDict pstotext-1.9/rot90.ps0100664000076400007640000000534306235323356012765 0ustar rjlrjl% Copyright (C) 1995, Digital Equipment Corporation. % All rights reserved. % See the file pstotext.txt for a full description. % % Lectern: % PostScript to be prepended to a job to rotate its images by //angle, % adjusting the page shape appropriately. This works only for % devices (such as ppmraw) that allow arbitrary imaging areas. % Last modified on Thu Aug 1 11:36:25 PDT 1996 by mcjones % modified on Tue Mar 14 14:55:13 PST 1995 by birrell 4 dict begin /myDict currentdict def /angle 90 def /languagelevel where { pop languagelevel 2 ge } { FALSE } ifelse { % Use setpagedevice for level 2 implementations /lastW 0 def /lastH 0 def << /BeginPage { //myDict begin pop % page number currentpagedevice /PageSize get aload pop % stack: W H % (H = ) print dup = % (W = ) print 1 index = flush % (lastH = ) print lastH = % (lastW = ) print lastW = flush lastH ne exch lastW ne or { % /PageSize has changed: rotate it << /PageSize [ currentpagedevice /PageSize get aload pop % stack: W H exch % stack: newW newH /lastW 2 index def /lastH 1 index def ] >> setpagedevice % calls BeginPage recursively % (H := ) print lastH = % (W := ) print lastW = flush } { % Rotate ctm. Don't do this if we're also adjusting PageSize, % because that causes another BeginPage call, which does this. currentpagedevice /PageSize get aload pop % stack: width height 2 div exch 2 div exch % stack: center 2 copy translate //angle rotate neg exch neg translate } ifelse end % //myDict } >> setpagedevice } { % Use ghostscript specific code for earlier systems % flip current frame buffer currentdevice getdeviceprops >> /HWSize get aload pop exch 2 array astore mark exch /HWSize exch currentdevice putdeviceprops pop % fix showpage to add a rotation and translation /adbFixup { currentdevice getdeviceprops >> dup /HWSize get aload pop 3 -1 roll /HWResolution get aload pop % stack: xsize ysize xres yres exch 4 -1 roll exch div 72 mul % stack: ysize yres xpts 3 1 roll div 72 mul % stack: xpts ypts 2 div exch 2 div exch 2 copy translate //angle rotate neg exch neg translate } bind odef /adbShowpage /showpage load def userdict begin /showpage { adbShowpage adbFixup } bind odef end adbFixup % define new ".setpagesize" that flips its arguments statusdict begin /adbSet /.setpagesize load def /.setpagesize { exch adbSet adbFixup } bind def end } ifelse end % //myDict pstotext-1.9/vms.h0100664000076400007640000000170606536174070012422 0ustar rjlrjl/* Written on 27-MAY-1998 13:08 by Hunter Goatley. */ #ifdef VMS #define MAXPATHLEN 255 #include #define unlink(x) delete(x) #ifndef WIFEXITED /* VAX C doesn't know these */ #define WIFEXITED(s) (((s)&0x7F)==0) #define WIFSIGNALED(s) (((unsigned)(((s)&0x7F)-1))<0x7E) #define WEXITSTATUS(s) (((s)>>8)&0xFF) #endif #if !defined(__DECC) || (__VMS_VER < 70000000) #include #include #include char *tempnam(char *dir, char *prefix) { int pid, len; char *buf; pid = getpid(); len = strlen(dir) + strlen(prefix) + 8; buf = malloc(len); sprintf(buf, "%s%s%X", dir, prefix, pid); return(buf); } #include int strcasecmp(register char *s1, register char *s2) { register int c1, c2; while (*s1 && *s2){ c1 = tolower(*s1); c2 = tolower(*s2); if (c1 != c2) return (c1-c2); s1++; s2++; } return (int) (*s1 - *s2); } #endif #endif /* VMS */