lookup-1.08b.orig/0040755000014400001440000000000006400712125013500 5ustar nakaharastafflookup-1.08b.orig/server.info/0040755000014400001440000000000006076514105015747 5ustar nakaharastafflookup-1.08b.orig/server.info/test.pl0100644000014400001440000000235606027344150017263 0ustar nakaharastaff#!/usr/local/bin/perl ## ## Silly little tester for the SERVER_CONFIG version of lookup. ## First run the server, such as 'lookup_server -port 9000 file2Bsearched', ## then run this test program, such as 'perl test.pl -port 9000'. ## ## ## If 'localhost' doesn't work on your machine, try putting in the hostname ## directly. I've found 'localhost' and perl don't get along on some Linux ## distributions. ## ## $SERVER = 'localhost'; $PORT = 9827; ## same default as in lookup.h -- can change with '-port'n ## ## network.pl is available at http://www.wg.omron.co.jp/~jfriedl/perl/ ## require "network.pl"; ## ## use '-port ####' to use other than the default port. ## if (@ARGV[0] eq '-port') { shift; $PORT = shift; } $error = &network'connect_to(*SERVER, $SERVER, $PORT); die "$error\n" if defined $error; ## ## Make sure nothing is buffered. ## select SERVER; $| = 1; select STDOUT; $| = 1; ## ## Tell the server not to drop the connection after the first reply. ## print SERVER "--continuous--"; $i = ; ## expect the "--ok--\n" response while (1) { print "Search Pattern: "; chop($i = ); print SERVER $i; while () { print; last if $_ eq "--done--\n"; ## server tells us we're done. } } lookup-1.08b.orig/server.info/network.pl0100644000014400001440000001242506022754553020002 0ustar nakaharastaff## ## Jeffrey Friedl (jfriedl@omron.co.jp) ## Copyri.... ah hell, just take it. ## ## July 1994 ## package network; $version = "950311.5"; ## version 950311.5 -- turned off warnings when requiring 'socket.ph'; ## version 941028.4 -- some changes to quiet perl5 warnings. ## version 940826.3 -- added check for "socket.ph", and alternate use of ## socket STREAM value for SunOS5.x ## ## BLURB: ## A few simple and easy-to-use routines to make internet connections. ## Similar to "chat2.pl" (but actually commented, and a bit more portable). ## Should work even on SunOS5.x. ## ##> ## ## connect_to() -- make an internet connection to a server. ## ## Two uses: ## $error = &network'connect_to(*FILEHANDLE, $fromsockaddr, $tosockaddr) ## $error = &network'connect_to(*FILEHANDLE, $hostname, $portnum) ## ## Makes the given connection and returns an error string, or undef if ## no error. ## ## In the first form, FROMSOCKADDR and TOSOCKADDR are of the form returned ## by SOCKET'GET_ADDR and SOCKET'MY_ADDR. ## ##< sub connect_to { local(*FD, $arg1, $arg2) = @_; local($from, $to) = ($arg1, $arg2); ## for one interpretation. local($host, $port) = ($arg1, $arg2); ## for the other if (defined($to) && length($from)==16 && length($to)==16) { ## ok just as is } elsif (defined($host)) { $to = &get_addr($host, $port); return qq/unknown address "$host"/ unless defined $to; $from = &my_addr; } else { return "unknown arguments to network'connect_to"; } return "connect_to failed (socket: $!)" unless &my_inet_socket(*FD); return "connect_to failed (bind: $!)" unless bind(FD, $from); return "connect_to failed (connect: $!)" unless connect(FD, $to); local($old) = select(FD); $| = 1; select($old); undef; } ##> ## ## listen_at() - used by a server to indicate that it will accept requests ## at the port number given. ## ## Used as ## $error = &network'listen_at(*LISTEN, $portnumber); ## (returns undef upon success) ## ## You can then do something like ## $addr = accept(REMOTE, LISTEN); ## print "contact from ", &network'addr_to_ascii($addr), ".\n"; ## while () { ## .... process request.... ## } ## close(REMOTE); ## ##< sub listen_at { local(*FD, $port) = @_; local($empty) = pack('S n a4 x8', 2 ,$port, "\0\0\0\0"); return "listen_for failed (socket: $!)" unless &my_inet_socket(*FD); return "listen_for failed (bind: $!)" unless bind(FD, $empty); return "listen_for failed (listen: $!)" unless listen(FD, 5); local($old) = select(FD); $| = 1; select($old); undef; } ##> ## ## Given an internal packed internet address (as returned by &connect_to ## or &get_addr), return a printable ``1.2.3.4'' version. ## ##< sub addr_to_ascii { local($addr) = @_; return "bad arg" if length $addr != 16; return join('.', unpack("CCCC", (unpack('S n a4 x8', $addr))[2])); } ## ## ## Given a host and a port name, returns the packed socket addresss. ## Mostly for internal use. ## ## sub get_addr { local($host, $port) = @_; return $addr{$host,$port} if defined $addr{$host,$port}; local($addr); if ($host =~ m/^\d+\.\d+\.\d+\.\d+$/) { $addr = pack("C4", split(/\./, $host)); } elsif ($addr = (gethostbyname($host))[4], !defined $addr) { local(@lookup) = `nslookup $host 2>&1`; if (@lookup) { local($lookup) = join('', @lookup[2 .. $#lookup]); if ($lookup =~ m/^Address:\s*(\d+\.\d+\.\d+\.\d+)/) { $addr = pack("C4", split(/\./, $1)); } } if (!defined $addr) { ## warn "$host: SOL, dude\n"; return undef; } } $addr{$host,$port} = pack('S n a4 x8', 2 ,$port, $addr); } ## ## my_addr() ## Returns the packed socket address of the local host (port 0) ## Mostly for internal use. ## ## sub my_addr { return $addr{'me'} if defined $addr{'me'}; chop($_myhostname_ = `hostname`) if !defined $_myhostname_; $addr{'me'} = &get_addr($_myhostname_, 0); } ## ## my_inet_socket(*FD); ## ## Local routine to do socket(PF_INET, SOCK_STREAM, AF_NS). ## Takes care of figuring out the proper values for the args. Hopefully. ## ## Returns the same value as 'socket'. ## sub my_inet_socket { local(*FD) = @_; local($socket); if (!defined $socket_values_queried) { ## try to load some "socket.ph" if (!defined &main'_SYS_SOCKET_H_) { eval 'package main; local($^W) = 0; require("sys/socket.ph")||require("socket.ph");'; } ## we'll use "the regular defaults" if for PF_INET and AF_NS if unknown $PF_INET = defined &main'PF_INET ? &main'PF_INET : 2; $AF_NS = defined &main'AF_NS ? &main'AF_NS : 6; $SOCK_STREAM = &main'SOCK_STREAM if defined &main'SOCK_STREAM; $socket_values_queried = 1; } if (defined $SOCK_STREAM) { $socket = socket(FD, $PF_INET, $SOCK_STREAM, $AF_NS); } else { ## ## We'll try the "regular default" of 1. If that returns a ## "not supported" error, we'll try 2, which SunOS5.x uses. ## $socket = socket(FD, $PF_INET, 1, $AF_NS); if ($socket) { $SOCK_STREAM = 1; ## got it. } elsif ($! =~ m/not supported/i) { ## we'll just assume from now on that it's 2. $socket = socket(FD, $PF_INET, $SOCK_STREAM = 2, $AF_NS); } } $socket; } ## This here just to quiet -w warnings. sub dummy { 1 || $version || &dummy; } 1; __END__ lookup-1.08b.orig/server.info/README0100644000014400001440000000433306076513243016631 0ustar nakaharastaffLookup has been hacked to act in server mode, where it listens for connections on a socket. Compile with -DSERVER_CONFIG. This allows programs to access it for searching. It is not very well done. There can only be one client at a time (i.e. while working with one client, it doesn't listen for others). Also, the server shouldn't be a separately-compiled program, but just an option. Some day. As it is, you probably want to rename the 'lookup' binary to something like 'lookup.srv' for the server. When starting the server, -verbose means that the server should be a bit verbose about what commands it is getting, while '-port ####' allows you to override the port number defined in config.h. No RC file is read when running as a server. The client sends commands to the server. As with regular 'lookup', most command lines are search patterns (although you may append ",1" etc. for accessing multiple files). As with regular 'lookup', non-search commands are required to be a prefixed by a space (or whatever the command-prefix character is). *************** ** Note that the lines sent by the client should NOT have a newline (unless ** you want the newline to be part of the pattern). *************** There are some special commands that the client can send: "-- -exit- --" Causes the server to exit. I.e. print SERVER "-- -exit- --"; (note lack of newline) "--continuous--" If the first command by a given client, the server will not drop the connection after the first command has been answered. This allows multiple question/answer sessions during one connection. In this case, the server signals it is done with one answer by sending "--done--\n" and then waiting for another command. The server will respond with "--ok--\n" when the --continuous-- command has been recognized. "--bye--" For a --continuous-- connection, closes the connection for the current client; server continues to wait for another client. ------------------------------------------------ There is a small perl test program "test.pl" just so you can get it working. It requires my network.pl library, a (perhaps old) copy of which is included here. The latest version is always at http://www.wg.omron.co.jp/~jfriedl/perl/ jfriedl@omron.co.jp lookup-1.08b.orig/lib/0040755000014400001440000000000006400712074014251 5ustar nakaharastafflookup-1.08b.orig/lib/fuzzkana.h0100644000014400001440000000271106076503511016254 0ustar nakaharastaff#ifndef __FUZZKANA_H__ /* file wrapper */ #define __FUZZKANA_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * Oct 1993 * * "Fuzzify" a kana string such that using it as a regular expression * will result in a search which ignores vowel length and/or small TSUs. * * For example, the string「ときょ」would be turned into * 「と[ぅうおぉー]*きょ[ぅうおぉー]*」 * if ignoring vowel length, such that「とぅきょ」and「ときょう」 * among others) would match in addition to the original「ときょ」. * */ extern unsigned fuzzkana( const unsigned char *in, /* string to consider */ unsigned char *out, /* where to write new string */ unsigned out_size, /* size of output buffer */ unsigned flags); /* flags are from the list below */ #define FUZZ_LONG_VOWELS 0x01U #define FUZZ_SMALL_TSU 0x02U #define FUZZ_VOICED 0x04U #define FUZZ_REPEATER 0x08U /* not really kana, but easy to add here */ #define FUZZ_ALL (FUZZ_LONG_VOWELS | FUZZ_SMALL_TSU | FUZZ_VOICED) /* * Returns the number of bytes written to *out if all went well, 0 otherwise * (buffer overflow, null input parameters, empty input string). * If OUT is null, simply returns the number of bytes that would be required. */ #endif /* file wrapper */ lookup-1.08b.orig/lib/kanaid.c0100644000014400001440000000727106076511551015656 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * * Oct 1993 * * See "kanaid.h" for info. */ #include "config.h" #include "kanaid.h" unsigned long _KID[] = { /* ァ ぁ */ KID_A|KID_SMALL|KID_VOWEL, /* ア あ */ KID_A |KID_VOWEL, /* ィ ぃ */ KID_I|KID_SMALL|KID_VOWEL, /* イ い */ KID_I |KID_VOWEL, /* ゥ ぅ */ KID_U|KID_SMALL|KID_VOWEL, /* ウ う */ KID_U |KID_VOWEL, /* ェ ぇ */ KID_E|KID_SMALL|KID_VOWEL, /* エ え */ KID_E |KID_VOWEL| KID_DUAL, /* ォ ぉ */ KID_O|KID_SMALL|KID_VOWEL, /* オ お */ KID_O |KID_VOWEL| KID_DUAL, /* カ か */ KID_A |KID_K, /* ガ が */ KID_A |KID_G, /* キ き */ KID_I |KID_K, /* ギ ぎ */ KID_I |KID_G, /* ク く */ KID_U |KID_K, /* グ ぐ */ KID_U |KID_G, /* ケ け */ KID_E |KID_K, /* ゲ げ */ KID_E |KID_G, /* コ こ */ KID_O |KID_K, /* ゴ ご */ KID_O |KID_G, /* サ さ */ KID_A |KID_S, /* ザ ざ */ KID_A |KID_Z, /* シ し */ KID_I |KID_S, /* ジ じ */ KID_I |KID_Z| KID_DUAL, /* ス す */ KID_U |KID_S, /* ズ ず */ KID_U |KID_Z| KID_DUAL, /* セ せ */ KID_E |KID_S, /* ゼ ぜ */ KID_E |KID_Z, /* ソ そ */ KID_O |KID_S, /* ゾ ぞ */ KID_O |KID_Z, /* タ た */ KID_A |KID_T, /* ダ だ */ KID_A |KID_D, /* チ ち */ KID_I |KID_T, /* ヂ ぢ */ KID_I |KID_D| KID_DUAL, /* ッ っ */ KID_U|KID_SMALL|KID_T, /* ツ つ */ KID_U |KID_T, /* ヅ づ */ KID_U |KID_D| KID_DUAL, /* テ て */ KID_E |KID_T, /* デ で */ KID_E |KID_D, /* ト と */ KID_O |KID_T, /* ド ど */ KID_O |KID_D, /* ナ な */ KID_A |KID_N, /* ニ に */ KID_I |KID_N, /* ヌ ぬ */ KID_U |KID_N, /* ネ ね */ KID_E |KID_N, /* ノ の */ KID_O |KID_N, /* ハ は */ KID_A |KID_H, /* バ ば */ KID_A |KID_B, /* パ ぱ */ KID_A |KID_P, /* ヒ ひ */ KID_I |KID_H, /* ビ び */ KID_I |KID_B, /* ピ ぴ */ KID_I |KID_P, /* フ ふ */ KID_U |KID_H, /* ブ ぶ */ KID_U |KID_B, /* プ ぷ */ KID_U |KID_P, /* ヘ へ */ KID_E |KID_H, /* ベ べ */ KID_E |KID_B, /* ペ ぺ */ KID_E |KID_P, /* ホ ほ */ KID_O |KID_H, /* ボ ぼ */ KID_O |KID_B, /* ポ ぽ */ KID_O |KID_P, /* マ ま */ KID_A |KID_M, /* ミ み */ KID_I |KID_M, /* ム む */ KID_U |KID_M, /* メ め */ KID_E |KID_M, /* モ も */ KID_O |KID_M, /* ャ ゃ */ KID_A|KID_SMALL|KID_Y, /* ヤ や */ KID_A |KID_Y, /* ュ ゅ */ KID_U|KID_SMALL|KID_Y, /* ユ ゆ */ KID_U |KID_Y, /* ョ ょ */ KID_O|KID_SMALL|KID_Y, /* ヨ よ */ KID_O |KID_Y, /* ラ ら */ KID_A |KID_R, /* リ り */ KID_I |KID_R, /* ル る */ KID_U |KID_R, /* レ れ */ KID_E |KID_R, /* ロ ろ */ KID_O |KID_R, /* ヮ ゎ */ KID_A|KID_SMALL|KID_W, /* ワ わ */ KID_A |KID_W, /* ヰ ゐ */ KID_ARCHAIC, /* ヱ ゑ */ KID_ARCHAIC| KID_DUAL, /* ヲ を */ KID_o| KID_DUAL, /* ン ん */ KID_n, /* ヴ */ KID_U |KID_KATAONLY, /* ヵ */ KID_A|KID_SMALL|KID_KATAONLY|KID_K, /* ヶ */ KID_E|KID_SMALL|KID_KATAONLY|KID_K, }; lookup-1.08b.orig/lib/kanaid.h0100644000014400001440000001067306076511565015670 0ustar nakaharastaff#ifndef __KANAID_H__ /* file wrapper */ #define __KANAID_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * Oct 1993 * * Routines (a'la ctype) to give info about EUC kana characters. * Given the two-byte character as the two bytes HI and LO: * IS_KATAKANA(HI, LO) -- true if character is katakana * IS_HIRAGANA(HI, LO) -- true if character is hiragana * IS_KANA(HI, LO) -- true if either of the above. * IS_DASH(HI, LO) -- true if the character「ー」(which isn't kana) * * KANA_ID(HI, LO) -- VALID ONLY IF IS_KANA(HI,LO) IS TRUE, * returns the KID_* codes defined below which * describe the character. * * Warning: these are macros so arguments shouldn't have side effects. */ #define kanaid_version 100 /* 1.00 */ /* * Flags returned by KANA_ID(Highbyte, Lowbyte) have the following bits set */ #define KID_A 0x00000001 /* あ、か、が、さ、。。。*/ #define KID_I 0x00000002 /* い、き、ぎ、し、。。。*/ #define KID_U 0x00000004 /* う、く、ぐ、す、。。。*/ #define KID_E 0x00000008 /* え、け、げ、せ、。。。*/ #define KID_O 0x00000010 /* お、こ、ご、そ、。。。*/ /* mask to nab the volwel sound */ #define KID_VSOUND (KID_A|KID_I|KID_U|KID_E|KID_O) #define KID_K 0x00000020 /* か、き、く、け、こ */ #define KID_G 0x00000040 /* が、ぎ、ぐ、げ、ご */ #define KID_S 0x00000080 /* さ、し、す、せ、そ */ #define KID_Z 0x00000100 /* ざ、じ、ず、ぜ、ぞ */ #define KID_T 0x00000200 /* た、ち、つ、て、と */ #define KID_D 0x00000400 /* だ、ぢ、づ、で、ど */ #define KID_N 0x00000800 /* な、に、ぬ、ね、の */ #define KID_H 0x00001000 /* は、ひ、ふ、へ、ほ */ #define KID_B 0x00002000 /* ば、び、ぶ、べ、ぼ */ #define KID_P 0x00004000 /* ぱ、ぴ、ぷ、ぺ、ぽ */ #define KID_M 0x00008000 /* ま、み、む、め、も */ #define KID_Y 0x00010000 /* や、ゆ、よ 。。 。*/ #define KID_R 0x00020000 /* ら、り、る、れ、ろ */ #define KID_W 0x00040000 /* わ。。。 */ #define KID_n 0x00100000 /* ん */ /* mask to nab the consonant sound */ #define KID_CSOUND (KID_K|KID_G|KID_S|KID_Z|KID_T|KID_D|KID_N| \ KID_H|KID_B|KID_P|KID_M|KID_Y|KID_R|KID_W|KID_n) #define KID_o 0x00080000 /* を */ #define KID_SMALL 0x00200000 /* If small, as in ぁ,っ,ょ, etc. */ #define KID_VOWEL 0x00400000 /* If a raw vowel [あいうえお]. */ #define KID_KATAONLY 0x00800000 /* If char found only in katakana */ #define KID_ARCHAIC 0x01000000 /* If archaic */ #define KID_DUAL 0x02000000 /* For [ ずづ, じぢ, えゑ, おを ] */ /***********************************************************************/ /***********************************************************************/ #define KID_HIRA_HI 0244 /* high byte for hiragana EUC */ #define KID_KATA_HI 0245 /* high byte for katakana EUC */ /* true if the High/Low pair is the EUC 「ー」 */ #define IS_DASH(HighByte, LowByte) ((HighByte) == 0241 && (LowByte)== 0274) #define _KID_START 161 /* the Low byte associated with _KID[0] */ #define _KID_END 246 /* the Low byte associated with end of _KID[] */ /* True if the bytes represent a katakana character (except dash) */ #define IS_KATAKANA(HighByte, LowByte) ((HighByte) == KID_KATA_HI && \ (LowByte) >= _KID_START && \ (LowByte) <= _KID_END) /* True if the bytes represent a hiragana character */ #define IS_HIRAGANA(HighByte,LowByte) ((HighByte) == KID_HIRA_HI && \ (LowByte) >= _KID_START && \ (LowByte) <= _KID_END) /* True if either katakana or hiragana */ #define IS_KANA(H,L) (IS_KATAKANA(H,L) || IS_HIRAGANA(H,L)) /* * Returns the Kana ID for the given character. * * --> only valid if IS_KANA() or IS_HIRAGANA or IS_KATAKANA is true! <-- * */ #define KANA_ID(HighByte, LowByte) (_KID[(LowByte)-_KID_START]) extern unsigned long _KID[]; /* in kanaid.c */ #endif /* file wrapper */ lookup-1.08b.orig/lib/jregex.c0100644000014400001440000037271306176240202015712 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * * Jeffrey's REGEX routines with Japanese EUC support. * October 1993. * * This file is huge, but don't let it intimidate you. Most of it is comments. * * See "jregex.h" for overall usage info. * * Note: the terms "ASCII" is synonymous with "JIS/ROMAN" as far as * this file is concerned. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * The general idea is that the function * regcomp() -- ``regular expression compile'' * accepts a regular expression (and some flags to control how the expression * it to be interpreted), and it fills in a STRUCT REGEXBUF (see jregex.h) * with an internal representation (a "compiled" form) of said regex. * * The user then gives this STRUCT REGEXBUF along with a string to check * the pattern against to * regexec() -- ``regular expression execute'' * which returns true if the string matches the pattern, false otherwise. * As byproducts, regexec may fill in some global information concerning * which text was matched by which sets of parenthesis, etc. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * A STRUCT REGEXBUF is the compiled (internal form of the) regex as * returned to the user. * * BUF is the array of bytes representing the original regex. This is called * the "compiled form" or the "regex program" (since it is "executed"). * See the enum TYPE below for a description of how these bytes are * interpreted. BUFEND points to the first byte after the array. * * MIN_LENGTH_MATCH indicates the length of the shortest string which * could possibly match (and if 0, all strings match). * * ANCHOR_TO_START_OF_LINE is true if the pattern only matches at the * beginning of a line, i.e. the pattern begins with a '^'. In this case, * the command for that match in the compiled regex is omitted, as this * flag specifies it. * * FOLD_ACASE is true if regcomp() was told to fold ASCII case (i.e. ignore * differences in case when comparing letters, such that 'a' would match 'A'). * * FOLD_KCASE is true if regcomp() was told to fold kana case (i.e. ignore * "kananess", such that「あ」would match 「ア」). * * MUSTHAVE is created if REGCOMP_CALC_MUSTHAVE is given in the flags to * regcomp, and is a list of characters that are required for all successful * matches. The list isn't 100% guaranteed to be complete (i.e. there are * cases where a required character might not appear, but all characters * int the list are certainly required). This can be used by the user to * pre-screen lines, omitting lines not having all characters. Note that * when case folding is done, only the lower-case version of a character * appears in the list. * * MAX_PAREN_LEVEL is the number of sets of parentheses found in the * pattern.... it would be 3 for the patter「(a(b?))+c(de)?」. * * PAREN_INFO_REQUIRED indicates the number of the highest set which is * referenced from within the pattern... it would be zero for the * MAX_PAREN_LEVEL example, and 1 for something like「(['"])(\w+)\1」. * * MAX_LPC_L and MAX_LPC_C are used to in regexec to decide how much stack * space (for the execution of the regex) might be needed. See the discussion * elsewhere about these values. */ #include "config.h" #include "assert.h" #include #ifdef DEBUG_MALLOC # include #endif #include "jregex.h" #include "output.h" #include "euc.h" /* TO BE DONE: if not looking for longest, can invert push/check for anychar. (allow things that PUSH the entire line to do things backwards) optimize by having a firstchar support shift-JIS as well as EUC can sort EUC characters in classes for quicker lookup... even binary for long enough ones. combine adjacent EXACT/ONCE nodes into EXACT(N)/ONCE node. find out what standards say about these things and try to conform. Add {n,m} construct. Partition so that can be compiled in more minimal configurations if one doesn't need much of the support. Test EUC Code Set #3 support. */ /* NO_REGEXEC_MATCH_POINTS remove support for regexec_match_start and regexec_match_end. A (very very) small optimization if not needed. DONT_WORRY_ABOUT_KATAKANA_DASH_BEING_PART_OF_A_WORD The word_boundary stuff has to pay special attention to the kana dash character「ー」since it's not on the same EUC page as the kana. To make sure something like「アート」is considered one word, we have to do some special processing. Defining this omits that. NO_PAREN_INFO Define to remove support for regexec_paren_info, et. al. */ /*********************************************************************** ***********************************************************************/ /* * Defining FAST_REGEXEC will remove the ability to do regexec debugging. * This will save on doing the "are we debugging" tests. * * Defining NDEBUG will remove other debugging support as well. It will * still leave showregex() there for the user. * * One can define NO_SHOWREGEX independently to omit this routine if you * know you'll not use it. I should split this out into a separate file. */ #ifndef NDEBUG # define DEBUGSTUFF(stuff) stuff #else # define DEBUGSTUFF(stuff) /* nothing */ # ifndef FAST_REGEXEC # define FAST_REGEXEC # endif #endif #ifdef NO_REGEX_STATS # define STATS(stuff) /* nothing */ void regex_reset_stats(void) { /* nothing */ } #else # define STATS(stuff) stuff struct regex_stats regex_stats; void regex_reset_stats(void) { bzero((void *)®ex_stats, sizeof(regex_stats)); } #endif #ifdef FAST_REGEXEC # define FASTDEBUGSTUFF(stuff) /* nothing */ #else # define FASTDEBUGSTUFF(stuff) stuff #endif #if !defined(__GNUC__) # if !defined(__volatile__) # define __volatile__ /*nothing; for use with volatile functions */ # endif # if !defined(__inline__) # define __inline__ /*nothing; for use with volatile functions */ # endif #endif #define sizeof_array(array) (sizeof(array)/sizeof(array[0])) /******************* ******************* Some general routines. *******************/ /* * In case we give an invalid character to EUC_CHAR_LENGTH and it * returns zero, we sometimes want to just treat it as a one-byte * (unknown) character, so we'll use this macro. */ #define CHAR_LENGTH(C) (EUC_CHAR_LENGTH(C) ? EUC_CHAR_LENGTH(C) : 1) /* * Return where if the given one-byte character is found in the string * (possibly of mixed-size characters) from range_start to string_end, * return zero otherwise. */ static __inline__ const unsigned char * onebyte_char_in_string(unsigned char byte0, const unsigned char *string_start, const unsigned char *string_end) { while (string_start < string_end) { unsigned len = EUC_CHAR_LENGTH(string_start[0]); if (len == 0) { assert(0); len = 1; } if (1 == len && byte0 == string_start[0]) return string_start; string_start += len; } return 0; } /* * Return where if the given two-byte character is found in the string * (possibly of mixed-size characters) from range_start to string_end, * return zero otherwise. */ static __inline__ const unsigned char * twobyte_char_in_string(unsigned char byte0, unsigned char byte1, const unsigned char *string_start, const unsigned char *string_end) { if (string_start == string_end) return 0; string_end -= 1; /* no need to start checking at the last byte */ while (string_start < string_end) { unsigned len = EUC_CHAR_LENGTH(string_start[0]); if (len == 0) { assert(0); len = 1; } /* the order we check the bytes is designed to fail ASAP for the random non-matching pair */ if (2 == len && byte1 == string_start[1] && byte0 == string_start[0]) return string_start; string_start += len; } return 0; } /* * Return where if the given three-byte character is found in the string * (possibly of mixed-size characters) from range_start to string_end, * return zero otherwise. */ static __inline__ const unsigned char * threebyte_char_in_string(unsigned char byte0, unsigned char byte1, unsigned char byte2, const unsigned char *string_start, const unsigned char *string_end) { if (string_start == string_end) return 0; string_end -= 2; /* no need to start checking at the last two bytes */ while (string_start < string_end) { unsigned len = EUC_CHAR_LENGTH(string_start[0]); if (len == 0) { assert(0); len = 1; } /* the order we check the bytes is designed to fail ASAP for the random non-matching pair */ if (3 == len && byte2 == string_start[2] && byte1 == string_start[1] && byte0 == string_start[0]) return string_start; string_start += len; } return 0; } /* * A regex buffer is an array of bytes (unsigned chars). These bytes * can be partitioned into sets of bytes representing "commands", such * as "match `A' exactly", etc. The first byte in any set indicates * what type of command it is, which thereby indicates the number and * interpretation of any following bytes in the set. * * The type byte (first byte in a set) has two "fields". The upper six * bits indicate the basic type... match an exact ascii character, * match the end of a word, etc. The lower two bits indicate how many * times the thing is to be done (once, at least once, zero or one * times, or zero or more times). These 'count' bits only make sense * for some of the major types... doesn't make sense for "end of * word", so are ignored in such cases. */ #define COUNT_BITS 2 /* Bits used to represent count. */ #define ZERO_OK 0x1 /* This bit means this, while... */ #define MORE_THAN_1_OK 0x2 /* ... that bit means that. */ /* * The following are how the "count" two-bit value is interpreted. */ #define ONCE ( 0 ) #define ZERO_OR_ONE (ZERO_OK ) #define ONE_OR_MORE ( MORE_THAN_1_OK) #define ANY_NUMBER (ZERO_OK | MORE_THAN_1_OK) #define NO_COUNT 0 /* just for readability when it doesn't matter */ /* * Macros for creating and accessing a type-byte value from a major * type number and a count. Should be pretty self-explanatory. */ #define make_type_byte_with_count(type, count) (((type)<> COUNT_BITS) /* * Possible "command" types (upper 6 bits of first byte of each set). */ enum TYPE { UNUSED = 0, /* just to mark a do-nothing node.... not used in the final pattern */ EXACT1, /* Match the exact byte which follows in the compiled pattern. * Appearence in the final compiled pattern: * byte n: EXACT1 * byte n+1: byte to match */ EXACT2, /* Match the exact two bytes which follow in the compiled pattern. * * Appearence in the final compiled pattern: * byte n: EXACT2 * byte n+1: first byte to match * byte n+2: second byte to match */ EITHER_EXACT_2, /* Match the exact two bytes which follow in the compiled pattern, * or the two bytes that follow that. * Appearence in the final compiled pattern: * byte n: EITHER_EXACT_2 * byte n+1: first byte of first posibility * byte n+2: second byte of first posibility * byte n+3: first byte of second posibility * byte n+4: second byte of second posibility */ EXACT3, /* Match the exact three bytes which follow in the compiled pattern. * * Appearence in the final compiled pattern: * byte n: EXACT2 * byte n+1: first byte to match * byte n+2: second byte to match * byte n+3: third byte to match */ EXACT_ICASE_ASCII, /* Like EXACT1 except letter case is to be ignored. The byte to * match is guaranteed to be a lower case letter. * * Appearence in the final compiled pattern: * byte n: EXACT1 * byte n+1: lower case one-byte character to match. */ EXACT_ICASE_KANA, /* Like EXACT2 ecxept hiragana vs. katakana is ignored. The two-byte * EUC character to be matched is guaranteed to be a hiragana. * * Appearence in the final compiled pattern: * byte n: EXACT2 * byte n+1: first byte of EUC hiragana * byte n+2: second byte of EUC hiragana */ ANY_CHAR, /* Any character (ascii or EUC) except newline will match. * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_CHAR */ ANY_ASCII, /* Any ASCII character except newline will match. * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_ASCII */ ANY_MULTIBYTE, /* Any multibyte EUC character will match. * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_MULTIBYTE_EUC */ ANY_KATAKANA, /* Any multibyte EUC katakana. * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_KATAKANA */ ANY_NONKATAKANA, /* Anything not multibyte EUC katakana or ASCII newline will match. * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_NONKATAKANA */ ANY_HIRAGANA, /* Any multibyte EUC hiragana * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_HIRAGANA */ ANY_NONHIRAGANA, /* Anything not multibyte EUC hiragana or ASCII newline will match * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_NONHIRAGANA */ ANY_KANJI, /* Any multibyte character in kuten rows 16-84. * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_KANJI */ ANY_NONKANJI, /* Any character not in kuten rows 16-84 (except ASCII newline) * No following bytes. Appearence in the final compiled pattern: * byte n: ANY_NONKANJI */ CLASS, /* * Appearence in the final compiled pattern: * byte n: CLASS * byte n+1: One-byte boolean value ``inverted'' * byte n+2 * byte n+3: Unsigned two-byte value ``size2'' * byte n+4 * byte n+5: Unsigned two-byte value ``size3'' * next 128 bytes: Array[0..127] of boolean values * next size2 bytes: List of two-byte EUC characters * next size3 bytes: List of three-byte EUC characters * * Represnets a class (such as ``...[0-9a-f]...''). * * For ASCII, C is in the class if array[C] is true. * EUC characters are in the class if they're found in the lists that * follow 'array'. * * When 'inverted' is false, characters in the class match. * When 'inverted' is true, characters NOT in the class match. */ count_ok_limit, /* This is just an enum marker... the 'count' bits *are* */ /* interpreted for type bytes with major types in the */ /* list above this member; ignored for those below. */ REGEX_MATCH, /* If this is reached, the regex matches. No following bytes. * Appearence in the final compiled pattern: * byte n: REGEX_MATCH */ #ifndef NO_REGEXEC_MATCH_POINTS REGEX_MATCH_NOTE_WORD, /* Exactly like REGEX_MATCH except that * regexec_match_at_start_of_word * and * regexec_match_at_end_of_word * will be set appropriatly. * * byte n: REGEX_MATCH_NOTE_WORD */ #endif WORD_BOUNDARY, /* Matches the boundary between words. No following bytes. * Appearence in the final compiled pattern: * byte n: WORD_BOUNDARY */ START_OF_LINE, /* Matches the start of a line. No following bytes. * Appearence in the final compiled pattern: * byte n: START_OF_LINE */ END_OF_LINE, /* Matches the end of a line. No following bytes. * Appearence in the final compiled pattern: * byte n: END_OF_LINE */ JUMP, /* Appearence in the final compiled pattern: * byte n: JUMP * byte n+1, * byte n+2: signed two-byte value ``jump offset'' * * When this command executes, the "current location" in the compiled * pattern buffer moves from 'n' to 'n + offset' and execution * continues from there. */ PUSH, /* Appearence in the final compiled pattern: * byte n: PUSH * byte n+1, * byte n+2: signed two-byte value ``push offset'' * * A PUSH indicates an alternative way to lead to a successful match. * * When reached, a PUSH indicates that although the subsequent commands * in the compiled pattern may be used to nibble away at the line being * matched, a proper match may *also* be acheived by continuing with * the matching procedure at the pattern location n+offset. * * For example, the pattern 'ab?c' might result in a compiled * pattern looking something like: * #1: EXACT1 `a' * #2: PUSH (refer to command #4) * #3: EXACT1 `b' * #4: EXACT1 `c' * * Consider trying to match the string "ac" (which will match). * The 'a' will match with command #1; execution will then go * to command #2 with the string-to-be-matched being the "c" left * from the "ac" after the 'a' was nibbled off. * * The PUSH will, in effect, say: ``we'll try to continue with what * follows, but if we end up failing sometime down the line, we can * always try to continue to match the string as we have it at the * moment ("c"), starting at command #4.''. This is done by pushing a * #4/"a" combo onto a stack. * * The execution will then continue to #3, failing to match 'b' * against "c". But rather than failing the whole string, it is * noticed that there is a "try me" on the stack, and that state * is popped off. Execution continues at #4 with "c". * * The EXACT1 command suceeds. The end of the pattern is reached * so the whole regex succeeds. */ PUSH_JUMP, /* Just an optimization. * Like a JUMP, except does an implicit PUSH of the following command. * * The following two are the same: * * PUSH (refer to `mark') * JUMP somewhere * mark: * * -and- * * PUSH_JUMP somewhere * mark: * * Appearence in the final compiled pattern: * byte n: PUSH * byte n+1, * byte n+2: signed two-byte value ``jump offset'' */ #ifndef NO_PAREN_INFO SAVE_OPEN_PAREN, SAVE_CLOSE_PAREN, /* * If we've been asked to save paren info for this pattern, this will * indicate that we need to note that we're entering or exiting a set * of parens. Followed by one byte indicating the level of the parens. * * Appearance in the final compiled pattern: * byte n: SAVE_{OPEN,CLOSE}_PAREN * byte n+1: unsigned one-byte value ``paren level'' */ SAVE_CLOSE_PAREN_PUSH_JUMP, /* Combo of SAVE_CLOSE_PAREN and PUSH_JUMP. * * Appearence in the final compiled pattern: * byte n: SAVE_CLOSE_PAREN_PUSH_JUMP * byte n+1: unsigned one-byte value ``save count'' * byte n+2: unsigned one-byte value ``paren level'' * byte n+3 * byte n+4: signed two-byte value ``jump offset'' */ PUSH_SAVE_OPEN_PAREN, /* * Simply a PUSH followed by a SAVE_OPEN_PAREN. First two * bytes after is the PUSH value, third byte after is the * paren level. * * Appearance in the final compiled pattern: * byte n: * byte n+1: */ MATCH_PREV_PAREN, /* * Next byte is paren number to match exactly. * * Appearance in the final compiled pattern: * byte n: * byte n+1: */ #endif OPEN_PAREN, /* NOT USED in the compiled pattern, but is used in the * intermediate stage. See that description * Appearence in the final compiled pattern: * byte n: * byte n+1: */ ALT /* NOT USED in the compiled pattern, but is used in the * intermediate stage. See that description * Appearence in the final compiled pattern: * byte n: * byte n+1: */ }; /* * At various times we want to read and write short values that will * be unaligned. We have to emulate it ourselves if our processor * can't handle it. Define UNALIGNED_SHORT_ACCESS_OK if you know it's OK. */ #ifdef UNALIGNED_SHORT_ACCESS_OK # define write_short_val(ptr, val) (*(short *)(ptr) = (val)) # define read_short_val(ptr) (*(short *)(ptr)) #else # define write_short_val(ptr, val) \ macro_start { \ ((unsigned char *)(ptr))[0] = ((short)(val)) >> 8; \ ((unsigned char *)(ptr))[1] = ((short)(val)) & 0xff; \ } macro_end # define read_short_val(ptr) \ (short)((((const unsigned char *)(ptr))[0] << 8 | \ ((const unsigned char *)(ptr))[1])) #endif /* UNALIGNED_SHORT_ACCESS_OK */ /* * The following #defines are to make reading some of the code a bit * easer (there are lots of little magic numbers floating around), * and are not meant to be changeable. All are byte counts. */ #define SIZEOF_ASCII 1 /* one byte */ #define SIZEOF_EUC 2 /* EUC is two bytes */ #define TYPE_BYTE_SIZE 1 /* The type-byte takes up one byte. */ #define SHORT_VAL_SIZE 2 /* A short val takes up two bytes. */ #define ASCII_SPEC_SIZE SIZEOF_ASCII /* An ASCII spec in a regex buffer. */ #define EUC_SPEC_SIZE SIZEOF_EUC /* An EUC spec in a regex buffer. */ #define DETERMINISTIC_CLASS_SIZE (1+1+2+2+128) #define SIZEOF_PUSH_COMMAND (TYPE_BYTE_SIZE + /*offset*/SHORT_VAL_SIZE) #define SIZEOF_JUMP_COMMAND (TYPE_BYTE_SIZE + /*offset*/SHORT_VAL_SIZE) #ifndef NO_PAREN_INFO # define PAREN_SPEC_SIZE 1 /* paren level val held in one byte */ # define PAREN_COUNT_SIZE 1 /* paren level count held in one byte */ # define SIZEOF_PUSH_JUMP_COMMAND (TYPE_BYTE_SIZE + /*offset*/SHORT_VAL_SIZE) # define SIZEOF_SAVE_OPEN_PAREN_COMMAND (TYPE_BYTE_SIZE + PAREN_SPEC_SIZE) # define SIZEOF_SAVE_CLOSE_PAREN_COMMAND (TYPE_BYTE_SIZE + PAREN_SPEC_SIZE) # define SIZEOF_SAVE_CLOSE_PAREN_PUSH_JUMP ( \ TYPE_BYTE_SIZE + \ /* count byte */ PAREN_COUNT_SIZE + \ /* start byte */ PAREN_SPEC_SIZE + \ /* offset */ SHORT_VAL_SIZE ) # define SIZEOF_PUSH_SAVE_OPEN_PAREN_COMMAND ( \ TYPE_BYTE_SIZE + \ /* offset */ SHORT_VAL_SIZE + \ /* paren val */ PAREN_SPEC_SIZE) #endif /* * When a '(...)' pattern is compiled, there is a need for extra codes to * facilitate the saving of paren info (if compiled in and used) and/or * when the paren has '*', '+', or '?' tacked on. */ /* * In the case of raw parens (w/o '*', '+', or '?'), we need no extra * space if we're not saving paren info. If we are, we'd need to wrap * the parenthesized regex with SAVE_OPEN_PAREN and SAVE_CLOSE_PAREN. */ #ifndef NO_PAREN_INFO # define EXTRA_FOR_RAW_PAREN (SIZEOF_SAVE_OPEN_PAREN_COMMAND + \ SIZEOF_SAVE_CLOSE_PAREN_COMMAND) #else # define EXTRA_FOR_RAW_PAREN 0 #endif /* * For the case of '(stuff)?' the pattern would look like * * PUSH (refer to "mark:") * * mark: * * if not saving paren info, and if so: * * PUSH (refer to "mark:") * SAVE_OPEN_PAREN * * SAVE_CLOSE_PAREN * mark: * * which is optimized to * * PUSH_SAVE_OPEN_PAREN (refer to "mark:") * * SAVE_CLOSE_PAREN * mark: */ #ifndef NO_PAREN_INFO # define EXTRA_FOR_PAREN_QUESTION (SIZEOF_PUSH_SAVE_OPEN_PAREN_COMMAND +\ SIZEOF_SAVE_CLOSE_PAREN_COMMAND) #else # define EXTRA_FOR_PAREN_QUESTION SIZEOF_JUMP_COMMAND #endif /* * For the case of '(stuff)+' the pattern will look like * * mark: * * PUSH (refer to "mark2:") * JUMP to "mark:" * mark2: * * which is optimized to * mark: * * PUSH_JUMP to "mark:" * * If we're saving paren info, this becomes * mark: * SAVE_OPEN_PAREN # * * SAVE_CLOSE_PAREN # * * PUSH_JUMP to "mark:" * * which is optimized to: * mark: * SAVE_OPEN_PAREN # * * SAVE_CLOSE_PAREN_PUSH_JUMP to "mark:" * * */ #ifndef NO_PAREN_INFO # define EXTRA_FOR_PAREN_PLUS (SIZEOF_SAVE_CLOSE_PAREN_PUSH_JUMP + \ SIZEOF_SAVE_OPEN_PAREN_COMMAND) #else # define EXTRA_FOR_PAREN_PLUS (SIZEOF_PUSH_JUMP_COMMAND) #endif /* * For the case of '(stuff)*' the pattern will look like * * mark: * PUSH (refer to "mark2:") * * JUMP to "mark:" * mark2: * * if not saving parens, and if so: * * PUSH (refer to "mark2:") * mark: * SAVE_OPEN_PAREN * * SAVE_CLOSE_PAREN * PUSH (refer to mark2:) with paren info * JUMP to "mark:" * mark2: * * which can be optimized down to * * PUSH (refer to "mark2:") * mark: * SAVE_OPEN_PAREN * * SAVE_CLOSE_PAREN_PUSH_JUMP (push refers to mark2, jump refers to mark) * mark2: * */ #ifndef NO_PAREN_INFO # define EXTRA_FOR_PAREN_STAR (SIZEOF_PUSH_COMMAND + \ SIZEOF_SAVE_OPEN_PAREN_COMMAND + \ SIZEOF_SAVE_CLOSE_PAREN_PUSH_JUMP) #else # define EXTRA_FOR_PAREN_STAR (SIZEOF_JUMP_COMMAND + SIZEOF_PUSH_COMMAND) #endif /* * Something like 'a|b|c' gets compiled into something like * * PUSH (refer to "alt-1:") * * JUMP to "donewithalt:" * alt-1: * PUSH (refer to "alt-2:") * * JUMP to "donewithalt:" * alt-2: * * donewithalt: * * So each but the last alternative needs an extra PUSH and JUMP: */ #define EXTRA_FOR_EACH_BRACE (SIZEOF_JUMP_COMMAND + SIZEOF_PUSH_COMMAND) /* * During a compile (regcomp), we need some memory at times that we'll * want to free when the compile is done. TEMP_MEMORY points to a linked * list of memory to be freed. */ static char **temp_memory = 0; /* * If the user sets this to a function, it will be called upon a memory * error. It shouldn't return. */ void (*regex_memory_error)(void) = 0; static void *xmalloc(unsigned size) { void *mem = (void*)malloc(size); if (mem == 0) { if (regex_memory_error) (*regex_memory_error)(); die("[regex package out of memory]\n"); } return mem; } /* * Allocate temporary memory that will be freed when * regex_free_temp_memory is later called. */ static void * regex_malloc(unsigned size) { char **mem = xmalloc(size + /*enough for hidden link*/ sizeof(mem)); *(char ***)mem = temp_memory; /* Link the chain to the new memory */ temp_memory = mem; /* and the new memory to the chain. */ return &mem[1]; /* Return the requested non-hidden memory */ } /* * Free whatever memory has been allocated via regex_malloc. */ static void regex_free_temp_memory(void) { while (temp_memory) { char *tmp = (char *)temp_memory; temp_memory = (char **)*temp_memory; free(tmp); } } /* * regcomp_flags: set at the start of regcomp() to the user-defined flags * so that all functions called from within regcomp() can have access as * well. */ static unsigned regcomp_flags; /* * part_of_word: and array (subscripted by an 'unsigned char' value) to * indicate if the character (or any EUC whose high byte is the subscript) * should be considered part of a word. */ unsigned char jregex_part_of_word[256]; /* * When doing a case-insensitive match, case_translation[c] * (when c is an unsigned char) will return the same 'c' except for * when it's an upper case letter, in which case it will return the * lower-case counterpart. */ static unsigned char case_translation[256]; #define KATA_DASH_HI_BYTE 0241 /* byte#1 byte for EUC dash */ #define KATA_DASH_LO_BYTE 0274 /* byte#2 byte for EUC dash */ #define ROMAN_HI_BYTE 0243 /* high byte for alphabet EUC */ #define EUC_KATAKANA 0245 #define EUC_HIRAGANA 0244 #define LEAST_KANJI_HI_BYTE 0260 #define GREATEST_KANJI_HI_BYTE 0364 #define NOT_JREGEX_PART_OF_WORD 0 #define JREGEX_PART_OF_WORD 1 #define PART_OF_JAPANESE_WORD 2 /* * Initialize the jregex_part_of_word and case_translation arrays. */ static __inline__ void regex_init(void) { static int init_done = 0; int i; if (init_done) return; /* this not really needed, but for good form... clear out array */ for (i = 0; i < sizeof_array(jregex_part_of_word); i++) jregex_part_of_word[i] = NOT_JREGEX_PART_OF_WORD; for (i = 'a'; i <= 'z'; i++) /* Lowercase letters are parts of words */ jregex_part_of_word[i] = JREGEX_PART_OF_WORD; for (i = 'A'; i <= 'Z'; i++) /* Uppercase letters are parts of words */ jregex_part_of_word[i] = JREGEX_PART_OF_WORD; for (i = '0'; i <= '9'; i++) /* Numbers are parts of words */ jregex_part_of_word[i] = JREGEX_PART_OF_WORD; jregex_part_of_word['_'] = JREGEX_PART_OF_WORD; /* underscore is part of word */ jregex_part_of_word[ROMAN_HI_BYTE] = JREGEX_PART_OF_WORD; /* Roman letters are. */ /* kana and kanji are parts of their own kind of words... */ jregex_part_of_word[EUC_HIRAGANA] = PART_OF_JAPANESE_WORD; jregex_part_of_word[EUC_KATAKANA] = PART_OF_JAPANESE_WORD; jregex_part_of_word[KATA_DASH_HI_BYTE] = PART_OF_JAPANESE_WORD; for (i = LEAST_KANJI_HI_BYTE; i <= GREATEST_KANJI_HI_BYTE; i++) jregex_part_of_word[i] = PART_OF_JAPANESE_WORD; /* * Set up case_translation[] to reflect * c == case_translation[isupper(c) : tolower(c) : c] */ for (i = 0; i < sizeof_array(case_translation); i++) case_translation[i] = i; for (i = 'A'; i <= 'Z'; i++) case_translation[i] = 'a' + (i - 'A'); case_translation[EUC_KATAKANA] = EUC_HIRAGANA; init_done = 1; /* mark so we won't do again */ } struct dynamic_string { unsigned char *buf; unsigned short buf_used; unsigned short buf_length; }; #define STRING_BUFFER_INCREMENT 32 static __inline__ void add_char_to_string(const unsigned char *char_ptr, struct dynamic_string *dstr) { unsigned char_len = EUC_CHAR_LENGTH(char_ptr[0]); if (dstr->buf_used + char_len > dstr->buf_length) { unsigned char *old = dstr->buf; dstr->buf_length += STRING_BUFFER_INCREMENT; dstr->buf = regex_malloc(dstr->buf_length); bcopy(old, dstr->buf, dstr->buf_used); } dstr->buf[dstr->buf_used++] = char_ptr[0]; if (char_len > 1) { dstr->buf[dstr->buf_used++] = char_ptr[1]; if (char_len > 2) dstr->buf[dstr->buf_used++] = char_ptr[2]; } } /* * Before writing the compiled regex buffer, we read the pattern and * create an intermediate representation, which is a tree of * the following structures. */ struct intermediate { enum TYPE type; /* type of this node */ unsigned char count; /* ONCE, ZERO_OR_ONE, etc... */ /* What must match after this note matches. For example, in the pattern 'abc', there will be a node EXACT1("a") whose NEXT will point to an EXACT1("b") node whose NEXT will ..... */ struct intermediate *next; /* ALT and OPEN_PAREN nodes are parent nodes. The children and their siblings (via NEXT) all have pointers back to the PARENT. */ const struct intermediate *parent; /* The children of an ALT (various alternatives, as from 'a|b|c') have siblings (other alternatives) which use this pointer. */ struct intermediate *nextalt; /* Maximum number of bytes this node and all subsequent childeren and siblings will need in the final compiled regex. */ unsigned regex_bytes_needed; /* The minimum number of bytes that actually must be matched before a complete match can be acheived. For example, it would be 3,2, and 1 for the 'a', 'b', and 'c' nodes of the pattern "abc". This can be used for a regexec optimization... if there aren't that many bytes left in the string to match, we know it's a failure */ unsigned min_match_len; /* false for things that don't have to match any characters in a string, such as anything with '*' or '?' tacked onto the end. */ int must_match; /* Depending upon what the TYPE is, one of the following might be valid */ union { /* For ALT types. Points to the first node in a list (where the list is then traversed via NEXTALT) of alternatives. */ struct intermediate *alt; /* * PAREN points to the first node of a list (then traversed via * NEXT) of what's enclosed in one level of parenthesis, the level * number being in LEVEL. */ struct { struct intermediate *paren; #ifndef NO_PAREN_INFO unsigned char level; unsigned char max_internal_level; char real; /* false for /(?....)/ uses */ #endif } paren_info; /* The character(s) to match for the EXACT* types */ unsigned char exact[4]; /* For classes, a bit more info... */ struct class { char inverted; /* true if an inverted [^....] class */ unsigned char ascii[128]; /* ascii[c] true if c in class */ struct dynamic_string euc2; struct dynamic_string euc3; } *class; #ifndef NO_PAREN_INFO unsigned prev_paren_to_match; #endif } u; }; #ifndef NO_PAREN_INFO /* * Level of next set of parens in pattern as we read. */ static unsigned paren_level; static unsigned paren_levels_finished; static signed int highest_prev_to_match; /* * Whenever, during compiled regex execution, we enter a new set of parens, we * have to save some info on an internal stack. With a pattern such as「a(.)c」 * we'll only ever have one "instance" of a possible value for the set of * parens. However, with a pattern such as「a(.)*c」we'll have to keep multiple * instances. If the string to match is "ac1c2x", what will finally be matched * is "ac1c" and \1 would become "1". However, we don't find that out until * we'd gotten to the end of the string and realized that there wasn't another * `c' to be had. At that point, the stack of "instances" would have been that * \1 might have ended up being "x", "2", "c", "1", or "c". * * The upshot of this is that with a pattern such this, you might need as much * as one stack element per character in the checked string. But with a pattern * such as「a((((.))))*c」we might need up to four stack elements per * character. * * Since it's faster to allocate the max possible space at the beginning of * the execution rather than continually waste time during the execution by * trying to dynamically keep the stack large enough, we want a method of * knowing the maximum number of levels possible per character (Levels Per * Character -- lpc). This floating point value is represented in the compiled * regex as two integers L (levels) and C (characters) and is re-computed * as L/C. * * If 'len' is the length of a line to check against the regex, * roundup(len / C) * L * is the maximum number of levels possibly required at any point during * subsequent execution of the compiled regex pattern. */ static double max_lpc; static unsigned max_lpc_l, max_lpc_c; #endif /* * Which of REGEX_MATCH or REGEX_MATCH_NOTE_WORD we'll use for the * regex buffer we'll be creating. */ static enum TYPE this_regex_match; /* * Will be set to an appropriate error code within the routines that make * up the compile code, to be returned by regcomp once control gets back * up there. */ unsigned int regcomp_error = 0; const unsigned char *regcomp_eptr; const unsigned char *regcomp_last_pattern; const unsigned char *regcomp_error_report(void); /* forward */ #define ERROR_(VAL, LOC, RETVAL) \ macro_start { \ regcomp_error = (VAL); \ regcomp_eptr = (LOC); \ return RETVAL; \ } macro_end #define ERROR(VAL, LOC) ERROR_((VAL), (LOC), 0) /* * Allocate and return a new new intermediate node structure. * The node is zeroed out before being returned. */ static struct intermediate * new_comp_struct(void) { struct intermediate *ptr = regex_malloc(sizeof(struct intermediate)); bzero(ptr, sizeof(struct intermediate)); return ptr; } static int next_quoted_character(const unsigned char **ptr_p, unsigned char *out) { int len; #define isodigit(c) ((c) >= '0' && (c) <= '7') #define odigit_val(c) ((c) - '0') #define p (*ptr_p) if (*p == '\\') { p++; /* skip the backslash */ /* accept a two- or three-digit octal value */ if (isodigit(p[0]) && isodigit(p[1])) { unsigned num = odigit_val(p[0]) * 8 + odigit_val(p[1]); if (isodigit(p[2])) { num = num * 8 + odigit_val(p[2]); p++; } p += 2; if (num > 0xff) ERROR(REGCOMP_INVALID_OCTAL_VALUE, p); out[0] = num; return 1; } switch (*p) { case 't': p++; out[0] = '\t'; return 1; /* tab */ case 'n': p++; out[0] = '\n'; return 1; /* newline */ case 'r': p++; out[0] = '\r'; return 1; /* carriage return */ case 'f': p++; out[0] = '\f'; return 1; /* form feed */ } } switch(len = EUC_CHAR_LENGTH(p[0])) { default: ERROR(REGCOMP_CORRUPTED_TEXT, p); case 3: out[2] = p[2]; /*fallthrough to next case */ case 2: out[1] = p[1]; /*fallthrough to next case */ case 1: out[0] = p[0]; } p += len; return len; #undef p } /* * Function to parse a class [....] spec from a pattern. * Given an intermediate node (in which to fill in the 'struct class' info) * and the pointer to the "[...]" in the pattern. Returns the pointer to * the character just after the end of the "[...]". */ __inline__ static const unsigned char * nab_class(struct intermediate *inter, const unsigned char *p) { unsigned char current[3]; int char1_count = 0; int char2_count = 0; int char3_count = 0; int lastascii = -1; int ascii = 0; /* doesn't need initialized except to quiet gcc warnings */ const unsigned char *pat; struct class *class = regex_malloc(sizeof(struct class)); unsigned i; /* general use */ inter->type = CLASS; inter->must_match = 1; bzero(inter->u.class = class, sizeof(struct class)); /* If first char is '^', it's inverted. */ if (*p == '^') { class->inverted = 1; p++; /* skip '^' */ } pat = p; /* run through the class specification in the pattern */ do { /* Look for special things that we don't want to count as * regular characters */ if (p[0] == '\\') { switch (p[1]) { case 'd': /* \d means "digit" */ for (i = '0'; i <= '9'; i++) class->ascii[i] = 1; char1_count += 10; p += 2; lastascii = -1; continue; case 'w': /* \w means ASCII word */ for (i = 0; i < 127; i++) if (jregex_part_of_word[i]) { char1_count++; class->ascii[i] = 1; } p += 2; lastascii = -1; continue; case 's': /* whitespace */ class->ascii['\t'] = class->ascii['\n'] = class->ascii['\r'] = class->ascii['\f'] = class->ascii[' ' ] = 1; char1_count += 5; lastascii = -1; p += 2; continue; } } if (lastascii != -1 && p[0] == '-') { unsigned start, end; p++; /* skip range indicator '-' */ if (*p == 0) ERROR(REGCOMP_CORRUPTED_TEXT, p); if (next_quoted_character(&p, current) != 1) ERROR(REGCOMP_EUC_IN_CLASS_RANGE, p); if (current[0] >= 0x80) ERROR(REGCOMP_CORRUPTED_TEXT, p); start = (unsigned char)lastascii < current[0] ? lastascii : current[0]; end = (unsigned char)lastascii > current[0] ? lastascii : current[0]; for (i = start; i <= end; i++) { char1_count++; class->ascii[i] = 1; if (regcomp_flags & REGCOMP_IGNORE_ALPHA_CASE) { if (islower(i)) class->ascii[toupper(i)] = 1; else if (isupper(i)) class->ascii[tolower(i)] = 1; } } lastascii = -1; continue; } if (*p == 0) ERROR(REGCOMP_UNCLOSED_CLASS, p); switch(next_quoted_character(&p, current)) { default: ERROR(REGCOMP_CORRUPTED_TEXT, p); case 1: if (current[0] & 0x80) ERROR(REGCOMP_CORRUPTED_TEXT, p); char1_count++; class->ascii[ascii = lastascii = current[0]] = 1; if (regcomp_flags & REGCOMP_IGNORE_ALPHA_CASE) { if (isupper(current[0])) class->ascii[ascii = tolower(current[0])] = 1; else if (islower(current[0])) class->ascii[toupper(current[0])] = 1; } continue; case 2: /* deal with kana folding */ if ((regcomp_flags & REGCOMP_IGNORE_KANA_CASE) && current[0] == EUC_KATAKANA) { current[0] = EUC_HIRAGANA; } if (!twobyte_char_in_string(current[0], current[1], class->euc2.buf, class->euc2.buf + class->euc2.buf_used)) { char2_count++; add_char_to_string(current, &class->euc2); } lastascii = -1; continue; case 3: if (!threebyte_char_in_string(current[0], current[1], current[3], class->euc2.buf, class->euc2.buf + class->euc2.buf_used)) { char3_count++; add_char_to_string(current, &class->euc3); } lastascii = -1; continue; } } while (*p && *p != ']'); /* did we run off the end of the pattern specifier? */ if (*p == 0) ERROR(REGCOMP_UNCLOSED_CLASS, pat); p++; /* skip past the close bracket */ #if 1 if (!(char2_count * 2 == class->euc2.buf_used) || !(char3_count * 2 == class->euc3.buf_used)) { outputf("char2_count=%d, class->euc2.buf_used = %d\n", char2_count, class->euc2.buf_used); outputf("char3_count=%d, class->euc3.buf_used = %d\n", char3_count, class->euc3.buf_used); } #endif assert(char2_count * 2 == class->euc2.buf_used); assert(char3_count * 2 == class->euc3.buf_used); if (char1_count + char2_count + char3_count == 0) ERROR(REGCOMP_EMPTY_CLASS, pat); if (class->inverted || char1_count) inter->min_match_len = 1; else if (char2_count) inter->min_match_len = 2; else inter->min_match_len = 3; inter->regex_bytes_needed = DETERMINISTIC_CLASS_SIZE + char2_count * 2 + char3_count * 3; /* optim */ if (!class->inverted && char1_count == 0 && char3_count == 0 && char2_count == 2) { /* * If ignoring case and one or both is kana, we can't do this. */ if (!(regcomp_flags & REGCOMP_IGNORE_KANA_CASE) || ((class->euc2.buf[0] != EUC_HIRAGANA) && (class->euc2.buf[0] != EUC_KATAKANA) && (class->euc2.buf[2] != EUC_HIRAGANA) && (class->euc2.buf[2] != EUC_KATAKANA))) { inter->type = EITHER_EXACT_2; /* make an EITHER_EXACT_2 */ inter->u.exact[0] = class->euc2.buf[0]; inter->u.exact[1] = class->euc2.buf[1]; inter->u.exact[2] = class->euc2.buf[2]; inter->u.exact[3] = class->euc2.buf[3]; inter->regex_bytes_needed = TYPE_BYTE_SIZE + 2 * EUC_SPEC_SIZE; } } else if (!class->inverted && char1_count + char3_count + char2_count == 1) { /* well, just make this an EXACT */ if (char1_count == 1) { if ((regcomp_flags & REGCOMP_IGNORE_ALPHA_CASE) && isalpha(ascii)) inter->type = EXACT_ICASE_ASCII; else inter->type = EXACT1; inter->u.exact[0] = ascii; inter->regex_bytes_needed = TYPE_BYTE_SIZE + ASCII_SPEC_SIZE; } else if (char2_count == 1) { if ((regcomp_flags & REGCOMP_IGNORE_KANA_CASE) && (class->euc2.buf[0] == EUC_HIRAGANA)) inter->type = EXACT_ICASE_KANA; else inter->type = EXACT2; inter->u.exact[0] = class->euc2.buf[0]; inter->u.exact[1] = class->euc2.buf[1]; inter->regex_bytes_needed = TYPE_BYTE_SIZE + EUC_SPEC_SIZE; } else if (char3_count == 1) { inter->type = EXACT3; inter->u.exact[0] = class->euc3.buf[0]; inter->u.exact[1] = class->euc3.buf[1]; inter->u.exact[2] = class->euc3.buf[2]; inter->regex_bytes_needed = TYPE_BYTE_SIZE + 3; } else { assert(0); } } /* end optim */ return p; } static __inline__ int set_mustmatch(struct intermediate *i) { kibishii_assert(i != 0); if (i->next == 0) return i->must_match; else return i->must_match |= set_mustmatch(i->next); } #define PAT (*pp) /* forward */ static struct intermediate *nibble_from_pattern(const unsigned char **pp); static struct intermediate * nibble_list(const unsigned char **pp, const struct intermediate *parent) { const unsigned char *start_PAT = PAT; struct intermediate *listhead = 0; /* head of current list. */ struct intermediate *lastlist; /* last element in the list [x] */ struct intermediate *alt = 0; /* encompasing ALT note, if exists */ struct intermediate *lastalt; /* last alternative in list [x] */ /* * [x] -- "may be used uninitialized" Ok. */ struct intermediate *to_be_returned = 0; /* what we'll eventually return */ #define update_stats_for_current_alternative() \ macro_start { \ set_mustmatch(listhead); \ if (alt->min_match_len > listhead->min_match_len) \ alt->min_match_len = listhead->min_match_len; \ alt->regex_bytes_needed += \ listhead->regex_bytes_needed + EXTRA_FOR_EACH_BRACE; \ alt->must_match &= listhead->must_match; \ } macro_end /* * Nibble away at the pattern, adding to our alternative until * we hit the next alternative, or run out of pattern space, * or leave a level of parenthesis. */ for (;;) { struct intermediate *new; /* if have a '|' now, it's because of a "||", which is illegal */ if (PAT[0] == '|') ERROR(REGCOMP_BAD_BRACE, PAT); if (PAT[0] == ')') { /* * We'll be leaving, so update status info in the ALT node * from the currently-being-built alternative (whose stats * are in LISTHEAD) */ if (alt && listhead) update_stats_for_current_alternative(); if (parent == 0) ERROR(REGCOMP_UNMATCHED_CLOSE_PAREN, PAT); #ifndef NO_PAREN_INFO if (parent->u.paren_info.real) paren_levels_finished = parent->u.paren_info.level +1; #endif if (to_be_returned == 0) { /* make a dummy empty node */ to_be_returned = new_comp_struct(); to_be_returned->type = UNUSED; } PAT++; /* skip past closed paren */ return to_be_returned; } #if 1 if (PAT[0] == '$' && PAT[1] == '\0' && alt) { /* * We'll be leaving, so update status info in the ALT node * from the currently-being-built alternative (whose stats * are in LISTHEAD) */ if (listhead) update_stats_for_current_alternative(); if (parent != 0) return to_be_returned; else { lastlist = alt; listhead = to_be_returned; alt = 0; } } #endif /* get next bit of the pattern */ if (new = nibble_from_pattern(pp), new == 0) return 0; new->parent = alt ? alt : parent; if (alt && (new->type == this_regex_match || (new->type == EXACT1 && new->u.exact[0] == '\n'))) { if (listhead) update_stats_for_current_alternative(); listhead = to_be_returned; lastlist = alt; alt = 0; } if (listhead == 0) { listhead = lastlist = new; if (alt) { lastalt->nextalt = listhead; lastalt = new; } } else { if (regcomp_flags & REGCOMP_FUZZY_KANA_REPETITION) { /* * Allow 々 to match where appropriate. * When set, 時々 and 時時 will match each-other. * * If the new and the last node are both EXACT2 and the * last one was a regular kanji (high byte 0xb0 or greater) * and the new node is either the same character or 々, * turn the new one into a char-or-々 node. */ if (lastlist->type == EXACT2 && new->type == EXACT2 && lastlist->u.exact[0] >= 0xb0 && ((new->u.exact[0] == lastlist->u.exact[0] && new->u.exact[1] == lastlist->u.exact[1]) || (new->u.exact[0] == ((unsigned char *)"々")[0] && new->u.exact[1] == ((unsigned char *)"々")[1]))) { new->type = EITHER_EXACT_2; new->u.exact[0] = lastlist->u.exact[0]; new->u.exact[1] = lastlist->u.exact[1]; new->u.exact[2] = "々"[0]; new->u.exact[3] = "々"[1]; new->regex_bytes_needed = TYPE_BYTE_SIZE + 2*EUC_SPEC_SIZE; } } lastlist->next = new; lastlist = new; listhead->regex_bytes_needed += new->regex_bytes_needed; listhead->min_match_len += new->min_match_len; listhead->must_match |= new->must_match; } if (new->type == this_regex_match) { if (parent) ERROR(REGCOMP_UNMATCHED_OPEN_PAREN, start_PAT - 1); set_mustmatch(to_be_returned); return to_be_returned; } if (to_be_returned == 0) to_be_returned = listhead; if (PAT[0] == '|') { /* * Starting another alternative. */ if (alt != 0) { /* for non-first alternatives */ update_stats_for_current_alternative(); } else { /* * Hitting this '|' has told us that the stuff we'd just * nibbled up was all a first-alternative, with the 2nd * (and possibly more) to follow. * * We have to make a new ALT node, and make everything to * the left (in the pattern) that we just read the first * child of that ALT node, with the next alternative(s) * that child's siblings. * * We have to install the ALT node in place of what was * removed to become the first child. */ struct intermediate *ptr; /* make new structure or alt node */ alt = new_comp_struct(); alt->type = ALT; alt->parent = parent; /* set its parent */ /* * Now, rather than returning a whole list, we'll return * just the ALT node, whose children will be the various * lists.... */ if (to_be_returned == listhead) to_be_returned = alt; /* note the first alternative */ set_mustmatch(listhead); lastalt = alt->u.alt = listhead; /* alt node now becomes parent for all elements in that list */ for (ptr = listhead; ptr; ptr = ptr->next) ptr->parent = alt; alt->min_match_len = listhead->min_match_len; alt->regex_bytes_needed = listhead->regex_bytes_needed; alt->must_match = listhead->must_match; } listhead = 0; PAT++; /* skip past '|' */ } } } /* * Nibble one node worth of the (pointer to a pointer to the) pattern. * If the pattern is sitting on something enclosed in parens, the entire * thing is nibbled recursively and the one OPEN_PAREN node (with its children * being the "entire thing") is returned. */ static struct intermediate * nibble_from_pattern(const unsigned char **pp) { unsigned char this[3]; struct intermediate *new = new_comp_struct(); /* * We don't worry about freeing up the node memory if we abort early... * regex_free_temp_memory() will take care of it all for us. */ /* look for special things in the pattern */ switch (PAT[0]) { case '\0': new->type = this_regex_match; new->regex_bytes_needed = TYPE_BYTE_SIZE; new->min_match_len = 0; return new; case '|': /* should never run into this here */ case ')': /* should never run into this here */ assert(0); break; case '.': new->type = ANY_CHAR; new->regex_bytes_needed = TYPE_BYTE_SIZE; new->min_match_len = 1; ++PAT; /* skip past '.' */ goto allow_starplus; /* can have a +, *, or ? appended */ case '<': case '>': ++PAT; /* skip past '<' */ word_boundary: new->type = WORD_BOUNDARY; new->must_match = 1; new->regex_bytes_needed = TYPE_BYTE_SIZE; return new; case '$': new->type = END_OF_LINE; new->must_match = 1; new->regex_bytes_needed = TYPE_BYTE_SIZE; ++PAT; /* skip past '$' */ return new; case '^': new->type = START_OF_LINE; new->must_match = 1; new->regex_bytes_needed = TYPE_BYTE_SIZE; ++PAT; /* skip past '^' */ return new; case '*': case '+': case '?': /* these should never appear here */ ERROR(REGCOMP_MISUSED_COUNT_SPEC, PAT); /* notreached */ case '[': if (PAT = nab_class(new, PAT+1), PAT == 0) return 0; goto allow_starplus; /* can have a +, *, or ? appended */ case '(': ++PAT; /* skip past open paren */ if (PAT[0] != '?' || PAT[1] != ':') /* perl5's grouping-only parens */ new->u.paren_info.real = 1; else { PAT += 2; new->u.paren_info.real = 0; } new->type = OPEN_PAREN; #ifndef NO_PAREN_INFO if (new->u.paren_info.real) new->u.paren_info.level = paren_level++; #endif /* nibble entire contents of group */ if ((new->u.paren_info.paren = nibble_list(pp,new)) == 0) return 0; /* first child node will have list totals */ new->must_match = new->u.paren_info.paren->must_match; new->min_match_len = new->u.paren_info.paren->min_match_len; new->regex_bytes_needed = new->u.paren_info.paren->regex_bytes_needed; #ifndef NO_PAREN_INFO if (new->u.paren_info.real) { new->u.paren_info.max_internal_level = paren_level - 1; /* * If will be writing the two commands to for noting the * position of a paren start and end, allot the space * for them. */ if (regcomp_flags & REGCOMP_SAVE_MATCHED_PAREN_INFO) new->regex_bytes_needed += TYPE_BYTE_SIZE * 2 + 1 * 2; } #endif goto allow_starplus; /* can have a +, *, or ? appended */ case '\\': /* backslash + non-zero digit + non-octal-digit == match paren */ if (PAT[1] != '0' && isdigit(PAT[1]) && !isodigit(PAT[2])) { /* match paren #%d exactly */ #ifndef NO_PAREN_INFO if ((regcomp_flags & REGCOMP_SAVE_MATCHED_PAREN_INFO) == 0) #endif ERROR(REGCOMP_NEED_SAVE_PAREN_INFO, PAT); #ifndef NO_PAREN_INFO new->type = MATCH_PREV_PAREN; new->u.prev_paren_to_match = PAT[1] - '0' - 1; if (new->u.prev_paren_to_match > paren_levels_finished) ERROR(REGCOMP_PAREN_LEVEL_INVALID, PAT); if ((signed int)new->u.prev_paren_to_match>highest_prev_to_match) highest_prev_to_match = new->u.prev_paren_to_match; new->regex_bytes_needed = TYPE_BYTE_SIZE + PAREN_SPEC_SIZE; new->min_match_len = 0; /* actually could figure this out */ new->must_match = 1; PAT += 2; /* skip past \digit */ goto allow_starplus; /* Can't have a +, *, or ? appended, but * we'll get an error if there is one */ #endif } switch(PAT[1]) { case 'b': PAT += 2; goto word_boundary; case 'a': /* ASCII (except newline) character */ case 'H': /* anything not hiragana */ case 'K': /* anything not katakana */ case 'C': /* anything not kanji */ switch(PAT[1]) { case 'a': new->type = ANY_ASCII; break; case 'H': new->type = ANY_NONHIRAGANA; break; case 'K': new->type = ANY_NONKATAKANA; break; case 'C': new->type = ANY_NONKANJI; break; } new->regex_bytes_needed = TYPE_BYTE_SIZE; new->min_match_len = 1; PAT += 2; /* skip past '\a', etc. */ goto allow_starplus; /* can have a +, *, or ? appended */ case 'A': /* multibyte character */ case 'h': /* any (multibyte) hiragana */ case 'k': /* any (multibyte) katakana */ case 'c': /* any (multibyte) kanji */ switch (PAT[1]) { case 'A': new->type = ANY_MULTIBYTE; break; case 'h': new->type = ANY_HIRAGANA; break; case 'k': new->type = ANY_KATAKANA; break; case 'c': new->type = ANY_KANJI; break; } new->regex_bytes_needed = TYPE_BYTE_SIZE; new->min_match_len = 2; PAT += 2; /* skip past '\A' */ goto allow_starplus; /* can have a +, *, or ? appended */ case 'd': /* digit */ case 'w': /* ascii word */ case 's': /* whitespace */ case 'D': /* not digit */ case 'W': /* not ascii word */ case 'S': /* not whitespace */ { /* we construct a character class specifier which is the same and use that... */ int inverted = isupper(PAT[1]); unsigned char class[4]; class[0] = '\\'; class[1] = inverted ? tolower(PAT[1]) : PAT[1]; class[2] = ']'; class[3] = '\0'; if (nab_class(new, class) == 0) return 0; new->u.class->inverted = inverted; new->min_match_len = 1; PAT += 2; /* skip past \d, etc. */ goto allow_starplus; /* can have a +, *, or ? appended */ } } } /* was nothing special.... nab the next character as-is */ switch(next_quoted_character(&PAT, this)) { default: ERROR(REGCOMP_CORRUPTED_TEXT, PAT); case 1: { unsigned char c = this[0]; /* if ignoring alpha-case, and this is a letter... */ if ((regcomp_flags & REGCOMP_IGNORE_ALPHA_CASE) && isascii(c) && isalpha(c)) { new->type = EXACT_ICASE_ASCII; if (isupper(c)) c = tolower(c); } else { new->type = EXACT1; } new->u.exact[0] = c; } new->min_match_len = 1; new->regex_bytes_needed = TYPE_BYTE_SIZE + 1; break; case 2: { unsigned char hi = this[0]; /* if ignoring kana case, and this is a kana.... */ if ((regcomp_flags & REGCOMP_IGNORE_KANA_CASE) && (hi == EUC_HIRAGANA || hi == EUC_KATAKANA)) { new->type = EXACT_ICASE_KANA; hi = EUC_HIRAGANA; } else { new->type = EXACT2; } new->u.exact[0] = hi; new->u.exact[1] = this[1]; } new->min_match_len = 2; new->regex_bytes_needed = TYPE_BYTE_SIZE + 2; break; case 3: new->type = EXACT3; new->u.exact[0] = this[0]; new->u.exact[1] = this[1]; new->u.exact[2] = this[2]; new->min_match_len = 3; new->regex_bytes_needed = TYPE_BYTE_SIZE + 3; break; } goto allow_starplus; /* can have a +, *, or ? appended */ /***************************************************/ allow_starplus: /* Whatever just preceeded is allowed to have a +, *, or ? after it. If it's there, act appropriately. If it's a simple node, it'll just need to have its COUNT updated, but if it's an OPEN PAREN type, we'll have to note the extra space we'll need for that in the pattern. Also, for * and ?, since even a null string will match something marked with one of these, we can note that the must_match and min_match_len are false and zero, respectively. */ switch (PAT[0]) { case '+': new->count = ONE_OR_MORE; ++PAT; /* skip the '+' */ if (new->type == OPEN_PAREN) { if (new->min_match_len == 0) ERROR(REGCOMP_INFINITE_PLUS, PAT); new->regex_bytes_needed += EXTRA_FOR_PAREN_PLUS; } break; case '*': new->count = ANY_NUMBER; ++PAT; /* skip the '*' */ if (new->type == OPEN_PAREN) { if (new->min_match_len == 0) ERROR(REGCOMP_INFINITE_STAR, PAT); new->regex_bytes_needed += EXTRA_FOR_PAREN_STAR; } new->min_match_len = 0; new->must_match = 0; break; case '?': new->count = ZERO_OR_ONE; ++PAT; /* skip the '?' */ new->min_match_len = 0; new->must_match = 0; if (new->type == OPEN_PAREN) new->regex_bytes_needed += EXTRA_FOR_PAREN_QUESTION; break; default: new->count = ONCE; if (new->type == OPEN_PAREN) new->regex_bytes_needed += EXTRA_FOR_RAW_PAREN; break; } /* note that we must match anthing with a non-zero minimum match length */ if (new->min_match_len != 0) new->must_match = 1; return new; } /* forward */ static void write_regex_buffer(const struct intermediate *, unsigned char **, int, unsigned char **); /* I should probably just make this a global.... */ #define outbuf (*bb) /* * [this a separate function just for visual convenience... * only called from one place] * * Given an intermediate node of type CLASS, write out the compiled * form to the given buffer. */ static __inline__ void write_regex_buffer_for_class(const struct intermediate *inter, unsigned char **bb) { const struct class *class = inter->u.class; unsigned i; /* * Figure out what type of CLASS_* node this really is and dump * that type byte. */ *outbuf++ = make_type_byte_with_count(CLASS, inter->count); *outbuf++ = class->inverted; write_short_val(outbuf, class->euc2.buf_used); outbuf += SHORT_VAL_SIZE; write_short_val(outbuf, class->euc3.buf_used); outbuf += SHORT_VAL_SIZE; for (i = 0; i < sizeof(class->ascii); i++) *outbuf++ = class->ascii[i]; for (i = 0; i < class->euc2.buf_used; i++) *outbuf++ = class->euc2.buf[i]; for (i = 0; i < class->euc3.buf_used; i++) *outbuf++ = class->euc3.buf[i]; } static __inline__ unsigned char * consolidate_musthave(unsigned char *range1_start, const unsigned char *range1_end, const unsigned char *range2_start, const unsigned char *range2_end) { unsigned char *dest = range1_start; #if 0 outputf("consolidate [%.*s] [%.*s]\n", range1_end - range1_start, range1_start, range2_end - range2_start, range2_start); #endif while (range1_start < range1_end) { switch(EUC_CHAR_LENGTH(range1_start[0])) { default: assert(0); range1_start += 1; break; case 1: if (onebyte_char_in_string(range1_start[0], range2_start, range2_end)) *dest++ = range1_start[0]; range1_start++; break; case 2: if (twobyte_char_in_string(range1_start[0],range1_start[1], range2_start, range2_end)) { *dest++ = range1_start[0]; *dest++ = range1_start[1]; } range1_start += 2; break; case 3: if (threebyte_char_in_string( range1_start[0], range1_start[1], range1_start[2], range2_start, range2_end)) { *dest++ = range1_start[0]; *dest++ = range1_start[1]; *dest++ = range1_start[2]; } range1_start += 3; break; } } #if 0 outputf("yields [%.*s]\n", dest - range1_start, range1_start); #endif return dest; } /* * Given an intermediate node which is one of the alternatives for * a set of alternatives, output the alternative along with the appropriate * PUSHs and JUMPs to facilitate the selection process. All but the last * alternative will have a preceding PUSH to say ``if I fail, try the * next alternative'' and a following JUMP to indicate ``if I succeed, * jump past all the other alternatives that we don't need anymore''. * * If noomit ("no omit") is true, we are not allowed to optimize the * compiled pattern by replacing the ``jump past other alternatives'' * with a REGEX_MATCH (because the jump would just jump to a REGEX_MATCH). * Noomit will be true when the parent ALT is not followed by something * that must match. * * If musthave is nonzero, it's a pointer to a pointer to a buffer where * we can stuff characters that we know must be part of the string * in order to match. In the case of alternatives, a character must be * required in all alternatives to be required in the whole string. * We'll accomplish this by noting where the first alternative's required * characters are saved (musthave_base), then getting the next alternative's * required characters (starting at musthave_altbase), then erasing from * the first set's those characters not in the second set. */ static void write_regex_buffer_for_alt(const struct intermediate *inter, unsigned char **bb, int noomit, unsigned char **musthave) { unsigned char *push, *jump = 0; unsigned char *musthave_base = musthave ? *musthave : 0; unsigned char *musthave_altbase; /* if last alternative, just write the code */ if (inter->nextalt == 0) { write_regex_buffer(inter, &outbuf, noomit, musthave); return; } /* note location and alot space for a PUSH */ push = outbuf; outbuf += TYPE_BYTE_SIZE + SHORT_VAL_SIZE; /* write the alternative */ write_regex_buffer(inter, &outbuf, noomit, musthave); musthave_altbase = musthave ? *musthave : 0; if (!noomit) { /* since we're done, put a REGEX_MATCH */ *outbuf++ = make_type_byte(this_regex_match); } else { /* save space for a JUMP (to jump over later alternatives) */ jump = outbuf; outbuf += TYPE_BYTE_SIZE + SHORT_VAL_SIZE; } /* fill in previous PUSH to note this location */ *push = make_type_byte(PUSH); write_short_val(&push[1], outbuf - push); /* write the rest of the alternatives */ write_regex_buffer_for_alt(inter->nextalt, &outbuf, noomit, musthave); if (noomit) { /* have previous JUMP come here... after all alternatives */ *jump = make_type_byte(JUMP); write_short_val(&jump[1], outbuf - jump); } /* * If keeping must-have character information, update as appropriate * to a set of alternatives. * * Remove from the range [musthave_base - musthave_altbase] those * chars that are not in [musthave_altbase - *musthave], then * put *musthave to point at the end of what we've kept. */ if (musthave_base) { #ifndef NDEBUG if (regcomp_flags & REGCOMP_DEBUG) { const unsigned char *p1; output("intersection of ["); for (p1 = musthave_base; p1 < musthave_altbase; p1++) if (*p1) outchar(*p1); output("] and ["); for (p1 = musthave_altbase; p1 < *musthave; p1++) if (*p1) outchar(*p1); output("] yields "); } #endif *musthave = consolidate_musthave(musthave_base, musthave_altbase, musthave_altbase, *musthave); #ifndef NDEBUG if (regcomp_flags & REGCOMP_DEBUG) { unsigned char *p1; outchar('['); for (p1 = musthave_base; p1 < *musthave; p1++) if (*p1) outchar(*p1); output("]\n"); } #endif } } /* * Main compiled-pattern writing routine. * If noomit ("no omit") is true, our parent is followed by something that * must match, so we can't optimize by omitting "useless" commands that * trail the "useful" ones. These "useless" ones are ones which will match * anything (i.e. even the null string) and can never change the truthfulness * of a match or fail... i.e. 'abc' vs 'abc(xyz)?'. In the latter, the * final "(xyz)?" is completely useless. */ static void write_regex_buffer(const struct intermediate *inter, unsigned char **bb, int noomit, unsigned char **musthave) { assert(inter != 0); while (inter) { /* * We can't do any omitting if any of: * - our parent told us not to (via noomit). * - the flag isn't set to allow us to. * - there is a "next" sibling and it must match */ int must_not_omit = noomit || !(regcomp_flags & REGCOMP_JUST_MATCH) || (inter->next && inter->next->must_match); /* * But if we *can* omit, and this doesn't need to match, we * can just output a REGEX_MATCH. */ if (!must_not_omit && !inter->must_match) { *outbuf++ = make_type_byte(this_regex_match); return; } switch(inter->type) { default: outputf("<>\n", inter->type, __LINE__); break; case UNUSED: break; case REGEX_MATCH: case REGEX_MATCH_NOTE_WORD: case WORD_BOUNDARY: case END_OF_LINE: case START_OF_LINE: /* pretty simple stuff... */ *outbuf++ = make_type_byte(inter->type); break; case ANY_CHAR: case ANY_ASCII: case ANY_MULTIBYTE: case ANY_KATAKANA: case ANY_NONKATAKANA: case ANY_HIRAGANA: case ANY_NONHIRAGANA: case ANY_KANJI: case ANY_NONKANJI: *outbuf++ = make_type_byte_with_count(inter->type, inter->count); break; case OPEN_PAREN: { /* must treat each COUNT differently */ #ifdef NO_PAREN_INFO #define PAREN_INFO(stuff) { /* nothing */ } #else int real = inter->u.paren_info.real && (regcomp_flags & REGCOMP_SAVE_MATCHED_PAREN_INFO); #define PAREN_INFO(stuff) if (real) { stuff; } #endif switch (inter->count) { case ONCE: PAREN_INFO( *outbuf++ = make_type_byte(SAVE_OPEN_PAREN); *outbuf++ = inter->u.paren_info.level; ) write_regex_buffer(inter->u.paren_info.paren, &outbuf, must_not_omit, musthave); PAREN_INFO( *outbuf++ = make_type_byte(SAVE_CLOSE_PAREN); *outbuf++ = inter->u.paren_info.level; ) break; case ANY_NUMBER: { /* * Any number means that zero is also ok, so first do a * push so that it can backtrack to skipping this part * of the regex entirely. Then put the regex, and a * pushjump back to the regex to try again. */ unsigned char *push_loc = outbuf; unsigned char *mark1_loc; unsigned char *jump_loc; *outbuf++ = make_type_byte(PUSH); outbuf += SHORT_VAL_SIZE; mark1_loc = outbuf; PAREN_INFO( mark1_loc = outbuf; *outbuf++ = make_type_byte(SAVE_OPEN_PAREN); *outbuf++ = inter->u.paren_info.level; ) /* write the group regex */ write_regex_buffer(inter->u.paren_info.paren, &outbuf, must_not_omit, 0); #ifndef NO_PAREN_INFO if (real) { unsigned l = inter->u.paren_info.max_internal_level - inter->u.paren_info.level + 1; unsigned c = inter->u.paren_info.paren->min_match_len; double lpc = l / (double)c; jump_loc = outbuf; *outbuf++ = make_type_byte(SAVE_CLOSE_PAREN_PUSH_JUMP); *outbuf++ = l; *outbuf++ = inter->u.paren_info.level; if (lpc > max_lpc) { max_lpc = lpc; max_lpc_l = l; max_lpc_c = c; } } else #endif { jump_loc = outbuf; *outbuf++ = make_type_byte(PUSH_JUMP); } write_short_val(outbuf, mark1_loc - jump_loc); outbuf += SHORT_VAL_SIZE; /* fill in the offset of the PUSH */ write_short_val(&push_loc[TYPE_BYTE_SIZE], outbuf - push_loc); break; } case ZERO_OR_ONE: { /* * Since zero is OK, start off with a push past * the regex, then the regex. */ unsigned char *marker = outbuf; #ifndef NO_PAREN_INFO if (real) { *outbuf = make_type_byte(PUSH_SAVE_OPEN_PAREN); outbuf += TYPE_BYTE_SIZE + SHORT_VAL_SIZE; *outbuf++ = inter->u.paren_info.level; } else #endif { /* write the PUSH and remember where it is */ *outbuf = make_type_byte(PUSH); outbuf += TYPE_BYTE_SIZE + SHORT_VAL_SIZE; } /* write the group regex */ write_regex_buffer(inter->u.paren_info.paren, &outbuf, must_not_omit, 0); PAREN_INFO( *outbuf++ = make_type_byte(SAVE_CLOSE_PAREN); *outbuf++ = inter->u.paren_info.level; ) /* fill in the offset of the PUSH */ write_short_val(&marker[TYPE_BYTE_SIZE], outbuf - marker); break; } case ONE_OR_MORE: { /* * After doing the regex once, do a push to indicate * that the once has been achieved, then jump to try * again. This is optimized as a PUSH_JUMP */ unsigned char *marker = outbuf; unsigned char *base; PAREN_INFO( *outbuf++ = make_type_byte(SAVE_OPEN_PAREN); *outbuf++ = inter->u.paren_info.level; ) /* write the group regex */ write_regex_buffer(inter->u.paren_info.paren, &outbuf, must_not_omit, musthave); base = outbuf; #ifndef NO_PAREN_INFO if (real) { unsigned l = inter->u.paren_info.max_internal_level - inter->u.paren_info.level + 1; unsigned c = inter->u.paren_info.paren->min_match_len; double lpc = l/(double)c; *outbuf++ = make_type_byte(SAVE_CLOSE_PAREN_PUSH_JUMP); *outbuf++ = l; *outbuf++ = inter->u.paren_info.level; if (lpc > max_lpc) { max_lpc = lpc; max_lpc_l = l; max_lpc_c = c; } } else #endif { *outbuf++ = make_type_byte(PUSH_JUMP); } write_short_val(outbuf, marker - base); outbuf += SHORT_VAL_SIZE; break; } } break; } case CLASS: /* just call our helper routine */ write_regex_buffer_for_class(inter, &outbuf); break; case EXACT1: case EXACT_ICASE_ASCII: *outbuf++ = make_type_byte_with_count(inter->type, inter->count); *outbuf++ = inter->u.exact[0]; if (musthave && !(inter->count & ZERO_OK)) *(*musthave)++ = inter->u.exact[0]; break; case EXACT2: case EXACT_ICASE_KANA: *outbuf++ = make_type_byte_with_count(inter->type, inter->count); *outbuf++ = inter->u.exact[0]; *outbuf++ = inter->u.exact[1]; if (musthave && !(inter->count & ZERO_OK)) { *(*musthave)++ = inter->u.exact[0]; *(*musthave)++ = inter->u.exact[1]; } break; case EXACT3: *outbuf++ = make_type_byte_with_count(inter->type, inter->count); *outbuf++ = inter->u.exact[0]; *outbuf++ = inter->u.exact[1]; *outbuf++ = inter->u.exact[2]; if (musthave && !(inter->count & ZERO_OK)) { *(*musthave)++ = inter->u.exact[0]; *(*musthave)++ = inter->u.exact[1]; *(*musthave)++ = inter->u.exact[2]; } break; case EITHER_EXACT_2: *outbuf++ = make_type_byte_with_count(inter->type, inter->count); *outbuf++ = inter->u.exact[0]; *outbuf++ = inter->u.exact[1]; *outbuf++ = inter->u.exact[2]; *outbuf++ = inter->u.exact[3]; break; case ALT: { /* * Deal with a bunch (2+) of alternatives. * The 'final' bit is just an optimization. All alternatives * but the last end with a jump to whatever follows all the * alternatives. If we determine that there's nothing * important after this, we'll just skip the jump and put * a REGEX_MATCH there. */ write_regex_buffer_for_alt(inter->u.alt, &outbuf, must_not_omit, musthave); } break; case MATCH_PREV_PAREN: *outbuf++ = make_type_byte(MATCH_PREV_PAREN); *outbuf++ = inter->u.prev_paren_to_match; break; } inter = inter->next; } } #ifndef NDEBUG /* * For debugging. */ static void show_intermediate_pattern(const struct intermediate *inter, unsigned level) { if (inter == 0) { unsigned i; for (i = 0; i < level; i++) outchar('|'); output(" <>\n"); return; } while (inter) { const char *countmemo; unsigned i; for (i = 0; i < level; i++) outchar('0'+i); outputf("[%c r=%02d m=%02d] ", inter->must_match ? '!': ' ', inter->regex_bytes_needed, inter->min_match_len); switch(inter->count) { case ONCE: countmemo = "" ; break; case ANY_NUMBER: countmemo = " (*)"; break; case ZERO_OR_ONE: countmemo = " (?)"; break; case ONE_OR_MORE: countmemo = " (+)"; break; default: countmemo = "????????????"; break; } switch(inter->type) { default: outputf("<>\n", inter->type, __LINE__); break; case UNUSED: output("\n"); break; case REGEX_MATCH: output("regex match\n"); break; case REGEX_MATCH_NOTE_WORD: output("regex match (note word)\n"); break; case ANY_CHAR: outputf("any char%s\n", countmemo);break; case ANY_ASCII: outputf("any ASCII%s\n", countmemo);break; case ANY_MULTIBYTE: outputf("any multibyte%s\n", countmemo);break; case ANY_KATAKANA: outputf("any katakana%s\n", countmemo);break; case ANY_NONKATAKANA:outputf("any non-katakana%s\n",countmemo);break; case ANY_HIRAGANA: outputf("any hiragana%s\n", countmemo);break; case ANY_NONHIRAGANA:outputf("any non-hiragana%s\n",countmemo);break; case ANY_KANJI: outputf("any kanji%s\n", countmemo);break; case ANY_NONKANJI: outputf("any non-kanji%s\n", countmemo);break; case WORD_BOUNDARY: output("word boundary\n"); break; case END_OF_LINE: output("end of line\n"); break; case START_OF_LINE: output("start of line\n"); break; case OPEN_PAREN: #ifndef NO_PAREN_INFO outputf("group [level %u - %u]%s\n", inter->u.paren_info.level, inter->u.paren_info.max_internal_level, countmemo); #else outputf("group %s\n", countmemo); #endif show_intermediate_pattern(inter->u.paren_info.paren, level+1); break; #ifndef NO_PAREN_INFO case MATCH_PREV_PAREN: outputf("match previous paren group %d\n", inter->u.prev_paren_to_match); break; #endif case CLASS: { if (inter->u.class->inverted) output("inverted "); outputf("class%s「", countmemo); for (i = 0; i < 128; i++) { if (inter->u.class->ascii[i]) if (isprint(i)) outchar(i); else outputf("\\%03o", i); } for (i = 0; i < inter->u.class->euc2.buf_used; i++) outchar(inter->u.class->euc2.buf[i]); for (i = 0; i < inter->u.class->euc3.buf_used; i++) outchar(inter->u.class->euc3.buf[i]); output("」\n"); } break; case EXACT1: outputf("exact1「%c」%s\n", inter->u.exact[0], countmemo); break; case EXACT_ICASE_ASCII: outputf("ignore-case ascii「%c」%s\n",inter->u.exact[0],countmemo); break; case EXACT2: outputf("exact2「%c%c」%s\n", inter->u.exact[0], inter->u.exact[1], countmemo); break; case EITHER_EXACT_2: outputf("exact2「%c%c」*or*「%c%c」%s\n", inter->u.exact[0], inter->u.exact[1], inter->u.exact[2], inter->u.exact[3], countmemo); break; case EXACT_ICASE_KANA: outputf("kana (ignore case)「%c%c」%s\n", inter->u.exact[0], inter->u.exact[1], countmemo); break; case EXACT3: outputf("exact3「%c%c%c」%s\n", inter->u.exact[0], inter->u.exact[1], inter->u.exact[2], countmemo); break; case ALT: { struct intermediate *ptr = inter->u.alt; output("start of alternatives\n"); for (;;) { show_intermediate_pattern(ptr, level + 1); for (i = 0; i < level; i++) outchar('|'); if (ptr->nextalt == 0) break; output("---------------------\n"); ptr = ptr->nextalt; } output("end of alternatives\n"); break; } } inter = inter->next; } } #endif /* * Wow, this is it. * Given a null-terminated pattern, a REGEX_T to fill, and some flags, * return one of the REGCOMP_* return values (i.e. REGCOMP_SUCCESS) */ int regcomp(regex_t *r, const unsigned char *pattern, unsigned flags) { const struct intermediate *compiled; const unsigned char *orig_pattern = pattern; int retval; regcomp_last_pattern = pattern; if (pattern == 0 || r == 0) return regcomp_error = REGCOMP_INVALID_DATA; bzero(r, sizeof(*r)); regex_init(); /* Make sure this has been done. */ regcomp_flags = flags; /* So everyone else can know, too. */ regcomp_error = 0; /* No false alarms, please. */ #ifndef NO_PAREN_INFO paren_level = paren_levels_finished = 0; highest_prev_to_match = -1; max_lpc = max_lpc_c = max_lpc_l = 0; #endif this_regex_match = (flags & REGCOMP_WANT_WORD_MATCH_INFO) ? REGEX_MATCH_NOTE_WORD : REGEX_MATCH; DEBUGSTUFF(if (flags & REGCOMP_DEBUG) outputf("FLAGS %x PATTERN %s\n", flags, pattern);) if (pattern[0] == '\0') return regcomp_error = REGCOMP_EMPTY_PATTERN; compiled = nibble_list(&pattern, 0); /* process that baby */ if (regcomp_error) retval = regcomp_error; else if (compiled == 0) retval = REGCOMP_INTERNAL_ERROR; else { /* * It compiled well into the intermediate form. Now output to * the final compiled form. */ unsigned char *buffer = xmalloc(compiled->regex_bytes_needed); unsigned char *musthave, **mh_ptr = 0; /* * Simple optimization: if the regex (or every top-level alternative) * begins with START_OF_LINE, then set r->anchor_to_start_of_line. */ if (compiled->type == START_OF_LINE) r->anchor_to_start_of_line = 1; else if (compiled->type == ALT) { const struct intermediate *ptr = compiled->u.alt; r->anchor_to_start_of_line = 1; while (ptr) { if (ptr->type != START_OF_LINE) { r->anchor_to_start_of_line = 0; break; } ptr = ptr->nextalt; } } DEBUGSTUFF(if (flags & REGCOMP_DEBUG) show_intermediate_pattern(compiled, 1);) r->fold_acase = (flags & REGCOMP_IGNORE_ALPHA_CASE) ? 1 : 0; r->fold_kcase = (flags & REGCOMP_IGNORE_KANA_CASE) ? 1 : 0; if ((flags & REGCOMP_CALC_MUSTHAVE) == 0) r->musthave = 0; else { r->musthave = xmalloc((unsigned)strlen((void*)orig_pattern)+1); musthave = r->musthave; mh_ptr = &musthave; } r->buf = buffer; write_regex_buffer(compiled, &buffer, /*noomit*/0, mh_ptr); r->bufend = buffer; r->min_length_match = compiled->min_match_len; #ifndef NO_PAREN_INFO r->max_paren_level = paren_level; r->max_lpc_l = max_lpc_l; r->max_lpc_c = max_lpc_c; r->paren_info_required = highest_prev_to_match + 1; assert(r->paren_info_required == 0 || (flags & REGCOMP_SAVE_MATCHED_PAREN_INFO)); #endif /* squish out repeates and empties from the musthave list */ if (r->musthave && r->musthave != musthave) { unsigned char *start = r->musthave; unsigned char *end = musthave; /* changed in write_regex_buffer */ unsigned char *ptr = start + EUC_CHAR_LENGTH(start[0]); unsigned char *dest = ptr; while (ptr < end) switch(EUC_CHAR_LENGTH(ptr[0])) { default: warn("[internal error %s:%d; width=%d, first byte=\\%03o]\n", __FILE__, __LINE__, EUC_CHAR_LENGTH(ptr[0]), ptr[0]); ptr += EUC_CHAR_LENGTH(ptr[0]) ? EUC_CHAR_LENGTH(ptr[0]) : 1; break; case 1: if (!onebyte_char_in_string(ptr[0], start, dest)) *dest++ = ptr[0]; ptr += 1; break; case 2: if (!twobyte_char_in_string(ptr[0], ptr[1], start, dest)) { *dest++ = ptr[0]; *dest++ = ptr[1]; } ptr += 2; break; case 3: if (!threebyte_char_in_string(ptr[0], ptr[1], ptr[2], start, dest)) { *dest++ = ptr[0]; *dest++ = ptr[1]; *dest++ = ptr[2]; } ptr += 3; break; } musthave = dest; } if (r->musthave) *musthave = 0; /* cap off end of list */ retval = REGCOMP_SUCCESS; } DEBUGSTUFF(if (flags & REGCOMP_DEBUG) output((const char *)regcomp_error_report());) regex_free_temp_memory(); /* make sure to free our temp'ly-used memory */ return retval; } const char *regcomp_errstr[] = { "success", "internal error", "invalid data", "empty pattern", "unmatched [", "unmatched open paren", "unmatched close paren", "misused +,*, or ?", "object of + could be empty", "object of * could be empty", "empty class", "misused |", "nonexistent paren'ed expression", "need SAVE_PAREN_INFO with this pattern", "Japanese character in class range", "invalid octal value", "corrupted text" }; const unsigned char *regcomp_error_report(void) { static unsigned char *report = 0; int pat_len = regcomp_last_pattern ? strlen((void*)regcomp_last_pattern) : 0; const char *str; if (report) { free((void*)report); report = 0; } switch(regcomp_error) { default: str = "regcomp「%s」returns error code %d.\n"; report = xmalloc(strlen((void*)str) + pat_len + 10 + 1); sprintf(report, str, regcomp_last_pattern, regcomp_error); return report; case REGCOMP_SUCCESS: str = "regcomp「%s」returns success\n"; report = xmalloc(strlen((void*)str) + pat_len + 1); sprintf(report, str, regcomp_last_pattern); return report; case REGCOMP_INVALID_DATA: str = "regcomp (pat=0x%00000008x) barfs on bad data\n"; report = xmalloc(strlen((void*)str) + 1); sprintf(report, str, regcomp_last_pattern); return report; case REGCOMP_EMPTY_PATTERN: return (const unsigned char *)"regcomp barfs on empty pattern\n"; case REGCOMP_UNMATCHED_OPEN_PAREN: case REGCOMP_UNMATCHED_CLOSE_PAREN: case REGCOMP_MISUSED_COUNT_SPEC: case REGCOMP_UNCLOSED_CLASS: case REGCOMP_EUC_IN_CLASS_RANGE: case REGCOMP_INTERNAL_ERROR: case REGCOMP_INFINITE_PLUS: case REGCOMP_INFINITE_STAR: case REGCOMP_PAREN_LEVEL_INVALID: case REGCOMP_NEED_SAVE_PAREN_INFO: case REGCOMP_CORRUPTED_TEXT: case REGCOMP_BAD_BRACE: case REGCOMP_INVALID_OCTAL_VALUE: case REGCOMP_EMPTY_CLASS: str = "regcomp error: %s\n" " pattern「%s」\n"; report = xmalloc(strlen((void*)str) + strlen((void*)regcomp_errstr[regcomp_error]) + pat_len + 20 + pat_len + 1); sprintf(report,str,regcomp_errstr[regcomp_error],regcomp_last_pattern); if (regcomp_eptr - regcomp_last_pattern <= pat_len) { int i = regcomp_eptr - regcomp_last_pattern; strcat(report, " before --"); while (i-- > 0) strcat(report, "-"); strcat(report, "^\n"); } return report; } } /* * Free any (non-temporary) memory associated with the given REGEX_T. * The REGEX_T itself is not freed, however, as it wasn't allocated here. * Safe to re-free a free'd regex. */ void regfree(regex_t *r) { if (r) { if (r->buf) free(r->buf); if (r->musthave) free(r->musthave); r->buf = r->musthave = 0; } } const unsigned char *regmusthave(const regex_t *r) { return r->musthave; } #ifndef NO_SHOWREGEX /* * Another debug routine... this time prints a real compiled pattern. */ static void showbuf(const unsigned char *origb, const unsigned char *b, const unsigned char *b_end, const char *margin) { while (b < b_end) { enum TYPE type = get_type_from_type_byte(*b); unsigned char count = get_count_from_type_byte(*b); if (margin) output(margin); outputf(" %3ld ", b - origb); b += TYPE_BYTE_SIZE; switch (type) { default: outputf("unknown code [%x|%x]", type, count); break; case REGEX_MATCH_NOTE_WORD: output("match (note word)"); break; case REGEX_MATCH: output("match"); break; case EXACT_ICASE_ASCII: outputf("exact ascii (ignore case) [%c] ", *b); b += ASCII_SPEC_SIZE; break; case EXACT1: outputf("exact1「%c」", *b); b += ASCII_SPEC_SIZE; break; case ANY_CHAR: output("ANY CHAR "); break; case ANY_ASCII: output("ANY ASCII"); break; case ANY_MULTIBYTE: output("ANY MULTIBYTE"); break; case ANY_KATAKANA: output("ANY KATAKANA"); break; case ANY_NONKATAKANA: output("ANY NON-KATAKANA"); break; case ANY_HIRAGANA: output("ANY HIRAGANA"); break; case ANY_NONHIRAGANA: output("ANY NON-HIRAGANA"); break; case ANY_KANJI: output("ANY KANJI"); break; case ANY_NONKANJI: output("ANY NON-KANJI"); break; case EXACT_ICASE_KANA: outputf("exact kana (ignore case) 「%c%c」", b[0], b[1]); b += EUC_SPEC_SIZE; break; case EXACT2: outputf("exact2 「%c%c」", b[0], b[1]); b += EUC_SPEC_SIZE; break; case EITHER_EXACT_2: outputf("exact2 「%c%c」*or*「%c%c」", b[0], b[1], b[2], b[3]); b += 2 * EUC_SPEC_SIZE; break; case EXACT3: outputf("exact3 「%c%c%c」", b[0], b[1], b[3]); b += 3; break; case CLASS: { int i; int inverted = b[0]; unsigned short euc2_len = read_short_val(b+1); unsigned short euc3_len = read_short_val(b+3); outputf("%sclass「", inverted ? "inverted " : ""); b += 5; /* skip above stuff */ for (i = 0; i < 128; i++) { if (b[i]) { if (isprint(i)) outchar(i); else outputf("\\%03o", i); } } b += 128; while (euc2_len-- != 0) outchar(*b++); while (euc3_len-- != 0) outchar(*b++); output("」"); } break; case WORD_BOUNDARY: output("word boundary"); break; case START_OF_LINE: output("start of line"); break; case END_OF_LINE: output("end of line"); break; case PUSH: outputf("push %ld", (b - TYPE_BYTE_SIZE) - origb + read_short_val(b)); b += SHORT_VAL_SIZE; break; case JUMP: outputf("jump to %ld", (b - TYPE_BYTE_SIZE) - origb + read_short_val(b)); b += SHORT_VAL_SIZE; break; case PUSH_JUMP: outputf("pushjump to %ld", (b - TYPE_BYTE_SIZE) - origb + read_short_val(b)); b += SHORT_VAL_SIZE; break; #ifndef NO_PAREN_INFO case SAVE_CLOSE_PAREN_PUSH_JUMP: outputf("save close paren %d, count of %d, pushjump to %ld", b[1], b[0], (b - TYPE_BYTE_SIZE) - origb + read_short_val(b+2)); b += PAREN_COUNT_SIZE + PAREN_SPEC_SIZE + SHORT_VAL_SIZE; break; case PUSH_SAVE_OPEN_PAREN: outputf("push to %ld, save open paren %d", (b - TYPE_BYTE_SIZE) - origb + read_short_val(b), b[SHORT_VAL_SIZE]); b += SHORT_VAL_SIZE + PAREN_SPEC_SIZE; break; case SAVE_OPEN_PAREN: outputf("save open paren %d", b[0]); b += PAREN_SPEC_SIZE; break; case SAVE_CLOSE_PAREN: outputf("save close paren %d", b[0]); b += PAREN_SPEC_SIZE; break; case MATCH_PREV_PAREN: outputf("match paren group %d", b[0]); b += PAREN_SPEC_SIZE; break; #endif } if (type < count_ok_limit) switch (count) { default: outputf("<>", b[-1]); break; case ONCE: output(" "); break; case ZERO_OR_ONE: output(" ?"); break; case ONE_OR_MORE: output(" +"); break; case ANY_NUMBER: output(" *"); break; } outchar('\n'); } } void showregex(const regex_t *r) { outputf("Minimum length match: %d\n", r->min_length_match); if (r->anchor_to_start_of_line) output("START OF LINE ONLY\n"); if (r->musthave) outputf("A line must have [%s]\n", r->musthave); #ifndef NO_PAREN_INFO outputf("Max paren level %d, max lpc is %d/%d.\n", r->max_paren_level, r->max_lpc_l, r->max_lpc_c); if (r->paren_info_required) outputf("requires paren info for %d parens\n", r->paren_info_required); #endif showbuf(r->buf, r->buf, r->bufend, ""); } #endif /* compile routines above */ /****************************************************************************/ /****************************************************************************/ /****************************************************************************/ /* execute routines below */ #ifndef FAST_REGEXEC unsigned int special_debug; #endif static unsigned int regexec_flags = 0; /* set the regexec_flags, returning old value. */ unsigned int regexec_setflags(unsigned flags) { unsigned int old = regexec_flags; regexec_flags = flags; return old; } #ifndef NO_REGEXEC_MATCH_POINTS const unsigned char *regexec_match_start; const unsigned char *regexec_match_end; int regexec_match_at_start_of_word, regexec_match_at_end_of_word; #endif #ifndef NO_PAREN_INFO unsigned regexec_paren_info_used = 0; #ifndef NO_DEFAULT_PAREN_INFO #ifndef DEFAULT_PAREN_INFO_SIZE # define DEFAULT_PAREN_INFO_SIZE 10 #endif matched_paren_t default_regexec_paren_info[DEFAULT_PAREN_INFO_SIZE]; matched_paren_t *regexec_paren_info = &default_regexec_paren_info[0]; unsigned regexec_paren_info_size = DEFAULT_PAREN_INFO_SIZE; #else matched_paren_t *regexec_paren_info = 0; unsigned regexec_paren_info_size = 0; #endif #ifndef FAST_REGEXEC static void report_new_pareninfo(const char *message, unsigned num) { if (message == 0) message = ""; if (num >= regexec_paren_info_size) outputf("paren_info[%d] %s OUT OF RANGE (%d)\n", num, message, regexec_paren_info_size); else if (regexec_paren_info[num].match_start == 0 || regexec_paren_info[num].match_end == 0) outputf("paren_info[%d] %s \n", num, message); else outputf("paren_info[%d] %s, now [%.*s]\n", num, message, (int)(regexec_paren_info[num].match_end - regexec_paren_info[num].match_start), regexec_paren_info[num].match_start); } #endif /* FAST_REGEXEC */ #endif /* NO_PAREN_INFO */ /* XXX - need to deal here with three-byte EUC */ #ifdef DONT_WORRY_ABOUT_KATAKANA_DASH_BEING_PART_OF_A_WORD #define word_boundary(line) (jregex_part_of_word[(line)[-1]] != \ jregex_part_of_word[(line)[0]]) #else static __inline__ int word_boundary(const unsigned char *line) { unsigned l = line[0]; unsigned c = line[-1]; if (c & 0x80) { /* special case for katakana and 'ー' (both part of word) */ unsigned char c2 = c; c = line[-2]; if (c == KATA_DASH_HI_BYTE && l == EUC_KATAKANA && c2 == KATA_DASH_LO_BYTE) return 0; if (l == KATA_DASH_HI_BYTE && c == KATA_DASH_HI_BYTE && c2 == KATA_DASH_LO_BYTE) return 0; } return jregex_part_of_word[c] != jregex_part_of_word[l]; } #endif /* * Execute the given compiled REGEX_T on the given LINE (of the given LENGTH) * and return true if it matches, false otherwise. */ unsigned int regexec(const regex_t *r, const unsigned char *line, unsigned len) { const unsigned char *b = r->buf; /* compiled pattern */ const unsigned char *orig_line = line; /* just to save */ const unsigned char *end_of_line = &line[len]; /* note end of line */ #define bol(line) ((line) == orig_line) #define eol(line) ((line) >= end_of_line) #ifndef NO_LLM_SUPPORT /* llm is true if doing a longest leftmost match */ const unsigned char llm = (regexec_flags & REGEXEC_LLM) ? 1 : 0; const unsigned char *longest_match_so_far = 0; /* for when llm is true */ #endif #define is_word_boundary(line) \ ( \ (orig_line != end_of_line) && \ ( \ (bol(line) && jregex_part_of_word[(line)[0]]) || \ (eol(line) && jregex_part_of_word[end_of_line[-1]]) || \ word_boundary(line) \ ) \ ) /* * Since we know the length of the shortest string possible, we can * figure out the latest place in our string where we can possibly * start a match from. However, if we can only match from the start * of a line, the LATEST_START will then be the start of the line. * Whichever is the case, note it. */ const unsigned char *latest_start = r->anchor_to_start_of_line ? line : end_of_line - r->min_length_match; /* * The LINE_HEAD will be from where we try to apply the regex, and what, * if failing, we bump up to try again from a new position (until we run * out of string [i.e. try to start beyond latest_start] and finally * fail completely). */ const unsigned char *line_head = line; static unsigned maxstates = 0; static struct statestruct { const unsigned char *b; const unsigned char *line; #ifndef NO_PAREN_INFO signed short highest_paren_seen; unsigned char pushed_paren_count; unsigned char pushed_paren_start; #endif } *statestack_start = 0, *statestack_end; struct statestruct *state; /* we'll need at most one more than one state per char in the input line */ /* -- Mmm, maybe need two per char -- actually, probably dependent on max paren depth */ unsigned int maxstates_wanted = 2 * len + 10; /* extra for good measure */ #ifndef NO_PAREN_INFO signed short highest_paren_seen = -1; static unsigned parenstate_size = 0; static const unsigned char **parenstate_base = 0; #ifndef NDEBUG static const unsigned char **parenstate_end; #endif const unsigned char **parenstate; /* "used uninitialized" OK here. */ const unsigned char **max_parenstate; if (r->max_lpc_l) { /* make sure we have pleanty of paren-state stuff */ unsigned max_parenstates_needed = 2 /* 2 for good measure */ + 2 * ((len + r->max_lpc_c - 1)/r->max_lpc_c * r->max_lpc_l); if (max_parenstates_needed > parenstate_size) { if (parenstate_base) free(parenstate_base); parenstate_size = max_parenstates_needed; parenstate_base = xmalloc(sizeof(unsigned char *)*parenstate_size); #ifndef NDEBUG parenstate_end = &parenstate_base[parenstate_size]; #endif } parenstate = parenstate_base; } max_parenstate = parenstate_base; #define push_paren_state(NUM) \ macro_start { \ unsigned int _num_ = (NUM); \ STATS(regex_stats.parens_pushed++); \ FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) { \ outputf("push_paren_state #%ld (%d) %lx\n", \ parenstate - parenstate_base, _num_, \ (unsigned long)regexec_paren_info[_num_].match_start); \ report_new_pareninfo("push", _num_); \ }) \ assert(r->max_lpc_l); \ assert(parenstate < parenstate_end); \ parenstate[0] = regexec_paren_info[_num_].match_start; \ parenstate[1] = regexec_paren_info[_num_].match_end; \ parenstate += 2; \ if (parenstate > max_parenstate) \ max_parenstate = parenstate; \ regexec_paren_info[_num_].match_end = 0; \ } macro_end #define pop_paren_state(NUM) \ macro_start { \ unsigned int _num_ = (NUM); \ STATS(regex_stats.parens_popped++); \ parenstate -= 2; \ assert(r->max_lpc_l); \ assert(parenstate >= parenstate_base); \ regexec_paren_info[_num_].match_start = parenstate[0]; \ regexec_paren_info[_num_].match_end = parenstate[1]; \ FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) { \ outputf("pop_paren_state #%ld (%d) %lx\n", parenstate - \ parenstate_base, _num_, (unsigned long)parenstate[0]); \ report_new_pareninfo("pop", _num_); \ }) \ } macro_end #endif if (maxstates_wanted > maxstates) { /* need more memory. First get rid of any we had before */ if (statestack_start) free(statestack_start); maxstates = maxstates_wanted; statestack_start = xmalloc(sizeof(struct statestruct) * maxstates); #ifndef NDEBUG statestack_end = &statestack_start[maxstates]; #endif } state = &statestack_start[0]; #define state_count() (state - statestack_start) #define raw_push_state(B, L) \ macro_start { \ STATS(regex_stats.states_pushed++; \ if (state_count() > regex_stats.max_state_depth) \ regex_stats.max_state_depth = state_count()); \ FASTDEBUGSTUFF( \ assert(state < statestack_end); \ if (regexec_flags & REGEXEC_DEBUG) \ outputf("state #%ld now <%ld,%ld>\n", \ state_count(), (B) - r->buf, (L) - orig_line);) \ state->b = (B); \ state->line = (L); \ state++; \ } macro_end #define raw_pop_state(B, L) \ macro_start { \ --state; \ (B) = state->b; \ (L) = state->line; \ STATS(regex_stats.states_popped++); \ } macro_end #ifndef NO_PAREN_INFO #define push_state_with_paren_info(B, L, COUNT, START) \ macro_start { \ unsigned int _count = (COUNT); /* get arg */ \ unsigned int _start = (START); /* get arg */ \ state->highest_paren_seen = highest_paren_seen; \ if (_start >= regexec_paren_info_size) \ state->pushed_paren_count = 0; /* nothing to save */ \ else \ { \ unsigned _end_ = _start + _count; \ if (_end_ > regexec_paren_info_size) \ _count -= _end_ - regexec_paren_info_size; \ \ state->pushed_paren_start = _start; \ state->pushed_paren_count = _count; \ do { \ push_paren_state(start++); \ } while (--_count); \ } \ raw_push_state(B, L); \ } macro_end #define push_state(B, L) \ macro_start { \ state->highest_paren_seen = highest_paren_seen; \ state->pushed_paren_count = 0; \ raw_push_state(B, L); \ } macro_end #define pop_state(B, L) \ macro_start { \ raw_pop_state(B, L); \ highest_paren_seen = state->highest_paren_seen; \ \ if (state->pushed_paren_count) \ { \ int _count = state->pushed_paren_count; \ unsigned int _end = state->pushed_paren_start + _count - 1; \ if (_end >= regexec_paren_info_size) { \ _count -= _end - regexec_paren_info_size; \ _end = regexec_paren_info_size; \ } \ if (_count > 0) { \ pop_paren_state(_end); \ while (--_count > 0) \ pop_paren_state(--_end); \ } \ } \ } macro_end #else /* NO_PAREN_INFO */ #define pop_state(B, L) raw_pop_state(B, L); #define push_state(B, L) raw_push_state(B, L); #endif /* * We make ample use of the preprocessor here, since there is much * regularity in what goes on while checking the regex to the string. * Much code is repeated to avoid having to make decisions at regex * match time.... should speed things up quite a bit. */ /* * The four macros do_ONCE, do_ONE_OR_MORE, do_ANY_NUMBER, and * do_ZERO_OR_ONE represent the basic way the matching is done for * each of a simple match, `+', `*', and `?'. * * The arguments to these macros are, for any one kind of test, * * EXTRA_REGEX_BYTES: the number of bytes (besides the type-byte) * that this instance of the regex command set uses. * * TEST the test that indicates if the regex matches * at this point in the string. * * STRING_INCR How much to bump along the string when a match * is successful. */ #ifndef NO_REGEX_STATS # define do_TEST(TEST) (regex_stats.tests++, (TEST)) # define do_TEST_noEOL(TEST) (regex_stats.tests++, (!eol(line)&&(TEST))) #else # define do_TEST(TEST) (TEST) # define do_TEST_noEOL(TEST) (!eol(line) && (TEST)) #endif #define do_ONCE(EXTRA_REGEX_BYTES, TEST, STRING_INCR) \ /* if the test fails, we have no match */ \ if (!do_TEST_noEOL(TEST)) \ goto nomatch; \ \ /* Otherwise, bump along the regex buffer pointer */ \ /* the string to reflect that the match has succeeded */ \ b_if_match = b + TYPE_BYTE_SIZE + (EXTRA_REGEX_BYTES); \ line += (STRING_INCR); \ goto match; #define do_ONE_OR_MORE(EXTRA_REGEX_BYTES, TEST, STRING_INCR) \ /* if the test fails, we have no match */ \ if (!do_TEST_noEOL(TEST)) \ goto nomatch; \ \ /* at this point we know we'll match. */ \ b_if_match = b + TYPE_BYTE_SIZE + (EXTRA_REGEX_BYTES); \ \ /* Since it's "or more", continue matching and bumping along */ \ /* the string so long as the test matches. We'll push the state */ \ /* after all but the first match, since all but the first are */ \ /* optional and we may have to backtrack and retry w/o having */ \ /* taken the match */ \ while (line += STRING_INCR, do_TEST_noEOL(TEST)) \ push_state(b_if_match, line); \ STATS( \ regex_stats.states_pushed++; \ regex_stats.states_popped++; \ ) \ goto match; #define do_ANY_NUMBER(EXTRA_REGEX_BYTES, TEST, STRING_INCR) \ STATS( \ regex_stats.states_pushed++; \ regex_stats.states_popped++; \ ) \ \ /* since it's any number, zero is fine to, so we *know* we match */ \ b_if_match = b + TYPE_BYTE_SIZE + (EXTRA_REGEX_BYTES); \ \ /* nibble along string, pushing state, so long as we match */ \ if (do_TEST_noEOL(TEST)) do { \ push_state(b_if_match, line); \ } while (line += (STRING_INCR), do_TEST_noEOL(TEST)); \ /* if we failed because we were at the end of the line, there */ \ /* was an effective push/test/fail/pop that we optimized out, */ \ /* so we account for them here */ \ goto match; #define do_ZERO_OR_ONE(EXTRA_REGEX_BYTES, TEST, STRING_INCR) \ /* since zero is fine, we *know* we match */ \ b_if_match = b + TYPE_BYTE_SIZE + (EXTRA_REGEX_BYTES); \ /* but if the test is OK, push state and nibble */ \ if (do_TEST_noEOL(TEST)) { \ push_state(b_if_match, line); \ line += (STRING_INCR); \ } \ STATS( else { \ regex_stats.states_pushed++; \ regex_stats.states_popped++; \ }) \ goto match; /* * The following just applies the four above to the case statement * below. A, B, and C are EXTRA_REGEX_BYTES, TEST, and STRING_INC, * but I couldn't make it fit prettily (-: * * For fun, check out what the output of the preprocessor looks like. */ #define case_major(MAJOR, PRE, A, B, C) \ case make_type_byte_with_count(MAJOR, ONCE ): \ { PRE; do_ONCE(A,B,C) } \ case make_type_byte_with_count(MAJOR, ZERO_OR_ONE): \ { PRE; do_ZERO_OR_ONE(A,B,C) } \ case make_type_byte_with_count(MAJOR, ONE_OR_MORE): \ { PRE; do_ONE_OR_MORE(A,B,C) } \ case make_type_byte_with_count(MAJOR, ANY_NUMBER ): \ { PRE; do_ANY_NUMBER (A,B,C) } #ifndef NO_REGEXEC_MATCH_POINTS regexec_match_start = line; #endif #ifndef NO_PAREN_INFO if (regexec_paren_info == 0) regexec_paren_info_size = 0; /* no info, so no size */ else { /* clear out what we might potentially set */ int i = regexec_paren_info_size < r->max_paren_level ? regexec_paren_info_size : r->max_paren_level; while (i-- > 0) { regexec_paren_info[i].match_end = 0; FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) report_new_pareninfo("init", i)); } } if (r->paren_info_required > regexec_paren_info_size) return 0; /* we need the paren state */ #endif STATS( if (!r->anchor_to_start_of_line) { latest_start = end_of_line; } ) for (;;) { const unsigned char *b_if_match; STATS(regex_stats.cycles++); FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) { outputf("%ld -----------------------------: [%.*s]\n", state_count(), (int)(end_of_line - line), line); showbuf(r->buf, b, r->bufend, ">>> "); }) switch(b[0]) { #ifndef NO_REGEXEC_MATCH_POINTS case make_type_byte(REGEX_MATCH_NOTE_WORD): #ifndef NO_LLM_SUPPORT if (llm && (line <= longest_match_so_far)) goto nomatch; #endif regexec_match_at_start_of_word = is_word_boundary(regexec_match_start); regexec_match_at_end_of_word = is_word_boundary(line); /**** FALLTHROUGH ****/ #endif case make_type_byte(REGEX_MATCH): #ifndef NO_LLM_SUPPORT if (llm && (line <= longest_match_so_far)) { FASTDEBUGSTUFF( if (regexec_flags & REGEXEC_DEBUG) outputf("** pattern matches, but is too short **\n"); ) goto nomatch; } longest_match_so_far = line; #endif FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) outputf("*** pattern matches [llm=%d]***\n", llm);) #ifndef NO_REGEXEC_MATCH_POINTS regexec_match_end = line; #endif #ifndef NO_PAREN_INFO regexec_paren_info_used = highest_paren_seen + 1; FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) { int i; outputf("regexec_paren_info_used is %d\n", regexec_paren_info_used); for (i = 0; i < regexec_paren_info_used; i++) report_new_pareninfo("return", i); }) #endif #ifndef NO_LLM_SUPPORT if (llm) goto nomatch; /* look for more */ #endif return 1; /* match! */ default: outputf("<>\n", b - r->buf, b[0]); return 0; case_major(EXACT1, /* Major type. */ , /* Prep space. */ 1, /* Extra bytes in regex buffer. */ b[1] == line[0], /* The test itself. */ 1) /* Bytes to move in string if match. */ case_major(EXACT_ICASE_ASCII, /* Major type. */ , /* Prep space. */ ASCII_SPEC_SIZE, /* Extra bytes in regex buffer. */ b[1] == case_translation[line[0]], /* Test itself. */ SIZEOF_ASCII) /* Bytes to move in string if match. */ case_major(EXACT2, /* Major type. */ , /* Prep space. */ EUC_SPEC_SIZE, /* Extra bytes in regex buffer. */ b[2] == line[1] && b[1] == line[0], /* Test. */ SIZEOF_EUC) /* Bytes to move in string if match. */ case_major(EITHER_EXACT_2, /* Major type. */ , /* Prep space. */ EUC_SPEC_SIZE * 2 , /* Extra bytes in regex buffer. */ (b[2] == line[1] && b[1] == line[0]) || (b[4] == line[1] && b[3] == line[0]), /* Test. */ SIZEOF_EUC) /* Bytes to move in string if match. */ case_major(EXACT_ICASE_KANA, /* Major type. */ , /* Prep space. */ EUC_SPEC_SIZE, /* Extra bytes in regex buffer. */ b[2] == line[1] && (line[0] == EUC_HIRAGANA || /* Test. */ line[0] == EUC_KATAKANA), /* Test. */ SIZEOF_EUC) /* Bytes to move in string if match. */ case_major(EXACT3, /* Major type. */ , /* Prep space. */ 3, /* Extra bytes in regex buffer. */ b[1] == line[0] && /* Test. */ b[3] == line[2] && /* Test. */ b[2] == line[1], /* Test. */ 3) /* Bytes to move in string if match. */ case_major(ANY_CHAR, /* Major type. */ , /* Prep space. */ 0, /* Extra bytes in regex buffer. */ line[0] != '\n', /* Test. */ CHAR_LENGTH(line[0])) /* Bytes to move if match. */ case_major(ANY_ASCII, /* Major type. */ , /* Prep space. */ 0, /* Extra bytes in regex buffer. */ (line[0] & 0x80) == 0 && line[0] != '\n', /* Test. */ SIZEOF_ASCII) /* Bytes to move in string if match. */ case_major(ANY_MULTIBYTE, /* Major type. */ unsigned Len, /* prep space */ 0, /* Extra bytes in regex buffer. */ (Len = CHAR_LENGTH(line[0]), Len > 1), /* Test. */ Len) /* Bytes to move if match. */ case_major(ANY_KATAKANA, /* Major type. */ , /* Prep space. */ 0, /* Extra bytes in regex buffer. */ (line[0] == EUC_KATAKANA || /* Test. */ (line[0] == KATA_DASH_HI_BYTE && /* Test. */ line[1] == KATA_DASH_LO_BYTE)), /* Test. */ SIZEOF_EUC) /* Bytes to move in string if match. */ case_major(ANY_NONKATAKANA, /* Major type. */ , /* Prep space. */ 0, /* Extra bytes in regex buffer. */ !(line[0] == EUC_KATAKANA || (line[0] == KATA_DASH_HI_BYTE && /* Test. */ line[1] == KATA_DASH_LO_BYTE)) && /* Test. */ line[0] != '\n', /* Test. */ CHAR_LENGTH(line[0])) /* Bytes to move if match. */ case_major(ANY_HIRAGANA, /* Major type. */ , /* Prep space. */ 0, /* Extra bytes in regex buffer. */ line[0] == EUC_HIRAGANA, /* Test. */ SIZEOF_EUC) /* Bytes to move in string if match. */ case_major(ANY_NONHIRAGANA, /* Major type. */ , /* Prep space. */ 0, /* Extra bytes in regex buffer. */ line[0] != EUC_HIRAGANA && line[0] != '\n', /* Test. */ CHAR_LENGTH(line[0])) /* Bytes to move if match. */ case_major(ANY_KANJI , /* Major type. */ , /* Prep space. */ 0, /* Extra bytes in regex buffer. */ line[0] >= LEAST_KANJI_HI_BYTE && line[0] <= GREATEST_KANJI_HI_BYTE, /* Test. */ SIZEOF_EUC) /* Bytes to move in string if match. */ case_major(ANY_NONKANJI, /* Major type. */ , /* Prep space. */ 0, /* Extra bytes in regex buffer. */ ((line[0] < LEAST_KANJI_HI_BYTE && line[0] != '\n') || line[0] > GREATEST_KANJI_HI_BYTE), /* Test. */ CHAR_LENGTH(line[0])) /* Bytes to move if match. */ case_major(CLASS, /* Major type. */ unsigned short euc2_length = read_short_val(b+2); unsigned short euc3_length = read_short_val(b+4); unsigned char c; unsigned Len; int inverted = b[1]; int foldkana = r->fold_kcase; , /* pattern space size for this command, less type byte */ DETERMINISTIC_CLASS_SIZE - 1 + euc2_length + euc3_length, ((Len = EUC_CHAR_LENGTH(c = line[0])), ( ((Len == 1) && (b+6)[c]) || ((Len == 2) && euc2_length && twobyte_char_in_string ((foldkana ? case_translation[c] : c), line[1], b+6+128, b+6+128+euc2_length)) || ((Len == 3) && euc3_length && threebyte_char_in_string (c, line[1], line[2], b+6+128+euc2_length, b+6+128+euc2_length+euc3_length)) ) == !inverted), CHAR_LENGTH(line[0])) case make_type_byte(WORD_BOUNDARY): { if (!do_TEST(is_word_boundary(line))) goto nomatch; b_if_match = b + TYPE_BYTE_SIZE; goto match; } case make_type_byte(START_OF_LINE): if (!do_TEST(bol(line))) goto nomatch; b_if_match = b + TYPE_BYTE_SIZE; goto match; case make_type_byte(END_OF_LINE): if (do_TEST(!eol(line) && line[0] != '\n')) goto nomatch; b_if_match = b + TYPE_BYTE_SIZE; goto match; case make_type_byte(PUSH): push_state(b + read_short_val(b+TYPE_BYTE_SIZE), line); b += SIZEOF_PUSH_COMMAND; continue; case make_type_byte(JUMP): b += read_short_val(b+TYPE_BYTE_SIZE); continue; case make_type_byte(PUSH_JUMP): push_state(b + TYPE_BYTE_SIZE + SHORT_VAL_SIZE, line); b += read_short_val(b+TYPE_BYTE_SIZE); continue; #ifndef NO_PAREN_INFO case make_type_byte(MATCH_PREV_PAREN): { unsigned level = b[TYPE_BYTE_SIZE]; unsigned Len; assert(level < regexec_paren_info_size); if (level > highest_paren_seen || regexec_paren_info[level].match_start == 0 || regexec_paren_info[level].match_end == 0) goto nomatch; Len = regexec_paren_info[level].match_end - regexec_paren_info[level].match_start; if (Len != 0) { const unsigned char *A = line; const unsigned char *A_end = A + Len; const unsigned char *B = regexec_paren_info[level].match_start; if (A_end> end_of_line) goto nomatch; FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) outputf("content len is %d, strings are [%.*s] [%.*s]\n", Len, (int)Len, A, (int)Len, B);); do { if (isascii(*A)) { if (r->fold_acase == 0 || !isalpha(*A)) { if (*A != *B) goto nomatch; } else { if (!isascii(*B) || !isalpha(*B)) goto nomatch; if ((islower(*B) ? *B : tolower(*B)) != (islower(*A) ? *A : tolower(*A))) goto nomatch; } A++; B++; } else { if (r->fold_kcase == 0 || (A[0] != EUC_HIRAGANA && A[0] != EUC_KATAKANA)) { if (A[0] != B[0] || A[1] != B[1]) goto nomatch; } else { if ((B[0] != EUC_HIRAGANA && B[0] != EUC_KATAKANA) || A[1] != B[1]) goto nomatch; } A += 2; B += 2; } } while (A < A_end); line += Len; } b_if_match = b + TYPE_BYTE_SIZE + PAREN_SPEC_SIZE; goto match; } case make_type_byte(SAVE_OPEN_PAREN): { unsigned level = b[TYPE_BYTE_SIZE]; STATS(regex_stats.parens_entered++); if (level < regexec_paren_info_size) { regexec_paren_info[level].match_start = line; regexec_paren_info[level].match_end = 0; if ((signed int)level > highest_paren_seen) highest_paren_seen = level; FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) { outputf("highest_paren_seen now %d\n", highest_paren_seen); report_new_pareninfo("open", level);}); } b_if_match = b + TYPE_BYTE_SIZE + 1; goto match; } case make_type_byte(PUSH_SAVE_OPEN_PAREN): { unsigned level = b[TYPE_BYTE_SIZE+SHORT_VAL_SIZE]; push_state(b + read_short_val(b+TYPE_BYTE_SIZE), line); STATS(regex_stats.parens_entered++); if (level < regexec_paren_info_size) { regexec_paren_info[level].match_start = line; regexec_paren_info[level].match_end = 0; if ((signed int)level > highest_paren_seen) highest_paren_seen = level; FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) { outputf("highest_paren_seen now %d\n", highest_paren_seen); report_new_pareninfo("open2", level);}) } b_if_match = b + SIZEOF_PUSH_SAVE_OPEN_PAREN_COMMAND; goto match; } case make_type_byte(SAVE_CLOSE_PAREN_PUSH_JUMP): { unsigned count = b[TYPE_BYTE_SIZE]; unsigned start = b[TYPE_BYTE_SIZE+PAREN_COUNT_SIZE]; STATS(regex_stats.parens_saved++); if (start < regexec_paren_info_size) { regexec_paren_info[start].match_end = line; if ((unsigned int)start > highest_paren_seen) highest_paren_seen = start; FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) { outputf("highest_paren_seen now %d\n", highest_paren_seen); report_new_pareninfo("set1", start);}) } push_state_with_paren_info(b+TYPE_BYTE_SIZE+2+SHORT_VAL_SIZE, line, count, start); b += read_short_val(b+TYPE_BYTE_SIZE + PAREN_COUNT_SIZE + PAREN_SPEC_SIZE); continue; } case make_type_byte(SAVE_CLOSE_PAREN): { unsigned level = b[TYPE_BYTE_SIZE]; STATS(regex_stats.parens_saved++); if (level < regexec_paren_info_size) { regexec_paren_info[level].match_end = line; if ((unsigned int)level > highest_paren_seen) highest_paren_seen = level; FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) { outputf("highest_paren_seen now %d\n", highest_paren_seen); report_new_pareninfo("set2", level);}) } b_if_match = b + TYPE_BYTE_SIZE + PAREN_SPEC_SIZE; goto match; } #endif /* NO_PAREN_INFO */ } assert(0); match: STATS(regex_stats.matches++); FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) output("\n");) b = b_if_match; continue; nomatch: STATS(regex_stats.failures++); /* * If we failed, but have some states pushed onto the stack, * pop one off and continue from there. */ if (state_count() > 0) { #ifndef NO_PAREN_INFO signed int old_highest_paren_seen = highest_paren_seen; #endif pop_state(b, line); #ifndef NO_PAREN_INFO if (old_highest_paren_seen != highest_paren_seen) { while (old_highest_paren_seen > highest_paren_seen) { FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) report_new_pareninfo("flushing", old_highest_paren_seen)); regexec_paren_info[old_highest_paren_seen--].match_start=0; } } #endif FASTDEBUGSTUFF( if (special_debug) { outputf("popping state #%ld (total %ld): [%.*s|%s] regex %d.\n", state_count(), #ifdef NO_REGEX_STATS -1, #else regex_stats.states_popped, #endif line - orig_line, orig_line, line, b - r->buf); } else if (regexec_flags & REGEXEC_DEBUG) outputf("++abort, popping state #%ld (total %ld popped)++: <%ld, %ld, p%d>\n", state_count(), #ifdef NO_REGEX_STATS -1, #else regex_stats.states_popped, #endif b - r->buf, line - orig_line, #ifndef NO_PAREN_INFO highest_paren_seen #else -1 #endif );) } else { #ifndef NO_LLM_SUPPORT if (llm && (longest_match_so_far != 0)) return 1; /* we had a match last time */ #endif /* bump past the currently first character */ line_head += CHAR_LENGTH(line_head[0]); /* are we too late? */ if (line_head > latest_start) { FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) output("*** regex failed ***\n");) return 0; } STATS(regex_stats.states_pushed++); /* effectively*/ STATS(regex_stats.states_popped++); /* effectively*/ line = line_head; /* Start from the head of the line... */ b = r->buf; /* ... and head of the regex */ FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) outputf("++abort, moving along line: line=%lx, latest=%lx, end=%lx++\n", (long)line, (long)latest_start, (long)end_of_line);) #ifndef NO_PAREN_INFO highest_paren_seen = -1; #endif #ifndef NO_REGEXEC_MATCH_POINTS regexec_match_start = line; #endif } continue; } /* notreached */ } lookup-1.08b.orig/lib/jregex.h0100644000014400001440000002534406076503272015722 0ustar nakaharastaff#ifndef __JREGEX_H__ /* file wrapper */ #define __JREGEX_H__ #define jregex_version 106 /* 1.06 */ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * * Jeffrey's REGEX routines. * October 1993 * * To apply a regular expression to a string, do something like: * * * int match(const unsigned char *pattern, const unsigned char *string) * { * regex_t compiled; * * if (regcomp(&compiled, pattern, 0) != 0) * return 0; <-- bad pattern * else * { * int doesmatch = regexec(&compiled, string, strlen(string)); * regfree(&compiled); * return doesmatch; * } * } * * These routines are optimized so that the actual matching routine * (regexec) is very fast, at the expense of the compile routine (regcomp). * * Patterns and strings consist of any mixture of 7-bit ASCII and 16 bit * EUC Japanese. * * A pattern consists of exact text to be matched, as well as the special * characters: * * ( ) For grouping. * < > For beginning/end of word (actually, same as \b) * ^ $ For beginning/end of line. * * Zero or more of the previous thing. * $ Zero or one of the previous thing. * + One or more of the previous thing * . Any character (ASCII or EUC) except newline. * \D Any character (ASCII OR EUC) not \d (see below) * \W Any character (ASCII OR EUC) not \w (see below) * \S Any character (ASCII OR EUC) not \s (see below) * \a Any ASCII character not \n * \A Any multibyte character. * \k Any katakana character, including ー. * \K Any non-katakana character except \n. * \h Any hiragana character. * \H Any non-hiragana character except \n. * \c Any jis208 kanji (kuten rows 16-84). * \C Anything not \c except \n. * \b Boundary between words. * * [ ] Indicate character classes. Within classes, none of the * above are special. If the first character after the open * bracket is '^', any character *not* specified in the * class will match. * * \n Newline. * \t Tab. * \r Carriage Return. * \f Form-feed. * \d A digit [0-9] * \w An ASCII word element [0-9a-zA-Z_] * \s Whitespace [\t \n\r\f] * \## \### Two or three digit octal specifier is that octal number. * Creating octal values above 127 will result in undefined * behavior. Probably unfun undefined behavior. #ifndef NO_PAREN_INFO * \# To match the parenthesized expression starting with * the #th paren. For example "(a)\1" will match "aa". #endif * * \ A backslash before a character (other than the backslash * combos indicated above) causes that character to be taken * raw... i.e. removes the specialness from '+', etc. * * The highest precedence is with '*', '$', and '+'. Then parens. * Then concatenation (sequences of characters, classes, etc). * Then '|'. * * Note that when case is folded, \k, \K, \h, and \H do NOT fold. */ /* struct used to hold a compiled regular expression */ typedef struct regexbuf regex_t; /* * Given a null-terminated pattern, a REGEX_T to fill, and some flags, * return one of the REGCOMP_* return values (i.e. REGCOMP_SUCCESS) */ extern int regcomp(regex_t *r, const unsigned char *pattern, unsigned flags); /* status returned by regcomp() */ #define REGCOMP_SUCCESS 0 /* yeah! */ #define REGCOMP_INTERNAL_ERROR 1 /* Mmmmm. */ #define REGCOMP_INVALID_DATA 2 /* null pointer passed to */ #define REGCOMP_EMPTY_PATTERN 3 /* empty pattern */ #define REGCOMP_UNCLOSED_CLASS 4 /* forgot that ']' */ #define REGCOMP_UNMATCHED_OPEN_PAREN 5 /* more ( than ) */ #define REGCOMP_UNMATCHED_CLOSE_PAREN 6 /* less ( than ) */ #define REGCOMP_MISUSED_COUNT_SPEC 7 /* ill-used +,*, or ? */ #define REGCOMP_INFINITE_PLUS 8 /* something like "(x?)+" */ #define REGCOMP_INFINITE_STAR 9 /* something like "(x?)*" */ #define REGCOMP_EMPTY_CLASS 10 /* an empty clas [] */ #define REGCOMP_BAD_BRACE 11 /* misuse of | */ #ifndef NO_PAREN_INFO # define REGCOMP_PAREN_LEVEL_INVALID 12 /* \# when paren # not defined */ # define REGCOMP_NEED_SAVE_PAREN_INFO 13 /* has internal paren references*/ #endif #define REGCOMP_EUC_IN_CLASS_RANGE 14 /* EUC can't be part of range */ #define REGCOMP_INVALID_OCTAL_VALUE 15 /* something like \678 */ #define REGCOMP_CORRUPTED_TEXT 16 /* bad character codes in text */ /* flags accepted by regcomp() and regexec_setflags() */ /* turns on debugging */ #define REGCOMP_DEBUG 0x0001 #define REGEXEC_DEBUG 0x0002 #define REGEX_DEBUG (REGCOMP_DEBUG|REGEXEC_DEBUG) /* causes alphabetic case to not matter in comparing */ #define REGCOMP_IGNORE_ALPHA_CASE 0x0004 /* causes kana case (hiragana vs katakana) to not matter in comparing */ #define REGCOMP_IGNORE_KANA_CASE 0x0008 /* both of the above */ #define REGCOMP_IGNORE_CASE (REGCOMP_IGNORE_ALPHA_CASE|\ REGCOMP_IGNORE_KANA_CASE) /* * Used when you want a binary "does this line match or not" answer. * It will allow various shortcuts to be taken, such as ignoring * the final ``z*'' in the example pattern ``xyz*'' (as every line * that could match the pattern ``xyz*'' will also be matched by * the more simple ``xy''). * Note that this will tend to have strange but generally predictable * results upon regexec_match_start, regexec_match_end, and paren * info. */ #define REGCOMP_JUST_MATCH 0x0010 /* * Tells regcomp to calculate the list of characters that any * matching line must have, and make it available via regmusthave(). */ #define REGCOMP_CALC_MUSTHAVE 0x0020 #ifndef NO_PAREN_INFO /* * If set tells regcomp to compile so that regexec will save info about * what text was matched by what parens. This is saved to * regexec_paren_info (but, of course, won't be if that variable is null). */ #define REGCOMP_SAVE_MATCHED_PAREN_INFO 0x0040 #endif /* * If set will cause regexec to set * regexec_match_at_start_of_word * and * regexec_match_at_end_of_word * upon exit. */ #define REGCOMP_WANT_WORD_MATCH_INFO 0x0080 /* * If set, will cause 々 to match "appropriately" either way. * When set, 時々 and 時時 will match each-other. */ #define REGCOMP_FUZZY_KANA_REPETITION 0x0100 /* * Has the regex run in longest-leftmost (for overall regex) mode. */ #define REGEXEC_LLM 0x0200 /****************************************************/ /* * Execute the given compiled REGEX_T on the given LINE (of the given LENGTH) * and return true if it matches, false otherwise. */ extern unsigned int regexec(const regex_t *r, const unsigned char *line, unsigned len); #ifndef NO_REGEXEC_MATCH_POINTS /* * When regexec returns REGCOMP_SUCCESS, the following will point to the * the start and end of the text that was actually matched. * Undefined otherwise. */ extern const unsigned char *regexec_match_start; extern const unsigned char *regexec_match_end; #endif /* * If requested via REGCOMP_WANT_WORD_MATCH_INFO, indicates if the match * started and/or ended at a word boundary. */ extern int regexec_match_at_start_of_word; extern int regexec_match_at_end_of_word; #ifndef NO_PAREN_INFO typedef struct { const unsigned char *match_start; const unsigned char *match_end; } matched_paren_t; /* * User fills in the following two if they want the matched paren info saved. */ extern matched_paren_t *regexec_paren_info; extern unsigned regexec_paren_info_size; /* entries in regexec_paren_info[] */ /* While regexec will fill in this one telling how many are now valid. */ extern unsigned regexec_paren_info_used; /* entries in above actually used */ #endif /* * Used to set flags (such as REGEXEC_DEBUG) for regexec(). * The previous value is returned. If debugging has not been compiled in, * the value ~0 is returned. */ extern unsigned int regexec_setflags(unsigned int flags); /* * If the user sets this to a function, it will be called upon a memory * error. It shouldn't return. */ extern void (*regex_memory_error)(void); /* * Free any (non-temporary) memory associated with the given REGEX_T. * The REGEX_T itself is not freed, however, as it wasn't allocated here. * Safe to re-free a free'd regex. */ extern void regfree(regex_t *r); /* * Return a list of characters that will be part of every matching * line, if known. Regcomp must have been called with REGCOMP_CALC_MUSTHAVE, * and will return NULL if not. If there are no known required characters * (such as for "a|b"), an empty string ("") will be returned. */ extern const unsigned char *regmusthave(const regex_t *r); /* * When there's an error in regcomp, REGCOMP_ERROR is the error * number, and REGCOMP_EPTR (if non-null) points into the pattern * string near where the error was realized. */ extern unsigned int regcomp_error; extern const unsigned char *regcomp_eptr; extern const unsigned char *regcomp_last_pattern; extern const unsigned char *regcomp_error_report(void); #ifndef NO_SHOWREGEX /* for debugging */ extern void showregex(const regex_t *r); #endif /* * User doesn't need to be concerned with what's inside here, but we * have to make it available to the outside... */ struct regexbuf { unsigned char *buf; unsigned char *bufend; unsigned int min_length_match; unsigned char *musthave; #ifndef NO_PAREN_INFO unsigned char max_paren_level; unsigned char max_lpc_l; unsigned char max_lpc_c; unsigned char paren_info_required; #define reg_max_paren_level_used(BUF) ((BUF)->max_paren_level) #define reg_max_paren_level_required(BUF) ((BUF)->paren_info_required) #endif unsigned anchor_to_start_of_line:1, fold_acase:1, fold_kcase:1; }; #ifndef NO_REGEX_STATS extern struct regex_stats { int states_pushed, states_popped, max_state_depth, parens_pushed, parens_popped, parens_entered, parens_saved, cycles, matches, failures, tests; } regex_stats; #endif /* NO_REGEX_STATS */ extern void regex_reset_stats(void); #endif /* file wrapper */ lookup-1.08b.orig/lib/output.h0100644000014400001440000000711106076511757015775 0ustar nakaharastaff#ifndef __OUTPUT_H__ /* file wrapper */ #define __OUTPUT_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * General output (a'la printf, etc.) routines that can do conversion * to various Japanese encoding methods. * * Probably won't really work if USE_LOCAL_OUTPUT isn't set. */ #if !defined(__GNUC__) # if !defined(__volatile__) # define __volatile__ /*nothing; for use with volatile functions */ # endif # if !defined(__inline__) # define __inline__ /*nothing; for use with volatile functions */ # endif #endif #ifndef USE_LOCAL_OUTPUT #include #define flush_output() fflush(stdout) #define outputf printf #define output(string) fputs(string, stdout) #define outchar putchar #define select_output_style(foo) (0) #define show_output_style() /* nothing */ #define OUTPUT_PAGER 0 #define set_extra_output_file(fd) (-1) #else /* else USE_LOCAL_OUTPUT */ #if !PROVIDE_PAGER #define OUTPUT_PAGER 0 #else #define OUTPUT_PAGER 1 void output_pager_reset_more(void); int output_pager_transparent(int); unsigned output_pager_lines(unsigned); unsigned output_pager_columns(unsigned); int output_pager_status(int want); #endif extern void flush_output(void); extern int outputf(const char *fmt, ...); extern int output(const char *str); extern unsigned (*_output_char_function)(unsigned char); extern unsigned long select_output_style(unsigned long mods); extern void show_output_style(void); extern int set_extra_output_file(int fd); #define INQUIRE_ONLY 0x00000000 /* just returns old code */ #define EUC_OUTPUT 0x00000001 #define SJIS_OUTPUT 0x00000002 #define JIS_OUTPUT 0x00000004 #define _BASIC_OUTPUT_TYPE (JIS_OUTPUT|SJIS_OUTPUT|EUC_OUTPUT) #define JIS_1978_OUTPUT JIS_OUTPUT #define JIS_1983_OUTPUT JIS_OUTPUT|0x00000010 #define JIS_1990_OUTPUT JIS_OUTPUT|0x00000020 #define _JIS_KANJI_STYLE (JIS_1978_OUTPUT|JIS_1978_OUTPUT|JIS_1990_OUTPUT) #define JIS_ROMAN 0x00000100 #define JIS_ASCII 0x00000200 #define _JIS_ENGLISH_STYLE (JIS_ROMAN|JIS_ASCII) #define ELIDE_NONDISPLAYABLE 0x00001000 #define OUTPUT_NONDISPLAYABLE 0x00002000 #define SHOW_NONDISPLAYABLE_CODES 0x00004000 #define MARK_NONDISPLAYABLE 0x00008000 #define _NONDISPLAYABLE (ELIDE_NONDISPLAYABLE|OUTPUT_NONDISPLAYABLE|\ SHOW_NONDISPLAYABLE_CODES|MARK_NONDISPLAYABLE) #define PASS_HW_KATANANA 0x00010000 #define ELIDE_HW_KATAKANA 0x00020000 #define FOLD_HW_KATAKANA_TO_FULL 0x00040000 #define _KATAKANA (PASS_HW_KATANANA|ELIDE_HW_KATAKANA|\ FOLD_HW_KATAKANA_TO_FULL) #define SUPPORT_0212_1990 0x00100000 #define NO_0212_1990 0x00200000 #define _0212_1990 (SUPPORT_0212_1990|NO_0212_1990) #define outchar(c) (*_output_char_function)((unsigned char)(c)) extern void output_pager_reset_more(void); #endif /* end USE_LOCAL_OUTPUT */ extern __volatile__ void die(const char *fmt, ...); extern void warn(const char *fmt, ...); extern void output_buffer(const unsigned char *start, const unsigned char *end); #define OUTPUT_FILE_ERROR -1 #define JUST_CHECKING_OUTPUT_FILE -2 #define NO_OUTPUT_FILE -3 #define output_fd_valid(fd) ((fd) >= 0) #if !OUTPUT_PAGER # define output_pager_reset_more() /* nothing */ # define output_pager_transparent(val) (0) # define output_pager_lines(val) (0) # define output_pager_columns(val) (0) # define output_pager_status(val) (0) #endif #endif /* file wrapper */ lookup-1.08b.orig/lib/output.c0100644000014400001440000006154706173631472016000 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #include "config.h" #include "assert.h" #include #include #include "input.h" #include "output.h" #ifdef USE_LOCAL_OUTPUT /* * Output routines. */ #define OUTBUF_SIZE 500 #define EXTRA_BUFFER 12 static unsigned char outbuf[OUTBUF_SIZE + EXTRA_BUFFER]; static unsigned char *nextout = &outbuf[0]; #define bufend (&outbuf[OUTBUF_SIZE]) #define THREE_BYTE_HI 143 #define HALF_WIDTH_KATA_HI 142 unsigned const char *jis_start_208 = (unsigned const char *)"\33$@"; unsigned const char *jis_start_212 = (unsigned const char *)"\33$(D"; unsigned const char *jis_start_ASCII = (unsigned const char *)"\33(B"; unsigned const char *jis_start_kana = (unsigned const char *)"\33(I"; int flush_on_newline = 1; static int output_fd = 1 /* stdout */; static unsigned long output_style = JIS_OUTPUT | /* default output is JIS */ JIS_ASCII | /* use ASCII, not (JIS-ROMAN) when appropriate */ SHOW_NONDISPLAYABLE_CODES | NO_0212_1990 | PASS_HW_KATANANA; static int extra_fd = NO_OUTPUT_FILE; static void nondisplayable3(unsigned char a, unsigned char b, unsigned char c) { switch(output_style & _NONDISPLAYABLE) { default: soft_assert(0); break; case ELIDE_NONDISPLAYABLE: break; case OUTPUT_NONDISPLAYABLE: *nextout++ = a; *nextout++ = b; *nextout++ = c; break; case SHOW_NONDISPLAYABLE_CODES: outputf("\\%03\\%03\\%03", a, b, c); break; case MARK_NONDISPLAYABLE: output("★"); break; } } static void flush_raw_output(void) { unsigned len = nextout - outbuf; if (len != 0) { kibishii_assert(output_fd_valid(output_fd)||output_fd==NO_OUTPUT_FILE); if (output_fd_valid(output_fd)) if (write(output_fd, outbuf, len) < 0) { output_fd = OUTPUT_FILE_ERROR; kibishii_assert(0); } kibishii_assert(output_fd_valid(extra_fd) || extra_fd==NO_OUTPUT_FILE); if (output_fd_valid(extra_fd)) if (write(extra_fd, outbuf, len) < 0) { extra_fd = OUTPUT_FILE_ERROR; /* quietly shut it down */ kibishii_assert(0); } nextout = outbuf; } } #ifndef PROVIDE_PAGER # define PROVIDE_PAGER 1 #endif #if PROVIDE_PAGER static int column, line; extern unsigned (*_real_output_char_function)(unsigned char); static unsigned screen_height = 20; static unsigned screen_width = 80; #define SPECIAL_PRINT 0x01 #define REGULAR_PRINT 0x02 #define NO_STOP 0x04 #define TRANSPARENT 0x08 /* for watching out for ANSI escape sequences */ #define SEEN_ESCAPE 0x10 #define WAITING_FOR_M 0x20 #define isNORMAL (!(pager_status & (SEEN_ESCAPE|WAITING_FOR_M))) #define hasSEEN_ESCAPE (pager_status & SEEN_ESCAPE) #define isWAITING_FOR_M (pager_status & WAITING_FOR_M) #define setNORMAL (pager_status &= ~(SEEN_ESCAPE|WAITING_FOR_M)) #define setSEEN_ESCAPE ((pager_status &= ~WAITING_FOR_M),\ (pager_status |= SEEN_ESCAPE)) #define setWAITING_FOR_M ((pager_status |= WAITING_FOR_M), \ (pager_status &= ~SEEN_ESCAPE)) static unsigned pager_status = REGULAR_PRINT; #define NOTE "--MORE [space=more, return=one more line, (c)ontinue, (q)uit]--" #define SHORT_NOTE "--MORE [space,return,c,q]--" #define VERYSHORT_NOTE "-MORE-" static __inline__ void more(void) { const char *note; unsigned len; int i; if (screen_width > sizeof(NOTE)) note = NOTE; else if (screen_width > sizeof(SHORT_NOTE)) note = SHORT_NOTE; else note = VERYSHORT_NOTE; len = strlen(note); for (i = 0; i < len; i++) _real_output_char_function(note[i]); flush_pending_input(); flush_raw_output(); for (;;) { extern volatile unsigned apply_regex_abort;/*kludge!*/ switch (next_cooked_input_byte()) { default: continue; case '\r': case '\n': line = screen_height - 1; break; case ' ' : line = 0; break; case 'c' : line = 0; pager_status |= NO_STOP; break; case 0: case 'q' : line = 0; pager_status &= ~REGULAR_PRINT; apply_regex_abort = 1; break; } break; } /* erase note */ for (i = 0; i < len; i++) { _real_output_char_function('\b'); _real_output_char_function(' '); _real_output_char_function('\b'); } } static void pager_output_char_function(unsigned char c) { unsigned column_increment; if (c != 0 && !(pager_status & (SPECIAL_PRINT|REGULAR_PRINT))) return; if (pager_status & NO_STOP) { (*_real_output_char_function)(c); return; } if (((line == screen_height) && (column > 0 || c >= '\33')) || line > screen_height) { int logging_on = output_fd_valid(extra_fd); int old_extra_fd; /* "may be used uninitialized" warning OK here */ if (logging_on) { flush_raw_output(); old_extra_fd = extra_fd; extra_fd = NO_OUTPUT_FILE; } more(); if (logging_on) { flush_raw_output(); extra_fd = old_extra_fd; } /* if they 'q', then we'll not continue */ if (!(pager_status & (SPECIAL_PRINT|REGULAR_PRINT))) return; } if ((c == '\n' || c == '\r') && column == screen_width) { /* don't want to output a newline at this point, since we know we're already there */ column = 0; } else column_increment = (*_real_output_char_function)(c); while (column >= screen_width) { line++; column -= screen_width; } if (!(pager_status & TRANSPARENT)) { static mcount = 0; if (c == '\33') setSEEN_ESCAPE; else if (c == '[' && hasSEEN_ESCAPE) { setWAITING_FOR_M; mcount = 0; } else if (isWAITING_FOR_M && c != '\n' && c != '\r') { if (mcount++ > /* some reasonably large number */20 || c == 'm') setNORMAL; } else switch(c) { default: column += column_increment; break; case '\n': line++; /* fallthrough */ case '\r': setNORMAL; column = 0; break; case '\b': if (column > 0) column--; break; } } } void output_pager_reset_more(void) { line = 0; pager_status &= ~NO_STOP; pager_status |= REGULAR_PRINT; } int output_pager_transparent(int t) { int was = (pager_status & TRANSPARENT) ? 1 : 0; if (t) pager_status |= TRANSPARENT; else pager_status &= ~TRANSPARENT; return was; } unsigned output_pager_lines(unsigned new) { unsigned old = screen_height; if (new) screen_height = new; return old; } unsigned output_pager_columns(unsigned new) { unsigned old = screen_width; if (new) screen_width = new; return old; } #endif /* PROVIDE_PAGER */ /************************************************************************/ /************************************************************************/ /* * Just pass it through. */ static unsigned output_euc_as_simple_euc(unsigned char c) { unsigned retval = 0; if (c != 0) { #if PROVIDE_PAGER if (c != /*CodeSet3*/143 && c >= ' ') retval = 1; #endif *nextout++ = c; if ((c == '\n' && flush_on_newline) || nextout >= bufend) flush_raw_output(); } return retval; } /* * */ static unsigned output_euc_as_jis(unsigned char c) { static unsigned char hi = 0; static enum { ASCII, KATAKANA, _X_ , JIS208, JIS212 } mode = ASCII; unsigned retval = 0; if ((c & 0x80) == 0) { if (mode != ASCII) { unsigned const char *ptr = jis_start_ASCII; while (*ptr) *nextout++ = *ptr++; mode = ASCII; } if (c == 0) return 0; *nextout++ = c; #if PROVIDE_PAGER if (c >= ' ') retval = 1; #endif if ((c == '\n' && flush_on_newline) || nextout >= bufend) flush_raw_output(); return retval; } /* we have a multibyte character */ if (hi == 0) { /* the first byte of a 2- or 3-byte character */ hi = c; return 0; } if (hi == THREE_BYTE_HI) { static unsigned char mid = 0; if (mid == 0) { /* the 2nd byte of a three-byte character */ mid = c; return 0; } /* We now have a full three-byte JIS X 0212-1990 character. */ if (output_style & NO_0212_1990) { unsigned char a = hi, b = mid; hi = mid = 0; nondisplayable3(a, b, c); } else { if (mode != JIS212) { unsigned const char *ptr = jis_start_212; while (*ptr) *nextout++ = *ptr++; mode = JIS212; } *nextout++ = mid & 0x7f; *nextout++ = c & 0x7f; hi = mid = 0; #if PROVIDE_PAGER retval = 2; #endif } } else if (hi == HALF_WIDTH_KATA_HI) { /* Have a half-width katakana */ if (mode != KATAKANA) { unsigned const char *ptr = jis_start_kana; while (*ptr) *nextout++ = *ptr++; mode = KATAKANA; } /* could put a HW -> FW kana converter here */ *nextout++ = c & 0x7f; hi = 0; #if PROVIDE_PAGER retval = 1; #endif } else { /* have "regular" JIS X 0208 two-byte kanji */ if (mode != JIS208) { unsigned const char *ptr = jis_start_208; while (*ptr) *nextout++ = *ptr++; mode = JIS208; } *nextout++ = hi & 0x7f; *nextout++ = c & 0x7f; hi = 0; #if PROVIDE_PAGER retval = 2; #endif } if (nextout >= bufend) flush_raw_output(); return retval; } static unsigned output_euc_as_sjis(unsigned char c) { static unsigned char hi = 0; unsigned retval = 0; if ((c & 0x80) == 0) { if (c == 0) return 0; *nextout++ = c; #if PROVIDE_PAGER if (c >= ' ') retval = 1; #endif if ((c == '\n' && flush_on_newline) || nextout >= bufend) flush_raw_output(); return retval; } /* we have a multibyte character */ if (hi == 0) { /* the first byte of a 2- or 3-byte character */ hi = c; return 0; } if (hi == THREE_BYTE_HI) { static unsigned char mid = 0; if (mid == 0) { /* the 2nd byte of a three-byte character */ mid = c; return 0; } /* * We now have a full three-byte JIS X 0212-1990 character, which * is undisplayable in SJIS */ { unsigned char a = hi, b = mid; hi = mid = 0; nondisplayable3(a, b, c); } } else if (hi == HALF_WIDTH_KATA_HI) { /* Have a half-width katakana */ /* could put a HW -> FW kana converter here */ *nextout++ = c; hi = 0; #if PROVIDE_PAGER retval = 1; #endif } else { /* have "regular" JIS X 0208 two-byte kanji */ hi &= 0x7f; c &= 0x7f; *nextout++ = ((hi + 1) >> 1) + (hi < 95 ? 112 : 176); *nextout++ = c + ((hi & 1) ? (c > 95 ? 32 : 31) : 126); hi = 0; #if PROVIDE_PAGER retval = 2; #endif } if (nextout >= bufend) flush_raw_output(); return retval; } /***************************************************************/ unsigned (*_output_char_function)(unsigned char) = output_euc_as_jis; int output(const char *string) { unsigned char *ptr = (unsigned char *)string; while (*ptr) outchar(*ptr++); return 0; } void output_buffer(const unsigned char *start, const unsigned char *end) { while (start < end) outchar(*start++); } void flush_output(void) { outchar(0); /* flush to ASCII mode, if it matters */ flush_raw_output(); } #ifdef USE_VSNPRINTF int outputf(const char *fmt, ...) { va_list ap; unsigned char buffer[500]; unsigned char *start = buffer, *end; unsigned count; va_start(ap, fmt); count = vsnprintf((void*)buffer, sizeof(buffer), (void *)fmt, ap); end = start + count; va_end(ap); while (start < end) outchar(*start++); return count; } void warn(const char *fmt, ...) { va_list ap; unsigned char buffer[500]; unsigned char *start = buffer, *end; unsigned count; unsigned fd = output_fd; flush_output(); va_start(ap, fmt); count = vsnprintf((void*)buffer, sizeof(buffer), (void*)fmt, ap); end = start + count; va_end(ap); { #if PROVIDE_PAGER unsigned old_status = pager_status; pager_status |= SPECIAL_PRINT; #endif output_fd = 2; while (start < end) outchar(*start++); flush_output(); output_fd = fd; #if PROVIDE_PAGER pager_status = old_status; #endif } } __volatile__ void die(const char *fmt, ...) { va_list ap; unsigned char buffer[500]; unsigned char *start = buffer, *end; unsigned count; flush_output(); va_start(ap, fmt); count = vsnprintf((void*)buffer, sizeof(buffer), (void *)fmt, ap); end = start + count; va_end(ap); #if PROVIDE_PAGER pager_status |= SPECIAL_PRINT | NO_STOP; #endif output_fd = 2; while (start < end) outchar(*start++); flush_output(); exit(1); } #else /* USE_VSNPRINTF */ /* * vfcnprintf -- Vector Function Print Formatted. * * Given a format string, a vector of arguments, and an output-this-byte * function, format and print a'la printf. * In the stdio world, it's about the same as (untested!) * * printf(const char *fmt, ...) * { * va_list ap; * va_start(ap, fmt); * #undef putchar * vfcnprintf(putchar, fmt, ap) * } * * The number of bytes sent to FUNCTION is returned. If FUNCTION is null, * the number of bytes that *would be* sent is returned. * * It's simple in that for non-%s formats, it just does an internal * sprintf of whatever. For %s, it does it itself so that it doesn't have * to worry about buffer limits. * * Jeffrey Friedl (jfriedl@omron.co.jp) * Feb 1994 */ static int vfcnprintf(unsigned (*function)(unsigned char), const char *fmt,va_list ap) { unsigned char C; /* current character from the format string */ unsigned bytes_out = 0; /* number of bytes output */ const unsigned char *format = (const unsigned char *)fmt; #define NEXT_ARG(type) (va_arg(ap, type)) /* Send the given BYTE to the output function (if it exists) */ #define out(BYTE) \ macro_start { \ unsigned char c = (BYTE); /* BYTE gets evaluated exactly once */ \ if (function) \ (void)(*function)(c); \ bytes_out++; \ } macro_end /* * Run though the format string, either dumping each character, or * interpreting a %-format. */ while (C = *format++, C != '\0') if (C != '%') { out(C); } else { const unsigned char *fstart = format; unsigned argsneeded = 0; int islong = 0; int arg1, arg2; /* "may be used uninitialized" warning OK here */ if (*format == '-') /* allow an optional leading '-' */ format++; if (*format == '*') { /* allow an optional field size */ format++; argsneeded++; } else while (isascii(*format) && isdigit(*format)) format++; if (*format == '.') /* allow a minimum width */ { format++; if (*format == '*') { format++; argsneeded++; } else while (isascii(*format) && isdigit(*format)) format++; } if (*format == 'l') { /* allow an optional 'l' */ islong = 1; format++; } /* * If there were '*' arg markers, nab them from the argument list. */ if (argsneeded) { arg1 = NEXT_ARG(int); if (argsneeded == 2) arg2 = NEXT_ARG(int); } #define do_type(TYPE) \ macro_start { \ unsigned char f[128]; \ unsigned char b[128]; \ unsigned char *ptr = f; \ TYPE value = NEXT_ARG(TYPE); \ \ for (*ptr++ = '%'; fstart < format; fstart++) \ *ptr++ = *fstart; \ *ptr = 0; \ switch(argsneeded) \ { \ case 0: sprintf(b, f, value); break; \ case 1: sprintf(b, f, arg1, value); break; \ case 2: sprintf(b, f, arg1, arg2, value); break; \ } \ for (ptr = b; *ptr; ptr++) \ out(*ptr); \ } macro_end switch(*format++) { case '%': out('%'); break; case 'c': out(NEXT_ARG(unsigned int)); break; default: /* unknown */ out('?'); while (fstart < format) out(*fstart++); out('?'); break; /* 'int' sized things */ case 'd': case 'x': case 'o': case 'u': if (!islong) { do_type(int); break; } /* else FALLTHROUGH */ /* 'long int' sized things */ case 'D': case 'X': case 'O': case 'U': do_type(long int); break; case 'F': case 'f': case 'E': case 'e': case 'G': case 'g': do_type(double); break; /* string */ case 's': case 'n': /* errno string */ case 'N': /* errno string */ { extern char *sys_errlist[]; extern int errno, sys_nerr; char temp_error_buff[20]; const char *str; if (format[-1] == 's') str = NEXT_ARG(const char *); else { int errval; if (format[-1] == 'N') errval = NEXT_ARG(int); else errval = errno; if (errval >= 0 && errval < sys_nerr) str = sys_errlist[errval]; else { sprintf(temp_error_buff, "#%d", errval); str = temp_error_buff; } } if (format[-2] == '%') { /* easy case... just dump the string */ while (*str) out(*str++); } else { /* shit, have to go and interpret the format myself */ int leftadjust = 0; int minfieldwidth = 0; const char *strmax = &str[/*BIG*/0x7fff]; const char *ptr; if (*fstart == '-') leftadjust = *fstart++; if (*fstart == '*') { fstart++; if (minfieldwidth = arg1, minfieldwidth < 0) { leftadjust = !leftadjust; minfieldwidth = -minfieldwidth; } } else { while (isascii(*fstart) && isdigit(*fstart)) minfieldwidth = minfieldwidth*10 + (*fstart++-'0'); } if (*fstart == '.') { fstart++; if (*fstart == '*') { strmax = &str[argsneeded == 1 ? arg1 : arg2]; fstart++; } else { int num = 0; while (isascii(*fstart) && isdigit(*fstart)) num = num * 10 + (*fstart++ - '0'); strmax = &str[num]; } } kibishii_assert(*fstart == 's' || *fstart == 'N'); if (!leftadjust) { int len; for (ptr= str; *ptr && ptr < strmax; ptr++) ; len = ptr - str; while (len < minfieldwidth) { out(' '); minfieldwidth--; } } ptr = str; while (*ptr && ptr < strmax) out(*ptr++); minfieldwidth -= (ptr - str); while (minfieldwidth-- > 0) out(' '); } break; } } } return bytes_out; } int outputf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); return vfcnprintf(*_output_char_function, fmt, ap); } void warn(const char *fmt, ...) { unsigned fd = output_fd; va_list ap; #if PROVIDE_PAGER unsigned old_status = pager_status; pager_status |= SPECIAL_PRINT; #endif va_start(ap, fmt); flush_output(); output_fd = 2; (void)vfcnprintf(*_output_char_function, fmt, ap); flush_output(); output_fd = fd; #if PROVIDE_PAGER pager_status = old_status; #endif } __volatile__ void die(const char *fmt, ...) { va_list ap; #if PROVIDE_PAGER pager_status |= SPECIAL_PRINT; #endif va_start(ap, fmt); flush_output(); output_fd = 2; (void)vfcnprintf(*_output_char_function, fmt, ap); flush_output(); exit(1); } #endif /* USE_VSNPRINTF */ #if PROVIDE_PAGER static unsigned (*_real_output_char_function)(unsigned char) = 0; /* * output_pager_status(0) turns off and reports previous state. * output_pager_status(1) turns on and reports previous state. * output_pager_status(-1) reports current state. */ int output_pager_status(int want) { int was = _real_output_char_function != 0; if (want >= 0 && was != want) { if (want) { _real_output_char_function = _output_char_function; _output_char_function = (unsigned (*)(unsigned char)) pager_output_char_function; line = column = 0; } else { _output_char_function = _real_output_char_function; _real_output_char_function = 0; } } return was; } #endif unsigned long select_output_style(unsigned long mods) { unsigned (**function_pointer)(unsigned char) = #if PROVIDE_PAGER _real_output_char_function ? &_real_output_char_function : #endif &_output_char_function; unsigned long old_output_style = output_style; if (mods == INQUIRE_ONLY) return output_style; #define update(RANGE) \ if (mods & (RANGE)) \ output_style = (output_style & ~(RANGE)) | (mods & (RANGE)); update(_BASIC_OUTPUT_TYPE); update(_JIS_KANJI_STYLE); update(_JIS_ENGLISH_STYLE); update(_NONDISPLAYABLE); update(_0212_1990); update(_KATAKANA); switch(output_style & _BASIC_OUTPUT_TYPE) { default: soft_assert(0); break; case SJIS_OUTPUT: *function_pointer = output_euc_as_sjis; break; case EUC_OUTPUT: *function_pointer = output_euc_as_simple_euc; break; case JIS_OUTPUT: *function_pointer = output_euc_as_jis; switch(output_style & _JIS_KANJI_STYLE) { default: kibishii_assert(0); break; case JIS_1978_OUTPUT: jis_start_208 = (unsigned const char *)"\33$@"; break; case JIS_1983_OUTPUT: jis_start_208 = (unsigned const char *)"\33$B"; break; case JIS_1990_OUTPUT: jis_start_208 = (unsigned const char *)"\33$2\33$B"; break; } switch(output_style & _JIS_ENGLISH_STYLE) { default: kibishii_assert(0); break; case JIS_ROMAN: jis_start_ASCII = (unsigned const char *)"\33(J"; break; case JIS_ASCII: jis_start_ASCII = (unsigned const char *)"\33(B"; break; } break; } if ((output_style & _KATAKANA) != PASS_HW_KATANANA) warn("[half-width katakana support not implemented]\n"); return old_output_style; } void show_output_style(void) { int is_sjis = 0; int has_212_support = 0; switch(output_style & _BASIC_OUTPUT_TYPE) { default: soft_assert(0); break; case SJIS_OUTPUT: output("Shift JIS"); is_sjis = 1; break; case EUC_OUTPUT: output("EUC"); break; case JIS_OUTPUT: switch(output_style & _JIS_KANJI_STYLE) { default: soft_assert(0); break; case JIS_1978_OUTPUT: output("JIS (1978"); break; case JIS_1983_OUTPUT: output("JIS (1983"); break; case JIS_1990_OUTPUT: output("JIS (1990"); break; } switch(output_style & _JIS_ENGLISH_STYLE) { default: soft_assert(0); break; case JIS_ROMAN: output(", roman)"); break; case JIS_ASCII: output(", ASCII)"); break; } break; } switch (output_style & _0212_1990) { default: soft_assert(0); break; case SUPPORT_0212_1990: outputf(" (X212"); has_212_support = 1; break; case NO_0212_1990: outputf(" (!X212"); break; } switch (output_style & _KATAKANA) { default: soft_assert(0); break; case PASS_HW_KATANANA: output(", hwk"); break; case ELIDE_HW_KATAKANA: output(", !hwk"); break; case FOLD_HW_KATAKANA_TO_FULL: output(", hwk converted"); break; } if (is_sjis || !has_212_support) { output(", unsupported "); switch(output_style & _NONDISPLAYABLE) { default: soft_assert(0); break; case ELIDE_NONDISPLAYABLE: output("stripped"); break; case OUTPUT_NONDISPLAYABLE: output("passed"); break; case SHOW_NONDISPLAYABLE_CODES: output("as codes"); break; case MARK_NONDISPLAYABLE: output("marked"); } } outchar(')'); } int set_extra_output_file(int fd) { int old = extra_fd; if (fd != JUST_CHECKING_OUTPUT_FILE) extra_fd = fd; return old; } int set_normal_output_file(int fd) { int old = output_fd; if (fd != JUST_CHECKING_OUTPUT_FILE) output_fd = fd; return old; } #undef outchar void outchar(unsigned char c) { (*_output_char_function)(c); } #else /* not USE_LOCAL_OUTPUT */ volatile void die(const char *fmt, ...) { va_list ap; unsigned char buffer[500]; va_start(ap, fmt); vsnprintf(buffer, sizeof(buffer), (unsigned char *)fmt, ap); va_end(ap); fputs(buffer, stderr); exit(1); } void warn(const char *fmt, ...) { va_list ap; unsigned char buffer[500]; va_start(ap, fmt); vsnprintf(buffer, sizeof(buffer), (unsigned char *)fmt, ap); va_end(ap); fputs(buffer, stderr); } void output_buffer(const unsigned char *start, const unsigned char *end) { while (start < end) putchar(*start++); } #endif /* not USE_LOCAL_OUTPUT */ lookup-1.08b.orig/lib/xmalloc.c0100644000014400001440000000046405531125316016056 0ustar nakaharastaff#include "xmalloc.h" #undef xmalloc /* * like malloc(), but dies if memory not available. */ void *xmalloc(unsigned len) { void *ptr; if (ptr = (void *)malloc(len), ptr == 0) { #define MSG "\n" write(2, MSG, sizeof(MSG)-1); exit(3); } return ptr; } lookup-1.08b.orig/lib/xmalloc.h0100644000014400001440000000040205543750256016065 0ustar nakaharastaff#ifndef __XMALLOC_H__ /* file wrapper */ #define __XMALLOC_H__ #ifdef DEBUG_MALLOC # include # define xmalloc(foo) malloc(foo) #else extern void *xmalloc(unsigned len); /* Simple "get memory or die" malloc. */ #endif #endif /* file wrapper */ lookup-1.08b.orig/lib/packed_list.h0100644000014400001440000001027006173116616016710 0ustar nakaharastaff#ifndef __PACKED_LIST_H__ /* file wrapper */ #define __PACKED_LIST_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #define packed_list_version 200 /* 2.00 */ /* * Jeffrey Friedl, Omron Corp. (jfriedl@nff.ncl.omron.co.jp) * October 1993 * * Packed lists are sequences of unsigned values. Each value, which may * be as large as an unsigned long, is held in as few bytes as possible. * * Each sequential set of value-indicating bytes is zero or more bytes * with the high bit set (to indicate that it's not the last byte in the * set) followed by one byte with the high bit clear. Each byte in the * set adds seven bits to the value. Bytes are given least-significant * first. * * These are given as inlined functions (or local static ones if you don't * have gcc), as they're small. */ #include "MemItem.h" #include "output.h" #define DATA_BITS_IN_BYTE 7 #define NON_LAST_BYTE_MARKER (1< NON_LAST_BYTE_DATA_MASK) { *(*ptr)++ = (value & NON_LAST_BYTE_DATA_MASK) | NON_LAST_BYTE_MARKER; value >>= DATA_BITS_IN_BYTE; } *(*ptr)++ = value; return *ptr - optr; } /* * VALUE = read_packed_value(PTR) * * The VALUE is read from the stream of bytes at (*PTR) and returned, with * the byproduct that (*PTR) is incremented by the number of bytes actually * used when getting VALUE. */ static __inline__ unsigned long read_packed_value(const unsigned char **pptr) { unsigned char c; unsigned long value; unsigned shift; if (c = *(*pptr)++, (c & NON_LAST_BYTE_MARKER) == 0) return c; value = c & NON_LAST_BYTE_DATA_MASK; shift = DATA_BITS_IN_BYTE; while (c = *(*pptr)++, c & NON_LAST_BYTE_MARKER) { value |= (c & NON_LAST_BYTE_DATA_MASK) << shift; shift += DATA_BITS_IN_BYTE; } value |= c << shift; return value; } #if 0 /* * Like access_index except that (*PTR) is not actually moved. */ static __inline__ unsigned long peek_at_packed_value(const unsigned char **pptr) { const unsigned char *ptr = *pptr; return read_packed_value(&ptr); } #endif /* * VALUE = read_packed_value(PTR) * * The VALUE is read from the stream of bytes at (*PTR) and returned, with * the byproduct that (*PTR) is incremented by the number of bytes actually * used when getting VALUE. */ static __inline__ unsigned long mem_read_packed_value(MemItem *mem) { unsigned char c; unsigned long value; unsigned shift; if (c = GetMemByte(mem), (c & NON_LAST_BYTE_MARKER) == 0) return c; value = c & NON_LAST_BYTE_DATA_MASK; shift = DATA_BITS_IN_BYTE; while (c = GetMemByte(mem), c & NON_LAST_BYTE_MARKER) { value |= (c & NON_LAST_BYTE_DATA_MASK) << shift; shift += DATA_BITS_IN_BYTE; } value |= c << shift; return value; } #if 0 /* * Like access_index except that (*PTR) is not actually moved. */ static __inline__ unsigned long mem_peek_at_packed_value(MemItem *mem) { MemLoc loc = mem->loc; unsigned long val = mem_read_packed_value(mem); mem->loc = loc; return val; } #endif /* * Returns the number of bytes required to store the given value. */ static __inline__ unsigned bytes_required_for_packed_value(unsigned long value) { unsigned bytes = 1; while (value > NON_LAST_BYTE_DATA_MASK) { bytes++; value >>= DATA_BITS_IN_BYTE; } return bytes; } #undef __inline__ #endif /* file wrapper */ lookup-1.08b.orig/lib/euc.c0100644000014400001440000002273506076503435015207 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #include "config.h" #include "euc.h" /* * Attempts to define some definitions useful for dealing with * Japanese EUC Packed Format text. */ #define _set(num) ((num)<<_euc_codeset_shift) #define _bpc(num) ((num)<<_euc_bpc_shift) unsigned const char euc_info[256] = { /* 0 x00 */ _set(0)|_bpc(1), /* 1 x01 */ _set(0)|_bpc(1), /* 2 x02 */ _set(0)|_bpc(1), /* 3 x03 */ _set(0)|_bpc(1), /* 4 x04 */ _set(0)|_bpc(1), /* 5 x05 */ _set(0)|_bpc(1), /* 6 x06 */ _set(0)|_bpc(1), /* 7 x07 */ _set(0)|_bpc(1), /* 8 x08 */ _set(0)|_bpc(1), /* 9 x09 */ _set(0)|_bpc(1), /* 10 x0a */ _set(0)|_bpc(1), /* 11 x0b */ _set(0)|_bpc(1), /* 12 x0c */ _set(0)|_bpc(1), /* 13 x0d */ _set(0)|_bpc(1), /* 14 x0e */ _set(0)|_bpc(1), /* 15 x0f */ _set(0)|_bpc(1), /* 16 x10 */ _set(0)|_bpc(1), /* 17 x11 */ _set(0)|_bpc(1), /* 18 x12 */ _set(0)|_bpc(1), /* 19 x13 */ _set(0)|_bpc(1), /* 20 x14 */ _set(0)|_bpc(1), /* 21 x15 */ _set(0)|_bpc(1), /* 22 x16 */ _set(0)|_bpc(1), /* 23 x17 */ _set(0)|_bpc(1), /* 24 x18 */ _set(0)|_bpc(1), /* 25 x19 */ _set(0)|_bpc(1), /* 26 x1a */ _set(0)|_bpc(1), /* 27 x1b */ _set(0)|_bpc(1), /* 28 x1c */ _set(0)|_bpc(1), /* 29 x1d */ _set(0)|_bpc(1), /* 30 x1e */ _set(0)|_bpc(1), /* 31 x1f */ _set(0)|_bpc(1), /* 32 ' ' x20 */ _set(0)|_bpc(1), /* 33 '!' x21 */ _set(0)|_bpc(1), /* 34 '"' x22 */ _set(0)|_bpc(1), /* 35 '#' x23 */ _set(0)|_bpc(1), /* 36 '$' x24 */ _set(0)|_bpc(1), /* 37 '%' x25 */ _set(0)|_bpc(1), /* 38 '&' x26 */ _set(0)|_bpc(1), /* 39 ''' x27 */ _set(0)|_bpc(1), /* 40 '(' x28 */ _set(0)|_bpc(1), /* 41 ')' x29 */ _set(0)|_bpc(1), /* 42 '*' x2a */ _set(0)|_bpc(1), /* 43 '+' x2b */ _set(0)|_bpc(1), /* 44 ',' x2c */ _set(0)|_bpc(1), /* 45 '-' x2d */ _set(0)|_bpc(1), /* 46 '.' x2e */ _set(0)|_bpc(1), /* 47 '/' x2f */ _set(0)|_bpc(1), /* 48 '0' x30 */ _set(0)|_bpc(1), /* 49 '1' x31 */ _set(0)|_bpc(1), /* 50 '2' x32 */ _set(0)|_bpc(1), /* 51 '3' x33 */ _set(0)|_bpc(1), /* 52 '4' x34 */ _set(0)|_bpc(1), /* 53 '5' x35 */ _set(0)|_bpc(1), /* 54 '6' x36 */ _set(0)|_bpc(1), /* 55 '7' x37 */ _set(0)|_bpc(1), /* 56 '8' x38 */ _set(0)|_bpc(1), /* 57 '9' x39 */ _set(0)|_bpc(1), /* 58 ':' x3a */ _set(0)|_bpc(1), /* 59 ';' x3b */ _set(0)|_bpc(1), /* 60 '<' x3c */ _set(0)|_bpc(1), /* 61 '=' x3d */ _set(0)|_bpc(1), /* 62 '>' x3e */ _set(0)|_bpc(1), /* 63 '?' x3f */ _set(0)|_bpc(1), /* 64 '@' x40 */ _set(0)|_bpc(1), /* 65 'A' x41 */ _set(0)|_bpc(1), /* 66 'B' x42 */ _set(0)|_bpc(1), /* 67 'C' x43 */ _set(0)|_bpc(1), /* 68 'D' x44 */ _set(0)|_bpc(1), /* 69 'E' x45 */ _set(0)|_bpc(1), /* 70 'F' x46 */ _set(0)|_bpc(1), /* 71 'G' x47 */ _set(0)|_bpc(1), /* 72 'H' x48 */ _set(0)|_bpc(1), /* 73 'I' x49 */ _set(0)|_bpc(1), /* 74 'J' x4a */ _set(0)|_bpc(1), /* 75 'K' x4b */ _set(0)|_bpc(1), /* 76 'L' x4c */ _set(0)|_bpc(1), /* 77 'M' x4d */ _set(0)|_bpc(1), /* 78 'N' x4e */ _set(0)|_bpc(1), /* 79 'O' x4f */ _set(0)|_bpc(1), /* 80 'P' x50 */ _set(0)|_bpc(1), /* 81 'Q' x51 */ _set(0)|_bpc(1), /* 82 'R' x52 */ _set(0)|_bpc(1), /* 83 'S' x53 */ _set(0)|_bpc(1), /* 84 'T' x54 */ _set(0)|_bpc(1), /* 85 'U' x55 */ _set(0)|_bpc(1), /* 86 'V' x56 */ _set(0)|_bpc(1), /* 87 'W' x57 */ _set(0)|_bpc(1), /* 88 'X' x58 */ _set(0)|_bpc(1), /* 89 'Y' x59 */ _set(0)|_bpc(1), /* 90 'Z' x5a */ _set(0)|_bpc(1), /* 91 '[' x5b */ _set(0)|_bpc(1), /* 92 '\' x5c */ _set(0)|_bpc(1), /* 93 ']' x5d */ _set(0)|_bpc(1), /* 94 '^' x5e */ _set(0)|_bpc(1), /* 95 '_' x5f */ _set(0)|_bpc(1), /* 96 '`' x60 */ _set(0)|_bpc(1), /* 97 'a' x61 */ _set(0)|_bpc(1), /* 98 'b' x62 */ _set(0)|_bpc(1), /* 99 'c' x63 */ _set(0)|_bpc(1), /* 100 'd' x64 */ _set(0)|_bpc(1), /* 101 'e' x65 */ _set(0)|_bpc(1), /* 102 'f' x66 */ _set(0)|_bpc(1), /* 103 'g' x67 */ _set(0)|_bpc(1), /* 104 'h' x68 */ _set(0)|_bpc(1), /* 105 'i' x69 */ _set(0)|_bpc(1), /* 106 'j' x6a */ _set(0)|_bpc(1), /* 107 'k' x6b */ _set(0)|_bpc(1), /* 108 'l' x6c */ _set(0)|_bpc(1), /* 109 'm' x6d */ _set(0)|_bpc(1), /* 110 'n' x6e */ _set(0)|_bpc(1), /* 111 'o' x6f */ _set(0)|_bpc(1), /* 112 'p' x70 */ _set(0)|_bpc(1), /* 113 'q' x71 */ _set(0)|_bpc(1), /* 114 'r' x72 */ _set(0)|_bpc(1), /* 115 's' x73 */ _set(0)|_bpc(1), /* 116 't' x74 */ _set(0)|_bpc(1), /* 117 'u' x75 */ _set(0)|_bpc(1), /* 118 'v' x76 */ _set(0)|_bpc(1), /* 119 'w' x77 */ _set(0)|_bpc(1), /* 120 'x' x78 */ _set(0)|_bpc(1), /* 121 'y' x79 */ _set(0)|_bpc(1), /* 122 'z' x7a */ _set(0)|_bpc(1), /* 123 '{' x7b */ _set(0)|_bpc(1), /* 124 '|' x7c */ _set(0)|_bpc(1), /* 125 '}' x7d */ _set(0)|_bpc(1), /* 126 '~' x7e */ _set(0)|_bpc(1), /* 127 x7f */ _set(0)|_bpc(1), /* 128 x80 */ 0, /* 129 x81 */ 0, /* 130 x82 */ 0, /* 131 x83 */ 0, /* 132 x84 */ 0, /* 133 x85 */ 0, /* 134 x86 */ 0, /* 135 x87 */ 0, /* 136 x88 */ 0, /* 137 x89 */ 0, /* 138 x8a */ 0, /* 139 x8b */ 0, /* 140 x8c */ 0, /* 141 x8d */ 0, /* 142 x8e */ _set(2)|_bpc(2), /* for halfwidth katakana */ /* 143 x8f */ _set(3)|_bpc(3), /* for JIS X 0212 */ /* 144 x90 */ 0, /* 145 x91 */ 0, /* 146 x92 */ 0, /* 147 x93 */ 0, /* 148 x94 */ 0, /* 149 x95 */ 0, /* 150 x96 */ 0, /* 151 x97 */ 0, /* 152 x98 */ 0, /* 153 x99 */ 0, /* 154 x9a */ 0, /* 155 x9b */ 0, /* 156 x9c */ 0, /* 157 x9d */ 0, /* 158 x9e */ 0, /* 159 ROW x9f */ 0, /* 160 --- xa0 */ 0, /* 161 1 xa1 */ _set(1)|_bpc(2), /* punctuation and symbols */ /* 162 2 xa2 */ _set(1)|_bpc(2), /* punctuation and symbols */ /* 163 3 xa3 */ _set(1)|_bpc(2), /* full-width roman alphanumerics */ /* 164 4 xa4 */ _set(1)|_bpc(2), /* hiragana */ /* 165 5 xa5 */ _set(1)|_bpc(2), /* katakana */ /* 166 6 xa6 */ _set(1)|_bpc(2), /* greek letters */ /* 167 7 xa7 */ _set(1)|_bpc(2), /* funky cryllic letters */ /* 168 8 xa8 */ _set(1)|_bpc(2), /* box drawing stuff */ /* 169 9 xa9 */ 0, /* unassigned */ /* 170 10 xaa */ 0, /* unassigned */ /* 171 11 xab */ 0, /* unassigned */ /* 172 12 xac */ 0, /* unassigned */ /* 173 13 xad */ 0, /* unassigned */ /* 174 14 xae */ 0, /* unassigned */ /* 175 15 xaf */ 0, /* unassigned */ /* 176 16 xb0 */ _set(1)|_bpc(2), /* kanji.... */ /* 177 17 xb1 */ _set(1)|_bpc(2), /* 178 18 xb2 */ _set(1)|_bpc(2), /* 179 19 xb3 */ _set(1)|_bpc(2), /* 180 20 xb4 */ _set(1)|_bpc(2), /* 181 21 xb5 */ _set(1)|_bpc(2), /* 182 22 xb6 */ _set(1)|_bpc(2), /* 183 23 xb7 */ _set(1)|_bpc(2), /* 184 24 xb8 */ _set(1)|_bpc(2), /* 185 25 xb9 */ _set(1)|_bpc(2), /* 186 26 xba */ _set(1)|_bpc(2), /* 187 27 xbb */ _set(1)|_bpc(2), /* 188 28 xbc */ _set(1)|_bpc(2), /* 189 29 xbd */ _set(1)|_bpc(2), /* 190 30 xbe */ _set(1)|_bpc(2), /* 191 31 xbf */ _set(1)|_bpc(2), /* 192 32 xc0 */ _set(1)|_bpc(2), /* 193 33 xc1 */ _set(1)|_bpc(2), /* 194 34 xc2 */ _set(1)|_bpc(2), /* 195 35 xc3 */ _set(1)|_bpc(2), /* 196 36 xc4 */ _set(1)|_bpc(2), /* 197 37 xc5 */ _set(1)|_bpc(2), /* 198 38 xc6 */ _set(1)|_bpc(2), /* 199 39 xc7 */ _set(1)|_bpc(2), /* 200 40 xc8 */ _set(1)|_bpc(2), /* 201 41 xc9 */ _set(1)|_bpc(2), /* 202 42 xca */ _set(1)|_bpc(2), /* 203 43 xcb */ _set(1)|_bpc(2), /* 204 44 xcc */ _set(1)|_bpc(2), /* 205 45 xcd */ _set(1)|_bpc(2), /* 206 46 xce */ _set(1)|_bpc(2), /* 207 47 xcf */ _set(1)|_bpc(2), /* 208 48 xd0 */ _set(1)|_bpc(2), /* 209 49 xd1 */ _set(1)|_bpc(2), /* 210 50 xd2 */ _set(1)|_bpc(2), /* 211 51 xd3 */ _set(1)|_bpc(2), /* 212 52 xd4 */ _set(1)|_bpc(2), /* 213 53 xd5 */ _set(1)|_bpc(2), /* 214 54 xd6 */ _set(1)|_bpc(2), /* 215 55 xd7 */ _set(1)|_bpc(2), /* 216 56 xd8 */ _set(1)|_bpc(2), /* 217 57 xd9 */ _set(1)|_bpc(2), /* 218 58 xda */ _set(1)|_bpc(2), /* 219 59 xdb */ _set(1)|_bpc(2), /* 220 60 xdc */ _set(1)|_bpc(2), /* 221 61 xdd */ _set(1)|_bpc(2), /* 222 62 xde */ _set(1)|_bpc(2), /* 223 63 xdf */ _set(1)|_bpc(2), /* 224 64 xe0 */ _set(1)|_bpc(2), /* 225 65 xe1 */ _set(1)|_bpc(2), /* 226 66 xe2 */ _set(1)|_bpc(2), /* 227 67 xe3 */ _set(1)|_bpc(2), /* 228 68 xe4 */ _set(1)|_bpc(2), /* 229 69 xe5 */ _set(1)|_bpc(2), /* 230 70 xe6 */ _set(1)|_bpc(2), /* 231 71 xe7 */ _set(1)|_bpc(2), /* 232 72 xe8 */ _set(1)|_bpc(2), /* 233 73 xe9 */ _set(1)|_bpc(2), /* 234 74 xea */ _set(1)|_bpc(2), /* 235 75 xeb */ _set(1)|_bpc(2), /* 236 76 xec */ _set(1)|_bpc(2), /* 237 77 xed */ _set(1)|_bpc(2), /* 238 78 xee */ _set(1)|_bpc(2), /* 239 79 xef */ _set(1)|_bpc(2), /* 240 80 xf0 */ _set(1)|_bpc(2), /* 241 81 xf1 */ _set(1)|_bpc(2), /* 242 82 xf2 */ _set(1)|_bpc(2), /* 243 83 xf3 */ _set(1)|_bpc(2), /* 244 84 xf4 */ _set(1)|_bpc(2), /* 245 85 xf5 */ 0, /* unassigned */ /* 246 86 xf6 */ 0, /* unassigned */ /* 247 87 xf7 */ 0, /* unassigned */ /* 248 88 xf8 */ 0, /* unassigned */ /* 249 89 xf9 */ 0, /* unassigned */ /* 250 90 xfa */ 0, /* unassigned */ /* 251 91 xfb */ 0, /* unassigned */ /* 252 92 xfc */ 0, /* unassigned */ /* 253 93 xfd */ 0, /* unassigned */ /* 254 94 xfe */ 0, /* unassigned */ /* 255 xff */ 0, }; lookup-1.08b.orig/lib/README0100644000014400001440000000305006173633115015131 0ustar nakaharastaffVarious routines thrown together into a library for neatness. User Documentation is usually in the ".h" file. Internals in the ".c". ------------------------------------------------------------------------- assert.h -- local assert routines. config.h -- overall library configuration file. euc.[ch] -- describes how many bytes in various an EUC characters. fuzzkana.[ch] -- Transforms regex patterns to allow a later search using the new pattern to "ignore" longness of vowels and small TSUs. index.[ch] -- Jeffrey's cool small and fast file indexing routines. input.[ch] -- input routines that allow other stuff to be done at idle times. jreadline.[ch] -- Somewhat similar to GNU's readline. Allows EUC input and does auto-conversion of romaji->kana. jregex.[ch] -- Jeffrey's Regular Expression Package. ->Very<- fast. Very cool. Handles Japanese EUC. kanaid.[ch] -- Information about EUC kana characters, a'la ASCII's ctype. loadfile.[ch] -- Load a file and read/write/create indexes. longlinenote.h -- Rather specialized stuff for the load-file stuff. output.[ch] -- output routines. Can convert to other encodings. packed_list.h -- For reading/writing in a packed memory-wise way. replace.[ch] -- simple string replacement routines. romaji2kana.[ch] -- Translates romaji to kana. Uses kanaid. std_romaji.c -- standard romaji->kana converter subroutine strsave.[ch] -- Standard strsave() - returns a private copy of a string. termset*.c -- routines to set terminal state xmalloc.[ch] -- Standard xmalloc() - a "memory or death" malloc(). lookup-1.08b.orig/lib/strsave.c0100644000014400001440000000066605554713020016112 0ustar nakaharastaff#ifndef _USING_DGUX /* DGUX has its own */ #include "xmalloc.h" #include "strsave.h" /* return a private copy of the given string, die if out of memory */ #ifndef strcpy extern char *strcpy(char *, const char *); #endif unsigned char *strsave(const unsigned char *str) { unsigned char *ptr; ptr = xmalloc((unsigned)strlen((void *)str)+1); (void)strcpy((void *)ptr, (void *)str); return ptr; } #endif /* _USING_DGUX */ lookup-1.08b.orig/lib/strsave.h0100644000014400001440000000041405554713017016114 0ustar nakaharastaff#ifndef __STRSAVE_H__ /* file wrapper */ #define __STRSAVE_H__ #ifdef _USING_DGUX # include #else /* return a private copy of the given string, die if out of memory */ extern unsigned char *strsave(const unsigned char *); #endif #endif /* file wrapper */ lookup-1.08b.orig/lib/longlinenote.h0100644000014400001440000000416306076511666017135 0ustar nakaharastaff#ifndef __LONGLINENOTE_H__ #define __LONGLINENOTE_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * October 1993 * * Goes along with my indexing routines. * * Due to complications with the way the text file is kept in memory, * the length of some longer lines needs to be remembered. * This is just a list of structs giving a line->length mapping. */ #define longlinenote_version 100 /* 1.00 */ #include "xmalloc.h" #if !defined(__GNUC__) # if !defined(__volatile__) # define __volatile__ /*nothing; for use with volatile functions */ # endif # if !defined(__inline__) # define __inline__ /*nothing; for use with volatile functions */ # endif #endif struct long_line_note { struct long_line_note *next; /* info about the next line */ const unsigned char *line; /* the line in question */ unsigned len; /* the length of the line */ }; /* * Record that LINE has length LEN. Simple stuff. */ static __inline__ void note_long_line(struct long_line_note **listptr, const unsigned char *line, unsigned len) { struct long_line_note *oldnext; oldnext = (*listptr); (*listptr) = xmalloc(sizeof(struct long_line_note)); (*listptr)->next = oldnext; (*listptr)->line = line; (*listptr)->len = len; } /* * Return the length of LINE, as per our line notes. * It's a fatal error to not have the line in our list. */ static __inline__ unsigned get_long_line_note(struct long_line_note *list, const unsigned char *line) { /* * We assume that the lines are in larger-address-first order, * so if we skip only records for lines with a larger address than * us, either finding our line exactly then, or dying if not. */ while (list && list->line > line) list = list->next; if (list && list->line == line) return list->len; assert(0); return 0; /* to quite "no return from non-void" warnings */ } #undef __inline__ #endif /* file wrapper */ lookup-1.08b.orig/lib/index.c0100644000014400001440000003075206174371630015537 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * October 1993 * * See comment in index.h for general info about this stuff. */ #include "config.h" #include "system.h" #include "assert.h" #include "system.h" #include #include #include #ifndef O_CREATE # if defined(_HAVE_SYS_FCNTL_H_) # include # elif defined(_HAVE_FCNTL_H_) # include # endif #endif #include "output.h" #include "index.h" #include "xmalloc.h" /* for xmalloc () */ #include static unsigned line_count; /* * For each character (which is a HI/LO pair) of each line in all * FILESIZE bytes of TEXT, call enter(HI, LO, start-of-line-value). * PER_LINE will be done once per text line. */ static void SCAN_TEXT(VirtFile *v, void (*enter)(unsigned char hi, unsigned char lo, TextOffset val), unsigned flags, const char *msg) { unsigned lastpercent = 0; /* for reporting the progress */ fileloc pos = 0, end = v->length; while (pos < end) { /* "value" is what will eventually be stored in the per-char list */ TextOffset value = pos; const unsigned char *strptr, *strend; unsigned linelen; /* if (flags & INDEX_REPORT_PROGRESS) */ { /* * Given that X of Y has been done, print what percent has been * done. The temp variable T is used to hold the last value * printed. */ int percent = (100 * pos)/end; if (percent != lastpercent) { outputf("[%s] %s %02d%% \r", v->filename, msg, lastpercent = percent); flush_output(); } } if (strptr = VirtPos2Str(v, pos, &linelen), strptr == NULL) break; line_count++; strend = strptr + linelen; pos += linelen + 1; /* +1 for newline */ /* for each character (single or multibyte) in the line... */ while (strptr < strend) { unsigned char c = *strptr++; if (c == 0245) { /* a multibyte katakana: translate to hiragana */ enter(0244&0x7f, *(strptr++) & 0x7f, value); } else if (c & 0x80) { /* any other multibyte */ enter(c & 0x7f, *(strptr++) & 0x7f, value); } else if (isalnum(c)) { /* a regular (ASCII) alphabetic or numeric */ enter(0, isupper(c) ? tolower(c) : c, value); } } } } /* * Before creating the real index, we'll use a temporary full * (i.e. non-sparse) HI/LO array index to compute some things, * such as how we'll sparse-ize the real index. */ struct fullindex { struct fullindex_entry { unsigned count; unsigned count2; TextOffset lastentered; int mem_needed; unsigned char *listptr; } char_info[/*high 7-bit byte*/128][/*low 7-bit byte*/128]; } *fullindex; static void enter1(unsigned char hi, unsigned char lo, TextOffset data) { struct fullindex_entry *p = &fullindex->char_info[hi][lo]; unsigned diff; if (p->count == 0) diff = data; else if (p->lastentered == data) return; /* been there, done that */ else diff = data - p->lastentered; p->mem_needed += bytes_required_for_packed_value(diff); p->lastentered = data; if (p->count != MAX_COUNT) if (p->count++ == (unsigned)~0) die("count overflow at %s line %d.\n", __FILE__, __LINE__); } static void enter2(unsigned char hi, unsigned char lo, TextOffset data) { struct fullindex_entry *p = &fullindex->char_info[hi][lo]; unsigned diff; /* don't enter for lines we're skipping */ if (p->count == SKIPPED_COUNT) return; if (p->count2 == 0) diff = data; else if (p->lastentered == data) return; /* been there, done that */ else { kibishii_assert(DATA > p->lastentered); diff = data - p->lastentered; } kibishii_assert(p->count != 0); kibishii_assert(p->listptr != 0); /* outputf("char %c%c: output %d [%d -> %d]\n", hi, lo, diff, p->lastentered, data); /**/ p->lastentered = data; /* note that we've seen this char and used its memory */ p->count--; p->count2++; p->mem_needed -= write_packed_value(&p->listptr, diff); assert(p->mem_needed >= 0); } /* * create_index(V, PER, LC) * Create an index for the virtual file. * Characters that are on at least PER percent of the lines in * the file are omitted from the index. */ struct index * create_index(VirtFile *v, unsigned percent, unsigned flags) { unsigned hi, lo; /* general usage for accessing index */ struct index index; /* real index header */ struct index *indexp; /* fully allocated real index pointer */ unsigned char *freemem; /* pointer into indexp of unpartitioned mem */ /* allocate and clear memory for fullindex -- freed at end of this fcn */ fullindex = (void*)xmalloc(sizeof(*fullindex)); bzero((void*)fullindex, sizeof(*fullindex)); /* * Do the first scan of the text, noting how many * lines each character is on, and how much memory its index will need. */ line_count = 0; SCAN_TEXT(v, enter1, flags, "index (first pass): "); index.linecount = line_count; /* figure the line limiter */ index.limitcount = index.linecount * percent / 100; if (index.limitcount > MAX_COUNT) index.limitcount = MAX_COUNT; /* * Will run through the fullindex we created and note, for each * character seen (i.e. for each hi/lo combo there) how much memory * in the index we'll need to represent it (if it's not omitted because * it's on too many lines). */ index.indexsize = sizeof(index); /* will at least need index head */ for (hi = 0x00; hi < 0x80; hi++) /* for every possible HI byte.... */ { /* look for the first LO with entries in it */ for (lo = 0; lo < 0x80; lo++) if (fullindex->char_info[hi][lo].count != 0) break; /* no memory needed if there are none */ if (lo >= 0x80) { index.hi[hi].first_lo = index.hi[hi].end_lo = 0; continue; } /* now go through the rest of the LOs, noting the last we've seen */ for (index.hi[hi].first_lo = lo; lo < 0x80; lo++) { unsigned count = fullindex->char_info[hi][lo].count; if (count == 0) continue; /* note that we've seen a LO at least this late in the game */ index.hi[hi].end_lo = lo; if (count < index.limitcount) { if (count != 0) { if (flags & INDEX_REPORT_STATS) { outputf("%d times [%c%c] %d bytes]\n", count, (hi ? (hi|0x80) : ' '), (hi ? (lo|0x80) : lo), fullindex->char_info[hi][lo].mem_needed); } index.indexsize += fullindex->char_info[hi][lo].mem_needed; } } else { /* on too many lines... we'll omit this from the index */ if (flags & INDEX_REPORT_SKIPPED) outputf("[%c%c:%d/%d]", (hi ? (hi|0x80) : ' '), (hi ? (lo|0x80) : lo), count, fullindex->char_info[hi][lo].mem_needed); /*index.omittedsize += fullindex->char_info[hi][lo].mem_needed;*/ fullindex->char_info[hi][lo].count = SKIPPED_COUNT; fullindex->char_info[hi][lo].mem_needed = 0; } } index.hi[hi].end_lo++; /* now points just beyond last char*/ /* must also account for the lo_count[] and lo[] arrays */ index.indexsize += (index.hi[hi].end_lo - index.hi[hi].first_lo) * (sizeof(elementcount) + sizeof(IndexOffset)); } if (flags & INDEX_REPORT_SKIPPED) outchar('\n'); indexp = xmalloc(index.indexsize); /* allocate memory for real index */ *indexp = index; /* copy partially filled header */ freemem = (unsigned char *)&indexp[1]; /* point to free data after head */ /* go into the real index to create the list array holders */ for (hi = 0x00; hi < 0x80; hi++) { if (index.hi[hi].end_lo != 0) { unsigned count = index.hi[hi].end_lo - index.hi[hi].first_lo; indexp->hi[hi].shifted_lo = makeIndexOffset(indexp, freemem); freemem += sizeof(IndexOffset) * count; } } /* go into the real index to partition the count array holders */ for (hi = 0x00; hi < 0x80; hi++) { if (index.hi[hi].end_lo != 0) { unsigned count = index.hi[hi].end_lo - index.hi[hi].first_lo; indexp->hi[hi].listcount = makeIndexOffset(indexp, freemem); freemem += sizeof(elementcount) * count; } } /* go into the index to partition the list memories */ for (hi = 0x00; hi < 0x80; hi++) { IndexOffset thisCountPtr; IndexOffset thisListPtr; if (index.hi[hi].end_lo == 0) continue; /* no lists needed here */ thisCountPtr = indexp->hi[hi].listcount; thisListPtr = indexp->hi[hi].shifted_lo; /* for each LO that exists for this HI.... */ for (lo = index.hi[hi].first_lo; lo < index.hi[hi].end_lo; lo++) { /* insert count for this HI/LO */ elementcount count = fullindex->char_info[hi][lo].count; *realptr(indexp, thisCountPtr, elementcount *) = count; /* partition memory for the list if there's a list for this pair */ if (count && count != SKIPPED_COUNT) { *realptr(indexp, thisListPtr, IndexOffset *) = makeIndexOffset(indexp, freemem); fullindex->char_info[hi][lo].listptr = freemem; freemem += fullindex->char_info[hi][lo].mem_needed; } /* bump up count and listptr pointers for next LO */ thisListPtr += sizeof(IndexOffset); thisCountPtr += sizeof(elementcount); /* clear this for the next runthrough */ fullindex->char_info[hi][lo].lastentered = 0; } } /* make sure it came out exactly right */ kibishii_assert(makeIndexOffset(indexp, freemem) == index.indexsize); /* * Run through text a 2nd time, actually creating the real index. * This is virtually identical to the loop at the top of this function. */ SCAN_TEXT(v, enter2, flags, "index (final pass): "); /* if (flags & INDEX_REPORT_PROGRESS) */ output(" \r"); free(fullindex); indexp->magic = INDEX_MAGIC; indexp->version_major = INDEX_VERSION_MAJOR; indexp->version_minor = INDEX_VERSION_MINOR; return indexp; } int write_index_file(const char *filename, const struct index *i) { int fd; int iserror; if (fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0444), fd < 0) return fd; iserror = write(fd, (void*)i, i->indexsize) != i->indexsize; iserror |= close(fd) != 0; return iserror; } /* * If TRY is true, issue no message if file not found. */ struct index *read_index_file(const char *filename, int try, unsigned flags) { struct index *index = 0; long int size = filesize(filename); if (size < 0 && try) return 0; if (size >= 0) { int fd = open(filename, 0); index = xmalloc(size); read(fd, (void *)index, size); close(fd); } if (index == 0) { if (!try) outputf("[open of \"%s\" failed: %n]\n", filename); return 0; } index->FileP = NULL; if (size != index->indexsize) { warn("\n", size, index->indexsize); } else if (index->magic != INDEX_MAGIC) { warn("\n"); } else if (index->version_major != INDEX_VERSION_MAJOR) { warn("\n", index->version_major, INDEX_VERSION_MAJOR); } else if (index->version_minor != INDEX_VERSION_MINOR) { warn("\n", index->version_minor, INDEX_VERSION_MINOR); } return index; } struct index *mem_read_index_file(const char *filename) { FILE *fp; struct index *index; if (fp = fopen(filename, "r"), fp == NULL) return NULL; index = xmalloc(sizeof(struct index)); if (fread(index, sizeof(struct index), 1, fp) != 1) die("bad fread of index header: %n\n"); index->FileP = fp; if (index->magic != INDEX_MAGIC) { warn("\n"); } else if (index->version_major != INDEX_VERSION_MAJOR) { warn("\n", index->version_major, INDEX_VERSION_MAJOR); } else if (index->version_minor != INDEX_VERSION_MINOR) { warn("\n", index->version_minor, INDEX_VERSION_MINOR); } return index; } /* * Return true if the named file seems to be an index file. */ int is_index_file(const char *filename) { int fd; struct index head; struct stat statbuf; int i = 0; if (fd = open(filename, 0), fd < 0) return 0; if (fstat(fd, &statbuf)) i = read(fd, &head, sizeof(head)); close(fd); if (i != sizeof(head)) return 0; if (head.magic != INDEX_MAGIC) return 0; if (head.indexsize != statbuf.st_size) return 0; return 1; } lookup-1.08b.orig/lib/index.h0100644000014400001440000002150306333200730015523 0ustar nakaharastaff#ifndef __INDEX_H__ /* file wrapper */ #define __INDEX_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #ifndef __PACKED_LIST_H__ # ifndef __SYSTEM_H__ # include "system.h" # endif # if defined(_HAVE_SYS_STDTYPES_H_) # include # else # include # endif #include "packed_list.h" #endif #include "virtfile.h" #define INDEX_MAGIC 0x6a647800 /* "jdx\0" */ #define INDEX_VERSION_MAJOR 1 #define INDEX_VERSION_MINOR 2 /* * Jeffrey Friedl, Omron Corporation. * jfriedl@nff.ncl.omron.co.jp * October 1993 * * Conceptually, following the index for a character will yield the lines * in the text file which contain that character. * * One more level closer to reality (how it's actually implemented), this * is returned as an array of pointers (to each line) and a count indicating * how many elements in the array (the array is not null-terminated). * * One more level closer, the array elements aren't really pointers, but * offsets from the start of the file (or from the start of the memory * into which the file's been loaded). * * One more level closer, each element doesn't actually hold the offset into * the file, but the _difference_ from the previous offset (the first * element actually holding the real offset into the file, as the "previous" * offset of the first is zero). * * One more level closer, it's really not an array of values (differences) * but the appropriate number of sequential sets of value-indicating bytes. * These are implemented via "packed_list.h". */ /* * IndexOffset - holds an offset from the beginning of the index * (file or allocated memory) to some point in the index. * These are found in the index itself. */ typedef unsigned long IndexOffset; /* * Also found only in the index itself, TextOffset holds an offset from * the start of the text (file or memory) to some point into the text * (i.e. starts of lines). */ typedef unsigned long TextOffset; /* * The following two macros convert from in-memory text or index pointers * into the appropriate xxxxOffset types. */ #define makeIndexOffset(IndexStartPointer, PointerSomewhereIntoIndexMemory) \ ((IndexOffset)((const unsigned char *)(PointerSomewhereIntoIndexMemory) - \ (const unsigned char *)(IndexStartPointer))) #define makeTextOffset(FileStartPointer, PointerSomewhereIntoFileMemory) \ ((TextOffset)((const unsigned char *)(PointerSomewhereIntoFileMemory) - \ (const unsigned char *)(FileStartPointer))) /* * The opposite, converts an OFFSET from the beginning of BASE (which is * a pointer) to a pointer of the given TYPE. */ #define realptr(base, offset, type) \ ((type)((const unsigned char *)(base)+(offset))) /* * EUC Japanese are double-byte characters, each with the high bit set. * When we find one of these, we look at it as a HI byte and a LO byte, * with the high bits cleared (thereby setting their possible range from * [0x80 - 0xff] to [0x00 - 0x7f], which is a bit more convenient to * work with). For "regular" characters, we use HI=0, LO="regular byte". * * Basically, we would like to be able to do something like * ListOfLinesForOneCharacter = index[HI][LO] * but that would require a larger array than we want, since well over half * of the possible codes aren't used. * * So I compromise a bit... I keep the [HI] part (all 128 slots), but each * of those 128 slots is, rather than 128 slots for [LO], some variable * number of slots along with info indicating how many are there. * * If, for example, the lowest LO for [HI=123] is 10 and the highest LO * used is 20, I would keep 11 slots with first_lo of 10 and end_lo of 20. * * The ListOfLinesForOneCharacter is logically an array of pointers, * so my index structure might look something like * * struct * { * int first_lo, end_lo; * unsigned char *lists_of_lines[]; <--- indexed by (LO-first_lo) * unsigned list_counts[]; <--- indexed by (LO-first_lo) * } hi[128]; * * But rather than use "char *foo[]", I use IndexOffset so that it can make * sense in core or on disk. */ struct index { unsigned magic; /* INDEX_MAGIC */ unsigned short version_major; /* INDEX_VERSION_MAJOR */ unsigned short version_minor; /* INDEX_VERSION_MINOR */ unsigned indexsize; /* size of complete index, including this header */ unsigned linecount; /* FYI, number of lines in the file indexed */ unsigned limitcount; /* FYI, if char on this many lines, not in index */ FILE *FileP; #define IsMemIndex(I) ((I)->FileP != NULL) time_t st__mtime; /* st_mtime of file indexed */ /* the real index */ struct { unsigned char first_lo; unsigned char end_lo; IndexOffset listcount; IndexOffset shifted_lo; } hi[128]; /* the rest of the index follows... (indexsize-sizeof(index)) bytes */ }; #ifndef USE_SHORT_INDEX_COUNTS #define USE_SHORT_INDEX_COUNTS 1 #endif #if USE_SHORT_INDEX_COUNTS typedef unsigned short elementcount; #else typedef unsigned elementcount; #endif #define SKIPPED_COUNT ((elementcount)~0) #define MAX_COUNT (SKIPPED_COUNT - 1) #if !defined(__GNUC__) # if !defined(__volatile__) # define __volatile__ /*nothing; for use with volatile functions */ # endif # if !defined(__inline__) # define __inline__ /*nothing; for use with volatile functions */ # endif #endif /* * More or less does the virtual * dest = index[hi].list_of_counts[lo] * returning true if it could be gotten, false if there was no such * info entered (i.e. if HI and/or LO were bad). */ static __inline__ int get_index_count(const struct index *i, unsigned char hi, unsigned char lo, elementcount *dest) { /* make sure that the LO falls into the range of lo's for the HI */ if (lo < i->hi[hi].first_lo || lo >= i->hi[hi].end_lo) return 0; /* no such character in the index */ *dest = realptr(i, i->hi[hi].listcount, elementcount *) [lo - i->hi[hi].first_lo]; return 1; } /* * More or less does the virtual * dest = index[hi].list_of_counts[lo] * returning true if it could be gotten, false if there was no such * info entered (i.e. if HI and/or LO were bad). */ static __inline__ int mem_get_index_count(const struct index *i, unsigned char hi, unsigned char lo, elementcount *dest) { long loc; /* make sure that the LO falls into the range of lo's for the HI */ if (lo < i->hi[hi].first_lo || lo >= i->hi[hi].end_lo) return 0; /* no such character in the index */ loc = (long)&realptr(0, i->hi[hi].listcount, elementcount *) [lo - i->hi[hi].first_lo]; if (fseek(i->FileP, loc, SEEK_SET) != 0) die("bad fseek to %ld (fp=%x) at %s line %d, returned %ld: %n\n", (long)loc, i->FileP, __FILE__, __LINE__); fread(dest, sizeof(elementcount), 1, i->FileP); return 1; } static __inline__ int get_index_list(const struct index *i, unsigned char hi, unsigned char lo, const unsigned char **dest) { /* make sure that the LO falls into the range of lo's for the HI */ if (lo < i->hi[hi].first_lo || lo >= i->hi[hi].end_lo) return 0; /* no such character in the index */ else { IndexOffset listoffset = realptr(i, i->hi[hi].shifted_lo, IndexOffset *) [lo - i->hi[hi].first_lo]; *dest = realptr(i, listoffset, unsigned char *); return 1; } } static __inline__ int mem_get_index_list(const struct index *i, unsigned char hi, unsigned char lo, IndexOffset *dest) { /* make sure that the LO falls into the range of lo's for the HI */ if (lo < i->hi[hi].first_lo || lo >= i->hi[hi].end_lo) return 0; /* no such character in the index */ else { long loc = (long) &realptr(0, i->hi[hi].shifted_lo, IndexOffset *) [lo - i->hi[hi].first_lo]; if (fseek(i->FileP, loc, SEEK_SET) != 0) die("bad fseek to %ld (fp=%x) at %s line %d, returned %ld: %n\n", loc, i->FileP, __FILE__, __LINE__); if (fread(dest, sizeof(*dest), 1, i->FileP) !=1) die("bad read from fp=%x at %s line %d: %n\n", i->FileP, __FILE__, __LINE__); return 1; } } /* other things defined in index.c */ extern struct index * create_index(VirtFile *v, unsigned percent, unsigned flags); /* These flags must be distinct from those in loadfile.h */ #define INDEX_REPORT_PROGRESS 0x00000001 #define INDEX_REPORT_SKIPPED 0x00000002 #define INDEX_REPORT_STATS 0x00000004 extern int is_index_file(const char *filename); struct index *read_index_file(const char *filename, int try, unsigned flags); struct index *mem_read_index_file(const char *filename); int write_index_file(const char *filename, const struct index *i); #undef __inline__ #endif /* file wrapper */ lookup-1.08b.orig/lib/loadfile.c0100644000014400001440000000715306333200746016202 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * October 1993 * * Routine to load a file and to load, create, and/or write an accompanying * index. WRT loadfile, a "file" is a rather high-level object that has * an index and other substructures associated with it. * * On a lower level is "readfile" which deals with filesystem files * as singular entities. */ #include "config.h" #include "assert.h" #include #include #include "xmalloc.h" /* for xmalloc () */ #include "strsave.h" /* for strsave() */ #include "system.h" #ifndef strlen # if defined(_HAVE_STRINGS_H_) /* might be defined in system.h */ # include # else # include # define index strchr # define rindex strrchr # endif #endif #include "loadfile.h" #include "output.h" /* * Given the name of a text file, return the name of its associated index * file (or what it would be called were it to exist). The returned string * should eventually be free'd by the user. */ char *indexfile_name(const char *datafile_name) { char *indexname = xmalloc((unsigned)(strlen((void*)datafile_name)+ strlen(LOADFILE_INDEX_EXTENTION)+2)); strcpy(indexname, datafile_name); strcat(indexname, LOADFILE_INDEX_EXTENTION); return indexname; } /* * If LOADFILE_READINDEX is set, the index is read from "FILENAME.jdx". * Otherwise, it is created internally. * * If LOADFILE_WRITEINDEX is set, the index is written to "FILENAME.jdx". * * It's stupid to have both LOADFILE_READINDEX and LOADFILE_WRITEINDEX set. */ struct fileinfo * loadfile(const char *filename, unsigned percent, unsigned flags) { struct fileinfo *info = xmalloc(sizeof(struct fileinfo)); long int filesize; struct stat statbuf; /* stat for various uses */ if (stat(filename, &statbuf) < 0) { warn("[can't stat \"%s\": %n]\n", filename); return 0; } if (statbuf.st_size == 0) warn("[warning: file \"%s\" is empty]\n", filename); /* open file */ info->v = OpenVertFile(filename); /* note the short filename */ info->short_filename = (const char *)rindex((void*)info->v->filename, '/'); if (info->short_filename == 0) info->short_filename = info->v->filename; else info->short_filename++; /* skip over "/" */ if (flags & (LOADFILE_READINDEX|LOADFILE_READifPRESENT)) { char *name = indexfile_name(filename); int just_try = flags & LOADFILE_READifPRESENT; if (flags & LOADFILE_NO_MEM_INDEX) info->index = read_index_file(name, just_try, flags); else info->index = mem_read_index_file(name); if (info->index == 0) { if (flags & LOADFILE_READifPRESENT) { free(name); goto build_index; } warn("[error: couldn't read \"%s\": %n]\n", name); free(name); free(info); return 0; } info->indexfile = name; if (info->index->st__mtime != statbuf.st_mtime) { warn("\n", filename); } } else { struct index *i; build_index: i = create_index(info->v, percent, flags); i->st__mtime = statbuf.st_mtime; i->FileP = NULL; /* no file -- it's in memory */ info->index = i; } if (flags & LOADFILE_WRITEINDEX) { char *name = indexfile_name(filename); if (write_index_file(name, info->index) != 0) { warn("[error: couldn't write \"%s\": %n]\n", name); free(name); } else { warn("[wrote index file \"%s\"]\n", name); info->indexfile = name; } } return info; } lookup-1.08b.orig/lib/loadfile.h0100644000014400001440000000376206173456332016217 0ustar nakaharastaff#ifndef __LOADFILE_H__ /* file wrapper */ #define __LOADFILE_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * October 1993 * * Routine to load a file and to load, create, and/or write an accompanying * index. WRT loadfile, a "file" is a rather high-level object that has * an index and other substructures associated with it. * * On a lower level is "readfile" which deals with filesystem files * as singular entities. */ #define loadfile_version 102 /* 1.02 */ #include "longlinenote.h" #include "index.h" #include "virtfile.h" /* * Info about a loaded file. Here, a "file" is rather high-level concept. */ struct fileinfo { VirtFile *v; /* pointer to virtual file */ const char *short_filename; /* short name of file */ const struct index *index; /* index for file */ const char *indexfile; /* name of file index was read from */ }; /* * Load a named text file, and automatically load or calculate an index * for it. Also can write the index to a file (these options controlled * by the FLAGS given below). */ extern struct fileinfo * loadfile(const char *filename, unsigned percent, unsigned flags); /* These flags must be distinct from those in index.h */ #define LOADFILE_WRITEINDEX 0x00010000 #define LOADFILE_READINDEX 0x00020000 #define LOADFILE_NO_MEM_INDEX 0x00040000 #define LOADFILE_READifPRESENT 0x00080000 /* * File extension expected for index read and written by loadfile. */ #define LOADFILE_INDEX_EXTENTION ".jin" /* jeff's index */ /* * Given the name of a text file, return the name of its associated index * file (or what it would be called were it to exist). The returned string * should eventually be free'd by the user. */ extern char *indexfile_name(const char *datafile_name); #endif /* file wrapper */ lookup-1.08b.orig/lib/assert.h0100644000014400001440000000456706076503333015741 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #ifndef __ASSERT_H__ #define __ASSERT_H__ #ifndef macro_start /* just in case this file is included stand-alone */ # define macro_start do # define macro_end while (0) #endif /* * When NDEBUG (no debug) is on, these asserts become (almost) nothing. */ #ifdef NDEBUG # define assert(expr) macro_start { /* nothing */ } macro_end # define soft_assert(expr) macro_start { /* nothing */ } macro_end # define kibishii_assert(expr) macro_start { /* nothing */ } macro_end #else #ifndef __OUTPUT_H__ #include "output.h" #endif /* * Regular assert -- die if EXPR is not true. */ #define assert(expr) \ macro_start { \ if (!(expr)) \ { \ die("\nassert(" #expr ") failed \"%s\" line %d.\n", \ __FILE__, __LINE__); \ } \ } macro_end /* * just warn (if EXPR is not true.) */ #define soft_assert(expr) \ macro_start { \ if (!(expr)) \ { \ warn("\nassert(" #expr ") failed \"%s\" line %d.\n", \ __FILE__, __LINE__); \ } \ } macro_end /* * Kibishii debug stuff is normally not on unless specifically requested, * such as under special debugging situations, or development times. * When on, it's just like a soft assert. When off, it's nothing. */ #ifndef KIBISHII_DEBUG # define kibishii_assert(expr) macro_start { /* nothing */ } macro_end #else # define kibishii_assert(expr) soft_assert(expr) #endif /* KIBISHII_DEBUG */ #endif /* NDEBUG wrapper */ #endif /* file wrapper */ lookup-1.08b.orig/lib/romaji2kana.c0100644000014400001440000003624606076512110016621 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * */ #include "config.h" #include "assert.h" #include #include "romaji2kana.h" #include "kanaid.h" #if !defined(__GNUC__) # if !defined(__volatile__) # define __volatile__ /*nothing; for use with volatile functions */ # endif # if !defined(__inline__) # define __inline__ /*nothing; for use with volatile functions */ # endif #endif #if 0 # define ROMAJI_DEBUG #endif #ifdef TEST # define ROMAJI_DEBUG #endif #ifdef ROMAJI_DEBUG # include "output.h" #endif #define arraysize(array) (sizeof(array)/sizeof(array[0])) #define R2K_NORMAL 0x80 #if (R2K_NORMAL & (R2K_UNCONVERTED_PUNC_OK|R2K_NONASCII_OK|R2K_ALLOW_LONG_O_WITH_H)) # error oops #endif #define POSSIBLE_FLAGS_PER_ROMAJI R2K_ALLOW_LONG_O_WITH_H #define Y(ROMAJI, KANA, FLAG) { {#ROMAJI}, {#KANA}, FLAG } #define X(ROMAJI, KANA) { {#ROMAJI}, {#KANA}, R2K_NORMAL } #define LISTS_ARE_IN_ORDER 1 /* make true only if the romaji entries below are in alphabetical order within each array */ static struct { const unsigned char romaji[1]; const unsigned char kana[2]; const unsigned char flags; } romaji1[ ] = { X( a , あ ), X( e , え ), Y( h , う , R2K_ALLOW_LONG_O_WITH_H), X( i , い ), Y( m , ん , R2K_ALLOW_M_FOR_N), X( n , ん ), X( o , お ), X( u , う ), }; static struct { const unsigned char romaji[2]; const unsigned char kana[4]; const unsigned char flags; } romaji2[ ] = { X( ba , ば ), X( be , べ ), X( bi , び ), X( bo , ぼ ), X( bu , ぶ ), X( ca , か ), X( co , こ ), X( cu , く ), X( da , だ ), X( de , で ), X( di , ぢ ), X( do , ど ), /* make */ X( du , づ ), X( fa , ふぁ ), X( fe , ふぇ ), X( fi , ふぃ ), X( fo , ふぉ ), X( fu , ふ ), X( ga , が ), X( ge , げ ), /* sure */ X( gi , ぎ ), X( go , ご ), X( gu , ぐ ), X( ha , は ), X( he , へ ), X( hi , ひ ), X( ho , ほ ), X( hu , ふ ), X( ja , じゃ ), /* to */ X( je , じぇ ), X( ji , じ ), X( jo , じょ ), X( ju , じゅ ), X( ka , か ), X( ke , け ), X( ki , き ), /* keep */ X( ko , こ ), X( ku , く ), X( la , ら ), X( le , れ ), X( li , り ), X( lo , ろ ), X( lu , る ), /* in */ X( ma , ま ), X( me , め ), X( mi , み ), X( mo , も ), X( mu , む ), X( na , な ), X( ne , ね ), /* alphabetical */ X( ni , に ), X( no , の ), X( nu , ぬ ), X( pa , ぱ ), X( pe , ぺ ), X( pi , ぴ ), X( po , ぽ ), X( pu , ぷ ), X( ra , ら ), /* order */ X( re , れ ), X( ri , り ), X( ro , ろ ), X( ru , る ), X( sa , さ ), X( se , せ ), X( si , し ), X( so , そ ), X( su , す ), X( ta , た ), X( te , て ), X( ti , ち ), X( to , と ), X( tu , つ ), X( va , ヴぁ ), X( ve , ヴぇ ), X( vi , ヴぃ ), X( vo , ヴぉ ), X( vu , ヴ ), X( wa , わ ), X( we , ゑ ), X( wi , ゐ ), X( wo , を ), X( xa , ぁ ), X( xe , ぇ ), X( xi , ぃ ), X( xo , ぉ ), X( xu , ぅ ), X( ya , や ), X( yo , よ ), X( yu , ゆ ), X( za , ざ ), X( ze , ぜ ), X( zi , じ ), X( zo , ぞ ), X( zu , ず ), }; static struct { const unsigned char romaji[3]; const unsigned char kana[4]; const unsigned char flags; } romaji3[ ] = { X( bya , びゃ ), X( byo , びょ ), X( byu , びゅ ), X( cha , ちゃ ), X( che , ちぇ ), X( chi , ち ), X( cho , ちょ ), X( chu , ちゅ ), X( dya , ぢゃ ), X( dye , ぢぇ ), X( dyi , でぃ ), X( dyo , ぢょ ), X( dyu , ぢゅ ), X( dzi , ぢ ), /* make */ X( dzu , づ ), X( gya , ぎゃ ), X( gyo , ぎょ ), X( gyu , ぎゅ ), /* sure */ X( hya , ひゃ ), X( hyo , ひょ ), X( hyu , ひゅ ), X( jya , じゃ ), X( jyo , じょ ), /* to */ X( jyu , じゅ ), X( kya , きゃ ), X( kyo , きょ ), X( kyu , きゅ ), X( mya , みゃ ), /* keep */ X( myo , みょ ), X( myu , みゅ ), X( nya , にゃ ), X( nyo , にょ ), X( nyu , にゅ ), /* in */ X( pya , ぴゃ ), X( pyo , ぴょ ), X( pyu , ぴゅ ), X( rya , りゃ ), X( ryo , りょ ), /* alphabetical */ X( ryu , りゅ ), X( sha , しゃ ), X( shi , し ), X( sho , しょ ), /* order */ X( shu , しゅ ), X( sya , しゃ ), X( syi , し ), X( syo , しょ ), X( syu , しゅ ), X( tsu , つ ), X( tya , ちゃ ), X( tye , ちぇ ), X( tyi , てぃ ), X( tyo , ちょ ), X( tyu , ちゅ ), X( tzu , づ ), X( xka , ヵ ), X( xke , ヶ ), X( xtu , っ ), X( xwa , ゎ ), X( xya , ゃ ), X( xyo , ょ ), X( xyu , ゅ ), X( zya , じゃ ), X( zye , じぇ ), X( zyo , じょ ), X( zyu , じゅ ), }; static unsigned mode = R2K_MIXED_MODE; unsigned r2k_setmode(unsigned newmode) { unsigned oldmode = mode; mode = newmode; return oldmode; } #ifndef R2K_DEFAULT_FLAGS # define R2K_DEFAULT_FLAGS R2K_UNCONVERTED_PUNC_OK|\ R2K_NONASCII_OK|\ R2K_ALLOW_LONG_O_WITH_H #endif static unsigned flags = R2K_DEFAULT_FLAGS; unsigned r2k_setflag(unsigned new) { unsigned old = flags; flags = new; return old; } static const char *romaji2kana_pass = "\t ", *romaji2kana_omit = "'", *romaji2kana_longvowel = "-^"; const char *r2k_setpass(const char *new) { const char *old = romaji2kana_pass; if (new) romaji2kana_pass = new; return old; } const char *r2k_setomit(const char *new) { const char *old = romaji2kana_omit; if (new) romaji2kana_omit = new; return old; } const char *r2k_setlongvowel(const char *new) { const char *old = romaji2kana_longvowel; if (new) romaji2kana_longvowel = new; return old; } /* * More or less an index(3s) function. */ static __inline__ const char * is_char_in_string(const char *str, const char c) { if (str) while (str[0]) if (str[0] == c) return str; else str++; return 0; } /* * Convert the romaji from R to R_END to kana in buffer K whose * length is K_BUF_LEN. If INFO is non-null, it will be filled * with the number of bytes of K filled, and a flag to note if what * was written to K differs from what was read from R (the normal case * is that, of course, it does). * * If K is zero, nothing is written, and INFO will contain values as if * it was written. * * The number of characters that the thing didn't know what to do * with are returned. */ int romaji2kana(const unsigned char *r, const unsigned char *r_end, unsigned char *k, unsigned k_buf_len, struct romaji2kana_info *info) { const unsigned char *orig_k = k; /* start of output buffer */ unsigned char last_hi = 0, last_lo = 0; /* last converted character */ unsigned badtranscount = 0; /* value to be returned */ unsigned modified = 0; unsigned searchflags = flags | R2K_NORMAL; if (r == 0 || *r == '\0' || (k && k_buf_len < 3)) return R2K_BAD_ARGS; #ifdef ROMAJI_DEBUG outputf("romaji2kana(\"%.*s\"): ", r_end - r, r); #endif /* put the char into the output buffer, aborting on overflow */ #define out(c) \ macro_start { \ unsigned char value = (c); /* ensure C evaluated exactly once */ \ if (orig_k) \ { \ if (k_buf_len == 0) \ return R2K_OVERFLOW; \ k_buf_len--; \ *k = value; \ } /* ... else we're just noting the size that would be output */ \ k++; \ } macro_end /* While there's still romaji left to be converted.... */ while (r < r_end) { unsigned char bite[3]; int bite_size, upper; /* if not an ascii character, just pass through */ if (!isascii(r[0])) { if ((searchflags & R2K_NONASCII_OK) == 0) badtranscount++; out(last_hi = r[0]); /* output high byte */ out(last_lo = r[1]); /* output low byte */ r += 2; continue; } /* if the ASCII is to be passed through, do so */ if (is_char_in_string(romaji2kana_pass, r[0])) { out(*r++); last_hi = 0; continue; } /* if the ASCII is to be omitted, do so */ if (is_char_in_string(romaji2kana_omit, r[0])) { modified = 1; last_hi = 0; r++; continue; } /* * If the character indicates a long vowel and we've just output * a character that has a vowel sound we can continue, output the * appropriate character. */ if (is_char_in_string(romaji2kana_longvowel, r[0])) { /* If the last character was katakana, just output the * dash character「ー」. Otherwise, the appropriate vowel. */ if (last_hi == KID_KATA_HI) { out(((const unsigned char *)"ー")[0]); out(((const unsigned char *)"ー")[1]); r++; /* skip the longness marker */ last_hi = 0; /* so we won't trigger this again */ continue; } if (last_hi == KID_HIRA_HI) { unsigned char low = 0; switch(KANA_ID(last_hi, last_lo) & KID_VSOUND) { case KID_A: low = ((const unsigned char *)"あ")[1];break; case KID_I: low = ((const unsigned char *)"い")[1];break; case KID_U: low = ((const unsigned char *)"う")[1];break; case KID_E: low = ((const unsigned char *)"え")[1];break; case KID_O: low = ((const unsigned char *)"お")[1];break; } if (low) { out(KID_HIRA_HI); out(low); r++; /* skip the longness marker */ last_hi = 0; /* so we won't trigger this again */ continue; } } } /* * If the first two characters are the same, and not a vowel, * we'll make it a small TSU (unless it's "n" in which case there * will be special handling). */ if (&r[1] < r_end && isalpha(r[1]) && isascii(r[1]) && !is_char_in_string("aeiouAEIOU", r[0]) && (isupper(r[0]) ? r[0] : tolower(r[0])) == (isupper(r[1]) ? r[1] : tolower(r[1]))) { if (r[0] == 'n' || r[0] == 'N') { out(last_hi = isupper(r[0]) ? KID_KATA_HI : KID_HIRA_HI); out(last_lo = ((const unsigned char *)"ん")[1]); } else { out(last_hi = isupper(r[0]) ? KID_KATA_HI : KID_HIRA_HI); out(last_lo = ((const unsigned char *)"ッ")[1]); } r++; continue; } /* * We'll try to bite off as large a chunk of romaji that makes * sense. We'll bite until: * hit the biggest reasonable bite, or * run out of romaji, or * run into a non-ASCII char, or * run into a non-alphabetic, or * are paying attention to case and hit a different case. * * We'll convert to lowercase for the checking, but UPPER will * remember if it was upper case or not (if it matters). */ upper = isupper(r[0]); for (bite_size = 0 ; bite_size < sizeof(bite) && &r[bite_size] < r_end && isascii(r[bite_size]) && isalpha(r[bite_size]) && (mode != R2K_MIXED_MODE || upper == isupper(r[bite_size])) ; bite_size++) { bite[bite_size] = isupper(r[bite_size]) ? tolower(r[bite_size]) : r[bite_size]; } #ifdef ROMAJI_DEBUG printf("trying bite [%.*s]\n", bite_size, bite); #endif /* * Used below to check the current bite against an array of * romaji->kana pairs. */ #define check(ARRAY, TEST) \ macro_start { \ int i; \ for (i = 0; i < arraysize(ARRAY); i++) \ { \ if ((TEST) && (ARRAY[i].flags & searchflags)) \ { \ kana = ARRAY[i].kana; /* we have a winner */ \ max_kana_len = sizeof(ARRAY[0].kana); \ r += sizeof(ARRAY[0].romaji); \ goto copy_matched_kana; \ } \ if ((LISTS_ARE_IN_ORDER) && ARRAY[i].romaji[0] > bite[0]) \ break; \ } \ } macro_end /* * Now check the bite against the static database. If we don't find a * match, we'll reduce the size of the bite until we get a match or * find we can't even match a single character. */ switch (bite_size) { const unsigned char *kana; /* Betcha've never seen a variable */ unsigned max_kana_len; /* in a place like this before. */ case 3: check(romaji3, (romaji3[i].romaji[0] == bite[0] && romaji3[i].romaji[1] == bite[1] && romaji3[i].romaji[2] == bite[2])); /* FALLTHROUGH: Mmm, no match... try below */ case 2: check(romaji2, (romaji2[i].romaji[0] == bite[0] && romaji2[i].romaji[1] == bite[1])); /* FALLTHROUGH: Mmm, no match... try below */ case 1: check(romaji1, romaji1[i].romaji[0] == bite[0]); /* FALLTHROUGH: no match */ default: /* ack, couldn't find any match... skip the char */ if ((searchflags & R2K_UNCONVERTED_PUNC_OK) == 0 || isalpha(*r)) badtranscount++; out(*r++); last_hi = 0; break; copy_matched_kana: while (max_kana_len && kana[0]) { unsigned hi = kana[0], lo = kana[1]; assert(hi == KID_HIRA_HI || hi == KID_KATA_HI); if (hi == KID_HIRA_HI) { if (mode == R2K_ALL_HIRA_MODE) hi = KID_HIRA_HI; else if (mode == R2K_ALL_KATA_MODE) hi = KID_KATA_HI; else /* mode is mixed */ hi = upper ? KID_KATA_HI : KID_HIRA_HI; } last_hi = hi; last_lo = lo; modified = 1; out(hi); out(lo); max_kana_len -= 2; kana += 2; } break; } } out(0); /* final string-ending null */ if (info != 0) { info->k_buf_used = (k - orig_k); info->modified = modified; } #ifdef ROMAJI_DEBUG outputf("/ bytes used: %d", k - orig_k); if (orig_k) outputf("/ [%.*s]", k - orig_k, orig_k); outchar('\n'); #endif return badtranscount; } #ifdef TEST main(int argc, char *argv[]) { unsigned char kana[100]; int i; i = romaji2kana(argv[1], kana, sizeof(kana), 0); if (i < 0) outputf("[%s] return is %d\n", argv[1], i); else outputf("[%s] kana is [%s] ret %d\n", argv[1], kana, i); } #endif lookup-1.08b.orig/lib/input.c0100644000014400001440000001247606173464731015576 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #include "config.h" #if USE_LOCAL_INPUT #include "system.h" #if defined(_HAVE_SYS_FCNTL_H_) # include #elif defined(_HAVE_FCNTL_H_) # include #endif #include #include #include "assert.h" #include "output.h" #include "input.h" extern int errno; static unsigned char input_buffer[INPUT_BUF_SIZE]; #define eptr (&input_buffer[INPUT_BUF_SIZE]) static unsigned char *incoming = input_buffer; static unsigned char *outgoing = input_buffer; int preread_input_pending = 0; void (*input_inactivity_function)() = 0; #define have_preread_input() (outgoing < incoming) static volatile int is_reading = 0; static volatile int blocking = 1; static volatile int alarm_triggered = 0; #if defined(O_NDELAY) # define DELAY (O_NDELAY) #elif defined(FNDELAY) # define DELAY (FNDELAY) #else # error "Ack, don't know how to fcntl(delay)" #endif static void set_blocking(int want_blocking) { int flags; kibishii_assert(want_blocking != blocking); flags = fcntl(STDIN, F_GETFL, /*dummy*/0); if (flags < 0) die("[bad fcntl1: %n]\n"); if (want_blocking) flags &= ~DELAY; else flags |= DELAY; if (fcntl(STDIN, F_SETFL, flags) < 0) die("[bad fcntl2: %n]\n"); #ifdef KIBISHII_DEBUG flags = fcntl(STDIN, F_GETFL, /*dummy*/0); if (flags < 0) die("[bad fcntl3: %n]\n"); kibishii_assert(!(flags & DELAY) == want_blocking); #endif blocking = want_blocking; } /* * For outside use, to ensure we're in blocking mode. */ void ensure_blocking_input(void) { if (!blocking) set_blocking(1); } static int alarm_server(int num) { if (is_reading) { kibishii_assert(input_inactivity_function); alarm_triggered = 1; if (blocking) set_blocking(0); } signal(SIGALRM, alarm_server); /* set again for later alarms */ return 0; /* to make someone happy */ } static void read_more_input(void) { int ret; kibishii_assert(incoming >= input_buffer); kibishii_assert(outgoing >= input_buffer); kibishii_assert(incoming < eptr); kibishii_assert(outgoing <= incoming); /* * keep trying to read until we don't die due to an alarm. */ alarm_triggered = 0; for (;;) { is_reading = 1; #define _DOREAD_ ret = read(STDIN, incoming, eptr - incoming) #ifndef EINTR _DOREAD_; #else /* read some input -- if we exit merely due to an interrupted system call, just try again */ while (_DOREAD_, ret < 0 && errno == EINTR) ; #endif #undef _DOREAD_ is_reading = 0; if (ret >= 0 || !alarm_triggered) break; kibishii_assert(input_inactivity_function); /* the alarm was triggered since this read_more_input called */ (*input_inactivity_function)(); /* input_inactivity_function could have deregistered itself */ if (input_inactivity_function == 0) { /* had de-registered itself.... can now go back to blocking */ if (!blocking) set_blocking(1); alarm_triggered = 0; } } if (input_inactivity_function && !alarm_triggered) sleep(0); /* alarm was unused... unset it */ if (ret > 0) incoming += ret; else if (ret < 0 && errno != EWOULDBLOCK) { int prev = output_pager_transparent(1); warn("[read error: %n]\n"); if (prev == 0) output_pager_transparent(0); } else if (ret == 0 && blocking) { *incoming++ = 0; /* Mmmm, must be EOF */ } preread_input_pending = have_preread_input(); } int __input_pending__(void) { if (preread_input_pending) return 1; if (blocking) set_blocking(0); read_more_input(); return preread_input_pending; } unsigned char next_raw_input_byte(void) { unsigned char c; kibishii_assert(incoming >= input_buffer); kibishii_assert(outgoing >= input_buffer); kibishii_assert(incoming < eptr); kibishii_assert(outgoing <= incoming); kibishii_assert(have_preread_input() == preread_input_pending); if (!preread_input_pending) { /* had none there... wait for it */ if (!blocking) set_blocking(1); if (input_inactivity_function) { static int alarm_server_registered = 0; if (!alarm_server_registered) { signal(SIGALRM, alarm_server); /* reset */ alarm_server_registered = 1; } alarm(2); } read_more_input(); /* expect to have gotten something*/ kibishii_assert(outgoing < incoming); } kibishii_assert(outgoing < incoming); c = *outgoing++; if (outgoing == incoming) { incoming = outgoing = input_buffer; preread_input_pending = 0; } kibishii_assert(outgoing <= incoming); return c; } unsigned char next_cooked_input_byte(void) { unsigned char c; kibishii_assert(have_preread_input() == preread_input_pending); if (preread_input_pending) c = next_raw_input_byte(); else { /* GRAB THE TTY */ set_tty_state_to_cbreak(); c = next_raw_input_byte(); reset_tty_state(); } return c; } int flush_pending_input(void) { int had = input_pending(); kibishii_assert(have_preread_input() == preread_input_pending); while (input_pending()) { incoming = outgoing = input_buffer; preread_input_pending = 0; } return had; } #endif /* USE_LOCAL_INPUT */ lookup-1.08b.orig/lib/jreadline.h0100644000014400001440000000653006076511210016356 0ustar nakaharastaff#ifndef __JREADLINE_H__ /* filewrapper */ #define __JREADLINE_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #define jreadline_version 103 /* 1.03 */ /* * Given a prompt, accept a line of input from the user and return it. * The value returned should be eventually free'd by the user. */ extern unsigned char *readline(const unsigned char *readlineprompt); /* * Add the given line to the history list. The contents of the line are copied * to separately saved memory, so there's no need to do that by the calling * function. */ extern void add_history(const unsigned char *); #ifndef NO_AUTO_ROMAJI_CONVERSION /* * Can be set to true or false to allow automatic romaji conversion. * Particularly useful to be set via jreadline_access (described below). */ extern int jreadline_auto_romaji; /* * The kind of function that should be called to do romaji conversion, * if any (if set, will be called after each character input if * jreadline_auto_romaji is true. */ typedef void (*romaji_converter_t)(unsigned char *buffer, const unsigned char *bufend, unsigned char **cursorloc, const unsigned char **eol, int force); /* * Used to set what function should be called to do the conversion (the * old function address being returned). An appropriate function can * be found in std_romaji.c */ extern romaji_converter_t set_romaji_converter(romaji_converter_t new); #endif /* NO_AUTO_ROMAJI_CONVERSION */ /* * If set, called (more-or-less) after each character read, to allow * an outside agent to modify or view the in-progress line. When * multiple characters are available for input at one time, they're * processed in a block before (*jreadline_access)() is called. */ extern int (*jreadline_access)(unsigned char *line, unsigned char **dot, unsigned char **eol); /* * used to set the prompt >during readline processing<. To be called from * a function that's called from within readline, such as the romaji * converter or the jreadline_access function. */ const unsigned char *jreadline_mod_prompt(const unsigned char *new); /* The prompt used by the current (if in-execution) or last readline(). */ extern const unsigned char *jreadline_last_prompt; /* Actually defined in std_romaji.c... rather a kludge to be here */ extern const char *std_romaji_allowed_nonletters(const char *new); extern void std_romaji_converter(const unsigned char *start_of_line, const unsigned char *bufend, unsigned char **dot_p, const unsigned char **eol_p, int force, int eat_leading_slash); /* * The input buffer is static and of limited size. * An input line is not allowed beyond this size. */ #ifndef MAX_INPUT_LINE_LENGTH # define MAX_INPUT_LINE_LENGTH 200 #endif /* * Used to set (or enquire the current) type of encoding used for high-bit * set characters. Returns the previous value, and sets a new value if * selection is JREADLINE_EUC or JREADLINE_SJIS. Note that regular JIS * is always recognized. */ unsigned jreadline_highbit_input(unsigned selection); #define JREADLINE_EUC 1 #define JREADLINE_SJIS 2 #define JREADLINE_INQUIRE 3 #endif /* file wrapper */ lookup-1.08b.orig/lib/euc.h0100644000014400001440000000236206076503447015211 0ustar nakaharastaff#ifndef __EUC_H__ /* file wrapper */ #define __EUC_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). ******************************************************************** * * Attempts to define some definitions useful for dealing with * Japanese EUC Packed Format text. * Codeset 0: ASCII (or JIS-Roman). Bytes/char: 1 * Codeset 1: JIS X 0208 Bytes/char: 2 * Codeset 0: Halfwidth katakana. Bytes/char: 2 * Codeset 0: JIS X 0212 Bytes/char: 3 */ /* Returns the codeset number for a character whose first byte is given. */ #define EUC_CODESET(X) ((euc_info[X]>>_euc_codeset_shift)&_euc_codeset_mask) /* Returns the length of a character whose first byte is given */ #define EUC_CHAR_LENGTH(X) ((euc_info[X]>>_euc_bpc_shift)&_euc_bpc_mask) /****************************************************************************/ /* private stuff below */ extern unsigned const char euc_info[256]; #define _euc_bpc_shift 0 #define _euc_bpc_mask 3 #define _euc_codeset_shift 2 #define _euc_codeset_mask 3 #endif /* file wrapper */ lookup-1.08b.orig/lib/romaji2kana.h0100644000014400001440000000552006076512125016623 0ustar nakaharastaff#ifndef __ROMAJI2KANA_H__ /* file wrapper */ #define __ROMAJI2KANA_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #define romaji2kana_version 101 /* 1.01 */ /* * Sets the mode of subsequent conversions. Returns the previous value. * Initial default is R2K_MIXED_MODE. */ extern unsigned r2k_setmode(unsigned newmode); #define R2K_MIXED_MODE 0 /* lower case is hira, upper is kata */ #define R2K_ALL_HIRA_MODE 1 /* case doesn't matter; always hira */ #define R2K_ALL_KATA_MODE 2 /* case doesn't matter; always kata */ /* * The following three routines are used to set the PASS, OMIT, and LONGVOWEL * strings. The previous value is returned. If a null is given as the * argument, only the current version is returned (and nothing is changed)> * * The PASS string lists ASCII characters which are to be ignored without * comment (non-ASCII are always ignored). The default is "\t ". * * The OMIT string lists characters which are to be ignored in the input. * The default is "'". This allows conversions of "shinichi" and "shi'nichi" * to work (the single quote in the romaji will force the partition to * SHI NI CHI rather than SHI N I CHI) yet not result in an error. * * The LONGVOWEL string lists characters that are to be taken as indicating * a long vowel. Default is "-^". If the output is katakana, a dash is * inserted into the output, while if hiragana the proper vowel. */ extern const char *r2k_setpass(const char *new); extern const char *r2k_setomit(const char *new); extern const char *r2k_setlongvowel(const char *new); extern unsigned r2k_setflag(unsigned new); #define R2K_UNCONVERTED_PUNC_OK 0x01 #define R2K_NONASCII_OK 0x02 #define R2K_ALLOW_LONG_O_WITH_H 0x04 #define R2K_ALLOW_M_FOR_N 0x08 struct romaji2kana_info { unsigned short k_buf_used; unsigned short modified; }; /* * Does a conversion of romaji bytes in the range from R to R_END, * depositing the potentially converted text to K, filling up to * K_BUF_LEN bytes. If return value is negative, indicates an error. * Otherwise, returns the number of bytes not converted (and not * OMITed). * * If INFO is non-null, it is filled in appropriately. K_BUF_USED is * the number of bytes that were written to K. MODIFIED is true if the * output is different from the input. * * If K is null, just reports via INFO what would have happened. */ extern int romaji2kana(const unsigned char *r, const unsigned char *r_end, unsigned char *k, unsigned k_buf_len, struct romaji2kana_info *info); #define R2K_BAD_ARGS -1 #define R2K_OVERFLOW -2 /* overflow of the output buffer */ #endif /* file wrapper */ lookup-1.08b.orig/lib/replace.c0100644000014400001440000001423606076512035016037 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #include "config.h" #include "assert.h" #include #include "jregex.h" #include "xmalloc.h" #include "replace.h" static unsigned _apply_substitution(const regex_t *compiled, unsigned char **new, unsigned char **new_end_match, const unsigned char *str, unsigned str_len, const unsigned char *startsearch, const unsigned char *replace) { unsigned char *nptr, *new_end; unsigned required_len; const unsigned char *ptr = replace; int i; if (regexec_paren_info_size < reg_max_paren_level_used(compiled)) return APP_SUB_PAREN_INFO_NOT_ENOUGH; if (i = regexec(compiled, startsearch, str_len-(startsearch - str)), i==0) return APP_SUB_DID_NOT_MATCH; #ifndef NO_REGEXEC_MATCH_POINTS required_len = (regexec_match_start - str) + (str + str_len) - regexec_match_end; #else required_len = 0; #endif /* * Calculate how much space we'll need for the new str. * Above we figure how much we'll need for what's not replaced, and * below we'll run through the replacement str looking for * things like "\&", "\1", etc., to be replaced by what matched. */ while (*ptr) { unsigned char c; if (*ptr != '\\') { /* just the character itself */ regular: required_len++; ptr++; continue; } /* a backslash-something... something in 'c' below */ c = *++ptr; #ifndef NO_REGEXEC_MATCH_POINTS if (c == '&') { /* add for entire bit that was matched */ required_len += regexec_match_end - regexec_match_start; ptr++; continue; } #endif /* at this point, only \1 through \9 valid */ if (!isascii(c) || !isdigit(c) || c == '0') goto regular; ptr++; c -= '1'; if (c >= regexec_paren_info_used) continue; if (regexec_paren_info[c].match_end == 0 || regexec_paren_info[c].match_start == 0) continue; /* add length of what was matched */ required_len += regexec_paren_info[c].match_end - regexec_paren_info[c].match_start; } /* finalize length; get memory; set pointers */ required_len += 1; /* final null */ nptr = *new = xmalloc(required_len); new_end = nptr + required_len; ptr = replace; #ifndef NO_REGEXEC_MATCH_POINTS /* if needed, copy from BOL to start-of-match over to new string */ if (regexec_match_start != str) { const unsigned char *sptr = str; while (sptr != regexec_match_start) *nptr++ = *sptr++; } #endif /* run through the replacement */ while (*ptr) { unsigned char c; assert(nptr < new_end); if (*ptr != '\\') { /* copy over the raw character */ *nptr++ = *ptr++; continue; } c = *++ptr; #ifndef NO_REGEXEC_MATCH_POINTS /* if \&, replace with matched text */ if (c == '&') { const unsigned char *sptr = regexec_match_start; while (sptr < regexec_match_end) *nptr++ = *sptr++; ptr++; continue; } #endif if (!isascii(c) || !isdigit(c) || c == '0') { *nptr++ = *ptr++; /* oops, not \1 through \9 */ continue; } c -= '1'; ptr++; if (regexec_paren_info[c].match_end != 0 && regexec_paren_info[c].match_start != 0) { const unsigned char *sptr = regexec_paren_info[c].match_start; while (sptr < regexec_paren_info[c].match_end) *nptr++ = *sptr++; } } /* note where in the new string the replacement ends */ *new_end_match = nptr; #ifndef NO_REGEXEC_MATCH_POINTS /* if needed, copy over from end-of-match to EOL */ if (regexec_match_end < str + str_len) { const unsigned char *sptr = regexec_match_end; while (sptr < str + str_len) *nptr++ = *sptr++; } #endif *nptr++ = '\0'; assert(nptr == new_end); return APP_SUB_SUCCESS; } unsigned apply_substitution(const regex_t *compiled, unsigned char **new, unsigned *matchcount, const unsigned char *str, unsigned str_len, const unsigned char *replace, unsigned count) { unsigned char *old; unsigned char *new_end_match; unsigned dummy; int i; if (matchcount == 0) matchcount = &dummy; i = _apply_substitution(compiled, new, &new_end_match, str, str_len, str, replace); *matchcount = 0; if (i != APP_SUB_SUCCESS) return i; (*matchcount)++; while (--count && *new_end_match != '\0') { old = *new; i = _apply_substitution(compiled, new, &new_end_match, *new, strlen((void*)*new), new_end_match, replace); if (i != APP_SUB_SUCCESS) break; (*matchcount)++; free(old); } return APP_SUB_SUCCESS; } unsigned char * sub(const unsigned char *str, unsigned stringlen, const unsigned char *pattern, const unsigned char *replace, unsigned flags, unsigned times) { regex_t R; unsigned char *new = 0; int i = regcomp(&R, pattern, flags); #ifdef REGCOMP_SAVE_MATCHED_PAREN_INFO if (i == REGCOMP_NEED_SAVE_PAREN_INFO && !(flags & REGCOMP_SAVE_MATCHED_PAREN_INFO)) i = regcomp(&R, pattern, flags|REGCOMP_SAVE_MATCHED_PAREN_INFO); #endif if (i != REGCOMP_SUCCESS) return 0; if (apply_substitution(&R, &new, 0, str, stringlen, replace, times) != APP_SUB_SUCCESS) { new = 0; } regfree(&R); return new; } #ifdef TEST int main(int argc, char *argv[]) { const unsigned char *pattern = argc > 1 ? argv[1] : "a"; const unsigned char *replace = argc > 2 ? argv[2] : ">A<"; const unsigned char *str = argc > 3 ? argv[3] : "xabxyzxaaaawz"; unsigned char *new; int i; regexec_paren_info_size = 10; regexec_paren_info = xmalloc(sizeof(*regexec_paren_info) * regexec_paren_info_size); constant_sub(i, pattern, REGCOMP_SAVE_MATCHED_PAREN_INFO, str, strlen((void*)str), replace, 1000, new); if (i != APP_SUB_SUCCESS) { die("apply returns %d\n", i); } outputf("pattern is 「%s」\n", pattern); outputf("replace is 「%s」\n", replace); outputf("string is 「%s」\n", str); outputf("result is 「%s」\n", new); return 0; } #endif lookup-1.08b.orig/lib/jreadline.c0100644000014400001440000006501306076504276016370 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #include "config.h" #include "assert.h" #include #include #include #include "output.h" #include "jreadline.h" #include "strsave.h" #include "input.h" #include "xmalloc.h" #include "euc.h" #include "system.h" #if defined(_HAVE_STRINGS_H_) /* might be defined in system.h */ # include #else # include # define index strchr # define rindex strrchr #endif #if !defined(__GNUC__) # if !defined(__volatile__) # define __volatile__ /*nothing; for use with volatile functions */ # endif # if !defined(__inline__) # define __inline__ /*nothing; for use with volatile functions */ # endif #endif #define array_elements(array) (sizeof(array)/sizeof(array[0])) /* idea and rough implementation for HANDLE_SIGNALS from GNU's readline */ #if !defined(NO_HANDLE_SIGNALS) && !defined(HANDLE_SIGNALS) # define HANDLE_SIGNALS #endif #ifndef MOVE_MEMORY /* this must be a "safe" memory copy */ #define MOVE_MEMORY(FROM, TO, LENGTH) \ (void)bcopy((const char*)(FROM), (char*)(TO), (int)(LENGTH)) #endif #define bindfunc static /* maybe someday we'll allow outside binding */ /* * The input buffer is where all input and editing are done. * It is accessed by * dot -- buffer pointer. This points just beyond the "cursor" * character. Inserts are done at this pointer. * Unless one has used the editing commands to move * backward, dot will usually be the same as end_of_line. * * end_of_line -- end of the used input buffer. Points just beyond the * last character on the current input line. */ static unsigned char start_of_line[MAX_INPUT_LINE_LENGTH + 1]; static unsigned char *dot = start_of_line; /* dot = "cursor" pointer */ static unsigned char *end_of_line = start_of_line; /* points beyond last char*/ /* * Some macros to check the status of the start_of_line with respect to size. * * ok_to_insert() * is false if a character can't be added to the buffer. * ok_to_move_forwrad * is false if one is already at the end of the line. * ok_to_move_backward * is false if one is already at the beginning of the line. */ #define line_length() (end_of_line - start_of_line) #define bytes_into_line() (dot - start_of_line) #define bytes_from_end() (end_of_line - dot) #define eol() (dot == end_of_line) #define bol() (dot == start_of_line) #define ok_to_insert() (line_length() < MAX_INPUT_LINE_LENGTH) #define ok_to_insert2() (line_length() < (MAX_INPUT_LINE_LENGTH-1)) #ifdef JIS0212_SUPPORT #define ok_to_insert3() (line_length()+1 < (MAX_INPUT_LINE_LENGTH-2)) #endif #define ok_to_move_forward() (!eol()) #define ok_to_move_backward() (!bol()) /* * Returns the number of bytes in the character starting at PTR. */ static __inline__ int char_size(unsigned char *ptr) { int size; kibishii_assert(ptr >= start_of_line && ptr < end_of_line); size = EUC_CHAR_LENGTH(*ptr); if (size == 0) { /* * This should never happen with normal input, but major problems if * it does, so better take care of it here. */ kibishii_assert(size); return 1; } return size; } /* * Returns the number of bytes in the character before the one starting at PTR. */ static __inline__ int prev_char_size(unsigned char *mark) { unsigned char *ptr; int lastsize = 1; kibishii_assert(mark > start_of_line && mark <= end_of_line); for (ptr = start_of_line; ptr < mark; ptr += (lastsize=char_size(ptr))) ; kibishii_assert(mark == ptr); return lastsize; } /* * The history mechanism is a simple linked list of command texts. */ struct history_struct { struct history_struct *h_prev, /* command previous to this one */ *h_next; /* command that happend after this one */ const unsigned char *h_text; /* text of the command */ }; /* head->prev is the last command, head->prev->prev is the one before that */ static struct history_struct history_head = {&history_head, &history_head, 0 }; /* * Current_history_pointer used by PREV, NEXT, etc for movement, and is * reset at each ENTER to be the head. */ static struct history_struct *current_history_ptr; const unsigned char *jreadline_last_prompt = (void *)"> "; const unsigned char * jreadline_mod_prompt(const unsigned char *new) { const unsigned char *old = jreadline_last_prompt; jreadline_last_prompt = new; return old; } /* * Given the text (of the most recently executed command), add it to the * head of the history list. */ void add_history(const unsigned char *text) { struct history_struct *new; /* allocate and fill the new history struct */ new = (struct history_struct *)xmalloc(sizeof(struct history_struct)); new->h_text = strsave((void*)text); /* link it in */ new->h_prev = history_head.h_prev; new->h_prev->h_next = new; history_head.h_prev = new; history_head.h_prev->h_next = &history_head; } static unsigned jreadline_active; /* true if getting an input line */ static unsigned waiting_for_input; /* true if waiting for a character */ static unsigned jreadline_display_width = 80; #define max(x,y) ((x)>(y)?(x):(y)) #define min(x,y) ((x)<(y)?(x):(y)) /* * Print a copy of the current input buffer to the output. * If newline is true, one is appended. * * This is called each time the line is changed, so may be thought * of as updating the printed version of the internal buffer. */ static void update_display(int newline) { static int printed_last_time = 0; int text_len = end_of_line - start_of_line; int prompt_len = strlen((void*)jreadline_last_prompt); int total_len = prompt_len + text_len; int starting_index, ending_index; int len_printed_before_dot; int len_printed_after_dot; int printed_this_time; int dot_index = prompt_len + (dot - start_of_line); int print_only_until_dot = 0; if (total_len < (jreadline_display_width - 2)) { starting_index = 0; ending_index = total_len; } else { unsigned available_width = jreadline_display_width - 1; ending_index = dot_index + (available_width / 2); if (ending_index > total_len) ending_index = total_len; starting_index = ending_index - (available_width - 2); if (starting_index < 0) starting_index = 0; if (starting_index > 0) { /* first, save room for an initial "…" */ starting_index += 2; available_width -= 2; /* * Too much to print, must pick some part of prompt+line to * display. We propose to start printing at STARTING_INDEX, but * must make sure that's not in the middle of a multi-byte * character. If it is, we'll bump it up to the start of the * next character. */ if (starting_index < prompt_len) { /* * Proposed starting place is in the prompt... check from the * beginning of the prompt to the proposed location... */ int i = 0; while (i < starting_index) { if (jreadline_last_prompt[i] & 0x80) { #ifdef JIS0212_SUPPORT if (jreadline_last_prompt[i] == /*CodeSet3*/ 143) i += 3; else #endif i += 2; } else i++; } starting_index = i; } else { /* * Proposed starting place is in the line somewhere... * check down the line to the proposed starting index. */ int i = 0; while (prompt_len + i < starting_index) { unsigned char c = start_of_line[i]; if (c & 0x80) { #ifdef JIS0212_SUPPORT if (c == /*CodeSet3*/ 143) i += 3; else #endif i += 2; } else i ++; } starting_index = prompt_len + i; } } ending_index = starting_index + available_width; if (ending_index > total_len) ending_index = total_len; else ending_index -= 2; /* save room for a final "…" */ } soft_assert(dot_index >= starting_index); soft_assert(dot_index <= ending_index); output_pager_transparent(1); again: outchar('\r'); len_printed_after_dot = 0; if (starting_index == 0) len_printed_before_dot = 0; else { len_printed_before_dot = 2; output("…"); } /* output any prompt */ if (starting_index < prompt_len) { const unsigned char *this_start = jreadline_last_prompt + starting_index, *this_end = jreadline_last_prompt + min(prompt_len, ending_index); while (this_start < this_end) { unsigned char c = *this_start; if (!(c & 0x80)) { outchar(c); this_start++; len_printed_before_dot++; } else { if (this_start + 2 > this_end) break; outchar(c); this_start++; #ifdef JIS0212_SUPPORT if (c == 143) outchar(*(this_start++)); #endif outchar(*(this_start++)); len_printed_before_dot += 2; } } if (this_start > this_end) len_printed_before_dot += (this_start - this_end); } if (ending_index > prompt_len) { const unsigned char *this_start, *this_end = start_of_line + (ending_index - prompt_len); if (starting_index <= prompt_len) this_start = start_of_line; else this_start = start_of_line + (starting_index - prompt_len); while (this_start < this_end) { unsigned char c = *this_start; if (!(c & 0x80)) { outchar(c); if (this_start < dot) len_printed_before_dot++; else len_printed_after_dot++; this_start++; } else { if (this_start + 2 > this_end) break; outchar(c); if (this_start < dot) len_printed_before_dot += 2; else len_printed_after_dot += 2; this_start++; #ifdef JIS0212_SUPPORT if (c == 143) outchar(*(this_start++)); #endif outchar(*(this_start++)); } } } if (print_only_until_dot) goto done; if (ending_index != total_len) { output("…"); len_printed_after_dot += 2; } printed_this_time = len_printed_before_dot + len_printed_after_dot; if (printed_this_time < printed_last_time) { int needed = min(printed_last_time, jreadline_display_width) - printed_this_time; len_printed_after_dot += needed; while (needed--) outchar(' '); } printed_last_time = printed_this_time; if (len_printed_after_dot && !newline) { if (len_printed_after_dot > len_printed_before_dot) { ending_index = dot_index; print_only_until_dot = 1; goto again; } else { while (len_printed_after_dot--) outchar('\b'); } } done: if (newline) outchar('\n'); else flush_output(); output_pager_transparent(0); } unsigned set_jreadline_width(unsigned new) { unsigned old = jreadline_display_width; jreadline_display_width = new; if (waiting_for_input) update_display(0); return old; } /* * Return values for the various functions that get invoked directly * by a users typing a character. * * These functions, all in the form * int bind_FUNCTION(unsigned char c) * * Are called with the character that was entered to invoke the command. * For most such commands, the character is ignored. */ #define CONTINUE 1 /* continue accepting input */ #define END_OF_INPUT 2 /* EOF ...*/ #define RETURN_TO_USER 3 /* ENTER */ int (*jreadline_access)(unsigned char *,unsigned char **,unsigned char **) = 0; #ifndef NO_AUTO_ROMAJI_CONVERSION int jreadline_auto_romaji = 0; static romaji_converter_t romaji_converter; romaji_converter_t set_romaji_converter(romaji_converter_t new) { romaji_converter_t old = romaji_converter; romaji_converter = new; return old; } #define romaji_conversion(force) \ macro_start { \ if (romaji_converter) \ (*romaji_converter)(start_of_line, \ &start_of_line[sizeof(start_of_line)], \ &dot, \ (const unsigned char **)&end_of_line, \ (force)); \ } macro_end bindfunc int bind_convert_romaji(unsigned char c) { romaji_conversion(1); return CONTINUE; } #endif /* Ring the bell (as some sign of user error) */ bindfunc int bind_ring_bell(unsigned char c) { outchar('\007'); flush_output(); return CONTINUE; } /* select the next command in the history list, if there is one */ bindfunc int bind_next_history_line(unsigned char c) { if (current_history_ptr->h_next == &history_head) (void)bind_ring_bell(c); /* oops, not one there */ else { int i = 0; current_history_ptr = current_history_ptr->h_next; /* * Copy the text to the input buffer, setting dot * and end_of_line to their proper values. */ dot = start_of_line; while (c = current_history_ptr->h_text[i++], c != 0) *dot++ = c; ; end_of_line = dot; } return CONTINUE; } /* select the previous command in the history list, if there is one */ bindfunc int bind_previous_history_line(unsigned char c) { if (current_history_ptr->h_prev == &history_head) (void)bind_ring_bell(c); /* not one */ else { int i = 0; current_history_ptr = current_history_ptr->h_prev; /* * Copy the text to the input buffer, setting dot * and end_of_line to their proper values. */ dot = start_of_line; while (c = current_history_ptr->h_text[i++], c != 0) *dot++ = c; end_of_line = dot; } return CONTINUE; } /* Redraw the current input line */ bindfunc int bind_redraw(unsigned char c) { update_display(1); return CONTINUE; } /* the ENTER command */ bindfunc int bind_EOL(unsigned char c) { return RETURN_TO_USER; } /* the EOF "command" */ static int at_EOF(void) { if (end_of_line - start_of_line == 0) return END_OF_INPUT; else return RETURN_TO_USER; /* if there's input, use it */ } /* destructive backspace. */ bindfunc int bind_delete_char_backward(unsigned char c) { if (!ok_to_move_backward()) (void)bind_ring_bell(c); /* can't move back */ else { unsigned size = prev_char_size(dot); /* * Delete char to the left of the cursor. * If there's text to the right, it must be shifted * over. */ if (ok_to_move_forward()) MOVE_MEMORY(/*from*/dot, /*to*/dot-size, end_of_line - dot); dot -= size; end_of_line -= size; } return CONTINUE; } /* gobble the current character -- opposit of the above */ bindfunc int bind_delete_char_forward(unsigned char c) { if (!ok_to_move_forward()) return bind_delete_char_backward(c); else { unsigned size = char_size(dot); /* delete the char under the cursor */ MOVE_MEMORY(/*from*/dot + size, /*to*/dot, end_of_line - dot); end_of_line -= size; } return CONTINUE; } /* * Move forward one character, if possible. * If not, return an at_EOF */ bindfunc int bind_delete_char_forward_or_EOF(unsigned char c) { if (ok_to_move_forward()) return bind_delete_char_forward(c); else if (ok_to_move_backward()) return bind_delete_char_backward(c); else return at_EOF(); } /* move forward one character, if possible */ bindfunc int bind_forward_character(unsigned char c) { if (!ok_to_move_forward()) (void)bind_ring_bell(c); else dot += char_size(dot); return CONTINUE; } /* move back one character, if possible */ bindfunc int bind_backward_character(unsigned char c) { if (!ok_to_move_backward()) (void)bind_ring_bell(c); else dot -= prev_char_size(dot); return CONTINUE; } /* Insert the given character into the current input buffer, if possible */ bindfunc int bind_self_insert(unsigned char c) { if (!ok_to_insert()) (void)bind_ring_bell(c); else { /* push any characters that are to the right of the cursor */ int length = end_of_line - dot; if (length > 0) MOVE_MEMORY(/*from*/dot, /*to*/dot + 1, length); /* add the character */ *dot++ = c; /* bump up the end pointer, too */ end_of_line++; #ifndef NO_AUTO_ROMAJI_CONVERSION if (jreadline_auto_romaji && isascii(c) && !isspace(c)) romaji_conversion(0); #endif } return CONTINUE; } /* Insert the given characters into the current input buffer, if possible */ bindfunc int bind_self_insert2(unsigned char c1, unsigned char c2) { if (!ok_to_insert2()) (void)bind_ring_bell(c1); else { /* push any characters that are to the right of the cursor */ int length = end_of_line - dot; if (length > 0) MOVE_MEMORY(/*from*/dot, /*to*/dot + 2, length); /* add the characters */ dot[0] = c1; dot[1] = c2; dot += 2; /* bump up the end pointer, too */ end_of_line += 2; } return CONTINUE; } #ifdef JIS0212_SUPPORT /* Insert the given characters into the current input buffer, if possible */ bindfunc int bind_self_insert3(unsigned char c1, unsigned char c2, unsigned char c3) { if (!ok_to_insert3()) (void)bind_ring_bell(c1); else { /* push any characters that are to the right of the cursor */ int length = end_of_line - dot; if (length > 0) MOVE_MEMORY(/*from*/dot, /*to*/dot + 3, length); /* add the characters */ dot[0] = c1; dot[1] = c2; dot[2] = c3; dot += 3; /* bump up the end pointer, too */ end_of_line += 3; } return CONTINUE; } #endif /* Tabs become spaces... */ bindfunc int bind_tab(unsigned char c) { return bind_self_insert(' '); } /* move to the end of the line */ bindfunc int bind_end_of_line(unsigned char c) { dot = end_of_line; return CONTINUE; } /* delete to the end of the line */ bindfunc int bind_kill_to_end(unsigned char c) { end_of_line = dot; return CONTINUE; } /* move to the start of the line */ bindfunc int bind_start_of_line(unsigned char c) { dot = start_of_line; return CONTINUE; } /* delete to the start of the line */ bindfunc int bind_kill_to_start(unsigned char c) { int len = end_of_line - dot; if (len == 0) end_of_line = dot = start_of_line; else { MOVE_MEMORY(/*from*/dot, /*to*/start_of_line, len); dot = start_of_line; end_of_line = dot + len; } return CONTINUE; } static enum jis_mode { ascii, roman = ascii, /* we'll treat them the same */ jis78, jis83 = jis78, /* we'll treat them the same */ jis90 = jis78, /* we'll treat them the same */ #ifdef JIS0212_SUPPORT jis_0212, #endif hw_kata } jis_mode; static enum hi_bit_mode { EUC, SJIS } hi_bit_mode = EUC; unsigned jreadline_highbit_input(unsigned selection) { unsigned old = (hi_bit_mode == EUC) ? JREADLINE_EUC : JREADLINE_SJIS; if (selection == JREADLINE_EUC) hi_bit_mode = EUC; else if (selection == JREADLINE_SJIS) hi_bit_mode = SJIS; return old; } static int bind_have_escape(unsigned char x) { struct { const unsigned char *string; enum jis_mode mode; int marker; } escape[] = { { (const unsigned char *)"$@", jis78 }, { (const unsigned char *)"$B", jis83 }, { (const unsigned char *)"$&@\33$B", jis90 }, #ifdef JIS0212_SUPPORT { (const unsigned char *)"$(D", jis_0212 }, #endif { (const unsigned char *)"(J", roman }, { (const unsigned char *)"(H", roman }, { (const unsigned char *)"(B", ascii }, { (const unsigned char *)"(I", hw_kata }, }; #define escapes array_elements(escape) int i, j = 0, count; for (i = 0; i < escapes; i++) escape[i].marker = 1; do { unsigned char c = next_raw_input_byte(); for (count = i = 0; i < escapes; i++) if (escape[i].marker) { if (escape[i].string[j] != c) escape[i].marker = 0; else if (escape[i].string[j+1] == '\0') { jis_mode = escape[i].mode; return CONTINUE; } else count++; } j++; } while(count); return bind_ring_bell(0); } static int (*bind_action[])(unsigned char c) = { #ifndef NO_AUTO_ROMAJI_CONVERSION /* 0 : '^@'*/ bind_convert_romaji, #else /* 0 : '^@'*/ bind_ring_bell, #endif /* 1 : '^A'*/ bind_start_of_line, /* 2 : '^B'*/ bind_backward_character, /* 3 : '^C'*/ bind_ring_bell, /* 4 : '^D'*/ bind_delete_char_forward_or_EOF, /* 5 : '^E'*/ bind_end_of_line, /* 6 : '^F'*/ bind_forward_character, /* 7 : '^G'*/ bind_delete_char_forward, /* 8 : '^H'*/ bind_delete_char_backward, /* 9 : '^I'*/ bind_tab, /* 10 : '^J'*/ bind_EOL, /* 11 : '^K'*/ bind_kill_to_end, /* 12 : '^L'*/ bind_redraw, /* 13 : '^M'*/ bind_EOL, /* 14 : '^N'*/ bind_next_history_line, /* 15 : '^O'*/ bind_ring_bell, /* 16 : '^P'*/ bind_previous_history_line, /* 17 : '^Q'*/ bind_ring_bell, /* 18 : '^R'*/ bind_redraw, /* 19 : '^S'*/ bind_ring_bell, /* 20 : '^T'*/ bind_ring_bell, /* 21 : '^U'*/ bind_kill_to_start, /* 22 : '^V'*/ bind_ring_bell, /* 23 : '^W'*/ bind_ring_bell, /* 24 : '^X'*/ bind_ring_bell, /* 25 : '^Y'*/ bind_ring_bell, /* 26 : '^Z'*/ bind_ring_bell, /* 27 : ''*/ bind_have_escape, /* 28 : ''*/ bind_ring_bell, /* 29 : ''*/ bind_ring_bell, /* 30 : */ bind_ring_bell, /* 31 : */ bind_ring_bell, }; #ifdef HANDLE_SIGNALS #include #ifndef SIG_TYPE # ifdef __GNUC__ # define SIG_TYPE __typeof__(SIG_DFL) # else typedef void (*SIG_TYPE)(); /* take a guess with it */ # endif #endif static jmp_buf top_level; SIG_TYPE sig_int; /* interrupt */ SIG_TYPE sig_quit; /* quit */ SIG_TYPE sig_ill; /* illegal instruction */ SIG_TYPE sig_fpe; /* floating */ SIG_TYPE sig_bus; /* bus error */ SIG_TYPE sig_segv; /* segmentation error */ #ifdef SIGTSTP SIG_TYPE sig_tstp; /* tty stop */ #endif static void release_signals(void) { (void)signal(SIGINT, sig_int); (void)signal(SIGQUIT, sig_quit); (void)signal(SIGILL, sig_ill); (void)signal(SIGFPE, sig_fpe); (void)signal(SIGBUS, sig_bus); (void)signal(SIGSEGV, sig_segv); #ifdef SIGTSTP (void)signal(SIGTSTP, sig_tstp); #endif } #ifdef luna88k static int signal_handler(int sig, int code, struct sigcontext *context); #else static int signal_handler(int sig, int code); #endif static void grab_signals(void) { sig_int = signal(SIGINT, (void*)signal_handler); sig_quit = signal(SIGQUIT, (void*)signal_handler); sig_ill = signal(SIGILL, (void*)signal_handler); sig_fpe = signal(SIGFPE, (void*)signal_handler); sig_bus = signal(SIGBUS, (void*)signal_handler); sig_segv = signal(SIGSEGV, (void*)signal_handler); #ifdef SIGTSTP sig_tstp = signal(SIGTSTP, (void*)signal_handler); #endif } #ifdef luna88k static int signal_handler(int sig, int code, struct sigcontext *context) #else static int signal_handler(int sig, int code) #endif { reset_tty_state(); release_signals(); kill(getpid(), sig); #if !defined(__svr4__) && !defined(__DGUX__) sigsetmask (0); #endif /* if we make it back... */ #ifdef SIGTSTP if (sig == SIGTSTP) { update_display(0); set_tty_state_to_cbreak(); grab_signals(); return 0; } #endif longjmp(top_level, 1); return 0; /* notreached */ } #endif /* HANDLE_SIGNALS */ /* * Get one line of input. */ unsigned char *readline(const unsigned char *prompt) { int action = CONTINUE; /* "may be clobbered by longjmp" OK here */ /* * reset the input buffer pointers to point to a clear * input area. */ end_of_line = dot = start_of_line; /* want history searches to start from here */ current_history_ptr = &history_head; jreadline_last_prompt = prompt ? prompt : (void*)""; /* print the initial prompt if was given one */ jreadline_active = 1; /* GRAB THE TTY */ set_tty_state_to_cbreak(); #ifdef HANDLE_SIGNALS if (setjmp(top_level)) { jreadline_active = 0; return strsave((const unsigned char *)""); } grab_signals(); #endif jis_mode = ascii; while (action == CONTINUE) { unsigned char c1, c2; /* no need to redisplay if more input ready */ if (!input_pending()) { if (jreadline_access) (*jreadline_access)(start_of_line, &dot, &end_of_line); update_display(0); } c1 = next_raw_input_byte(); #define retry_c1_with(X) \ macro_start { \ c1 = (X); \ goto use_c1; \ } macro_end use_c1: if (c1 & 0x80) { /* SJIS or EUC */ if (hi_bit_mode == EUC) { if (c2 = next_raw_input_byte(), (c2 & 0x80) == 0) retry_c1_with(c2); #ifdef JIS0212_SUPPORT if (c1 == 0x143 /* Code Set 3 */) { unsigned char c3; /* have a JIX 0212 */ if (c3 = next_raw_input_byte(), (c3 & 0x80) == 0) retry_c1_with(c3); action = bind_self_insert3(c1,c2,c3); } else #endif if (c1 == 142 /* Code Set 2 */) { /* should convert to full-width here */ action = bind_self_insert2(c1,c2); } else { action = bind_self_insert2(c1,c2); } } else { /* Shift-JIS */ if (c1 >= 161 && c1 <= 223) { /* should convert to full-width here */ action = bind_self_insert2(142, c1); } else { if (c2 = next_raw_input_byte(), c2 < 64) retry_c1_with(c2); /* convert c1:c2 to EUC */ action = bind_self_insert2( (((c1 - (c1<160 ? 112:176))<<1) - (c2<159)) | 0x80, (c2 - (c2<159 ? (c2>127?32:31) : 126)) | 0x80); } } } else if (c1 == 127) action = bind_delete_char_backward(c1); else if (c1 < array_elements(bind_action)) { jis_mode = ascii; /* probably should do this */ action = (*bind_action[(int)(unsigned char)c1])(c1); } else if (jis_mode == ascii) action = bind_self_insert(c1); else if (jis_mode == hw_kata) { bind_self_insert2(142, c1 | 0x80); } else { c2 = next_raw_input_byte(); #ifdef JIS0212_SUPPORT if (jis_mode == jis_0212) bind_self_insert3(143, c1|0x80, c2|0x80); else #endif bind_self_insert2(c1|0x80, c2|0x80); } *end_of_line = 0; /* cap off the text -- now a string */ } while (action == CONTINUE); #ifndef NO_AUTO_ROMAJI_CONVERSION /* * To fake-force any final conversion that might need to be * done ('n', 'h', etc.), tack on an 'x' (a rather safe character) * and offer a conversion. Then take back off. */ if (jreadline_auto_romaji) { *end_of_line++ = 'x'; *(dot = end_of_line) = '\0'; romaji_conversion(0); *(dot = --end_of_line) = '\0'; } #endif update_display(1); /* show final line */ reset_tty_state(); #ifdef HANDLE_SIGNALS release_signals(); #endif jreadline_active = 0; return action == END_OF_INPUT ? NULL : strsave(start_of_line); } lookup-1.08b.orig/lib/config.h0100644000014400001440000001652006173464700015677 0ustar nakaharastaff#ifndef __CONFIG_H__ /* file wrapper */ #define __CONFIG_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ /* Some overall control of how the library is built. The default is that this entire file is either comments or ifdef'ed out. However, you can set or modify things here as you like. After each comment is an example setting (for many things with values, perhaps the default as it exists at the time of this writing... for many yes-no things, the opposite of the default) surrounded by #if 0 #endif The example is just that, an example... don't make changes without understand what you're doing, of course. ----------------------------------------------------------------------------- */ /* apply_regex.c */ /* * APPLY_REGEX_MAX_CHAR_COUNT * Tells up to how many characters to check for simultaneously when * doing a search. For example in a regex search of the pattern * "this is a|(the) pattern", any line must at least have the characters * " aehinpst" somewhere in it. Apply_regex will run down the linked lists * of the APPLY_REGEX_MAX_CHAR_COUNT least common characters required in * a pattern, and for any line in all of the lists, the regex will be * applied. * * For large databases with random data, a value more than one or two can * lead to significant speed increases. Diminishing returns probably hit * quickly, though. */ #if 0 # define APPLY_REGEX_MAX_CHAR_COUNT 10 #endif /* fuzzkana.c */ /* * SMALL_TSU_OK * Tells which consonant sounds are allowed to be proceeded by a small tsu. */ #if 0 # define SMALL_TSU_OK (KID_K|KID_S|KID_T|KID_D|KID_P|KID_W|KID_M) #endif /* index.c */ /* * USE_SHORT_INDEX_COUNTS * If true, an 'elementcount' will be 'unsigned short', 'unsigned' otherwise. * The size of a count variable can limit the number of lines that an index * can refer to for any specific character (65535 lines for two-byte shorts, * for example). Characters on "too many" lines will be automatically omitted * from the index. * * This is merely a size-of-index vs. index-ability issue. */ #if 0 # define USE_SHORT_INDEX_COUNTS 1 #endif /* jreadline.c */ /* * NO_HANDLE_SIGNALS * The jreadline package will normally catch signals, clean up the tty * state, then deliver them to the program as if jreadline wasn't there. * Define this to remove jreadline's doing anything with signals. */ #if 0 # define NO_HANDLE_SIGNALS #endif /* * NO_AUTO_ROMAJI_CONVERSION * The jreadline package normally provides automatic romaji->kana conversion * (the "access" program doesn't use it, however). Defining this removes the * automatic conversion. */ #if 0 # define NO_AUTO_ROMAJI_CONVERSION #endif /* * NON_ANSIFIED_IOCTL * If sys/ioctl.h has not yet been ANSIfied, define this. */ #if 0 # define NON_ANSIFIED_IOCTL #endif /* * MAX_INPUT_LINE_LENGTH * Maximum length of an input line. */ #if 0 # define MAX_INPUT_LINE_LENGTH 200 #endif /* jregex.c */ /* * NO_REGEXEC_MATCH_POINTS * Removes support for regexec_match_start and regexec_match_end, * Which removes support for \& in replace as well. * The support is so painless that it's probably silly to define this. */ #if 0 # define NO_REGEXEC_MATCH_POINTS #endif /* * DONT_WORRY_ABOUT_KATAKANA_DASH_BEING_PART_OF_A_WORD * Normally the word-boundary checks will take special care to count * a katakana dash as part of a kanji word. This support is removed * by defining this. Unless you know you'll be doing a *lot* of word- * boundary stuff and that you'll *not* be dealing with a dash, probably * silly to remove this support. */ #if 0 # define DONT_WORRY_ABOUT_KATAKANA_DASH_BEING_PART_OF_A_WORD #endif /* * NO_PAREN_INFO * Defining this removes support for regexec_paren_info, etc., which * is the mechanism by which one can find out what \1, \2, \3, etc. are * in a pattern. This also removes the ability to use \1, \2, \3, etc. * *in* a pattern (such as in the pattern to find double-words: * "\<(\w+) \1\>"). * * Removing this support by defining this symbol reduces the size of * regcomp and regexec, although the speed of regexec will be little * affected when paren info isn't requested. */ #if 0 # define NO_PAREN_INFO #endif /* * NO_DEFAULT_PAREN_INFO * If defined, there will be no default paren_info stuff allocated. * Only matters if NO_PAREN_INFO is not defined, of course. */ #if 0 #ifndef NO_PAREN_INFO #define NO_DEFAULT_PAREN_INFO #endif #endif /* * DEFAULT_PAREN_INFO_SIZE * Sets the size of the default paren_info. Only matters if both NO_PAREN_INFO * and NO_DEFAULT_PAREN_INFO are not defined. */ #if 0 #ifndef NO_PAREN_INFO #ifndef NO_DEFAULT_PAREN_INFO #define DEFAULT_PAREN_INFO_SIZE 10 #endif #endif #endif /* * NO_SHOWREGEX * Don't compile in the showregex() function. * I should just partition this out into another file. */ #if 0 # define NO_SHOWREGEX #endif /* * FAST_REGEXEC * If set, removes debug-ability from regexec so that it doesn't have to * waste time on checking to see if debugging is turned on. For maximum * regexec speed. */ #if 0 # define FAST_REGEXEC #endif /* * UNALIGNED_SHORT_ACCESS_OK * If a short pointer can be to any byte (instead of, say, requiring alignment * to a 2-byte boundary), set this to true for some small speed optimization. */ #if 0 # define UNALIGNED_SHORT_ACCESS_OK #endif /* romaji2kana.c */ /* * R2K_DEFAULT_FLAGS * Sets the default flags for the conversion. */ #if 0 # define R2K_DEFAULT_FLAGS R2K_UNCONVERTED_PUNC_OK|R2K_NONASCII_OK #endif /* std_romaji.c */ /* generic */ /* * MOVE_MEMORY(from, to, length) * Provides a save (i.e. OK to overlap) memory move. */ #define MOVE_MEMORY(FROM, TO, LENGTH) \ (void)bcopy((char*)(FROM), (char*)(TO), (int)(LENGTH)) /* * NDEBUG * If defined, removes assert() and many other debugging provisions. */ #if 0 # define NDEBUG #endif /* * USE_LOCAL_OUTPUT * If defined, uses local output library (rather than stdio) * and allows for switchable Japanese-output encoding. */ #define USE_LOCAL_OUTPUT /* * USE_LOCAL_INPUT * If true, use local input routines. Used to be useful, but no longer. */ #define USE_LOCAL_INPUT 0 /* * PROVIDE_PAGER * * If set to true, provides the means for a local pager (but only if * USE_LOCAL_OUTPUT also true. */ #ifdef SERVER_CONFIG # define PROVIDE_PAGER 0 #else # define PROVIDE_PAGER 1 #endif /* * Macro helpers, to allow full blocks to appear like single statements. * * Used as in: * #define do_it(arg1, arg2) * macro_start { * int ARG1 = (arg1); * int ARG2 = (arg2); * blah blah blah. * } macro_end * * This would allow the use of do_it() even in situations such as * * if (something) * do_it(x,y); * else * blahblahblah(); * */ #ifndef macro_start # define macro_start do # define macro_end while (0) #endif /* some general system-V stuff */ #if defined(__svr4__) || defined(__DGUX__) # define index strchr # define rindex strrchr # define bcopy(FROM, TO, LENGTH) memcpy(TO, FROM, LENGTH) # if !defined(__DGUX__) /* DGUX memset is broken */ # define bzero(ADDR, LENGTH) memset(ADDR, LENGTH, 0) # endif #endif /*************************************************************************/ #endif /* file wrapper */ lookup-1.08b.orig/lib/std_romaji.c0100644000014400001440000001670506076512147016566 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #include #include "config.h" #include "system.h" #if defined(_HAVE_STRINGS_H_) /* might be defined in system.h */ # include #else # include # define index strchr # define rindex strrchr #endif #include "romaji2kana.h" #include "jreadline.h" #include "strsave.h" #include "xmalloc.h" /* * romaji_converter * * This routine allows on-the-fly romaji->kana conversion when used * with my readline-ish package jreadline. To activate, just call * set_romaji_converter(std_romaji_converter); * once before calling readline(). */ #ifndef MOVE_MEMORY /* this must be a "safe" memory copy */ #define MOVE_MEMORY(FROM, TO, LENGTH) \ (void)bcopy((char*)(FROM), (char*)(TO), (int)(LENGTH)) #endif #define line_length() (end_of_line - start_of_line) #define eol() (dot == end_of_line) #define bol() (dot == start_of_line) static const char *allowed_nonletters_in_romaji = (const char *)"-^'"; const char * std_romaji_allowed_nonletters(const char *new) { const char *old = allowed_nonletters_in_romaji; allowed_nonletters_in_romaji = new; return old; } int std_romaji_toggled_force = 1; /* * Give a chance to do automatic romaji-to-kana conversion. * We have some input text from START_OF_LINE to END_OF_LINE, with * the cursor at DOT. Attempt to convert any romaji just to the left * of DOT. If FORCE is false, this is an automatic invocation, and * we'll work slightly different. If FORCE is true, it's "real-time" * conversion. If the conversion would cause the end of the line to * expand past BUFEND, it is not done. * * An immediate subsequent FORCEs will undo the previous force * if 'std_romaji_toggled_force' is true. */ void std_romaji_converter(const unsigned char *start_of_line, const unsigned char *bufend, unsigned char **dot_p, const unsigned char **eol_p, int force, int eat_leading_slash) { unsigned char *dot = *dot_p; const unsigned char *end_of_line = *eol_p; unsigned char *ptr = &dot[-1]; static struct { unsigned char *start; unsigned char *orig; unsigned char *new; unsigned orig_len; unsigned new_len; } last_forced; if (std_romaji_toggled_force && force && last_forced.start + last_forced.new_len == dot) { if (!strncmp((void*)last_forced.start, (void*)last_forced.new, last_forced.new_len)) { /* swap orig and new and return */ int len_delta = last_forced.orig_len - last_forced.new_len; if (len_delta && dot != end_of_line) { MOVE_MEMORY(/* from */ dot, /* to */ dot + len_delta, /* count */ end_of_line - dot); } /* * Now put the new text. */ MOVE_MEMORY(/* from */ last_forced.orig, /* to */ last_forced.start, /* count */ last_forced.orig_len); /* adjust pointers for any text size change */ *eol_p += len_delta; *dot_p += len_delta; /* swap old & new */ { unsigned char *temp_ptr = last_forced.orig; last_forced.orig = last_forced.new; last_forced.new = temp_ptr; } { unsigned temp_len = last_forced.orig_len; last_forced.orig_len = last_forced.new_len; last_forced.new_len = temp_len; } return; } } /* * If at the beginning of a line, or right after a non-ascii, * obviously nothing to convert. */ if (dot == start_of_line || !isascii(dot[-1])) return; /* * Search to the left of the cursor for a potential place to start * converting.... if a non-ascii is found before a non-letter, * non-allowed_nonletters_in_romaji char, we'll stop and convert. */ while (ptr != start_of_line && isascii(*ptr) && (isalnum(*ptr) || index(allowed_nonletters_in_romaji, *ptr))) { ptr--; } /* in automatic conversions, don't convert if the '/' has been escaped */ if (!force && ptr > start_of_line && ptr[-1] == '\\') return; /* a leading '/' or EUC means auto-convert) */ if (force || !isascii(*ptr) || *ptr == '/') { unsigned char kana[MAX_INPUT_LINE_LENGTH * 2]; unsigned romaji_len, kana_len; signed int len_delta; unsigned char *romaji_end; /* * Because of the special nature of 'n' in romaji, we don't want to * convert it "on the fly" if it's the last thing on the line... it * might well the start of "ne", etc. To enter "んね", the user would * have to type "n'e" or some other thing using one of the romaji2kana * packages OMIT characters. * Also allow for "nyu" etc. (for "にゅ"). * * Also watch out for 'h', since maybe allowing 'h' to be used * as an 'o' extender. Should probably watch out for 'm' as well..... */ if (!force && (dot[-1] == 'n' || dot[-1] == 'N' || dot[-1] == 'h' || dot[-1] == 'H')) romaji_end = dot - 1; else if (!force && (dot[-1] == 'y' || dot[-1] == 'Y') && (&dot[-2] >= start_of_line) && (dot[-2] == 'n' || dot[-2] == 'N' || dot[-2] == 'h' || dot[-2] == 'H')) romaji_end = dot - 2; else romaji_end = dot; if (isascii(*ptr)) { if (*ptr == '/') ptr++; /* skip it */ if (ptr == romaji_end) return; /* nothing left to actually convert */ } else { /* * If we're up against an EUC, we'll give that as well * to the conversion routine. That's so if the first * char after that to be converted is a long vowel * indicator (such as '^' as in "To^kyo^"), the routine * will know which vowel to extend (it being the EUC char * we're including here). */ ptr--; /* include the last EUC char */ if (ptr + 2 == romaji_end) return; /* nothing left to actually convert */ } if (romaji2kana(ptr, romaji_end, kana, sizeof(kana), 0) < 0) return; /* oops */ kana_len = strlen((void*)kana); /* * Remove the leading slash as well if: * there *is* a leading slash. * not forced conversion. * the first kana character is not ascii. */ if (kana_len && eat_leading_slash && !isascii(kana[0]) && ptr > start_of_line && ptr[-1] == '/' && (ptr == start_of_line || ptr[-2] != '\\') ) { ptr--; } romaji_len = romaji_end - ptr; len_delta = kana_len - romaji_len; /* abort if resulting kana would overflow */ if (end_of_line + len_delta >= bufend) return; if (std_romaji_toggled_force && force) { /* save info about what conversion is done */ if (last_forced.orig) free(last_forced.orig); last_forced.orig_len = romaji_len; last_forced.orig = xmalloc(last_forced.orig_len + 1); strcpy((void*)last_forced.orig, (void*)ptr); if (last_forced.new) free(last_forced.new); last_forced.new_len = kana_len; last_forced.new = xmalloc(last_forced.new_len + 1); strcpy((void*)last_forced.new, (void*)kana); last_forced.start = ptr; } /* * If kana is different size than romaji and there's text after * the romaji, move the stuff after appropriately. */ if (len_delta && romaji_end != end_of_line) MOVE_MEMORY(/* from */romaji_end, /* to */romaji_end + len_delta, /* count */end_of_line - romaji_end); /* * Now put the converted stuff. */ MOVE_MEMORY(/*from*/kana, /*to*/ptr, /*len*/kana_len); /* adjust pointers for any text size change */ end_of_line += len_delta; dot += len_delta; } *dot_p = dot; *eol_p = end_of_line; } lookup-1.08b.orig/lib/replace.h0100644000014400001440000000333706076512044016044 0ustar nakaharastaff#ifndef __REPLACE_H__ /* file wrapper */ #define __REPLACE_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #define APP_SUB_SUCCESS 0 #define APP_SUB_PAREN_INFO_NOT_ENOUGH 100 #define APP_SUB_DID_NOT_MATCH 101 extern unsigned apply_substitution(const regex_t *compiled, unsigned char **new, unsigned *matchcount, const unsigned char *str, unsigned str_len, const unsigned char *replace, unsigned count); unsigned char * sub(const unsigned char *str, unsigned stringlen, const unsigned char *pattern, const unsigned char *replace, unsigned flags, unsigned times); #define constant_sub(RESULT, PAT, PATFLAGS, STR, LEN, REPL, TIMES, NEW) \ macro_start { \ static int is_compiled = 0; \ static regex_t compiled; \ \ if (is_compiled == 0) { \ int i = regcomp(&compiled, (PAT), (PATFLAGS)); \ assert(i == 0); \ is_compiled = 1; \ } \ (RESULT) = apply_substitution(&compiled, &(NEW), 0, (STR), \ (LEN), (REPL), (TIMES)); \ } macro_end #endif /* file wrapper */ lookup-1.08b.orig/lib/termset.c0100644000014400001440000000743506173470601016112 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ /* * This file might be compiled directly, or included from another file. * It might be compiled by an ANSI or traditional compiler. * * If it's an ANSI compile, we'll only compile if we think that the system * include files we use have been properly ANSIfied. Otherwise, we'll compile * everything. */ #include "config.h" /* * If we've been told in config.h that the the system headers haven't been * ANSIfied, we may as well note that we'll not compile now if we're doing * a STDC compile. */ #ifdef __STDC__ # ifdef NON_ANSIFIED_IOCTL # define NO_COMPILE # endif #endif /* * If we are compiling (so far as we know), include the appropriate system * headers for the ioctl we'll be using. */ #ifndef NO_COMPILE # include "system.h" # ifdef _HAVE_SYS_TERMIO_H_ # include # else # include # endif #endif /* * If we're compiling (so far as we know) and it's a STDC compile, * make a couple of checks to see if we can guess about the ANSIfication * of the header files. This is all very non-portable, but the best * I can think of to automate this stuff.... */ #ifndef NO_COMPILE # ifdef __STDC__ # ifdef _IO # ifdef TIOCEXCL # if _IO('x', 13) == TIOCEXCL # define NO_COMPILE # endif # endif /* TIOCEXCL */ # ifdef TCSBRK # if _IO('x', 5) == TCSBRK # define NO_COMPILE # endif # endif /* TCGETA */ # endif /* _IO */ # endif /* __STDC__ */ #endif #ifndef NO_COMPILE # ifdef __STDC__ # define NO_ARGS void # else # define NO_ARGS /*empty*/ # endif # include # define DIE(note) \ { \ write(fileno(stderr), note, sizeof(note)); \ exit(2); \ } #ifdef __GNUC__ # if __GNUC > 1 # ifndef __STDC__ # warning integer overflow OK for this file. # endif # endif #endif # ifndef _HAVE_SYS_TERMIO_H_ static struct sgttyb original, new; void set_tty_state_to_cbreak(NO_ARGS) { if (original.sg_flags == 0) { if (ioctl(fileno(stdin), TIOCGETP, (char*)&original) < 0) DIE("ioctl TIOCGETP error\n"); new = original; new.sg_flags |= CBREAK; /* turn cbreak on */ new.sg_flags &= ~ECHO; /* turn echo off */ } /* don't worry about "integer overflow" warnings on next line */ if (ioctl(fileno(stdin), TIOCSETP, (char*)&new) < 0) DIE("ioctl TIOCSETP error\n"); } void reset_tty_state(NO_ARGS) { /* don't worry about "integer overflow" warnings on next lines */ if (original.sg_flags) ioctl(fileno(stdin), TIOCSETP, (char*)&original); } # else /* _HAVE_SYS_TERMIO_H_ */ static struct termio original, new; void set_tty_state_to_cbreak(NO_ARGS) { if (original.c_cflag == 0) { if (ioctl(fileno(stdin), TCGETA, &original) < 0) DIE("ioctl TCGETA error\n"); new = original; # ifdef ISTRIP new.c_iflag &= ~ISTRIP; # endif new.c_lflag &= ~(ECHO|ICANON); new.c_cc[VMIN] = 1; /* Input should wait for at least 1 char */ new.c_cc[VTIME] = 0; /* no matter how long that takes. */ } if (ioctl(fileno(stdin), TCSETA, &new) < 0) DIE("ioctl TCSETA error\n"); } void reset_tty_state(NO_ARGS) { if (original.c_cflag) { #if USE_LOCAL_INPUT ensure_blocking_input(); #endif ioctl(fileno(stdin), TCSETA, &original); } } # endif /* _HAVE_SYS_TERMIO_H_ */ #endif /* NO_COMPILE */ lookup-1.08b.orig/lib/input.h0100644000014400001440000000251006173630630015560 0ustar nakaharastaff#ifndef __INPUT_H__ /* file wrapper */ #define __INPUT_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #if USE_LOCAL_INPUT # define INPUT_BUF_SIZE 100 # define STDIN 0 extern int __input_pending__(void); extern unsigned char next_raw_input_byte(void); extern unsigned char next_cooked_input_byte(void); extern int flush_pending_input(void); extern void (*input_inactivity_function)(); extern void ensure_blocking_input(void); # define input_pending() (preread_input_pending || __input_pending__()) #else /* don't USE_LOCAL_INPUT */ static __inline__ input_pending(void) { return 0; } static __inline__ unsigned char next_raw_input_byte(void) { unsigned char c; read(0, &c, 1); return c; } static __inline__ unsigned char next_cooked_input_byte(void) { unsigned char c; set_tty_state_to_cbreak(); c = next_raw_input_byte(); reset_tty_state(); return c; } static __inline__ int flush_pending_input(void) { return 0; } static __inline__ void ensure_blocking_input(void) { } #endif /* USE_LOCAL_INPUT */ #endif /* file wrapper */ lookup-1.08b.orig/lib/virtfile.h0100644000014400001440000000301706173471035016252 0ustar nakaharastaff#ifndef __VIRTFILE_H__ #define __VIRTFILE_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * July 1996 * * Routine to access a file with virtualmemory-like access. */ typedef long int fileloc; /* index to a file position */ /* * Information on a single virtual file. */ typedef struct { int fd; /* open file descriptor */ const char *filename; /* for reference, filename of opened file */ fileloc length; /* length of opened file */ } VirtFile; /* * Pages used for access to all files. */ typedef struct { VirtFile *owner; /* this page represents part of this file */ const unsigned char *text; /* pointer to this page's text */ fileloc start; /* offset into file this page starts */ fileloc end; /* offset into file this page ends */ } Page; /* * Open the file and return it's virtual handle. */ VirtFile * OpenVertFile(const char *filename); /* * Given a firtual file and an offset into the file, return a pointer * to the string which starts (or encompasses) the starting position. * If pCount is not NULL, is filled with the length of the line. */ const unsigned char * VirtPos2Str(VirtFile *v, fileloc start, unsigned *pCount); /* convenient utility */ extern long int filesize(const char *filename); #endif /* File Wrapper */ lookup-1.08b.orig/lib/termset_trad.c0100644000014400001440000000007405534064642017120 0ustar nakaharastaff#include "termset.c" /* yup, just two lines in this file */ lookup-1.08b.orig/lib/MemItem.h0100644000014400001440000000362606173632703015772 0ustar nakaharastaff#ifndef __MEMITEM_H__ /* file wrapper */ #define __MEMITEM_H__ /* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * Routines for accessing sequential bytes in a file via with a short * read-ahead cache. */ #include #include "xmalloc.h" #include "output.h" #define DEFAULT_DATALEN 512 typedef long MemLoc; typedef struct { FILE *fp; /* file this item is found in */ unsigned char *data; /* a pre-loaded block of bytes */ MemLoc start; /* index into the file representing where DATA starts */ MemLoc end; /* index for where DATA ends */ MemLoc loc; /* index (into file!) of ``current file pointer'' */ unsigned datalen; /* size of memory allocated for DATA -- will usually be (end-start) excapt near the end of the file */ } MemItem; /* * Given a memitem and a starting location, pre-read a block. */ static __inline__ void FillMemPage(MemItem *m, MemLoc start) { if (!m->data) { if (!m->datalen) m->datalen = DEFAULT_DATALEN; m->data = xmalloc(m->datalen); } if (fseek(m->fp, m->start = start, SEEK_SET) != 0) die("Bad fseek to %ld (fp=%x) at %s line %d: %n\n", (unsigned)m->start, m->fp, __FILE__, __LINE__); m->end = start + fread(m->data, 1, m->datalen, m->fp); } /* * Get the next byte from the memory-item, reading from disk if need be. */ static __inline__ unsigned char GetMemByte(MemItem *m) { if (!m->data || m->loc < m->start || m->loc >= m->end) FillMemPage(m, m->loc); return m->data[m->loc++ - m->start]; } /* * (re)set the memory item to point to the given file and location. */ static __inline__ void SetMem(MemItem *m, FILE *fp, MemLoc loc) { m->fp = fp; FillMemPage(m, m->loc = loc); } #endif /*__MEMITEM_H__ */ lookup-1.08b.orig/lib/virtfile.c0100644000014400001440000001325706173471013016250 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * July 1996 * * Routine to access a file with virtualmemory-like access. */ #include "config.h" #include #include #include #include #include "output.h" #include "xmalloc.h" #include "strsave.h" #include "virtfile.h" #define PAGE_VALID(p) ((p)->text) /* * Given a filename, return the length of the file. */ long int filesize(const char *filename) { struct stat statbuf; if (stat(filename, &statbuf) < 0) return -1L; else return statbuf.st_size; } /* * PAGE_SIZE -- size of pages loaded. The page size limits the * length of the longest line. */ static unsigned PAGE_SIZE = 0x2000; /* * Number of pages to keep in memory at once, for all files. */ static unsigned PAGE_COUNT = 40; /* * pointer to list of pages.... */ static Page *common = 0; static unsigned default_blast = 0; /* * Init routine -- fills 'common'. Can change PAGE_COUNT beforehand.... */ static void init_common(void) { int i; if (common) return; common = xmalloc(sizeof(Page) * PAGE_COUNT); bzero(common, sizeof(Page) * PAGE_COUNT); } VirtFile * OpenVertFile(const char *filename) { VirtFile *v; int fd = open(filename, 0); if (fd < 0) return NULL; if (!common) init_common(); v = xmalloc(sizeof *v); v->fd = fd; v->filename = strsave(filename); v->length = filesize(filename); return v; } static Page * LoadPage(VirtFile *v, fileloc start) { int i; Page *empty = 0, *p = 0; /* See if a page holding the location is already available */ for (i = 0; i < PAGE_COUNT; i++) { if (!empty && !PAGE_VALID(&common[i])) empty = &common[i]; if (common[i].owner == v && common[i].start < start) { p = &common[i]; break; } } if (!p) { /* if not, either pick an empty one, or blast a currently-used one */ if (empty) p = empty; else { p = &common[default_blast]; default_blast = (default_blast + 1) % PAGE_COUNT; } } /* * If page has no text yet, grab some memory */ if (!PAGE_VALID(p)) p->text = xmalloc(PAGE_SIZE); /* go to appropriate place in file and read data */ if (lseek(v->fd, start, SEEK_SET) < 0) die("bad lseek to %ld of %s: %n\n", (long)start); if (i = read(v->fd, p->text, PAGE_SIZE), i < 0) die("bad read of %ld bytes starting at %ld of %s: %n\n", (long)PAGE_SIZE, (long)start, v->filename); /* fill in other page stuff */ p->start = start; p->end = start + i; p->owner = v; return p; } /* * Given a file an a location, return a page that holds the location * somewhere. */ static Page * EnsurePositionInMemory(VirtFile *v, fileloc pos) { Page *p = 0; int i; /* see if page is already there */ for (i = 0; i < PAGE_COUNT; i++) { /* must be valid and owned by me */ if (!PAGE_VALID(&common[i]) || (common[i].owner != v)) continue; /* must have a position in range [start .. end) */ if (pos < common[i].start || pos > common[i].end) continue; /* if we don't have a pointer yet, or if this page has the desired location earlier in the page than the one we already found, use this new page instead */ if (!p || (pos - p->start) > (pos - common[i].start)) p = &common[i]; } /* if it's not already here, get a new page */ if (p) return p; else return LoadPage(v, pos); } /* * Given a page and an index into the file (which the given page MUST * represent), return the length of the line. */ struct ScanInfo { const unsigned char *str_start; /* pointer in memory where line starts */ unsigned short length; /* length of line */ char no_end; /* true if line extends after page */ }; static __inline__ struct ScanInfo * ScanStrInPage(Page *p, fileloc start) { static struct ScanInfo result; /* PTR is string we'll scan, END is end of the page */ const unsigned char *ptr = &p->text[start - p->start]; const unsigned char *end = &p->text[(int)(p->end - p->start)]; result.str_start = ptr; /* scan string looking for newline */ while ((ptr < end) && (*ptr != '\n')) ptr++; result.length = ptr - result.str_start; /* * Past end of page, and... page is full size (i.e. not last page of file) */ result.no_end = (ptr >= end && (p->end - p->start >= PAGE_SIZE)); return &result; } /* * Given a file and a location, return a pointer to memory holding the * string starting at (or spanning) the given location. * Fill *pCount with the length of the string if not NULL. */ const unsigned char * VirtPos2Str(VirtFile *v, fileloc start, unsigned *pCount) { struct ScanInfo *info; /* some small sanity check */ if (start < 0 || start > v->length) die("oops: %ld vs. length=%ld.\n", start, v->length); /* Get the page with the string, and get the string */ info = ScanStrInPage(EnsurePositionInMemory(v, start), start); /* * If we hit the end of the page (and the page is a full page, * and not the last one in the file where a newline might not end * things), we need to re-load the page such that the target line * is at the beginning of the page. This will ensure the whole line * is in memory. Of course, if the line is longer than PAGE_LENGTH * then life is tough and we return a chopped line. */ if (info->no_end) info = ScanStrInPage(LoadPage(v, start), start); if (pCount) *pCount = info->length; return info->str_start; } lookup-1.08b.orig/lib/fuzzkana.c0100644000014400001440000002135306076503473016261 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * Oct 1993 * * Feb 1995: Added をゑ fuzziness. * See "fuzzhkana.h" for overall comments. */ #include "config.h" #include "assert.h" #include "output.h" #include "fuzzkana.h" #include "kanaid.h" /* * Given * * IN -- some regex pattern string whose kana is to be fuzzified. * OUT -- where to stick the new pattern string * OUT_SIZE -- size of area pointed to by OUT. * * FLAGS -- * If FUZZ_LONG_VOWELS is set, the pattern will be written such * that longness of vowels doesn't matter. * * If FUZZ_SMALL_TSU is set, the pattern will be written such * that small TSUs won't matter. * * The number of bytes written to the output is returned, or zero if the * output buffer was overflowed (or, I suppose, if the input is empty) * * If OUT is zero, nothing is written (obviously)... the number of bytes * that would be required is returned (OUT_SIZE is ignored in this case) * * An example with both flags set * in 「ときょ」 * out 「と[ぅうおぉー]*っ?きょ[ぅうおぉー]*」 * */ #define LEAST_KANJI_HI_BYTE 0260 #define GREATEST_KANJI_HI_BYTE 0364 #define IS_EUC(HighByte) ((HighByte) & 0x80) unsigned fuzzkana(const unsigned char *in, unsigned char *out, unsigned out_size, unsigned flags) { const unsigned char *orig_out = out; const unsigned char *out_end = out + out_size; unsigned char hi, lo; int just_want_size = (out == 0); int do_voiced = (flags & FUZZ_VOICED); int do_vowels = (flags & FUZZ_LONG_VOWELS); int do_tsu = (flags & FUZZ_SMALL_TSU); #ifdef FUZZ_REPEATER int do_repeat = (flags & FUZZ_REPEATER); #endif if (in == 0 || *in == 0) return 0; /* * Shove the given character to the 'out' buffer, * aborting the function if the buffer is overflowed. * However, if just requesting how much buffer is required * (buy supplying no out buffer), just bump up the counter. */ #define SENDOUT(c) \ macro_start { \ unsigned char value = (c); \ if (just_want_size) \ out++; \ else \ { \ if (out >= out_end) \ return 0; \ *out++ = value; \ } \ } macro_end /* to tack on the given string to the output */ #define add(STR) \ macro_start { \ const unsigned char *str = (const unsigned char *)(STR); \ while (*str) \ SENDOUT(*str++); \ } macro_end /* run through the line */ while (hi = *in++, hi != 0) { unsigned next_id; unsigned vsound; unsigned id; /* Just pass through ASCII characters */ if (!IS_EUC(hi)) { SENDOUT(hi); continue; } lo = *in++; /* get the next byte of the EUC char. */ /* if this char is not kana, so just pass through and continue */ if (!IS_KANA(hi,lo)) { SENDOUT(hi); SENDOUT(lo); #ifdef FUZZ_REPEATER_no_no_no /* now done in jregex.c */ #define REPEATER_HI_BYTE 0241 /* high byte of 々 */ #define REPEATER_LO_BYTE 0271 /* low byte of 々 */ if (do_repeat && hi >= LEAST_KANJI_HI_BYTE && hi <= GREATEST_KANJI_HI_BYTE && ((in[0] == hi && in[1] == lo) || (in[0] == REPEATER_HI_BYTE && in[1] == REPEATER_LO_BYTE))) { SENDOUT('['); SENDOUT(REPEATER_HI_BYTE); SENDOUT(REPEATER_LO_BYTE); SENDOUT(hi); SENDOUT(lo); SENDOUT(']'); in += 2; } #endif /* FUZZ_REPEATER */ continue; } id = KANA_ID(hi,lo); /* the the id flags for this kana */ /* if we're doing voiced fuzz, fuzz dual-chracter'd sounds */ if (do_voiced && (id & KID_DUAL)) { switch(id) { default: die("oops, %02x %02x -> id is %x\n", hi, lo, id); break; case KID_E | KID_VOWEL | KID_DUAL: /* エ え */ case KID_ARCHAIC | KID_DUAL: /* ヱ ゑ */ add(IS_HIRAGANA(hi,lo) ? "[えゑ]" : "[エヱ]"); break; case KID_O | KID_VOWEL | KID_DUAL: /* オ お */ case KID_o | KID_DUAL: /* ヲ を */ add(IS_HIRAGANA(hi,lo) ? "[おを]" : "[オヲ]"); break; case KID_I | KID_Z | KID_DUAL: /* ジ じ */ case KID_I | KID_D | KID_DUAL: /* ヂ ぢ */ add(IS_HIRAGANA(hi,lo) ? "[じぢ]" : "[ジヂ]"); break; case KID_U | KID_D | KID_DUAL: /* ヅ づ */ case KID_U | KID_Z | KID_DUAL: /* ズ ず */ add(IS_HIRAGANA(hi,lo) ? "[づず]" : "[ヅズ]"); break; } } else { /* otherwise, just pass through */ SENDOUT(hi); SENDOUT(lo); } vsound = id & KID_VSOUND; #define GET_NEXT_ID(ptr) \ macro_start { \ if (!IS_EUC((ptr)[0])) \ next_id = 0; \ else if (IS_DASH((ptr)[0], (ptr)[1])) \ next_id = vsound | KID_VOWEL; \ else if (!IS_KANA((ptr)[0], (ptr)[1])) \ next_id = 0; \ else \ next_id = KANA_ID((ptr)[0], (ptr)[1]); \ } macro_end GET_NEXT_ID(in); #if 0 /* consider the O and U sound to be the same */ if (vsound & (KID_O|KID_U)) vsound = KID_O|KID_U; #endif if (do_vowels) { /* * If current character has a vowel sound and is not followed * by a small y-consonant sound, allow to be doubled. */ if (vsound && (next_id & (KID_SMALL|KID_Y)) != (KID_SMALL|KID_Y)) { /* * Unless they have case folding off, it won't matter which * of these two we use, but since it would matter if they * had it off, we'll separate them.... */ if (IS_HIRAGANA(hi,lo)) switch (vsound) { default: assert(0); break; case KID_A: add("[ぁあー]*"); break; case KID_I: add("[ぃいー]*"); break; case KID_U: add("[ぅうー]*"); break; case KID_O: add("[ぅうおぉをー]*"); vsound |= KID_U; break; case KID_E: add("[ぇえゑー]*"); break; } else switch (vsound) { default: assert(0); break; case KID_A: add("[ァアー]*"); break; case KID_I: add("[ィイー]*"); break; case KID_U: add("[ゥウー]*"); break; case KID_O: add("[ゥウオォヲー]*"); vsound |= KID_U; break; case KID_E: add("[ェエヱー]*"); break; } /* * If the next char is the simple vowel we've just * allowed to be doubled, skip it. * * If the input string is something like「ううううう」 * This IF will cause each「うう」pair to combine into * a「う[ぅうおぉー]*」pattern. However, if you make the IF * a WHILE, it would collapse them all down into one. * Either method has their benefits. */ while ((next_id & KID_VOWEL) && (next_id & vsound)) { in += 2; /* skip the vowel we've just replaced */ GET_NEXT_ID(in); } } } if (do_tsu) { unsigned next_is_small_tsu = (next_id & (KID_T|KID_U|KID_SMALL)) == (KID_T|KID_U|KID_SMALL); /* * if the next thing is a hard-coded small-TSU, or all of * + currently have a vowel sound, and * + the next character has an "appropriate" consonant sound, * + and the next character isn't small. * then we'll add a possible small tsu. */ #ifndef SMALL_TSU_OK #define SMALL_TSU_OK (KID_K|KID_S|KID_T|KID_D|KID_P|KID_W|KID_M) #endif if (next_is_small_tsu || (vsound && (next_id & SMALL_TSU_OK) && !(next_id & KID_SMALL))) { /* add a possability for a small TSU */ add("っ?"); if (next_is_small_tsu) in += 2; /* skip small tsu that's there */ } } } SENDOUT(0); /* tack on a null */ return out - orig_out; } lookup-1.08b.orig/doc/0040755000014400001440000000000006100574546014257 5ustar nakaharastafflookup-1.08b.orig/doc/man.tmp0100644000014400001440000027772006277306024015567 0ustar nakaharastaff LOOKUP(1) LOOKUP(1) April 22nd, 1994 NNAAMMEE lookup - interactive file search and display SSYYNNOOPPSSIISS llooookkuupp [ args ] [ _f_i_l_e _._._. ] DDEESSCCRRIIPPTTIIOONN _L_o_o_k_u_p allows the quick interactive search of text files. It supports ASCII, JIS-ROMAN, and Japanese EUC Packed formated text, and has an integrated romaji/c_akana converter. TTHHIISS MMAANNUUAALL _L_o_o_k_u_p is flexible for a variety of applications. This manual will, however, focus on the application of searching Jim Breen's _e_d_i_c_t (Japanese-English dictionary) and _k_a_n_j_i_d_i_c (kanji database). Being familiar with the content and format of these files would be helpful. See the INFO section near the end of this manual for information on how to obtain these files and their documentation. OOVVEERRVVIIEEWW OOFF MMAAJJOORR FFEEAATTUURREESS The following just mentions some major features to whet your appetite to actually read the whole manual (-: Romaji-to-Kana Converter _L_o_o_k_u_p can convert romaji to kana for you, even,i`Eon the fly,i'Eas you type. Fuzzy Searching Searches can be a bit,i`Evague,i'Eor,i`Efuzzy,i'E, so that you'll be able to find,i`EoA`i,upb,i'Eeven if you try to search for,i`Eox`Eox-ox,c,i'E(the proper yomikata being,i`Eox`Eox|ox-ox,cox|,i'E). Regular Expressions Uses the powerful and expressive _r_e_g_u_l_a_r _e_x_p_r_e_s_s_i_o_n for searching. One can easily specify complex searches that affect,i`EI want lines that look like such-and-such, but not like this-and-that, but that also have this particular characteristic....,i'E Wildcard ``Glob'' Patterns Optionally, can use well-known filename wildcard patterns instead of full-fledged regular expressions. Filters You can have _l_o_o_k_u_p not list certain lines that would oth- erwise match your search, yet can optionally save them for quick review. For example, you could have all name-only entries from _e_d_i_c_t filtered from normal output. 1 LOOKUP(1) LOOKUP(1) Automatic Modifications Similarly, you can do a standard search-and-replace on lines just before they print, perhaps to remove information you don't care to see on most searches. For example, if you're generally not interested in _k_a_n_j_i_d_i_c's info on Chi- nese readings, you can have them removed from lines before printing. Smart Word-Preference Mode You can have _l_o_o_k_u_p list only entries with _w_h_o_l_e _w_o_r_d_s that match your search (as opposed to an _e_m_b_e_d_d_e_d match, such as finding,i`Ethe,i'Einside,i`Ethem,i'E), but if no whole-word matches exist, will go ahead and list any entry that matches the search. Handy Features Other handy features include a dynamically settable and parameterized prompt, automatic highlighting of that part of the line that matches your search, an output pager, readline-like input with horizontal scrolling for long input lines, a,i`E.lookup,i'Estartup file, automated programa- bility, and much more. Read on! RREEGGUULLAARR EEXXPPRREESSSSIIOONNSS _L_o_o_k_u_p makes liberal use of _r_e_g_u_l_a_r _e_x_p_r_e_s_s_i_o_n_s (or _r_e_g_e_x for short) in controlling various aspects of the searches. If you are not familiar with the important concepts of regexes, read the tutorial appendix of this manual before continuing. JJAAPPAANNEESSEE CCHHAARRAACCTTEERR EENNCCOODDIINNGG MMEETTHHOODDSS Internally, _l_o_o_k_u_p works with Japanese packed-format EUC, and all files loaded must be encoded similarly. If you have files encoded in JIS or Shift-JIS, you must first convert them to EUC before loading (see the INFO section for programs that can do this). Interactive input and output encoding, however, may be be selected via the -jis, -sjis, and -euc invocation flags (default is -euc), or by various commands to the program (described later). Make sure to use the encoding appropriate for your system. If you're using kterm under the X Window System, you can use _l_o_o_k_u_p's -jis flag to match kterm's default JIS encoding. Or, you might use kterm's,i`E-km euc,i'Estartup option (or menu selec- tion) to put kterm into EUC mode. Also, I have found kterm's scrollbar (,i`E-sb -sl 500,i'E) to be quite useful. With many,i`EEnglish,i'Efonts in Japan, the character that nor- mally prints as a backslash (halfwidth version of ,i`A) in The States appears as a yen symbol (the half-width version of ,i"i). How it will appear on your system is a function of what font you use and what output encoding method you choose, which may be different from the font and method that was used to print 2 LOOKUP(1) LOOKUP(1) this manual (both of which may be different from what's printed on your keyboard's appropriate key). Make sure to keep this in mind while reading. SSTTAARRTTUUPP Let's assume that your copy of _e_d_i_c_t is in ~/lib/edict. You can start the program simply with lookup ~/lib/edict You'll note that _l_o_o_k_u_p spends some time building an index before the default,i`Elookup> ,i'Eprompt appears. _L_o_o_k_u_p gains much of its search speed by constructing an index of the file(s) to be searched. Since building the index can be time consuming itself, you can have _l_o_o_k_u_p write the built index to a file that can be quickly loaded the next time you run the program. Index files will be given a,i`E.jin,i'E(Jef- frey's Index) ending. Let's build the indices for _e_d_i_c_t and _k_a_n_j_i_d_i_c now: lookup -write ~/lib/edict ~/lib/kanjidic This will create the index files ~/lib/edict.jin ~/lib/kanjidic.jin and exit. You can now re-start _l_o_o_k_u_p _, automatically using the pre-com- puted index files as: lookup ~/lib/edict ~/lib/kanjidic You should then be presented with the prompt without having to wait for the index to be constructed (but see the section on Operating System concerns for possible reasons of delay). IINNPPUUTT There are basically two types of input: searches and commands. Commands do such things as tell _l_o_o_k_u_p to load more files or set flags. Searches report lines of a file that match some search specifier (where lines to search for are specified by one or more regular expressions). The input syntax may perhaps at first seem odd, but has been designed to be powerful and concise. A bit of time invested to learn it well will pay off greatly when you need it. BBRRIIEEFF EEXXAAMMPPLLEE Assuming you've started _l_o_o_k_u_p with _e_d_i_c_t and _k_a_n_j_i_d_i_c as noted above, let's try a few searches. In these examples, the ,i`Esearch [edict]> ,i'E 3 LOOKUP(1) LOOKUP(1) is the prompt. Note that the space after the,iAE>,i,Cis part of the prompt. Given the input: search [edict]> tranquil _l_o_o_k_u_p will report all lines with the string,i`Etranquil,i'Ein them. There are currently about a dozen such lines, two of which look like: o^Aox'eox<< [ox"aox1ox'eox<<] /peaceful (an)/tranquil/calm/restful/ o^Aox'eox(R) [ox"aox1ox'eox(R)] /peace/tranquility/ Notice that lines with,i`Etranquil,i'E_a_n_d,i`Etranquility,i'Ematched? This is because,i`Etranquil,i'Ewas embedded in the word,i`Etranquil- ity,i'E. You could restrict the search to only the _w_o_r_d,i`Etran- quil,i'Eby prepending the special,i`Estart of word,i'Esym- bol,iAE<,i,Cand appending the special,i`Eend of word,i'Esym- bol,iAE>,i,Cto the regex, as in: search [edict]> This is the regular expression that says,i`Ethe beginning of a word, followed by a,iAEt,i,C,,iAEr,i,C, ...,,iAEl,i,C, which is at the end of a word.,i'EThe current version of _e_d_i_c_t has just three matching entries. Let's try another: search [edict]> fukushima This is a search for the,i`EEnglish,i'Efukushima -- ways to search for kana or kanji will be explored later. Note that among the several lines selected and printed are: _ 'E^uoA,c [ox~Oox ox.oxIb] /Fukus_hima (pn,pl)/ `I'U'A3/4^E,ioA,c [ox-ox1/2ox~Oox ox.oxIb] /Kisofukushima (pl)/ By default, searches are done in a case-insensitive manner --,iAEF,i,Cand,iAEf,i,Care treated the same by _l_o_o_k_u_p, at least so far as the matching goes. This is called _c_a_s_e _f_o_l_d_i_n_g. Let's give a command to turn this option off, so that,iAEf,i,Cand,iAEF,i,Cwon't be considered the same. Here's an odd point about _l_o_o_k_u_p_'_s input syntax: the default setting is that all command lines must begin with a space. The space is the (default) command-introduction character and tells the input parser to expect a command rather than a search regular expression. _I_t _i_s _a _c_o_m_m_o_n _m_i_s_t_a_k_e _a_t _f_i_r_s_t _t_o _f_o_r_g_e_t _t_h_e _l_e_a_d_i_n_g _s_p_a_c_e _w_h_e_n issuing a command. Be careful. Try the command,i`E fold,i'Eto report the current status of case- folding. Notice that as soon as you type the space, the 4 LOOKUP(1) LOOKUP(1) prompt changes to ,i`Elookup command> ,i'E as a reminder that now you're typing a command rather than a search specification. lookup command> fold The reply should be,i`Efile #0's case folding is on,i'E You can actually turn it off with,i`E fold off,i'E. Now try the search for,i`Efukushima,i'Eagain. Notice that this time the entries with,i`EFukushima,i'Earen't listed? Now try the search string,i`EFukushima,i'Eand see that the entries with,i`Efukushima,i'Earen't listed. Case folding is usually very convenient (it also makes corre- sponding katakana and hiragana match the same), so don't for- get to turn it back on: lookup command> fold on JJAAPPAANNEESSEE IINNPPUUTT _L_o_o_k_u_p has an automatic romaji/c_akana converter. A lead- ing,iAE/,i,Cindicates that romaji is to follow. Try typ- ing,i`E/tokyo,i'Eand you'll see it convert to,i`E/ox`Eox-ox,c,i'Eas you type. When you hit return, _l_o_o_k_u_p will list all lines that have a,i`Eox`Eox-ox,c,i'Esomewhere in them. Well, sort of. Look care- fully at the lines which match. Among them (if you had case folding back on) you'll see: =Y-=Y^e=Y1=Y`E9|,u [=Y-=Y^e=Y1=Y`Eox-ox,cox|] /Christianity/ oA`i,upb [ox`Eox|ox-ox,cox|] /Toukyou (pl)/Tokyo/current capital of Japan/ AE`I9|`A [ox`Eox~Aox-ox,cox|] /convex lens/ The first one has,i`Eox`Eox-ox,c,i'Ein it (as,i`E=Y`Eox-ox,c,i'E, where the katakana,i`E=Y`E,i'Ematches in a case-insensitive manner to the hiragana,i`Eox`E,i'E), but you might consider the others unexpected, since they don't have,i`Eox`Eox-ox,c,i'Ein them. They're close (,i`Eox`Eox|ox-ox,c,i'Eand,i`Eox`Eox~Aox-ox,c,i'E), but not exact. This is the result of _l_o_o_k_u_p's,i`Efuzzification,i'E. Try the command,i`E fuzz,i'E(again, don't forget the command-introduction space). You'll see that fuzzification is turned on. Turn it off with,i`E fuzz off,i'Eand try,i`E/tokyo,i'E(which will convert as you type) again. This time you only get the lines which have,i`Eox`Eox-ox,c,i'Eexactly (well, case folding is still on, so it might match katakana as well). In a fuzzy search, length of vowels is ignored --,i`Eox`E,i'Eis con- sidered the same as,i`Eox`Eox|,i'E, for example. Also, the presence or absence of any,i`Eox~A,i'Echaracter is ignored, and the pairs ox, ox^A, ox_o oxoA, ox" ox~n, and ox_a ox`o are considered identical in a fuzzy search. 5 LOOKUP(1) LOOKUP(1) It might be convenient to consider a fuzzy search to be a,i`Epronunciation search,i'E. Special note: fuzzification will not be performed if a regular expres- sion,i`E*,i'E,,i`E+,i'E,or,i`E?,i'Emodifies a non-ASCII character. This is not an issue when input patterns are filename-like wildcard patterns (discussed below). In addition to kana fuzziness, there's one special case for kanji when fuzziness is on. The kanji repeater mark,i`E,i1,i'Ewill be recognized such that,i`E>>pb,i1,i'Eand,i`E>>pb>>pb,i'Ewill match each- other. Turn fuzzification back on (,i`Efuzz on,i'E), and search for all _w_h_o_l_e _w_o_r_d_s which sound like,i`Etokyo,i'E. That search would be specified as: search [edict]> / (again, the,i`Etokyo,i'Ewill be converted to,i`Eox`Eox-ox,c,i'Eas you type). My copy of _e_d_i_c_t has the three lines oA`i,upb [ox`Eox|ox-ox,cox|] /Toukyou (pl)/Tokyo/current capital of Japan/ AE~A,u"o [ox`Eox~Aox-ox,c] /special permission/patent/ AE`I9|`A [ox`Eox~Aox-ox,cox|] /convex lens/ This kind of whole-word romaji-to-kana search is so common, there's a special short cut. Instead of typing,i`E/,i'E, you can type,i`E[tokyo],i'E. The leading,iAE[,i,Cmeans,i`Estart romaji,i'E_a_n_d,i`Estart of word,i'E. Were you to type,i`E,i'Einstead (without a leading,iAE/,i,Cor,iAE[,i,Cto indicate romaji-to-kana conversion), you would get all lines with the _E_n_g_l_i_s_h whole-word,i`Etokyo,i'Ein them. That would be a reasonable request as well, but not what we want at the moment. Besides the kana conversion, you can use any cut-and-paste that your windowing system might provide to get Japanese text onto the search line. Cut,i`Eox`Eox-ox,c,i'Efrom somewhere and paste onto the search line. When hitting enter to run the search, you'll notice that it is done without fuzzification (even if the fuzzification flag was,i`Eon,i'E). That's because there's no leading,iAE/,i,C. Not only does a leading,iAE/,i,Cndicate that you want the romaji-to-kana conversion, but that you want it done fuzzily. So, if you'd like fuzzy cut-and-paste, just type a lead- ing,iAE/,i,Cefore pasting (or go back and prepend one after past- ing). These examples have all been pretty simple, but you can use all the power that regexes have to offer. As a slightly more complex example, the search,i`E,i'Ewould look for all lines with the words,i`Egrey,i'Eor,i`Egray,i'Ein them. Since 6 LOOKUP(1) LOOKUP(1) the,iAE[,i,Cisn't the first character of the line, it doesn't mean what was mentioned above (start-of-word romaji). In this case, it's just the regular-expression,i`Eclass,i'Eindicator. If you feel more comfortable using filename-like,i`E*.txt,i'Ewild- card patterns, you can use the,i`Ewildcard on,i'Ecommand to have patterns be considered this way. This has been a quick introduction to the basics of _l_o_o_k_u_p. It can be very powerful and much more complex. Below is a detailed description of its various parts and features. RREEAADDLLIINNEE IINNPPUUTT The actual keystrokes are read by a readline-ish package that is pretty standard. In addition to just typing away, the fol- lowing keystrokes are available: ^B / ^F move left/right one character on the line ^A / ^E move to the start/end of the line ^H / ^G delete one character to the left/right of the cursor ^U / ^K delete all characters to the left/right of the cursor ^P / ^N previous/next lines on the history list ^L or ^R redraw the line ^D delete char under the cursor, or EOF if line is empty ^space force romaji conversion (^@ on some systems) If automatic romaji-to-kana conversion is turned on (as it is by default), there are certain situations where the conversion will be done, as we saw above. Lower-case romaji will be con- verted to hiragana, while upper-case romaji to katakana. This usually won't matter, though, as case folding will treat hira- gana and katakana the same in the searches. In exactly what situations the automatic conversion will be done is intended to be rather intuitive once the basic idea is learned. However, at _a_n_y _t_i_m_e, one can use control-space to convert the ASCII to the left of the cursor to kana. This can be particularly useful when needing to enter kana on a command line (where auto conversion is never done; see below) RROOMMAAJJII FFLLAAVVOORR Most flavors of romaji are recognized. Special or non-obvious items are mentioned below. Lowercase are converted to hira- gana, uppercase to katakana. Long vowels can be entered by repeating the vowel, or with,iAE-,i,Cor,iAE^,i,C. In situations where an,i`En,i'Ecould be vague, as in,i`Ena,i'Ebeing ox^E or ox'oox/c, use a single quote to force ox'o. There- fore,,i"Okenichi,ix/c_aox+-ox"Eox'A while,i"Oken'ichi,ix/c_aox+-ox'ooxoxox'A. 7 LOOKUP(1) LOOKUP(1) The romaji has been richly extended with many non-standard combinations such as ox~Oox,i or ox'AoxS, which are represented in intuitive ways:,i"Ofa,ix/c_aox~Oox,i,,i"Oche,ix/c_aox'AoxS. etc. Various other mappings of interest: wo /c_aox`o we/c_aox~n wi/c_aox`'o VA /c_a=Y^o=Y,i VI/c_a=Y^o=Y-L VU/c_a=Y^o VE/c_a=Y^o=YS VO/c_a=Y^o=Y(C) di /c_aox^A dzi/c_aox^A dya/c_aox^Aox~a dyu/c_aox^Aoxoa dyo/c_aox^Aox,c du /c_aoxoA tzu/c_aoxoA dzu/c_aoxoA (the following kana are all smaller versions of the regular kana) xa /c_aox,i xi/c_aox-L xu/c_aox=Y xe/c_aoxS xo/c_aox(C) xu /c_aox=Y xtu/c_aox~A xwa/c_aox^i xka/c_a=Y~o xke/c_a=Y"o xya/c_aox~a xyu/c_aoxoa xyo/c_aox,c IINNPPUUTT SSYYNNTTAAXX Any input line beginning with a space (or whichever character is set as the command-introduction character) is processed as a command to _l_o_o_k_u_p rather than a search spec. _A_u_t_o_m_a_t_i_c kana conversion is never done on these lines (but _f_o_r_c_e_d conversion with control-space may be done at any time). Other lines are taken as search regular expressions, with the following special cases: ? A line consisting of a single question mark will report the current command-introduction character (the default is a space, but can be changed with the,i`Ecmdchar,i'Ecommand). = If a line begins with,iAE=,i,C, the line (without the,iAE=,i,C) is taken as a search regular expression, and no automatic (or internal -- see below) kana conversion is done anywhere on the line (although again, conversion can always be forced with control-space). This can be used to initiate a search where the beginning of the regex is the command- introduction character, or in certain situations where automatic kana conversion is temporarily not desired. / A line beginning with,iAE/,i,Cindicates romaji input for the whole line. If automatic kana conversion is turned on, the conversion will be done in real-time, as the romaji is typed. Otherwise it will be done internally once the line is entered. _R_e_g_a_r_d_l_e_s_s, the presence of the lead- ing,iAE/,i,Cindicates that any kana (either converted or cut- and-pasted in) should be,i`Efuzzified,i'Eif fuzzification is turned on. As an addition to the above, if the line doesn't begin with,iAE=,i,Cor the command-introduction character (and auto- matic conversion is turned on),,iAE/,i,C _a_n_y_w_h_e_r_e on the line initiates automatic conversion for the following word. 8 LOOKUP(1) LOOKUP(1) [ A line beginning with,iAE[,i,Cis taken to be romaji (just as a line beginning with,iAE/,i,C, and the converted romaji is sub- ject to fuzzification (if turned on). However, if,iAE[,i,Cis used rather than,iAE/,i,C, an implied,iAE<,i,C,i`Ebeginning of word,i'Eis prepended to the resulting kana regex. Also, any ending,iAE],i,Con such a line is converted to the,i`Eending of word,i'Especifier,iAE>,i,Cin the resulting regex. In addition to the above, lines may have certain prefixes and suffixes to control aspects of the search or command: ! Various flags can be toggled for the duration of a particu- lar search by prepending a,i`E!!,i'Esequence to the input line. Sequences are shown below, along with commands related to each: !F! ,i"A Filtration is toggled for this line (filter) !M! ,i"A Modification is toggled for this line (modify) !w! ,i"A Word-preference mode is toggled for this line (word) !c! ,i"A Case folding is toggled for this line (fold) !f! ,i"A Fuzzification is toggled for this line (fuzz) !W! ,i"A Wildcard-pattern mode is toggled for this line (wildcard) !r! ,i"A Raw. Force fuzzification off for this line !h! ,i"A Highlighting is toggled for this line (highlight) !t! ,i"A Tagging is toggled for this line (tag) !d! ,i"A Displaying is on for this line (display) The letters can be combined, as in,i`E!cf!,i'E. The final,iAE!,i,C can be omitted if the first character after the sequence is not an ASCII letter. If no letters are given (,i`E!!,i'E).,i`E!f!,i'Eis the default. These last two points can be conveniently combined in the common case of,i`E!/romaji,i'Ewhich would be the same as,i`E!f!/romaji,i'E. The special sequence,i`E!?,i'Elists the above, as well as indi- cates which are currently turned on. Note that the letters accepted in a,i`E!!,i'Esequence are many of the indicators shown by the,i`Efiles,i'Ecommand. + A,iAE+,i,Cprepended to anything above will cause the final search regex to be printed. This can be useful to see when and what kind of fuzzification and/or internal kana conver- sion is happening. Consider: search [edict]> +/ox"iox< +ox"iox< +!/ox"iox< [ox`Eox-ox,cox`E] oA`i,upboA^O [ox`Eox|ox-ox,cox|ox`E] /Tokyo Metropolitan area/ cutting and pasting the oA^O from above, and adding a,i`E,1,i'Eto search _k_a_n_j_i_d_i_c: search [edict]> oA^O,1 oA^O 4554 N4769 S11 ..... =Y`E =Y"A oxBox"aox3 {metropolis} {capital} FFIILLEENNAAMMEE--LLIIKKEE WWIILLDDCCAARRDD MMAATTCCHHIINNGG When wildcard-pattern mode is selected, patterns are consid- ered as extended.Q "*.txt" "-like" patterns. This is often more convenient for users not familiar with regular expres- sions. To have this mode selected by default, put default wildcard on into your,i`E.lookup,i'Efile (see,i`ESTARTUP FILE,i'Ebelow). When wildcard mode is on, only ,i`E*,i'E,,i`E?,i'E,,i`E+,i'E,and,i`E.,i'E,are effected. See the entry for the ,i`Ewildcard,i'Ecommand below for details. 10 LOOKUP(1) LOOKUP(1) Other features, such as the multiple-pattern searches (described below) and other regular-expression metacharacters are available. MMUULLTTIIPPLLEE--PPAATTTTEERRNN SSEEAARRCCHHEESS You can put multiple patterns in a single search specifier. For example consider search [edict]> china||japan The first part (,i`Echina,i'E) will select all lines that have,i`Echina,i'Ein them. Then, _f_r_o_m _a_m_o_n_g _t_h_o_s_e _l_i_n_e_s, the second part will select lines that have,i`Ejapan,i'Ein them. The,i`E||,i'Eis not part of any pattern -- it is _l_o_o_k_u_p's,i`Epipe,i'Emechanism. The above example is very different from the single pattern ,i`Echina|japan,i'Ewhich would select any line that had either,i`Echina,i'E_o_r,i`Ejapan,i'E. With,i`Echina||japan,i'E, you get lines that have,i`Echina,i'E_a_n_d _t_h_e_n _a_l_s_o have,i`Ejapan,i'Eas well. Note that it is also different from the regular expres- sion,i`Echina.*japan,i'E(or the wildcard pat- tern,i`Echina*japan,i'E)which would select lines having,i`Echina, then maybe some stuff, then japan,i'E. But consider the case when,i`Ejapan,i'Ecomes on the line before,i`Echina,i'E. Just for your comparison, the multiple-pattern specifier,i`Echina||japan,i'Eis pretty much the same as the single regular expres- sion,i`Echina.*japan|japan.*china,i'E. If you use,i`E|!|,i'Einstead of,i`E||,i'E, it will mean,i`E...and then lines _n_o_t matching...,i'E. Consider a way to find all lines of _k_a_n_j_i_d_i_c that do have a Halpern number, but don't have a Nelson number: search [edict]> |!| If you then wanted to restrict the listing to those that _a_l_s_o had a,i`Ejinmeiyou,i'Emarking (_k_a_n_j_i_d_i_c's,i`EG9,i'Efield) and had a reading of ox/cox-, you could make it: search [edict]> |!||||| A prepended,iAE+,i,Cwould explain: a match is,i`E,i'E and not,i`E,i'E and,i`E,i'E and,i`E,i'E The,i`E|!|,i'Eand,i`E||,i'Ecan be used to make up to ten separate reg- ular expressions in any one search specification. 11 LOOKUP(1) LOOKUP(1) Again, it is important to stress that,i`E||,i'Edoes not mean,i`Eor,i'E(as it does in a C program, or as,iAE|,i,Cdoes within a regular expression). You might find it convenient to read,i`E||,i'Eas,i`E_a_n_d also,i'E, while reading,i`E|!|,i'Eas,i`Ebut _n_o_t,i'E. It is also important to stress that any whitespace around the,i`E||,i'Eand,i`E|!|,i'Econstruct is _n_o_t ignored, but kept as part of the regex on either side. CCOOMMBBIINNAATTIIOONN SSLLOOTTSS Each file, when loaded, is assigned to a,i`Eslot,i'Evia which sub- sequent references to the file are then made. The slot may then be searched, have filters and flags set, etc. A special kind of slot, called a,i`Ecombination slot,i'E,rather than representing a single file, can represent multiple previ- ously-loaded slots. Searches against a combination slot (or,i`Ecombo slot,i'Efor short) search all those previously-loaded slots associated with it (called,i`Ecomponent slots,i'E). Combo slots are set up with the _c_o_m_b_i_n_e command. A Combo slot has no filter or modify spec, but can have a local prompt and flags just like normal file slots. The flags, however, have special meanings with combo slots. Most combo-slot flags act as a mask against the component-slot flags; when acted upon as a member of the combo, a component- slot's flag will be disabled if the corresponding combo-slot's flag is disabled. Exceptions to this are the _a_u_t_o_k_a_n_a, _f_u_z_z, and _t_a_g flags. The _a_u_t_o_k_a_n_a and _f_u_z_z flags governs a combo slot exactly the same as a regular file slot. When a slot is searched as a component of a combination slot, the component slot's _f_u_z_z (and _a_u_t_o_k_a_n_a) flags, or lack thereof, are ignored. The _t_a_g flag is quite different altogether; see the _t_a_g com- mand for complete information. Consider the following output from the _f_i_l_e_s command: "(R)"~"3"~"~"~"~","~"~"3"~"~"~"3"~"~"~"~"~"~"~"~"~"~"~"~"~"~ "- 0"-F wcfh d"/ca I "- 2762k"-/usr/jfriedl/lib/edict "- 1"-FM cf d"/ca I "- 705k"-/usr/jfriedl/lib/kanjidic "- 2"-F cfh@d"/ca "- 1k"-/usr/jfriedl/lib/local.words "-*3"-FM cfhtd"/ca "- combo"-kotoba (#2, #0) "+-"~",u"~"~"~"~"_o"~"~",u"~"~"~",u"~"~"~"~"~"~"~"~"~"~"~"~"~"~ See the discussion of the _f_i_l_e_s command below for basic expla- nation of the output. As can be seen, slot #3 is a _c_o_m_b_i_n_a_t_i_o_n _s_l_o_t with the name,i`Ekotoba,i'Ewith _c_o_m_p_o_n_e_n_t _s_l_o_t_s two and zero. When a search is initiated on this slot, first slot #2,i`Elocal.words,i'Ewill be 12 LOOKUP(1) LOOKUP(1) searched, then slot #0,i`Eedict,i'E. Because the combo slot's _f_i_l_t_e_r flag is _o_n, the component slots' _f_i_l_t_e_r flag will remain on during the search. The combo slot's _w_o_r_d flag is _o_f_f, however, so slot #0's _w_o_r_d flag will be forced off during the search. See the _c_o_m_b_i_n_e command for information about creating combo slots. PPAAGGEERR _L_o_o_k_u_p has a built in pager (a'la _m_o_r_e). Upon filling a screen with text, the string --MORE [space,return,c,q]-- is shown. A space will allow another screen of text; a return will allow one more line. A,iAEc,i,C will allow output text to continue unpaged until the next command. A,iAEq,i,C will flush output of the current command. If supported by the OS, _l_o_o_k_u_p_'_s idea of the screen size is automatically set upon startup and window resize. _L_o_o_k_u_p must know the width of the screen in doing both the horizontal input-line scrolling, and for knowing when a long line wraps on the screen. The pager parameters can be set manually with the,i`Epager,i'Ecom- mand. CCOOMMMMAANNDDSS Any line intended to be a command must begin with the command- introduction character (the default is a space, but can be set via the,i`Ecmdchar,i'Ecommand). However, that character is not part of the command itself and won't be shown in the following list of commands. There are a number of commands that work with the _s_e_l_e_c_t_e_d _f_i_l_e or _s_e_l_e_c_t_e_d _s_l_o_t (both meaning the same thing). The selected file is the one indicated by an appended comma+digit, as mentioned above. If no such indication is given, the default _s_e_l_e_c_t_e_d _f_i_l_e is used (usually the first file loaded, but can be changed with the,i`Eselect,i'Ecommand). Some commands accept a _b_o_o_l_e_a_n argument, such as to turn a flag on or off. In all such cases, a,i`E1,i'Eor,i`Eon,i'Emeans to turn the flag on, while a,i`E0,i'Eor,i`Eoff,i'Eis used to turn it off. Some flags are per-file (,i`Efuzz,i'E,,i`Efold,i'E, etc.), and a com- mand to set such a flag normally sets the flag for the selected file only. However, the default value inherited by subsequently loaded files can be set by prepend- ing,i`Edefault,i'Eto the command. This is particularly useful in the startup file before any files are loaded (see the section STARTUP FILE). Items separated by,iAE|,i,Care mutually exclusive possibilities (i.e. a boolean argument is,i`E1|on|0|off,i'E). 13 LOOKUP(1) LOOKUP(1) Items shown in brackets (,iAE[,i,Cand,iAE],i,C) are optional. All commands that accept a boolean argument to set a flag or mode do so optionally -- with no argument the command will report the current status of the mode or flag. Any command that allows an argument in quotes (such as load, etc.) allow the use of single or double quotes. The commands: [default] autokana [_b_o_o_l_e_a_n] Automatic romaji /c_a kana conversion for the _s_e_l_e_c_t_e_d _f_i_l_e is turned on or off (default is on). However, if,i`Edefault,i'Eis specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily disabled by a prepended,iAE=,i,C,as described in the INPUT SYNTAX section. clear|cls Attempts to clear the screen. If you're using a kterm it'll just output the appropriate tty control sequence. Otherwise it'll try to run the,i`Eclear,i'Ecommand. cmdchar ['_o_n_e_-_b_y_t_e_-_c_h_a_r'] The default command-introduction character is a space, but it may be changed via this command. The single quotes sur- rounding the character are required. If no argument is given, the current value is printed. An input line consisting of a single question mark will also print the current value (useful for when you don't know the current value). Woe to the one that sets the command-introduction character to one of the other special input-line characters, such as,iAE+,i,C,,iAE/,i,C, etc. combine ["name"] [ _n_u_m += ] _s_l_o_t_n_u_m ... Creates or adds file slots to a combination slot (see the COMBINATION SLOTS section for general information). Note that,i`Ecombo,i'Emay be used as the command as well. Assuming for this example that slots 0-2 are loaded with the files _c_u_r_l_y, _m_o_e, and _l_a_r_r_y, we can create a combina- tion slot that will reference all three: combo "three stooges" 2, 0, 1 The command will report creating combo slot #3 (three stooges): 2 0 1 14 LOOKUP(1) LOOKUP(1) The _n_a_m_e is optional, and will appear in the _f_i_l_e_s list, and also maybe be used to specify the slot as an argument to the _s_e_l_e_c_t command. A search via the newly created combo slot would search in the order specified on the _c_o_m_b_o command line: first _l_a_r_r_y, then _c_u_r_l_y, and finally _m_o_e. If you later load another file (say, _j_e_f_f_r_e_y to slot #4), you can then add it to the previously made combo: combo 3 += 4 (the,i`E+=,i'Ewording comes from the C programming language where it means,i`Eadd on to,i'E). Adding to a combination always adds slots to the end of the list. You can take the opportunity of adding the slot to also change the name, if you like: combo "four stooges" 3 += 4 The reply would be adding to combo slot #3(four stooges): 4 A file slot can be a component of any particular combo slot only once. When reporting the created or added slot num- bers, the number will appear in parenthesis if it had already been a member of the list. Furthermore, only _f_i_l_e slots can be component members of _c_o_m_b_o slots. Attempting to combine combo slot _X to combo slot _Y will result in having _X's component file slots (rater than the combo slot itself) added to _Y. command debug [_b_o_o_l_e_a_n] Sets the internal command parser debugging flag on or off (default is off). debug [_b_o_o_l_e_a_n] Sets the internal general-debugging flag on or off (default is off). describe _s_p_e_c_i_f_i_e_r This command will tell you how a character (or each charac- ter in a string) is encoded in the various encoding meth- ods: lookup command> describe ",uox" ,i`E,uox,i'Eas EUC is 0xb5a4 (181 164; 265 \244) as JIS is 0x3524 ( 53 36; 65 \044 "5$") as KUTEN is 2104 ( 0x1504; 25 \004) as S-JIS is 0x8b1f (139 31; 213 \037) 15 LOOKUP(1) LOOKUP(1) The quotes surrounding the character or string to describe are optional. You can also give a regular ASCII character and have the double-width version of the character described.... indicating,i`EA,i'E, for example, would describe,i`E-L'A,i'E. _S_p_e_c_i_f_i_e_r can also be a four-digit kuten value, in which case the character with that kuten will be described. If a four-digit _s_p_e_c_i_f_i_e_r has a hex digit in it, or if it is preceded by,i`E0x,i'E, the value is taken as a JIS code. You can precede the value by,i`Ejis,i'E,,i`Esjis,i'E,,i`Eeuc,i'E, or,i`Ekuten,i'Eto force interpretation to the requested code. Finally, _s_p_e_c_i_f_i_e_r can be a string of stripped JIS (JIS w/o the kanji-in and kanji-out codes, or with the codes but without the escape characters in them). For exam- ple,i`EF|K\,i'Ewould describe the two characters AE"u and "E"U. encoding [euc|sjis|jis] The same as the -euc, -jis, and -sjis command-line options, sets the encoding method for interactive input and output (or reports the current status). More detail over the out- put encoding can be achieved with the _o_u_t_p_u_t _e_n_c_o_d_i_n_g com- mand. A separate encoding for input can be set with the _i_n_p_u_t _e_n_c_o_d_i_n_g command. files [ - | long ] Lists what files are loaded in what slots, and some status information about them, as with: "-*0"-F wcfh d"/ca I "- 3749k"-/usr/jeff/lib/edict "- 1"-FM cf d"/ca I "- 754k"-/usr/jeff/lib/kanjidic "(R)"~"3"~"~"~"~"~","~"~"3"~"~"~"3"~"~"~"~"~"~"~"~"~"~"~"~"~"~ "- 0"-F wcf h d "/ca I "- 2762k"-/usr/jfriedl/lib/edict "- 1"-FM cf d "/ca I "- 705k"-/usr/jfriedl/lib/kanjidic "- 2"-F cfWh@d "/ca "- 1k"-/usr/jfriedl/lib/local.words "-*3"-FM cf htd "/ca "- combo"-kotoba (#2, #0) "- 4"- cf d "/ca "- 205k"-/usr/dict/words "+-"~",u"~"~"~"~"~"_o"~"~",u"~"~"~",u"~"~"~"~"~"~"~"~"~"~"~"~"~"~ The first section is the slot number, with a,i`E*,i'Ebeside the _d_e_f_a_u_l_t _s_l_o_t (as set by the _s_e_l_e_c_t command). The second section shows per-slot flags and status. Letters are shown if the flag is on, omitted if off. In the list below, related commands are given for each item: F ,i"A if there is a filter {but '#' if disabled}. (filter) M ,i"A if there is a modify spec {but '%' if disabled}. (modify) w ,i"A if word-preference mode is turned on. (word) c ,i"A if case folding is turned on. (fold) f ,i"A if fuzzification is turned on. (fuzz) W ,i"A if wildcard-pattern mode is turned on (wildcard) 16 LOOKUP(1) LOOKUP(1) h ,i"A if highlighting is turned on. (highlight) t ,i"A if there is a tag {but @ if disabled} (tag) d ,i"A if found lines should be displayed (display) ",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i",i a ,i"A if autokana is turned on (autokana) P ,i"A if there is a file-specific local prompt (prompt) I ,i"A if the file is loaded with a precomputed index (load) d ,i"A if the display flag is on (display) Note that the letters in the upper section directly corre- spond to the,i`E!!,i'Esequence characters described in the INPUT SYNTAX section. If there is a digit at the end of the flag section, it indicates that only #/10 of the file is actually loaded into memory (as opposed to the file having been completely loaded). Unloaded files will be loaded while _l_o_o_k_u_p is idle, or when first used. If the slot is a combination slot (as slot #3 is in the example above), that is noted in the third section, and the combination name and component slot numbers are noted in the fourth. Also, for combination slots (which have no _f_i_l_- _t_e_r or _m_o_d_i_f_y specifications, only the flags), _F and/or _M are shown if the corresponding mode is allowed during searches via the combo slot. See the _t_a_g command for info about _t with respect to combination slots. If an argument (either,i`E-,i'Eor,i`Elong,i'Ewill work) is given to the command, a short message about what the flags mean is also printed. filter ["_l_a_b_e_l"] [!] /_r_e_g_e_x/[i] Sets the filter for the _s_e_l_e_c_t_e_d _s_l_o_t (which must contain a file and not a combination). If a filter is set and active for a file, any line matching the given _r_e_g_e_x is filtered from the output (if the,iAE!,i,Cis put before the _r_e_g_e_x, any line _n_o_t matching the regex is filtered). The _l_a_b_e_l _, which isn't required, merely acts as documentation in vari- ous diagnostics. As an example, consider that _e_d_i_c_t lines often have,i`E(pn),i'Eon them to indicate that the given English is a place name. Often these place names can be a bother, so it would be nice to elide them from the output unless specifi- cally requested. Consider the example: lookup command> filter "name" /(pn)/ search [edict]> [ox-ox^I] ,u,i,C1/2 [ox-ox^Iox|] /function/faculty/ ,u/c,C1/4 [ox-ox^Iox|] /inductive/ _o`oAE"u [ox-ox^Iox|] /yesterday/ /c~a3 "name" lines filtered/c"a In the example,,iAE/,i,Ccharacters are used to delimit the 17 LOOKUP(1) LOOKUP(1) start and stop of the regex (as is common with many pro- grams). However, any character can be used. A final,iAEi,i,C, if present, indicates that the regex should be applied in a case-insensitive manner. The filter, once set, can be enabled or disabled with the other form of the,i`Efilter,i'Ecommand (described below). It can also be temporarily turned off (or, if disabled, tem- porarily turned on) by the,i`E!F!,i'Eline prefix. Filtered lines can optionally be saved and then displayed if you so desire. See the,i`Esaved list size,i'Eand,i`Eshow,i'Ecommands. Note that if you have saving enabled and only one line would be filtered, it is simply printed at the end (rather than print a one line message about how one line was fil- tered). By the way, a better,i`Ename,i'Efilter for _e_d_i_c_t would be: filter "name" #^[^/]+/[^/]*[^/]*/$# as it would filter all entries that had only one English section, that section being a name. It is also an example of using something other than,iAE/,i,Cto delimit a regex, as it makes things a bit easier to read. filter [_b_o_o_l_e_a_n] Enables or disables the filter for the _s_e_l_e_c_t_e_d _s_l_o_t. If no argument is given, displays the current filter and sta- tus. [default] fold [_b_o_o_l_e_a_n] The _s_e_l_e_c_t_e_d _s_l_o_t's case folding is turned on or off (default is on), or reported if no argument given. How- ever, if,i`Edefault,i'Eis specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily toggled by the,i`E!c!,i'Eline prefix. [default] fuzz [_b_o_o_l_e_a_n] The _s_e_l_e_c_t_e_d _s_l_o_t's fuzzification is turned on or off (default is on), or reported if no argument given. How- ever, if,i`Edefault,i'Eis specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily toggled by the,i`E!f!,i'Eline prefix. help [_r_e_g_e_x] Without an argument gives a short help list. With an argu- ment, lists only commands whose help string is picked up by 18 LOOKUP(1) LOOKUP(1) the given _r_e_g_e_x. [default] highlight [_b_o_o_l_e_a_n] Sets matched-string highlighting on or off for the _s_e_l_e_c_t_e_d _s_l_o_t (default off), or reports the current status if no argument is given. However, if,i`Edefault,i'Eis specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). If on, shows in bold or reverse video (see below) that part of the line which was matched by the search _r_e_g_e_x. If mul- tiple regexes were given, that part matched by the first regex is show. Note that a regex might match a portion of a line which is later removed by a _m_o_d_i_f_y parameter. In this case, no high- lighting is done. Can be temporarily toggled by the,i`E!h!,i'Eline prefix. highlight style [_b_o_l_d | _i_n_v_e_r_s_e | _s_t_a_n_d_o_u_t | _<_______>] Sets the style of highlighting for when highlighting is done. _I_n_v_e_r_s_e (inverse video) and _s_t_a_n_d_o_u_t are the same. The default is _b_o_l_d. You can also give an HTML tag, such as,i`E,i'Eand items will be wrapped by .... This would be particularly useful when the output is going to a CGI, as when lookup has been built in a server config- uration. Note that the highlighting is affected by using raw VT100/xterm control sequences. This isn't particularly very nice if your terminal doesn't understand them. Sorry. if {_e_x_p_r_e_s_s_i_o_n} _c_o_m_m_a_n_d_._._. If the evaluated _e_x_p_r_e_s_s_i_o_n is non-zero, the _c_o_m_m_a_n_d will be executed. Note that {} rather than () surround the _e_x_p_r_e_s_s_i_o_n. _E_x_p_r_e_s_s_i_o_n may be comprised of numbers, operators, paren- thesis, etc. In addition to the normal +, -, *, and /, are: !_x ,i"A yields 0 if _x is non-zero, 1 if _x is zero. _x && _y ,i"A !_x ,i"A,iAEnot,i,CYields 1 if _x is zero, 0 if non-zero. _x & _y ,i"A,iAEand,i,CYields 1 if both _x and _y are non-zero, 0 otherwise. _x | _y ,i"A,iAEor,i,C Yields 1 if _x or _y (or both) is non-zero, 0 otherwise 19 LOOKUP(1) LOOKUP(1) There may also be the special tokens _t_r_u_e and _f_a_l_s_e which are 1 and 0 respectively. There are also _c_h_e_c_k_e_d, _m_a_t_c_h_e_d, _p_r_i_n_t_e_d, _n_o_n_w_o_r_d, and _f_i_l_- _t_e_r_e_d which correspond to the values printed by the _s_t_a_t_s command. An example use might be the following kind of thing in an computer-generated script: !d!expect this line if {!printed} msg Oops! couldn't find "expect this line" input encoding [ euc | sjis ] Used to set (or report) what encoding to use when 8-bit bytes are found in the interactive input (all flavors of JIS are always recognized). Also see the _e_n_c_o_d_i_n_g and _o_u_t_- _p_u_t _e_n_c_o_d_i_n_g commands. limit [_v_a_l_u_e] Sets the number of lines to print during any search before aborting (or reports the current number if no value given). Default is 100. Output limiting is disabled if set to zero. log [ to [+] _f_i_l_e ] Begins logging the program output to _f_i_l_e (the Japanese encoding method being the same as for screen output). If,i`E+,i'Eis given, the log is appended to any text that might have previously been in _f_i_l_e, in which case a leading dashed line is inserted into the file. If no arguments are given, reports the current logging sta- tus. log - | off If only,i`E-,i'Eor _o_f_f is given, any currently-opened log file is closed. load [-now|-whenneeded] "_f_i_l_e_n_a_m_e" Loads the named file to the next available slot. If a pre- computed index is found (as,i`E_f_i_l_e_n_a_m_e.jin,i'E)it is loaded as well. Otherwise, an index is generated internally. The file to be loaded (and the index, if loaded) will be loaded during idle times. This allows a startup file to list many files to be loaded, but not have to wait for each 20 LOOKUP(1) LOOKUP(1) of them to load in turn. Using the ,i`E-now,i'Eflag causes the load to happen immediately, while using the ,i`E-when- needed,i'Eoption (can be shortened to ,i`E-wn,i'E)causes the load to happen only when the slot is first accessed. Invoke _l_o_o_k_u_p as % lookup -writeindex _f_i_l_e_n_a_m_e to generate and write an index file, which will then be automatically used in the future. If the file has already been loaded, the file is not re- read, but the previously-read file is shared. The new slot will, however, have its own separate flags, prompt, filter, etc. modify /_r_e_g_e_x/_r_e_p_l_a_c_e/[ig] Sets the _m_o_d_i_f_y parameter for the _s_e_l_e_c_t_e_d _f_i_l_e. If a file has a modify parameter associated with it, each line selected during a search will have that part of the line which matches _r_e_g_e_x (if any) replaced by the _r_e_p_l_a_c_e_m_e_n_t string before being printed. Like the _f_i_l_t_e_r command, the delimiter need not be,iAE/,i,C; any non-space character is fine. If a final,iAEi,i,Cis given, the regex is applied in a case-insensitive manner. If a final,iAEg,i,Cis given, the replacement is done to all matches in the line, not just the first part that might match _r_e_g_e_x. The _r_e_p_l_a_c_e_m_e_n_t may have embedded,i`E1,i'E, etc. in it to refer to parts of the matched text (see the tutorial on regular expressions). The modify parameter, once set, may be enabled or disabled with the other form of the modify command (described below). It may also be temporarily toggled via the,i`E!m!,i'Eline prefix. A silly example for the ultra-nationalist might be: modify //Dainippon Teikoku/g So that a line such as AE"u9|"a [ox"Eox'Aox(R)ox'o] /Bank of Japan/ would come out as AE"u9|"a [ox"Eox'Aox(R)ox'o] /Bank of Dainippon Teikoku/ As a real example of the modify command with _k_a_n_j_i_d_i_c, con- sider that it is likely that one is not interested in all the various fields each entry has. The following can be used to remove the info on the U, N, Q, M, E, B, C, and Y fields from the output: modify /( [UNQMECBY]\S+)+//g,1 It's sort of complex, but works. Note that here the 21 LOOKUP(1) LOOKUP(1) _r_e_p_l_a_c_e_m_e_n_t part is empty, meaning to just remove those parts which matched. The result of such a search of AE"u would normally print AE"u 467c U65e5 N2097 B72 B73 S4 G1 H3027 F1 Q6010.0 MP5.0714 ,i`A MN13733 E62 Yri4 P3-3-1 =Y"E=Y'A =Y,=Y"A ox`O -ox'O -ox<< {day} but with the above modify spec, appears more simply as AE"u 467c S4 G1 H3027 F1 P3-3-1 =Y"E=Y'A =Y,=Y"A ox`O -ox'O -ox<< {day} modify [_b_o_o_l_e_a_n] Enables or disables the modify parameter for the _s_e_l_e_c_t_e_d _f_i_l_e, or report the current status if no argument is given. msg _s_t_r_i_n_g The given _s_t_r_i_n_g is printed. Most likely used in a script as the target command of an _i_f command. output encoding [ euc | sjis | jis...] Used to set exactly what kind of encoding should be used for program output (also see the _i_n_p_u_t _e_n_c_o_d_i_n_g command). Used when the _e_n_c_o_d_i_n_g command is not detailed enough for one's needs. If no argument is given, reports the current output encod- ing. Otherwise, arguments can usually be any reasonable dash-separated combination of: euc Selects EUC for the output encoding. sjis Selects Shift-JIS for the output encoding. jis[78|83|90][-ascii|-roman] Selects JIS for the output encoding. If no year (78, 83, or 90) given, 78 is used. Can optionally specify that,i`EEnglish,i'Eshould be encoded as regular _A_S_C_I_I (the default when JIS selected) or as _J_I_S_-_R_O_M_A_N. 212 Indicates that JIS X0212-1990 should be supported (ignored for Shift-JIS output). no212 Indicates that JIS X0212-1990 should be not be sup- ported (default setting). This places JIS X0212-1990 characters under the domain of _d_i_s_p, _n_o_d_i_s_p, _c_o_d_e, or _m_a_r_k (described below). 22 LOOKUP(1) LOOKUP(1) hwk Indicates that _half _width _kana should be left as-is (default setting). nohwk Indicates that _half _width _kana should be stripped from the output. _(_n_o_t _y_e_t _i_m_p_l_e_m_e_n_t_e_d_)_. foldhwk Indicates that _half _width _kana should be folded to their full-width counterparts. _(_n_o_t _y_e_t _i_m_p_l_e_m_e_n_t_e_d_)_. disp Indicates that _n_o_n_-_d_i_s_p_l_a_y_a_b_l_e characters (such as JIS X0212-1990 while the output encoding method is Shift- JIS) should be passed along anyway (most likely resulting in screen garbage). nodisp Indicates that _n_o_n_-_d_i_s_p_l_a_y_a_b_l_e characters should be quietly stripped from the output. code Indicates that _n_o_n_-_d_i_s_p_l_a_y_a_b_l_e characters should be printed as their octal codes (default setting). mark Indicates that _n_o_n_-_d_i_s_p_l_a_y_a_b_l_e characters should be printed as,i`E,i'u,i'E. Of course, not all options make sense in all combina- tions, or at all times. When the current (or new) output encoding is reported, a complete and exact specifier rep- resenting the output encoding selected. An example might be,i`Ejis78-ascii-no212-hwk-code,i'E. pager [ _b_o_o_l_e_a_n | _s_i_z_e ] Turns on or off an output pager, sets it's idea of the screen size, or reports the current status. _S_i_z_e can be a single number indicating the number of lines to be printed between,i`EMORE?,i'Eprompts (usually a few lines less than the total screen height, the default being 20 lines). It can also be two numbers in the form,i`E#x#,i'Ewhere the first number is the width (in half-width characters; default 80) and the second is the lines-per-page as above. If the pager is on, every page of output will result in a,i`EMORE?,i'Eprompt, at which there are four possible responses. A space will allow one more full page to print. A return will allow one more line. A,iAEc,i,C(for,i`Econ- tinue,i'E) will all the rest of the output (for the current command) to proceed without pause, while a,iAEq,i,C(for,i`Equit,i'E) will flush the output for the current 23 LOOKUP(1) LOOKUP(1) command. If supported by the OS, the pager size parameters are set appropriately from the window size upon startup or window resize. The default pager status is,i`Eoff,i'E. [local] prompt "_s_t_r_i_n_g" Sets the prompt string. If,i`Elocal,i'Eis indicated, sets the prompt string for the _s_e_l_e_c_t_e_d _s_l_o_t only. Otherwise, sets the global default prompt string. Prompt strings may have the special %-sequences shown below, with related commands given in parenthesis: %N ,i"A the _d_e_f_a_u_l_t _s_l_o_t's file or combo name. %n ,i"A like %N, but any leading path is not shown if a filename. %# ,i"A the _d_e_f_a_u_l_t _s_l_o_t's number. %S ,i"A the,i`Ecommand-introduction,i'Echaracter (cmdchar) %0 ,i"A the running program's name %F='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if filtering enabled (filter) %M='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if modification enabled (modify) %w='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if word mode on (word) %c='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if case folding on (fold) %f='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if fuzzification on (fuzz). %W='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if wildcard-pat. mode on (wildcard). %d='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if displaying on (display). %C='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if currently entering a command. %l='_s_t_r_i_n_g' ,i"A _s_t_r_i_n_g shown if logging is on (log). %L ,i"A the name of the current output log, if any (log) For the tests (%f, etc), you can put,iAE!,i,Cjust after the,iAE%,i,Cto reverse the sense of the test (i.e. %!f="no fuzz"). The reverse of %F is if a filter is installed but disabled (i.e. _s_t_r_i_n_g will never be shown if there is no filter for the default file). The modify %M works compara- bly. Also, you can use an alternative form for the items that take an argument string. Replacing the quotes with paren- theses will treat _s_t_r_i_n_g as a recursive prompt specifier. For example, the specifier %C='command'%!C(%f='fuzzy 'search:) would result in a,i`Ecommand,i'Eprompt if entering a command, while it would result in either a,i`Efuzzy search:,i'Eor a,i`Esearch:,i'Eprompt if not entering a command. The paren- thesized constructs may be nested. Note that the letters of the test constructs are the same as the letters for the,i`E!!,i'Esequences described in INPUT SYNTAX. 24 LOOKUP(1) LOOKUP(1) An example of a nice prompt command might be: prompt "%C(%0 command)%!C(%w'*'%!f'raw '%n)> " With this prompt specification, the prompt would normally appear as,i`E_f_i_l_e_n_a_m_e> ,i'Ebut when fuzzification is turned off as,i`Eraw _f_i_l_e_n_a_m_e> ,i'E. And if word-preference mode is on, the whole thing has a,i`E*,i'Eprepended. However if a command is being entered, the prompt would then become,i`E_n_a_m_e com- mand,i'E, where _n_a_m_e was the program's name (system depen- dent, but most likely,i`Elookup,i'E). The default prompt format string is,i`E%C(%0 com- mand)%!C(search [%n])> ,i'E. regex debug [_b_o_o_l_e_a_n] Sets the internal regex debugging flag (turn on if you want billions of lines of stuff spewed to your screen). saved list size [_v_a_l_u_e] During a search, lines that match might be elided from the output due to filters or word-preference mode. This com- mand sets the number of such lines to remember during any one search, such that they may be later displayed (before the next search) by the _s_h_o_w command. The default is 100. select [ _n_u_m | _n_a_m_e | . ] If _n_u_m is given, sets the _d_e_f_a_u_l_t _s_l_o_t to that slot number. If _n_a_m_e is given, sets the _d_e_f_a_u_l_t _s_l_o_t to the first slot found with a file (or combination) loaded with that name. The incantation,i`Eselect .,i'Emerely sets the default slot to itself, which can be useful in script files where you want to indicate that any subsequent flags changes should work with whatever file was the default at the time the script was _s_o_u_r_c_ed. If no argument is given, simply reports the current _d_e_f_a_u_l_t _s_l_o_t (also see the _f_i_l_e_s command). In command files loaded via the _s_o_u_r_c_e command, or as the startup file, commands dealing with per-slot items (flags, local prompt, filters, etc.) work with the file or slot last _s_e_l_e_c_ted. The last such selected slot remains selected once the load is complete. Interactively, the default slot will become the _s_e_l_e_c_t_e_d _s_l_o_t for subsequent searches and commands that aren't aug- mented with an appended,i`E,#,i'E(as described in the INPUT SYNTAX section). show Shows any lines elided from the previous search (either due 25 LOOKUP(1) LOOKUP(1) to a _f_i_l_t_e_r or _w_o_r_d_-_p_r_e_f_e_r_e_n_c_e _m_o_d_e). Will apply any modifications (see the,i`Emodify,i'Ecommand) if modifications are enabled for the file. You can use the,i`E!m!,i'Eline prefix as well with this command (in this case, put the,i`E!m!,i'E_b_e_f_o_r_e the command-indicator charac- ter). The length of the list is controlled by the,i`Esaved list size,i'Ecommand. source "_f_i_l_e_n_a_m_e" Commands are read from _f_i_l_e_n_a_m_e and executed. In the file, all lines beginning with,i`E#,i'Eare ignored as comments (note that comments must appear on a line by them- selves, as,i`E#,i'Eis a reasonable character to have within commands). Lines whose first non-blank characters is,i`E=,i'E,,i`E!,i'E,or,i`E+,i'Eare considered searches, while all other non-blank lines are considered _l_o_o_k_u_p commands. Therefore, there is no need for lines to begin with the command-introduction character. However, leading whitespace is always OK. For search lines, take care that any trailing whitespace is deleted if undesired, as trailing whitespace (like all non- leading whitespace) is kept as part of the regular expres- sion. Within a command file, commands that modify per-file flags and such always work with the most-recently loaded (or selected) file. Therefore, something along the lines of load "my.word.list" set word on load "my.kanji.list" set word off set local prompt "enter kanji> " would word as might make intuitive sense. Since a script file must have a _l_o_a_d, or _s_e_l_e_c_t before any per-slot flag is set, one can use,i`Eselect .,i'Eto facilitate command scripts that are to work with,i`Ethe current slot,i'E. spinner [_v_a_l_u_e] Set the value of the spinner (A silly little feature). If set to a non-zero value, will cause a spinner to spin while a file is being checked, one increment per _v_a_l_u_e lines in 26 LOOKUP(1) LOOKUP(1) the file actually checked against the search specifier. Default is off (i.e. zero). stats Shows information about how many lines of the text file were checked against the last search specifier, and how many lines matched and were printed. tag [_b_o_o_l_e_a_n] ["_s_t_r_i_n_g"] Enable, disable, or set the tag for the _s_e_l_e_c_t_e_d _s_l_o_t. If the slot is not a combination slot, a tag _s_t_r_i_n_g may be set (the quotes are required). If a tag string is set and enabled for a file, the string is prepended to each matching output line printed. Unlike the _f_i_l_t_e_r and _m_o_d_i_f_y commands which automatically enable the function when a parameter is set, a _t_a_g is not automatically enabled when set. It can be enabled while being set via,i`E'tag,i'Eonor could be enabled subsequently via just,i`Etag on,i'E If the selected slot is a combination slot, only the enable/disable status may be changed (on by default). No tag string may be set. The reason for the special treatment lies in the special nature of how tags work in conjunction with combination files. During a search when the selected slot is a combination slot, each file which is a member of the combination has its per-file flags disabled if their corresponding flag is disabled in the original combination slot. This allows the combination slot's flags to act as a,i`Emask,i'Eto blot out each component file's per-file flags. The tag flag, however, is special in that the component file's tag flag is turned _o_n if the combination slot's tag flag is turned on (and, of course, the component file has a tag string registered). The intended use of this is that one might set a (disabled) tag to a file, yet _d_i_r_e_c_t searches against that file will have no prepended tag. However, if the file is searched as part of a combination slot (and the combination slot's tag flag is on), the tag _w_i_l_l be prepended, allowing one to easily understand from which file an output line comes. verbose [_b_o_o_l_e_a_n] Sets verbose mode on or off, or reports the current status (default on). Many commands reply with a confirmation if verbose mode is turned on. 27 LOOKUP(1) LOOKUP(1) version Reports the current version of the program. [default] wildcard [_b_o_o_l_e_a_n] The _s_e_l_e_c_t_e_d _s_l_o_t's patterns are considerd wildcard pat- terns if turned on, regular expressions if turned off. The current status is reported if no argument given. However, if,i`Edefault,i'Eis specified, the pattern-type to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily toggled by the,i`E!W!,i'Eline prefix. When wildcard patterns are selected, the changed metachar- acters are:,i`E*,i'Emeans,i`Eany stuff,i'E,,i`E?,i'Emeans,i`Eany one character,i'E,while,i`E+,i'Eand,i`E.,i'Ebecome unspecial. Other regex items such as,i`E|,i'E,,i`E(,i'E,,i`E[,i'E,etc. are unchanged. What,i`E*,i'Eand,i`E?,i'Ewill actually match depends upon the sta- tus of word-mode, as well as on the pattern itself. If word-mode is on, or if the pattern begins with the start- of-word,i`E<,i'Eor,i`E[,i'E,only non-spaces will be matched. Other- wise, any character will be matched. In summary,when wildcard mode is on, the input pattern is effected in the following ways: * is changed to the regular expression .* or ? is changed to the regular expression . or + is changed to the regular expression + . is changed to the regular expression . Because filename patterns are often called,i`Efilename globs,i'E,the command,i`Eglob,i'Ecan be used in place of,i`Ewild- card,i'E. [default] word|wordpreference [_b_o_o_l_e_a_n] The selected file's word-preference mode is turned on or off (default is off), or reports the current setting if no argument is specified. However, if,i`Edefault,i'Eis specified, the value to be inherited as the default by subsequently- loaded files is set (or reported). In word-preference mode, entries are searched for _a_s _i_f the search regex had a leading,iAE<,i,Cand a trailing,iAE>,i,C, resulting in a list of entries with a whole-word match of the regex. However, if there are none, but there _a_r_e non- word entries, the non-word entries are shown (the,i`Esaved list,i'Eis used for this -- see that command). This make it an,i`Eif there are whole words like this, show me, otherwise show me whatever you've got,i'Emode. If there are both word and non-word entries, the non-word 28 LOOKUP(1) LOOKUP(1) entries are remembered in the saved list (rather than any possible filtered entries being remembered there). One caveat: if a search matches a line in more than one place, and the first is _n_o_t a whole-word, while one of the others _i_s, the line will be listed considered non-whole word. For example, the search,i"Ojapan,ixwith word-preference mode on will not list an entry such as,i`E/Japanese/language in Japan/,i'E, as the first,i`EJapan,i'Eis part of,i`EJapanese,i'Eand not a whole word. If you really need just whole-word entries, use the,iAE<,i,Cand,iAE>,i,Cyourself. The mode may be temporarily toggled via the,i`E!w!,i'Eline pre- fix. The rules defining what lines are filtered, remembered, discarded, and shown for each permutation of search are rather complex, but the end result is rather intuitive. quit | leave | bye | exit Exits the program. SSTTAARRTTUUPP FFIILLEE If the file,i`E~/.lookup,i'Eis present, commands are read from it during _l_o_o_k_u_p startup. The file is read in the same way as the _s_o_u_r_c_e command reads files (see that entry for more information on file format, etc.) However, if there had been files loaded via command-line argu- ments, commands within the startup file to load files (and their associated commands such as to set per-file flags) are ignored. Similarly, any use of the command-line flags -euc, -jis, or -sjis will disable in the startup file the commands dealing with setting the input and/or output encodings. The special treatment mentioned in the above two paragraphs only applies to commands within the startup file itself, and does not apply to commands in command-files that might be _s_o_u_r_c_ed from within the startup file. The following is a reasonable example of a startup file: ## turn verbose mode off during startup file processing verbose off prompt "%C([%#]%0)%!C(%w'*'%!f'raw '%n)> " spinner 200 pager on ## The filter for edict will hit for entries that ## have only one English part, and that English part 29 LOOKUP(1) LOOKUP(1) ## having a pl or pn designation. load ~/lib/edict filter "name" #^[^/]+/[^/]*[^/]*/$# highlight on word on ## The filter for kanjidic will hit for entries without a ## frequency-of-use number. The modify spec will remove ## fields with the named initial code (U,N,Q,M,E, and Y) load ~/lib/kanjidic filter "uncommon" !// modify /( [UNQMEY])+//g ## Use the same filter for my local word file, ## but turn off by default. load ~/lib/local.words filter "name" #^[^/]+/[^/]*[^/]*/$# filter off highlight on word on ## Want a tag for my local words, but only when ## accessed via the combo below tag off ",i~O" combine "words" 2 0 select words ## turn verbosity back on for interactive use. verbose on CCOOMMMMAANNDD--LLIINNEE AARRGGUUMMEENNTTSS With the use of a startup file, command-line arguments are rarely needed. In practical use, they are only needed to cre- ate an index file, as in: lookup -write _t_e_x_t_f_i_l_e Any command line arguments that aren't flags are taken to be files which are loaded in turn during startup. In this case, any,i`Eload,i'E,,i`Efilter,i'E, etc. commands in the startup file are ignored. The following flags are supported: -help Reports a short help message and exits. -write Creates index files for the named files and exits. No _s_t_a_r_t_u_p _f_i_l_e is read. -euc Sets the input and output encoding method to EUC (currently the default). Exactly the same as the,i`Eencoding 30 LOOKUP(1) LOOKUP(1) euc,i'Ecommand. -jis Sets the input and output encoding method to JIS. Exactly the same as the,i`Eencoding jis,i'Ecommand. -sjis Sets the input and output encoding method to Shift-JIS. Exactly the same as the,i`Eencoding sjis,i'Ecommand. -v -version Prints the version string and exits. -norc Indicates that the startup file should not be read. -rc _f_i_l_e The named file is used as the startup file, rather than the default,i`E~/.lookup,i'E. It is an error for the file not to exist. -percent _n_u_m When an index is built, letters that appear on more than _n_u_m percent (default 50) of the lines are elided from the index. The thought is that if a search will have to check most of the lines in a file anyway, one may as well save the large amount of space in the index file needed to rep- resent that information, and the time/space tradeoff shifts, as the indexing of oft-occurring letters provides a diminishing return. Smaller indexes can be made by using a smaller number. -noindex Indicates that any files loaded via the command line should not be loaded with any precomputed index, but recalculated on the fly. -verbose Has metric tons of stats spewed whenever an index is cre- ated. -port ### For the (undocumented) server configuration only, tells which port to listen on. OOPPEERRAATTIINNGG SSYYSSTTEEMM CCOONNSSIIDDEERRAATTIIOONNSS I/O primitives and behaviors vary with the operating system. On my operating system, I can,i`Eread,i'Ea file by mapping it into memory, which is a pretty much instant procedure regardless of the size of the file. When I later access that memory, the appropriate sections of the file are automatically read into memory by the operating system as needed. 31 LOOKUP(1) LOOKUP(1) This results in _l_o_o_k_u_p starting up and presenting a prompt very quickly, but causes the first few searches that need to check a lot of lines in the file to go more slowly (as lots of the file will need to be read in). However, once the bulk of the file is in, searches will go very fast. The win here is that the rather long file-load times are amortized over the first few (or few dozen, depending upon the situation) searches rather than always faced right at command startup time. On the other hand, on an operating system without the mapping ability, _l_o_o_k_u_p would start up very slowly as all the files and indexes are read into memory, but would then search quickly from the beginning, all the file already having been read. To get around the slow startup, particularly when many files are loaded, _l_o_o_k_u_p uses _l_a_z_y _l_o_a_d_i_n_g if it can: a file is not actually read into memory at the time the _l_o_a_d command is given. Rather, it will be read when first actually accessed. Furthermore, files are loaded while _l_o_o_k_u_p is idle, such as when waiting for user input. See the _f_i_l_e_s command for more information. RREEGGUULLAARR EEXXPPRREESSSSIIOONNSS,, AA BBRRIIEEFF TTUUTTOORRIIAALL _R_e_g_u_l_a_r _e_x_p_r_e_s_s_i_o_n_s (,i`Eregex,i'Efor short) are a,i`Ecode,i'Eused to indicate what kind of text you're looking for. They're how one searches for things in the editors,i`Evi,i'E,,i`Este- vie,i'E,,i`Emifes,i'Eetc., or with the grep commands. There are differences among the various regex flavors in use -- I'll describe the flavor used by _l_o_o_k_u_p here. Also, in order to be clear for the common case, I might tell a few lies, but noth- ing too heinous. The regex,i"Oa,ixmeans,i`Eany line with an,iAEa,i,Cin it.,i'E Simple enough. The regex,i"Oab,ixmeans,i`Eany line with an,iAEa,i,Cimmediately fol- lowed by a,iAEb,i,C,i'E. So the line I am feeling flabby would,i`Ematch,i'Ethe regex,i"Oab,ixbecause, indeed, there's an,i`Eab,i'Eon that line. But it wouldn't match the line this line has no a followed _immediately_ by a b because, well, what the lines says is true. In most cases, letters and numbers in a regex just mean that you're looking for those letters and numbers in the order given. However, there are some special characters used within a regex. A simple example would be a period. Rather than indicate that you're looking for a period, it means,i`Eany character,i'E. So 32 LOOKUP(1) LOOKUP(1) the silly regex,i"O.,ixwould mean,i`Eany line that has any charac- ter on it.,i'EWell, maybe not so silly... you can use it to find non-blank lines. But more commonly it's used as part of a larger regex. Con- sider the regex,i"Ogray,ix. It wouldn't match the line The sky was grey and cloudy. because of the different spelling (grey vs. gray). But the regex,i"Ogr.y,ixasks for,i`Eany line with a,iAEg,i,C,,iAEr,i,C, some character, and then a,iAEy,i,C,i'E. So this would get,i`Egrey,i'Eand,i`Egray,i'E. A special construct somewhat similar to,iAE.,i,Cwould be the _c_h_a_r_a_c_t_e_r _c_l_a_s_s. A character class starts with a,iAE[,i,Cand ends with a,iAE],i,C, and will match any character given in between. An example might be gr[ea]y which would match lines with a,iAEg,i,C,,iAEr,i,C, an,iAEe,i,C_o_r an,iAEa,i,C, and then a,iAEy,i,C. Inside a character class you can list as many characters as you want to. For example the simple regex,i"Ox[0123456789]y,ixwould match any line with a digit sandwiched between an,iAEx,i,Cand a,iAEy,i,C. The order of the characters within the character class doesn't really matter...,i"O[513467289],ixwould be the same as,i"O[0123456789],ix. But as a short cut, you could put,i"O[0-9],ixinstead of,i"O[0123456789],ix. So the character class,i"O[a-z],ixwould match any lower-case letter, while the character class,i"O[a-zA-Z0-9],ixwould match any letter or digit. The character,iAE-,i,Cis special within a character class, but only if it's not the first thing. Another character that's special in a character class is,iAE^,i,C, if it _i_s the first thing. It,i`Einverts,i'Ethe class so that it will match any char- acter _n_o_t listed. The class,i"O[^a-zA-Z0-9],ixwould match any line with spaces or punctuation on them. There are some special short-hand sequences for some common character classes. The sequence,i"O\d,ixmeans,i`Edigit,i'E, and is the same as,i"O[0-9],ix. ,i"O\w,ixmeans,i`Eword element,i'Eand is the same as,i"O[0-9a-zA-Z_],ix. ,i"O\s,ixmeans,i`Espace-type thing,i'Eand is the same as,i"O[ \t],ix(,i"O\t,ixmeans tab). You can also use,i"O\D,ix,,i"O\W,ix, and,i"O\S,ixto mean things _n_o_t a digit, word element, or space-type thing. Another special character would be,iAE?,i,C. This means,i`Emaybe one of whatever was just before it, not is fine too,i'E. In the regex ,i"Obikes? for rent,ix, the,i`Ewhatever,i'Ewould be the,iAEs,i,C, 33 LOOKUP(1) LOOKUP(1) so this would match lines with either,i`Ebikes for rent,i'Eor,i`Ebike for rent,i'E. Parentheses are also special, and can group things together. In the regex big (fat harry)? deal the,i`Ewhatever,i'Efor the,iAE?,i,Cwould be,i`Efat harry,i'E. But be careful to pay attention to details... this regex would match I don't see what the big fat harry deal is! but _n_o_t I don't see what the big deal is! That's because if you take away the,i`Ewhatever,i'Eof the,iAE?,i,C, you end up with big deal Notice that there are _t_w_o spaces between the words, and the regex didn't allow for that. The regex to get either line above would be big (fat harry )?deal or big( fat harry)? deal Do you see how they're essentially the same? Similar to,iAE?,i,Cis,iAE*,i,C, which means,i`Eany number, including none, of whatever's right in front,i'E. It more or less means that whatever is tagged with,iAE*,i,Cis allowed, but not required, so something like I (really )*hate peas would match,i`EI hate peas,i'E,,i`EI really hate peas!,i'E,,i`EI really really hate peas,i'E, etc. Similar to both,iAE?,i,Cand,iAE*,i,Cis,iAE+,i,C, which means,i`Eat least one of whatever just in front, but more is fine too,i'E. The regex,i"Omis+pelling,ixwould match,i`Emi_spelling,i'E,,i`Emi_s_- _spelling,i'E,,i`Emi_s_s_spelling,i'E, etc. Actually, it's just the same as,i"Omiss*pelling,ixbut more simple to type. The regex,i"Oss*,ixmeans,i`Ean,iAEs,i,C, followed by zero or more,iAEs,i,C,i'E, while,i"Os+,ixmeans,i`Eone or more,iAEs,i,C,i'E. Both really the same. The special character,iAE|,i,Cmeans,i`Eor,i'E. Unlike,iAE+,i,C,,iAE*,i,C, and,iAE?,i,Cwhich act on the thing _i_m_m_e_d_i_a_t_e_l_y before, the,iAE|,i,Cis more,i`Eglobal,i'E. give me (this|that) one Would match lines that had,i`Egive me this one,i'Eor,i`Egive me that one,i'Ein them. You can even combine more than two: give me (this|that|the other) one How about: [Ii]t is a (nice |sunny |bright |clear )*day 34 LOOKUP(1) LOOKUP(1) Here, the,i`Ewhatever,i'Eimmediately before the,iAE*,i,Cis (nice |sunny |bright |clear ) So this regex would match all the following lines: _I_t _i_s _a _d_a_y. I think _i_t _i_s _a _n_i_c_e _d_a_y. _I_t _i_s _a _c_l_e_a_r _s_u_n_n_y _d_a_y today. If _i_t _i_s _a _c_l_e_a_r _s_u_n_n_y _n_i_c_e _s_u_n_n_y _s_u_n_n_y _s_u_n_n_y _b_r_i_g_h_t _d_a_y then.... Notice how the,i"O[Ii]t,ixmatches either,i`EIt,i'Eor,i`Eit,i'E? Note that the above regex would also match fru_i_t _i_s _a _d_a_y because it indeed fulfills all requirements of the regex, even though the,i`Eit,i'Eis really part of the word,i`Efruit,i'E. To answer concerns like this, which are common, are,iAE<,i,Cand,iAE>,i,C, which mean,i`Eword break,i'E. The regex,i"O,ixwould match any line with,i`Eit,i'E_e_n_d_i_n_g _a _w_o_r_d. And, of course,,i"O,ixwould match any line with _t_h_e _w_o_r_d,i`Eit,i'Ein it. Going back to the regex to find grey/gray, that would make more sense, then, as which would match only the _w_o_r_d_s,i`Egrey,i'Eand,i`Egray,i'E. Some- what similar are,iAE^,i,Cand,iAE$,i,C, which mean,i`Ebeginning of line,i'Eand,i`Eend of line,i'E, respectively (but, not in a charac- ter class, of course). So the regex,i"O^fun,ixwould find any line that begins with the letters,i`Efun,i'E, while,i"O^fun>,ixwould find any line that begins with the _w_o_r_d,i`Efun,i'E. ,i"O^fun$,ixwould find any line that was exactly,i`Efun,i'E. Finally,,i"O^\s*fun\s*$,ixwould match any line that,i`Efun,i'Eexactly, but perhaps also had leading and/or trail- ing whitespace. That's pretty much it. There are more complex things, some of which I'll mention in the list below, but even with these few simple constructs one can specify very detailed and complex patterns. Let's summarize some of the special things in regular expres- sions: Items that are basic units: _c_h_a_r any non-special character matches itself. \_c_h_a_r special chars, when proceeded by \, become non-special. . Matches any one character (except \n). \n Newline \t Tab. \r Carriage Return. \f Formfeed. \d Digit. Just a short-hand for [0-9]. \w Word element. Just a short-hand for [0-9a-zA-Z_]. \s Whitespace. Just a short-hand for [\t \n\r\f]. 35 LOOKUP(1) LOOKUP(1) \## \### Two or three digit octal number indicating a single byte. [_c_h_a_r_s] Matches a character if it's one of the characters listed. [^_c_h_a_r_s] Matches a character if it's not one of the ones listed. The \_c_h_a_r items above can be used within a character class, but not the items below. \D Anything not \d. \W Anything not \w. \S Anything not \s. \a Any ASCII character. \A Any multibyte character. \k Any (not half-width) katakana character (including ,i1/4). \K Any character not \k (except \n). \h Any hiragana character. \H Any character not \h (except \n). (_r_e_g_e_x) Parens make the _r_e_g_e_x one unit. (?:_r_e_g_e_x) [from perl5] Grouping-only parens -- can't use for \# (below) \c Any JISX0208 kanji (kuten rows 16-84) \C Any character not \c (except \n). \# Match whatever was matched by the #th paren from the left. With,i`E,i`u,i'Eto indicate one,i`Eunit,i'Eas above, the following may be used: ,i`u? A ,i`u allowed, but not required. ,i`u+ At least one ,i`u required, but more ok. ,i`u* Any number of ,i`u ok, but none required. There are also ways to match,i`Esituations,i'E: \b A word boundary. < Same as \b. > Same as \b. ^ Matches the beginning of the line. $ Matches the end of the line. Finally, the,i`Eor,i'Eis _r_e_g_1|_r_e_g_2 Match if either _r_e_g_1 or _r_e_g_2 match. Note that,i`E\k,i'Eand the like aren't allowed in character classes, so something such as,i"O[\k\h],ixto try to get all kana won't work. Use ,i"O(\k|\h),ixinstead. BBUUGGSS Needs full support for half-width katakana and JIS X 0212-1990. Non-EUC (JIS & SJIS) items not tested well. Probably won't work on non-UNIX systems. Screen control codes (for clear and highlight commands) are hard-coded for ANSI/VT100/kterm. 36 LOOKUP(1) LOOKUP(1) AAUUTTHHOORR Jeffrey Friedl (jfriedl@nff.ncl.omron.co.jp) IINNFFOO Jim Breen's text files _e_d_i_c_t and _k_a_n_j_i_d_i_c and their documenta- tion can be found in,i`Epub/nihongo,i'Eon ftp.cc.monash.edu.au (130.194.1.106 Information on input and output encoding and codes can be found in Ken Lunde's _U_n_d_e_r_s_t_a_n_d_i_n_g _J_a_p_a_n_e_s_e _I_n_f_o_r_m_a_t_i_o_n _P_r_o_- _c_e_s_s_i_n_g (AE"u"E"U,`i3/4`'o^E'o1/2`e'I'y) published by O'Reilly and Asso- ciates. ISBN 1-56592-043-0. There is also a Japanese edition published by SoftBank. A program to convert files among the various encoding methods is Dr. Ken Lunde's_j_c_o_n_v, which can also be found on ftp.cc.monash.edu.au. _J_c_o_n_v is also useful for converting halfwidth katakana (which _l_o_o_k_u_p doesn't yet support well) to full-width. 37 lookup-1.08b.orig/doc/c_combine.so0100644000014400001440000000336405554652613016547 0ustar nakaharastaff.TP combine ["name"] [ \fInum\fP += ] \fIslotnum\fP ... .br Creates or adds file slots to a combination slot (see the COMBINATION SLOTS section for general information). Note that\c .Q combo may be used as the command as well. Assuming for this example that slots 0-2 are loaded with the files .IR curly , .IR moe , and .IR larry , we can create a combination slot that will reference all three: .nf combo "three stooges" 2, 0, 1 .fi The command will report .nf creating combo slot #3 (three stooges): 2 0 1 .fi The .I name is optional, and will appear in the .I files list, and also maybe be used to specify the slot as an argument to the .I select command. A search via the newly created combo slot would search in the order specified on the .I combo command line: first .IR larry , then .IR curly , and finally .IR moe . If you later load another file (say, .I jeffrey to slot #4), you can then add it to the previously made combo: .nf combo 3 += 4 .fi (the\c .Q "+=" wording comes from the C programming language where it means\c .Q "add on to" "). " Adding to a combination always adds slots to the end of the list. You can take the opportunity of adding the slot to also change the name, if you like: .nf combo "four stooges" 3 += 4 .fi The reply would be .nf adding to combo slot #3(four stooges): 4 .fi A file slot can be a component of any particular combo slot only once. When reporting the created or added slot numbers, the number will appear in parenthesis if it had already been a member of the list. Furthermore, only .I file slots can be component members of .I combo slots. Attempting to combine combo slot .I X to combo slot .I Y will result in having .IR X "'s" component file slots (rater than the combo slot itself) added to .IR Y . lookup-1.08b.orig/doc/c_describe.so0100644000014400001440000000233706027341744016706 0ustar nakaharastaff.TP describe \fIspecifier\fP This command will tell you how a character (or each character in a string) is encoded in the various encoding methods: .nf lookup command> describe "気" “気”as EUC is 0xb5a4 (181 164; \265 \\244) as JIS is 0x3524 ( 53 36; \065 \\044 "5$") as KUTEN is 2104 ( 0x1504; \025 \\004) as S-JIS is 0x8b1f (139 31; \213 \\037) .fi The quotes surrounding the character or string to describe are optional. You can also give a regular ASCII character and have the double-width version of the character described.... indicating\c .Q A ", " for example, would describe\c .Q A ". " .I Specifier can also be a four-digit kuten value, in which case the character with that kuten will be described. If a four-digit .I specifier has a hex digit in it, or if it is preceded by\c .Q 0x ", " the value is taken as a JIS code. You can precede the value by\c .Q jis ,\c .Q sjis ,\c .Q euc ", " or\c .Q kuten to force interpretation to the requested code. Finally, .I specifier can be a string of stripped JIS (JIS w/o the kanji-in and kanji-out codes, or with the codes but without the escape characters in them). For example\c .Q "F|K\e" would describe the two characters 日 and 本. lookup-1.08b.orig/doc/c_clear.so0100644000014400001440000000026305534330173016204 0ustar nakaharastaff.TP clear|cls .br Attempts to clear the screen. If you're using a kterm it'll just output the appropriate tty control sequence. Otherwise it'll try to run the\c .Q clear command. lookup-1.08b.orig/doc/c_debug.so0100644000014400001440000000014305523140327016177 0ustar nakaharastaff.TP debug [\fIboolean\fP] .br Sets the internal general-debugging flag on or off (default is off). lookup-1.08b.orig/doc/c_display.so0100644000014400001440000000121505554652634016574 0ustar nakaharastaff.TP [default] display [\fIboolean\fP] .br The .IR "selected slot" "'s" display flag is turned on or off (default is on), or reported if no argument given. However, if\c .Q default is specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily toggled by the\c .Q "!d!" line prefix. When a slot's display flag is off, lines that would normally print during a search are not printed (yet the "printed" value in an .I if expression retains the count of lines that would have been printed) Normally used in a script as in: .nf !d!/LookForMe if {!printed} msg oops, it's not found. .fi lookup-1.08b.orig/doc/c_files.so0100644000014400001440000000516305755557575016252 0ustar nakaharastaff.TP files [ - | long ] .br Lists what files are loaded in what slots, and some status information about them, as with: .nf ┃*0┃F wcfh d│a I ┃ 3749k┃/usr/jeff/lib/edict ┃ 1┃FM cf d│a I ┃ 754k┃/usr/jeff/lib/kanjidic ┏━┳━━━━━┯━━┳━━━┳━━━━━━━━━━━━━━ ┃ 0┃F wcf h d │a I ┃ 2762k┃/usr/jfriedl/lib/edict ┃ 1┃FM cf d │a I ┃ 705k┃/usr/jfriedl/lib/kanjidic ┃ 2┃F cfWh@d │a ┃ 1k┃/usr/jfriedl/lib/local.words ┃*3┃FM cf htd │a ┃ combo┃kotoba (#2, #0) ┃ 4┃ cf d │a ┃ 205k┃/usr/dict/words ┗━┻━━━━━┷━━┻━━━┻━━━━━━━━━━━━━━ .fi The first section is the slot number, with a\c .Q * beside the .IR "default slot" (as set by the .I select command). The second section shows per-slot flags and status. Letters are shown if the flag is on, omitted if off. In the list below, related commands are given for each item: .nf F … if there is a filter {but '#' if disabled}. (filter) M … if there is a modify spec {but '%' if disabled}. (modify) w … if word-preference mode is turned on. (word) c … if case folding is turned on. (fold) f … if fuzzification is turned on. (fuzz) W … if wildcard-pattern mode is turned on (wildcard) h … if highlighting is turned on. (highlight) t … if there is a tag {but @ if disabled} (tag) d … if found lines should be displayed (display) ───────────────────────────────── a … if autokana is turned on (autokana) P … if there is a file-specific local prompt (prompt) I … if the file is loaded with a precomputed index (load) d … if the display flag is on (display) .fi Note that the letters in the upper section directly correspond to the\c .Q !! sequence characters described in the INPUT SYNTAX section. If there is a digit at the end of the flag section, it indicates that only #/10 of the file is actually loaded into memory (as opposed to the file having been completely loaded). Unloaded files will be loaded while .I lookup is idle, or when first used. If the slot is a combination slot (as slot #3 is in the example above), that is noted in the third section, and the combination name and component slot numbers are noted in the fourth. Also, for combination slots (which have no .I filter or .I modify specifications, only the flags), .IR F and/or .IR M are shown if the corresponding mode is allowed during searches via the combo slot. See the .I tag command for info about .I t with respect to combination slots. If an argument (either\c .Q - or\c .Q long will work) is given to the command, a short message about what the flags mean is also printed. lookup-1.08b.orig/doc/c_filter.so0100644000014400001440000000425605554652655016427 0ustar nakaharastaff.TP filter ["\fIlabel\fP"] [!] /\fIregex\fP/[i] .br Sets the filter for the .I "selected slot" (which must contain a file and not a combination). If a filter is set and active for a file, any line matching the given .I regex is filtered from the output (if the\&‘\&!\&’\&is put before the .IR regex , any line .I not matching the regex is filtered). The .I label , which isn't required, merely acts as documentation in various diagnostics. As an example, consider that .I edict lines often have\c .Q (pn) on them to indicate that the given English is a place name. Often these place names can be a bother, so it would be nice to elide them from the output unless specifically requested. Consider the example: .nf lookup command> filter "name" /(pn)/ search [edict]> [きの] 機能 [きのう] /function/faculty/ 帰納 [きのう] /inductive/ 昨日 [きのう] /yesterday/ ≪3 "name" lines filtered≫ .fi In the example,\&‘\&/\&’\&characters are used to delimit the start and stop of the regex (as is common with many programs). However, any character can be used. A final\&‘\&i\&’\&, if present, indicates that the regex should be applied in a case-insensitive manner. The filter, once set, can be enabled or disabled with the other form of the\c .Q filter command (described below). It can also be temporarily turned off (or, if disabled, temporarily turned on) by the\c .Q !F! line prefix. Filtered lines can optionally be saved and then displayed if you so desire. See the\c .Q "saved list size" and\c .Q show commands. Note that if you have saving enabled and only one line would be filtered, it is simply printed at the end (rather than print a one line message about how one line was filtered). By the way, a better\c .Q name filter for .I edict would be: .nf filter "name" #^[^/]+/[^/]*[^/]*/$# .fi as it would filter all entries that had only one English section, that section being a name. It is also an example of using something other than\&‘\&/\&’\&to delimit a regex, as it makes things a bit easier to read. .TP filter [\fIboolean\fP] .br Enables or disables the filter for the . IR "selected slot" . If no argument is given, displays the current filter and status. lookup-1.08b.orig/doc/c_fold.so0100644000014400001440000000052605554652660016056 0ustar nakaharastaff.TP [default] fold [\fIboolean\fP] .br The .IR "selected slot" "'s" case folding is turned on or off (default is on), or reported if no argument given. However, if\c .Q default is specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily toggled by the\c .Q "!c!" line prefix. lookup-1.08b.orig/doc/c_fuzz.so0100644000014400001440000000052705554652676016140 0ustar nakaharastaff.TP [default] fuzz [\fIboolean\fP] .br The .IR "selected slot" "'s" fuzzification is turned on or off (default is on), or reported if no argument given. However, if\c .Q default is specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily toggled by the\c .Q "!f!" line prefix. lookup-1.08b.orig/doc/c_help.so0100644000014400001440000000024605523140330016037 0ustar nakaharastaff.TP help [\fIregex\fP] .br Without an argument gives a short help list. With an argument, lists only commands whose help string is picked up by the given .IR regex . lookup-1.08b.orig/doc/c_highlight.so0100644000014400001440000000246506027341623017073 0ustar nakaharastaff.TP [default] highlight [\fIboolean\fP] .br Sets matched-string highlighting on or off for the .I "selected slot" (default off), or reports the current status if no argument is given. However, if\c .Q default is specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). If on, shows in bold or reverse video (see below) that part of the line which was matched by the search .IR regex . If multiple regexes were given, that part matched by the first regex is show. Note that a regex might match a portion of a line which is later removed by a .I modify parameter. In this case, no highlighting is done. Can be temporarily toggled by the\c .Q "!h!" line prefix. .TP highlight style [\fIbold\fP | \fIinverse\fP | \fIstandout\fP | \fI<___>\fP] .br Sets the style of highlighting for when highlighting is done. .I Inverse (inverse video) and .I standout are the same. The default is .IR bold . You can also give an HTML tag, such as\c .Q "" and items will be wrapped by .... This would be particularly useful when the output is going to a CGI, as when lookup has been built in a server configuration. Note that the highlighting is affected by using raw VT100/xterm control sequences. This isn't particularly very nice if your terminal doesn't understand them. Sorry. lookup-1.08b.orig/doc/regex.so0100644000014400001440000002377306100574170015736 0ustar nakaharastaff.I "Regular expressions" (\&“\®ex\&”\&for short) are a\c .Q code used to indicate what kind of text you're looking for. They're how one searches for things in the editors\c .Q vi ",\c" .Q stevie ",\c" .Q mifes etc., or with the grep commands. There are differences among the various regex flavors in use -- I'll describe the flavor used by .I lookup here. Also, in order to be clear for the common case, I might tell a few lies, but nothing too heinous. The regex\&「\&a\&」\&means\c .Q "any line with an\&‘\&a\&’\&in it." Simple enough. The regex\&「\&ab\&」\&means\&“\&any line with an\&‘\&a\&’\&immediately followed by a\&‘\&b\&’\&\&”\&. So the line .nf I am feeling flabby .fi would\c .Q match the regex\&「\&ab\&」\&because, indeed, there's an\c .Q ab on that line. But it wouldn't match the line .nf this line has no a followed _immediately_ by a b .fi because, well, what the lines says is true. In most cases, letters and numbers in a regex just mean that you're looking for those letters and numbers in the order given. However, there are some special characters used within a regex. A simple example would be a period. Rather than indicate that you're looking for a period, it means\c .Q "any character" ". " So the silly regex\&「\&.\&」\&would mean\c .Q "any line that has any character on it." Well, maybe not so silly... you can use it to find non-blank lines. But more commonly it's used as part of a larger regex. Consider the regex\&「\&gray\&」\&. It wouldn't match the line .nf The sky was grey and cloudy. .fi because of the different spelling (grey vs. gray). But the regex\&「\&gr.y\&」\&asks for\&“\&any line with a\&‘\&g\&’\&,\&‘\&r\&’\&, some character, and then a\&‘\&y\&’\&”\&. So this would get\c .Q grey and\c .Q gray ". " A special construct somewhat similar to\&‘\&.\&’\&would be the .IR "character class" . A character class starts with a\&‘\&[\&’\&and ends with a\&‘\&]\&’\&, and will match any character given in between. An example might be .nf gr[ea]y .fi which would match lines with a\&‘\&g\&’\&,\&‘\&r\&’\&, an\&‘\&e\&’\&\fIor\fP an\&‘\&a\&’\&, and then a\&‘\&y\&’\&. Inside a character class you can list as many characters as you want to. For example the simple regex\&「\&x[0123456789]y\&」\&would match any line with a digit sandwiched between an\&‘\&x\&’\&and a\&‘\&y\&’\&. The order of the characters within the character class doesn't really matter...\&「\&[513467289]\&」\&would be the same as\&「\&[0123456789]\&」\&. But as a short cut, you could put\&「\&[0\-9]\&」\&instead of\&「\&[0123456789]\&」\&. So the character class\&「\&[a\-z]\&」\&would match any lower-case letter, while the character class\&「\&[a\-zA\-Z0\-9]\&」\&would match any letter or digit. The character\&‘\&\-\&’\&is special within a character class, but only if it's not the first thing. Another character that's special in a character class is\&‘\&^\&’\&, if it .I is the first thing. It\c .Q inverts the class so that it will match any character .I not listed. The class\&「\&[^a\-zA\-Z0\-9]\&」\&would match any line with spaces or punctuation on them. There are some special short-hand sequences for some common character classes. The sequence\&「\&\\d\&」\&means\c .Q digit ", " and is the same as\&「\&[0\-9]\&」\&. \&「\&\\w\&」\&means\c .Q "word element" and is the same as\&「\&[0\-9a\-zA\-Z_]\&」\&. \&「\&\\s\&」\&means\c .Q "space-type thing" and is the same as\&「\&[ \\t]\&」\&(\&「\&\\t\&」\&means tab). You can also use\&「\&\\D\&」\&,\&「\&\\W\&」\&, and\&「\&\\S\&」\&to mean things .I not a digit, word element, or space-type thing. Another special character would be\&‘\&?\&’\&. This means\c .Q "maybe one of whatever was just before it, not is fine too" ". " In the regex \&「\&bikes? for rent\&」\&, the\c .Q whatever would be the\&‘\&s\&’\&, so this would match lines with either\c .Q "bikes for rent" or\c .Q "bike for rent" ". " Parentheses are also special, and can group things together. In the regex .nf big (fat harry)? deal .fi the\c .Q whatever for the\&‘\&?\&’\&would be\c .Q "fat harry" ". " But be careful to pay attention to details... this regex would match .nf I don't see what the big fat harry deal is! .fi but .I not .nf I don't see what the big deal is! .fi That's because if you take away the\c .Q whatever of the\&‘\&?\&’\&, you end up with .nf big deal .fi Notice that there are .I two spaces between the words, and the regex didn't allow for that. The regex to get either line above would be .nf big (fat harry )?deal .fi or .nf big( fat harry)? deal .fi Do you see how they're essentially the same? Similar to\&‘\&?\&’\&is\&‘\&*\&’\&, which means\c .Q "any number, including none, of whatever's right in front" ". " It more or less means that whatever is tagged with\&‘\&*\&’\&is allowed, but not required, so something like .nf I (really )*hate peas .fi would match\c .Q "I hate peas" ",\c" .Q "I really hate peas!" ",\c" .Q "I really really hate peas" ", " etc. Similar to both\&‘\&?\&’\&and\&‘\&*\&’\&is\&‘\&+\&’\&, which means\c .Q "at least one of whatever just in front, but more is fine too" ". " The regex\&「\&mis+pelling\&」\&would match\c .Q "mi\fIs\fPpelling" ",\c" .Q "mi\fIss\fPpelling" ",\c" .Q "mi\fIsss\fPpelling" ", " etc. Actually, it's just the same as\&「\&miss*pelling\&」\&but more simple to type. The regex\&「\&ss*\&」\&means\&“\&an\&‘\&s\&’\&, followed by zero or more\&‘\&s\&’\&\&”\&, while\&「\&s+\&」\&means\c .Q "one or more\&‘\&s\&’\&" ". " Both really the same. The special character\&‘\&|\&’\&means\c .Q or ". " Unlike\&‘\&+\&’\&,\&‘\&*\&’\&, and\&‘\&?\&’\&which act on the thing .I immediately before, the\&‘\&|\&’\&is more\c .Q global ". " .nf give me (this|that) one .fi Would match lines that had\c .Q "give me this one" or\c .Q "give me that one" in them. You can even combine more than two: .nf give me (this|that|the other) one .fi How about: .nf [Ii]t is a (nice |sunny |bright |clear )*day .fi Here, the\c .Q whatever immediately before the\&‘\&*\&’\&is .nf (nice |sunny |bright |clear ) .fi So this regex would match all the following lines: .nf \fIIt is a day\fP. I think \fIit is a nice day\fP. \fIIt is a clear sunny day\fP today. If \fIit is a clear sunny nice sunny sunny sunny bright day\fP then.... .fi Notice how the\&「\&[Ii]t\&」\&matches either\c .Q It or\c .Q it "? " Note that the above regex would also match .nf fru\fIit is a day\fP .fi because it indeed fulfills all requirements of the regex, even though the\c .Q it is really part of the word\c .Q fruit ". " To answer concerns like this, which are common, are\&‘\&<\&’\&and\&‘\&>\&’\&, which mean\c .Q "word break" ". " The regex\&「\&\&」\&would match any line with\c .Q it .IR "ending a word" ". " And, of course,\&「\&\&」\&would match any line with .I "the word\c" .Q it in it. Going back to the regex to find grey/gray, that would make more sense, then, as .nf .fi which would match only the .I words\c .Q grey and\c .Q gray ". " Somewhat similar are\&‘\&^\&’\&and\&‘\&$\&’\&, which mean\c .Q "beginning of line" and\c .Q "end of line" ", " respectively (but, not in a character class, of course). So the regex\&「\&^fun\&」\&would find any line that begins with the letters\c .Q fun ", " while\&「\&^fun>\&」\&would find any line that begins with the .I word\c .Q fun ". " \&「\&^fun$\&」\&would find any line that was exactly\c .Q fun ". " Finally,\&「\&^\\s*fun\\s*$\&」\&would match any line that\c .Q fun exactly, but perhaps also had leading and/or trailing whitespace. That's pretty much it. There are more complex things, some of which I'll mention in the list below, but even with these few simple constructs one can specify very detailed and complex patterns. Let's summarize some of the special things in regular expressions: .nf Items that are basic units: \fIchar\fP any non-special character matches itself. \\\fIchar\fP special chars, when proceeded by \\, become non-special. . Matches any one character (except \\n). \\n Newline \\t Tab. \\r Carriage Return. \\f Formfeed. \\d Digit. Just a short-hand for [0\-9]. \\w Word element. Just a short-hand for [0\-9a\-zA\-Z_]. \\s Whitespace. Just a short-hand for [\\t \\n\\r\\f]. \\## \\### Two or three digit octal number indicating a single byte. [\fIchars\fP] Matches a character if it's one of the characters listed. [^\fIchars\fP] Matches a character if it's not one of the ones listed. The \\\fIchar\fP items above can be used within a character class, but not the items below. \\D Anything not \\d. \\W Anything not \\w. \\S Anything not \\s. \\a Any ASCII character. \\A Any multibyte character. \\k Any (not half-width) katakana character (including ー). \\K Any character not \\k (except \\n). \\h Any hiragana character. \\H Any character not \\h (except \\n). (\fIregex\fP) Parens make the \fIregex\fP one unit. (?:\fIregex\fP) [from perl5] Grouping-only parens -- can't use for \\# (below) \\c Any JISX0208 kanji (kuten rows 16-84) \\C Any character not \\c (except \\n). \\# Match whatever was matched by the #th paren from the left. With“☆”to indicate one“unit”as above, the following may be used: ☆? A ☆ allowed, but not required. ☆+ At least one ☆ required, but more ok. ☆* Any number of ☆ ok, but none required. There are also ways to match“situations”: \\b A word boundary. < Same as \\b. > Same as \\b. ^ Matches the beginning of the line. $ Matches the end of the line. Finally, the“or”is \fIreg1\fP|\fIreg2\fP Match if either \fIreg1\fP or \fIreg2\fP match. Note that“\\k”and the like aren't allowed in character classes, so something such as「[\\k\\h]」to try to get all kana won't work. Use 「(\\k|\\h)」instead. .fi lookup-1.08b.orig/doc/lookup.man0100644000014400001440000010770606027342117016270 0ustar nakaharastaff.if \n1 .ll \n1n \" for page width.. . use cmd line arg -r1# to set width to # .de Q \" puts quotes around the argument. End previous line with \c \&“\&\\$1\&”\&\\$2\&\\c .. .TH LOOKUP 1 .nr IN 3n .ce 1 April 22nd, 1994 .SH NAME lookup \- interactive file search and display .SH SYNOPSIS .B lookup [ args ] [ .I file ... ] .br .SH DESCRIPTION .I Lookup allows the quick interactive search of text files. It supports ASCII, JIS-ROMAN, and Japanese EUC Packed formated text, and has an integrated romaji→kana converter. .SH THIS MANUAL .I Lookup is flexible for a variety of applications. This manual will, however, focus on the application of searching Jim Breen's .I edict (Japanese-English dictionary) and .I kanjidic (kanji database). Being familiar with the content and format of these files would be helpful. See the INFO section near the end of this manual for information on how to obtain these files and their documentation. .SH OVERVIEW OF MAJOR FEATURES The following just mentions some major features to whet your appetite to actually read the whole manual (-: .TP Romaji-to-Kana Converter .I Lookup can convert romaji to kana for you, even\c .Q "on the fly" as you type. .TP Fuzzy Searching Searches can be a bit\c .Q vague or\c .Q fuzzy ", " so that you'll be able to find\c .Q 東京 even if you try to search for\c .Q ときょ (the proper yomikata being\c .Q とうきょう "). " .TP Regular Expressions Uses the powerful and expressive .I "regular expression" for searching. One can easily specify complex searches that affect\&“\&I want lines that look like such-and-such, but not like this-and-that, but that also have this particular characteristic....\&” .TP Wildcard ``Glob'' Patterns Optionally, can use well-known filename wildcard patterns instead of full-fledged regular expressions. .TP Filters You can have .I lookup not list certain lines that would otherwise match your search, yet can optionally save them for quick review. For example, you could have all name-only entries from .I edict filtered from normal output. .TP Automatic Modifications Similarly, you can do a standard search-and-replace on lines just before they print, perhaps to remove information you don't care to see on most searches. For example, if you're generally not interested in .IR kanjidic "'s" info on Chinese readings, you can have them removed from lines before printing. .TP Smart Word-Preference Mode You can have .I lookup list only entries with .I "whole words" that match your search (as opposed to an .I embedded match, such as finding\c .Q the inside\c .Q them "), " but if no whole-word matches exist, will go ahead and list any entry that matches the search. .TP Handy Features Other handy features include a dynamically settable and parameterized prompt, automatic highlighting of that part of the line that matches your search, an output pager, readline-like input with horizontal scrolling for long input lines, a\c .Q .lookup startup file, automated programability, and much more. Read on! .SH REGULAR EXPRESSIONS .I Lookup makes liberal use of .I "regular expressions" (or .I regex for short) in controlling various aspects of the searches. If you are not familiar with the important concepts of regexes, read the tutorial appendix of this manual before continuing. .SH JAPANESE CHARACTER ENCODING METHODS Internally, .I lookup works with Japanese packed-format EUC, and all files loaded must be encoded similarly. If you have files encoded in JIS or Shift-JIS, you must first convert them to EUC before loading (see the INFO section for programs that can do this). Interactive input and output encoding, however, may be be selected via the -jis, -sjis, and -euc invocation flags (default is -euc), or by various commands to the program (described later). Make sure to use the encoding appropriate for your system. If you're using kterm under the X Window System, you can use .IR lookup "'s" -jis flag to match kterm's default JIS encoding. Or, you might use kterm's\c .Q "-km euc" startup option (or menu selection) to put kterm into EUC mode. Also, I have found kterm's scrollbar (\c .Q "-sb -sl 500" ") " to be quite useful. With many\c .Q English fonts in Japan, the character that normally prints as a backslash (halfwidth version of \\&) in The States appears as a yen symbol (the half-width version of ¥\&). How it will appear on your system is a function of what font you use and what output encoding method you choose, which may be different from the font and method that was used to print this manual (both of which may be different from what's printed on your keyboard's appropriate key). Make sure to keep this in mind while reading. .SH STARTUP Let's assume that your copy of .I edict is in ~/lib/edict. You can start the program simply with .nf lookup ~/lib/edict .fi You'll note that .I lookup spends some time building an index before the default\c .Q "lookup>\ " prompt appears. .I Lookup gains much of its search speed by constructing an index of the file(s) to be searched. Since building the index can be time consuming itself, you can have .I lookup write the built index to a file that can be quickly loaded the next time you run the program. Index files will be given a\c .Q .jin (Jeffrey's Index) ending. Let's build the indices for .I edict and .I kanjidic now: .nf lookup -write ~/lib/edict ~/lib/kanjidic .fi This will create the index files .nf ~/lib/edict.jin ~/lib/kanjidic.jin .fi and exit. You can now re-start .I lookup , automatically using the pre-computed index files as: .nf lookup ~/lib/edict ~/lib/kanjidic .fi You should then be presented with the prompt without having to wait for the index to be constructed (but see the section on Operating System concerns for possible reasons of delay). .SH INPUT There are basically two types of input: searches and commands. Commands do such things as tell .I lookup to load more files or set flags. Searches report lines of a file that match some search specifier (where lines to search for are specified by one or more regular expressions). The input syntax may perhaps at first seem odd, but has been designed to be powerful and concise. A bit of time invested to learn it well will pay off greatly when you need it. .SH BRIEF EXAMPLE Assuming you've started .I lookup with .I edict and .I kanjidic as noted above, let's try a few searches. In these examples, the .nf “search [edict]> ” .fi is the prompt. Note that the space after the\&‘\&>\&’\&is part of the prompt. Given the input: .nf search [edict]> tranquil .fi .I lookup will report all lines with the string\c .Q tranquil in them. There are currently about a dozen such lines, two of which look like: .nf 安らか [やすらか] /peaceful (an)/tranquil/calm/restful/ 安らぎ [やすらぎ] /peace/tranquility/ .fi Notice that lines with\c .Q tranquil \fIand\fP\c .Q tranquility matched? This is because\c .Q tranquil was embedded in the word\&“\&tranquility\&”\&. You could restrict the search to only the \fIword\fP\c .Q tranquil by prepending the special\c .Q "start of word" symbol\&‘\&<\&’\&and appending the special\c .Q "end of word" symbol\&‘\&>\&’\&to the regex, as in: .nf search [edict]> .fi This is the regular expression that says\&“\&the beginning of a word, followed by a\&‘\&t\&’\&,\&‘\&r\&’\&, ...,\&‘\&l\&’\&, which is at the end of a word.\&”The current version of .I edict has just three matching entries. Let's try another: .nf search [edict]> fukushima .fi This is a search for the\c .Q English fukushima -- ways to search for kana or kanji will be explored later. Note that among the several lines selected and printed are: .nf 副島 [ふくしま] /Fukushima (pn,pl)/ 木曽福島 [きそふくしま] /Kisofukushima (pl)/ .fi By default, searches are done in a case-insensitive manner --\&‘\&F\&’\&and\&‘\&f\&’\&are treated the same by .IR lookup , at least so far as the matching goes. This is called .IR "case folding" . Let's give a command to turn this option off, so that\&‘\&f\&’\&and\&‘\&F\&’\&won't be considered the same. Here's an odd point about .I "lookup's" input syntax: the default setting is that all command lines must begin with a space. The space is the (default) command-introduction character and tells the input parser to expect a command rather than a search regular expression. .I It is a common mistake at first to forget the leading space when issuing a command. Be careful. Try the command\c .Q "\ fold" to report the current status of case-folding. Notice that as soon as you type the space, the prompt changes to .nf “lookup command> ” .fi as a reminder that now you're typing a command rather than a search specification. .nf lookup command> fold .fi The reply should be\c .Q "file #0's case folding is on" .br You can actually turn it off with\c .Q " fold off" ". " Now try the search for\c .Q fukushima again. Notice that this time the entries with\c .Q Fukushima aren't listed? Now try the search string\c .Q Fukushima and see that the entries with\c .Q fukushima aren't listed. Case folding is usually very convenient (it also makes corresponding katakana and hiragana match the same), so don't forget to turn it back on: .nf lookup command> fold on .fi .SH JAPANESE INPUT .I Lookup has an automatic romaji→kana converter. A leading\&‘\&/\&’\&indicates that romaji is to follow. Try typing\c .Q /tokyo and you'll see it convert to\c .Q /\&ときょ as you type. When you hit return, .I lookup will list all lines that have a\&“\&ときょ\&”\&somewhere in them. Well, sort of. Look carefully at the lines which match. Among them (if you had case folding back on) you'll see: .nf キリスト教 [キリストきょう] /Christianity/ 東京 [とうきょう] /Toukyou (pl)/Tokyo/current capital of Japan/ 凸鏡 [とっきょう] /convex lens/ .fi The first one has\&“\&ときょ\&”\&in it (as\&“\&トきょ\&”\&, where the katakana\&“\&ト\&”\&matches in a case-insensitive manner to the hiragana\&“\&と\&”\&), but you might consider the others unexpected, since they don't have\c .Q ときょ in them. They're close (\&“\&とうきょ\&”\&and\&“\&とっきょ\&”\&), but not exact. This is the result of .IR lookup "'s\c" .Q fuzzification "\&." Try the command\c .Q "\ fuzz" (again, don't forget the command-introduction space). You'll see that fuzzification is turned on. Turn it off with\c .Q "\ fuzz off" and try\c .Q /tokyo (which will convert as you type) again. This time you only get the lines which have\&“\&ときょ\&”\&exactly (well, case folding is still on, so it might match katakana as well). In a fuzzy search, length of vowels is ignored --\&“\&と\&”\&is considered the same as\&“\&とう\&”\&, for example. Also, the presence or absence of any\&“\&っ\&”\&character is ignored, and the pairs じ ぢ, ず づ, え ゑ, and お を are considered identical in a fuzzy search. It might be convenient to consider a fuzzy search to be a\c .Q "pronunciation search" ". " Special note: fuzzification will not be performed if a regular expression\c .Q "*" , .Q "+" , or\c .Q "?" modifies a non-ASCII character. This is not an issue when input patterns are filename-like wildcard patterns (discussed below). In addition to kana fuzziness, there's one special case for kanji when fuzziness is on. The kanji repeater mark\c .Q "々" will be recognized such that\c .Q "時々" and\c .Q "時時" will match each-other. Turn fuzzification back on (\&“\&fuzz on\&”\&), and search for all .I "whole words" which sound like\&“\&tokyo\&”\&. That search would be specified as: .nf search [edict]> / .fi (again, the\c .Q tokyo will be converted to\c .Q ときょ as you type). My copy of .I edict has the three lines .nf 東京 [とうきょう] /Toukyou (pl)/Tokyo/current capital of Japan/ 特許 [とっきょ] /special permission/patent/ 凸鏡 [とっきょう] /convex lens/ .fi This kind of whole-word romaji-to-kana search is so common, there's a special short cut. Instead of typing\&“\&/\&”\&, you can type\c .Q [tokyo] ". " The leading\&‘\&[\&’\&means\&“\&start romaji\&”\&\c .I and\c .Q "start of word" ". " Were you to type\c .Q instead (without a leading\&‘\&/\&’\&or\&‘\&[\&’\&to indicate romaji-to-kana conversion), you would get all lines with the .I English whole-word\c .Q tokyo in them. That would be a reasonable request as well, but not what we want at the moment. Besides the kana conversion, you can use any cut-and-paste that your windowing system might provide to get Japanese text onto the search line. Cut\c .Q ときょ from somewhere and paste onto the search line. When hitting enter to run the search, you'll notice that it is done without fuzzification (even if the fuzzification flag was\c .Q on "). " That's because there's no leading\&‘\&/\&’\&. Not only does a leading\&‘\&/\&’\&ndicate that you want the romaji-to-kana conversion, but that you want it done fuzzily. So, if you'd like fuzzy cut-and-paste, just type a leading\&‘\&/\&’\&efore pasting (or go back and prepend one after pasting). These examples have all been pretty simple, but you can use all the power that regexes have to offer. As a slightly more complex example, the search\c .Q would look for all lines with the words\c .Q grey or\c .Q gray in them. Since the\&‘\&[\&’\&isn't the first character of the line, it doesn't mean what was mentioned above (start-of-word romaji). In this case, it's just the regular-expression\c .Q class indicator. If you feel more comfortable using filename-like\c .Q "*.txt" wildcard patterns, you can use the\c .Q "wildcard on" command to have patterns be considered this way. This has been a quick introduction to the basics of .IR lookup . It can be very powerful and much more complex. Below is a detailed description of its various parts and features. .SH READLINE INPUT The actual keystrokes are read by a readline-ish package that is pretty standard. In addition to just typing away, the following keystrokes are available: .nf ^B / ^F move left/right one character on the line ^A / ^E move to the start/end of the line ^H / ^G delete one character to the left/right of the cursor ^U / ^K delete all characters to the left/right of the cursor ^P / ^N previous/next lines on the history list ^L or ^R redraw the line ^D delete char under the cursor, or EOF if line is empty ^space force romaji conversion (^@ on some systems) .fi If automatic romaji-to-kana conversion is turned on (as it is by default), there are certain situations where the conversion will be done, as we saw above. Lower-case romaji will be converted to hiragana, while upper-case romaji to katakana. This usually won't matter, though, as case folding will treat hiragana and katakana the same in the searches. In exactly what situations the automatic conversion will be done is intended to be rather intuitive once the basic idea is learned. However, at .IR "any time" , one can use control-space to convert the ASCII to the left of the cursor to kana. This can be particularly useful when needing to enter kana on a command line (where auto conversion is never done; see below) .SH ROMAJI FLAVOR Most flavors of romaji are recognized. Special or non-obvious items are mentioned below. Lowercase are converted to hiragana, uppercase to katakana. Long vowels can be entered by repeating the vowel, or with\&‘\&-\&’\&or\&‘\&^\&’\&. In situations where an\&“\&n\&”\&could be vague, as in\&“\&na\&”\&being な or んあ\&, use a single quote to force ん\&. Therefore,「\&kenichi\&」→けにち while「\&ken'ichi\&」→けんいち\&. The romaji has been richly extended with many non-standard combinations such as ふぁ or ちぇ\&, which are represented in intuitive ways:「\&fa\&」→ふぁ\&,「\&che\&」→ちぇ\&. etc. Various other mappings of interest: .nf wo →を we→ゑ wi→ゐ VA →ヴァ VI→ヴィ VU→ヴ VE→ヴェ VO→ヴォ di →ぢ dzi→ぢ dya→ぢゃ dyu→ぢゅ dyo→ぢょ du →づ tzu→づ dzu→づ (the following kana are all smaller versions of the regular kana) xa →ぁ xi→ぃ xu→ぅ xe→ぇ xo→ぉ xu →ぅ xtu→っ xwa→ゎ xka→ヵ xke→ヶ xya→ゃ xyu→ゅ xyo→ょ .fi .SH INPUT SYNTAX Any input line beginning with a space (or whichever character is set as the command-introduction character) is processed as a command to .I lookup rather than a search spec. .I Automatic kana conversion is never done on these lines (but .I forced conversion with control-space may be done at any time). Other lines are taken as search regular expressions, with the following special cases: .TP ? A line consisting of a single question mark will report the current command-introduction character (the default is a space, but can be changed with the\c .Q cmdchar command). .TP = If a line begins with\&‘\&=\&’\&, the line (without the\&‘\&=\&’\&) is taken as a search regular expression, and no automatic (or internal -- see below) kana conversion is done anywhere on the line (although again, conversion can always be forced with control-space). This can be used to initiate a search where the beginning of the regex is the command-introduction character, or in certain situations where automatic kana conversion is temporarily not desired. .TP / A line beginning with\&‘\&/\&’\&indicates romaji input for the whole line. If automatic kana conversion is turned on, the conversion will be done in real-time, as the romaji is typed. Otherwise it will be done internally once the line is entered. .IR Regardless , the presence of the leading\&‘\&/\&’\&indicates that any kana (either converted or cut-and-pasted in) should be\c .Q fuzzified if fuzzification is turned on. As an addition to the above, if the line doesn't begin with\&‘\&=\&’\&or the command-introduction character (and automatic conversion is turned on),\&‘\&/\&’\& .I anywhere on the line initiates automatic conversion for the following word. .TP [ A line beginning with\&‘\&[\&’\&is taken to be romaji (just as a line beginning with\&‘\&/\&’\&, and the converted romaji is subject to fuzzification (if turned on). However, if\&‘\&[\&’\&is used rather than\&‘\&/\&’\&, an implied\&‘\&<\&’\&\c .Q "beginning of word" is prepended to the resulting kana regex. Also, any ending\&‘\&]\&’\&on such a line is converted to the\c .Q "ending of word" specifier\&‘\&>\&’\&in the resulting regex. .PP In addition to the above, lines may have certain prefixes and suffixes to control aspects of the search or command: .TP ! Various flags can be toggled for the duration of a particular search by prepending a\c .Q !! sequence to the input line. Sequences are shown below, along with commands related to each: .nf !F! … Filtration is toggled for this line (filter) !M! … Modification is toggled for this line (modify) !w! … Word-preference mode is toggled for this line (word) !c! … Case folding is toggled for this line (fold) !f! … Fuzzification is toggled for this line (fuzz) !W! … Wildcard-pattern mode is toggled for this line (wildcard) !r! … Raw. Force fuzzification off for this line !h! … Highlighting is toggled for this line (highlight) !t! … Tagging is toggled for this line (tag) !d! … Displaying is on for this line (display) .fi The letters can be combined, as in\c .Q "!cf!" . The final\&‘\&!\&’\& can be omitted if the first character after the sequence is not an ASCII letter. If no letters are given (\c .Q !! ").\c" .Q !f! is the default. These last two points can be conveniently combined in the common case of\c .Q !/romaji which would be the same as\c .Q !f!/romaji ". " The special sequence\c .Q !? lists the above, as well as indicates which are currently turned on. Note that the letters accepted in a\c .Q !! sequence are many of the indicators shown by the\c .Q files command. .TP + A\&‘\&+\&’\&prepended to anything above will cause the final search regex to be printed. This can be useful to see when and what kind of fuzzification and/or internal kana conversion is happening. Consider: .nf search [edict]> +/わかる a match is“わ[ぁあー]*っ?か[ぁあー]*る[ぅうおぉー]*” .fi Due to the\c .Q leading "\&/\, " the kana is fuzzified, which explains the somewhat complex resulting regex. For comparison, note: .nf search [edict]> +わかる a match is“わかる” search [edict]> +!/わかる a match is“わかる” .fi As the\&‘\&+\&’\&shows, these are not fuzzified. The first one has no leading\&‘\&/\&’\&or\&‘\&[\&’\&to induce fuzzification, while the second has the\&‘\&!\&’\&line prefix (which is the default version of\c .Q !f! "), " which toggles fuzzification mode to\c .Q off for that line. .TP \&, The default of all searches and most commands is to work with the first file loaded (\fIedict\fP in these examples). One can change this default (see the\c .Q select command) or, by appending a comma+digit sequence at the end of an input line, force that line to work with another previously-loaded file. An appended\c .Q ,1 works with first extra file loaded (in these examples, \fIkanjidic\fP). An appended\c .Q ,2 works with the 2nd extra file loaded, etc. An appended\c .Q ,0 works with the original first file (and can be useful if the default file has been changed via the\c .Q select command). The following sequence shows a common usage: .nf search [edict]> [ときょと] 東京都 [とうきょうと] /Tokyo Metropolitan area/ .fi cutting and pasting the 都 from above, and adding a\c .Q ,1 to search .IR kanjidic : .nf search [edict]> 都,1 都 4554 N4769 S11 ..... ト ツ みやこ {metropolis} {capital} .fi .SH FILENAME-LIKE WILDCARD MATCHING When wildcard-pattern mode is selected, patterns are considered as extended\ .Q "*.txt" "-like" patterns. This is often more convenient for users not familiar with regular expressions. To have this mode selected by default, put .nf default wildcard on .fi into your\c .Q ".lookup" file (see\c .Q "STARTUP FILE" below). When wildcard mode is on, only \c .Q "*" , .Q "?" , .Q "+" , and\c .Q "." , are effected. See the entry for the .Q wildcard command below for details. Other features, such as the multiple-pattern searches (described below) and other regular-expression metacharacters are available. .SH MULTIPLE-PATTERN SEARCHES You can put multiple patterns in a single search specifier. For example consider .nf search [edict]> china||japan .fi The first part (\&“\&china\&”\&) will select all lines that have\c .Q china in them. Then, .IR "from among those lines" , the second part will select lines that have\c .Q japan in them. The\c .Q || is not part of any pattern -- it is .IR lookup "'s\c" .Q pipe mechanism. The above example is very different from the single pattern \&“\&china|japan\&”\&which would select any line that had either\&“\&china\&”\&\c .I or\c .Q japan ". " With\c .Q china||japan ", " you get lines that have\c .Q china .I "and then also" have\c .Q japan as well. Note that it is also different from the regular expression\c .Q china.*japan (or the wildcard pattern\c .Q china*japan ")" which would select lines having\c .Q "china, then maybe some stuff, then japan" ". " But consider the case when\c .Q japan comes on the line before\c .Q china . Just for your comparison, the multiple-pattern specifier\&“\&china||japan\&”\&is pretty much the same as the single regular expression\&“\&china.*japan|japan.*china\&”\&. If you use\&“\&|!|\&”\&instead of\&“\&||\&”\&, it will mean\&“\&...and then lines .I not matching...\&”\&. Consider a way to find all lines of .I kanjidic that do have a Halpern number, but don't have a Nelson number: .nf search [edict]> |!| .fi If you then wanted to restrict the listing to those that .I also had a\&“\&jinmeiyou\&”\&marking (\fIkanjidic\fP's\&“\&G9\&”\&field) and had a reading of あき, you could make it: .nf search [edict]> |!|||||<あき> A prepended‘+’would explain: a match is“” and not“” and“” and“<あき>” .fi The\&“\&|!|\&”\&and\&“\&||\&”\&can be used to make up to ten separate regular expressions in any one search specification. Again, it is important to stress that\&“\&||\&”\&does not mean\&“\&or\&”\&(as it does in a C program, or as\&‘\&|\&’\&does within a regular expression). You might find it convenient to read\&“\&||\&”\&as\&“\&\fIand\fP also\&”\&, while reading\&“\&|!|\&”\&as\&“\&but \fInot\fP\&”\&. It is also important to stress that any whitespace around the\c .Q || and\c .Q |!| construct is .I not ignored, but kept as part of the regex on either side. .SH COMBINATION SLOTS Each file, when loaded, is assigned to a\c .Q slot via which subsequent references to the file are then made. The slot may then be searched, have filters and flags set, etc. A special kind of slot, called a\c .Q "combination slot" , rather than representing a single file, can represent multiple previously-loaded slots. Searches against a combination slot (or\c .Q "combo slot" for short) search all those previously-loaded slots associated with it (called\c .Q "component slots" "). " Combo slots are set up with the .I combine command. A Combo slot has no filter or modify spec, but can have a local prompt and flags just like normal file slots. The flags, however, have special meanings with combo slots. Most combo-slot flags act as a mask against the component-slot flags; when acted upon as a member of the combo, a component-slot's flag will be disabled if the corresponding combo-slot's flag is disabled. Exceptions to this are the .IR autokana , .IR fuzz , and .I tag flags. The .I autokana and .I fuzz flags governs a combo slot exactly the same as a regular file slot. When a slot is searched as a component of a combination slot, the component slot's .I fuzz (and .IR autokana ) flags, or lack thereof, are ignored. The .I tag flag is quite different altogether; see the .I tag command for complete information. Consider the following output from the .I files command: .nf ┏━┳━━━━┯━━┳━━━┳━━━━━━━━━━━━━━ ┃ 0┃F wcfh d│a I ┃ 2762k┃/usr/jfriedl/lib/edict ┃ 1┃FM cf d│a I ┃ 705k┃/usr/jfriedl/lib/kanjidic ┃ 2┃F cfh@d│a ┃ 1k┃/usr/jfriedl/lib/local.words ┃*3┃FM cfhtd│a ┃ combo┃kotoba (#2, #0) ┗━┻━━━━┷━━┻━━━┻━━━━━━━━━━━━━━ .fi See the discussion of the .I files command below for basic explanation of the output. As can be seen, slot #3 is a .I "combination slot" with the name\c .Q kotoba with .I "component slots" two and zero. When a search is initiated on this slot, first slot #2\c .Q "local.words" will be searched, then slot #0\c .Q edict ". " Because the combo slot's .I filter flag is .IR on , the component slots' .I filter flag will remain on during the search. The combo slot's .I word flag is .IR off , however, so slot #0's .I word flag will be forced off during the search. See the .I combine command for information about creating combo slots. .SH PAGER .I Lookup has a built in pager (a'la \fImore\fP). Upon filling a screen with text, the string .nf --MORE [space,return,c,q]-- .fi is shown. A space will allow another screen of text; a return will allow one more line. A\&‘\&c\&’\& will allow output text to continue unpaged until the next command. A\&‘\&q\&’\& will flush output of the current command. If supported by the OS, .I lookup's idea of the screen size is automatically set upon startup and window resize. .I Lookup must know the width of the screen in doing both the horizontal input-line scrolling, and for knowing when a long line wraps on the screen. The pager parameters can be set manually with the\c .Q pager command. .SH COMMANDS Any line intended to be a command must begin with the command-introduction character (the default is a space, but can be set via the\&“\&cmdchar\&”\&command). However, that character is not part of the command itself and won't be shown in the following list of commands. There are a number of commands that work with the .I "selected file" or .I "selected slot" (both meaning the same thing). The selected file is the one indicated by an appended comma+digit, as mentioned above. If no such indication is given, the default .I "selected file" is used (usually the first file loaded, but can be changed with the\&“\&select\&”\&command). Some commands accept a .I boolean argument, such as to turn a flag on or off. In all such cases, a\&“\&1\&”\&or\&“\&on\&”\&means to turn the flag on, while a\&“\&0\&”\&or\&“\&off\&”\&is used to turn it off. Some flags are per-file (\&“\&fuzz\&”\&,\&“\&fold\&”\&, etc.), and a command to set such a flag normally sets the flag for the selected file only. However, the default value inherited by subsequently loaded files can be set by prepending\c .Q default to the command. This is particularly useful in the startup file before any files are loaded (see the section STARTUP FILE). Items separated by\&‘\&|\&’\&are mutually exclusive possibilities (i.e. a boolean argument is\&“\&1|on|0|off\&”\&). Items shown in brackets (\&‘[’\&and\&‘\&]\&’\&) are optional. All commands that accept a boolean argument to set a flag or mode do so optionally -- with no argument the command will report the current status of the mode or flag. Any command that allows an argument in quotes (such as load, etc.) allow the use of single or double quotes. .PP The commands: .br .so c_autokana.so .so c_clear.so .so c_cmdchar.so .so c_combine.so .so c_cmd_debug.so .so c_debug.so .so c_describe.so .so c_encoding.so .so c_files.so .so c_filter.so .so c_fold.so .so c_fuzz.so .so c_help.so .so c_highlight.so .so c_if.so .so c_in_code.so .so c_limit.so .so c_log.so .so c_load.so .so c_modify.so .so c_msg.so .so c_out_code.so .so c_pager.so .so c_prompt.so .so c_rdebug.so .so c_list_size.so .so c_select.so .so c_show.so .so c_source.so .so c_spinner.so .so c_stats.so .so c_tag.so .so c_verbose.so .so c_version.so .so c_wild.so .so c_word.so .so c_quit.so .SH STARTUP FILE If the file\c .Q ~/.lookup is present, commands are read from it during .I lookup startup. The file is read in the same way as the .I source command reads files (see that entry for more information on file format, etc.) However, if there had been files loaded via command-line arguments, commands within the startup file to load files (and their associated commands such as to set per-file flags) are ignored. Similarly, any use of the command-line flags -euc, -jis, or -sjis will disable in the startup file the commands dealing with setting the input and/or output encodings. The special treatment mentioned in the above two paragraphs only applies to commands within the startup file itself, and does not apply to commands in command-files that might be .IR source d from within the startup file. The following is a reasonable example of a startup file: .nf ## turn verbose mode off during startup file processing verbose off prompt "%C([%#]%0)%!C(%w'*'%!f'raw '%n)> " spinner 200 pager on ## The filter for edict will hit for entries that ## have only one English part, and that English part ## having a pl or pn designation. load ~/lib/edict filter "name" #^[^/]+/[^/]*[^/]*/$# highlight on word on ## The filter for kanjidic will hit for entries without a ## frequency-of-use number. The modify spec will remove ## fields with the named initial code (U,N,Q,M,E, and Y) load ~/lib/kanjidic filter "uncommon" !// modify /( [UNQMEY]\S+)+//g ## Use the same filter for my local word file, ## but turn off by default. load ~/lib/local.words filter "name" #^[^/]+/[^/]*[^/]*/$# filter off highlight on word on ## Want a tag for my local words, but only when ## accessed via the combo below tag off "》" combine "words" 2 0 select words ## turn verbosity back on for interactive use. verbose on .fi .SH "COMMAND-LINE ARGUMENTS" With the use of a startup file, command-line arguments are rarely needed. In practical use, they are only needed to create an index file, as in: .nf lookup -write \fItextfile\fP .fi Any command line arguments that aren't flags are taken to be files which are loaded in turn during startup. In this case, any\&“\&load\&”\&,\&“\&filter\&”\&, etc. commands in the startup file are ignored. The following flags are supported: .TP \-help\ \ \ Reports a short help message and exits. .TP \-write\ \ \ Creates index files for the named files and exits. No .I "startup file" is read. .TP \-euc\ \ \ Sets the input and output encoding method to EUC (currently the default). Exactly the same as the\&“\&encoding euc\&”\&command. .TP \-jis\ \ \ Sets the input and output encoding method to JIS. Exactly the same as the\&“\&encoding jis\&”\&command. .TP \-sjis\ \ \ Sets the input and output encoding method to Shift-JIS. Exactly the same as the\&“\&encoding sjis\&”\&command. .TP \-v \-version Prints the version string and exits. .TP \-norc\ \ \ .br Indicates that the startup file should not be read. .TP \-rc \fIfile\fP The named file is used as the startup file, rather than the default\c .Q "~/.lookup" ". " It is an error for the file not to exist. .TP -percent \fInum\fP .br When an index is built, letters that appear on more than .I num percent (default 50) of the lines are elided from the index. The thought is that if a search will have to check most of the lines in a file anyway, one may as well save the large amount of space in the index file needed to represent that information, and the time/space tradeoff shifts, as the indexing of oft-occurring letters provides a diminishing return. Smaller indexes can be made by using a smaller number. .TP \-noindex .br Indicates that any files loaded via the command line should not be loaded with any precomputed index, but recalculated on the fly. .TP \-verbose .br Has metric tons of stats spewed whenever an index is created. .TP \-port ### For the (undocumented) server configuration only, tells which port to listen on. .SH OPERATING SYSTEM CONSIDERATIONS I/O primitives and behaviors vary with the operating system. On my operating system, I can\&“\&read\&”\&a file by mapping it into memory, which is a pretty much instant procedure regardless of the size of the file. When I later access that memory, the appropriate sections of the file are automatically read into memory by the operating system as needed. This results in .I lookup starting up and presenting a prompt very quickly, but causes the first few searches that need to check a lot of lines in the file to go more slowly (as lots of the file will need to be read in). However, once the bulk of the file is in, searches will go very fast. The win here is that the rather long file-load times are amortized over the first few (or few dozen, depending upon the situation) searches rather than always faced right at command startup time. On the other hand, on an operating system without the mapping ability, .I lookup would start up very slowly as all the files and indexes are read into memory, but would then search quickly from the beginning, all the file already having been read. To get around the slow startup, particularly when many files are loaded, .I lookup uses .I "lazy loading" if it can: a file is not actually read into memory at the time the .I load command is given. Rather, it will be read when first actually accessed. Furthermore, files are loaded while .I lookup is idle, such as when waiting for user input. See the .I files command for more information. .SH REGULAR EXPRESSIONS, A BRIEF TUTORIAL .so regex.so .SH BUGS Needs full support for half-width katakana and JIS X 0212-1990. .br Non-EUC (JIS & SJIS) items not tested well. .br Probably won't work on non-UNIX systems. .br Screen control codes (for clear and highlight commands) are hard-coded for ANSI/VT100/kterm. .SH AUTHOR Jeffrey Friedl (jfriedl@nff.ncl.omron.co.jp) .SH INFO Jim Breen's text files .I edict and .I kanjidic and their documentation can be found in\c .Q pub/nihongo on ftp.cc.monash.edu.au (130.194.1.106 Information on input and output encoding and codes can be found in Ken Lunde's .I "Understanding Japanese Information Processing" (\&日本語情報処理\&) published by O'Reilly and Associates. ISBN 1-56592-043-0. There is also a Japanese edition published by SoftBank. A program to convert files among the various encoding methods is Dr. Ken Lunde's\c .IR jconv , which can also be found on ftp.cc.monash.edu.au. .I Jconv is also useful for converting halfwidth katakana (which .I lookup doesn't yet support well) to full-width. lookup-1.08b.orig/doc/c_if.so0100644000014400001440000000210205554632617015520 0ustar nakaharastaff.TP if {\fIexpression\fP} \fIcommand...\fP If the evaluated .I expression is non-zero, the .I command will be executed. Note that {} rather than () surround the .IR expression . .I Expression may be comprised of numbers, operators, parenthesis, etc. In addition to the normal +, -, *, and /, are: .nf !\fIx\fP … yields 0 if \fIx\fP is non-zero, 1 if \fIx\fP is zero. \fIx\fP && \fIy\fP … !\fIx\fP …‘not’Yields 1 if \fIx\fP is zero, 0 if non-zero. \fIx\fP & \fIy\fP …‘and’Yields 1 if both \fIx\fP and \fIy\fP are non-zero, 0 otherwise. \fIx\fP | \fIy\fP …‘or’ Yields 1 if \fIx\fP or \fIy\fP (or both) is non-zero, 0 otherwise .fi There may also be the special tokens .IR true and .IR false which are 1 and 0 respectively. There are also .IR checked , .IR matched , .IR printed , .IR nonword , and .IR filtered which correspond to the values printed by the .I stats command. An example use might be the following kind of thing in an computer-generated script: .nf !d!expect this line if {!printed} msg Oops! couldn't find "expect this line" .fi lookup-1.08b.orig/doc/c_out_code.so0100644000014400001440000000427705535046505016734 0ustar nakaharastaff.TP output encoding [ euc | sjis | jis...] Used to set exactly what kind of encoding should be used for program output (also see the .I "input encoding" command). Used when the .I encoding command is not detailed enough for one's needs. If no argument is given, reports the current output encoding. Otherwise, arguments can usually be any reasonable dash-separated combination of: .RS 5n .TP euc\ \ \ \ Selects EUC for the output encoding. .TP sjis\ \ \ Selects Shift-JIS for the output encoding. .TP jis[78|83|90][-ascii|-roman] Selects JIS for the output encoding. If no year (78, 83, or 90) given, 78 is used. Can optionally specify that\c .Q English should be encoded as regular .I ASCII (the default when JIS selected) or as .IR "JIS-ROMAN" . .TP 212\ \ \ Indicates that JIS X0212-1990 should be supported (ignored for Shift-JIS output). .TP no212\ \ \ Indicates that JIS X0212-1990 should be not be supported (default setting). This places JIS X0212-1990 characters under the domain of .IR disp , .IR nodisp , .IR code , or .IR mark (described below). .TP hwk\ \ \ Indicates that \fIh\fPalf \fIw\fPidth \fIk\fPana should be left as-is (default setting). .TP nohwk\ \ \ Indicates that \fIh\fPalf \fIw\fPidth \fIk\fPana should be stripped from the output. .I "(not yet implemented)." .TP foldhwk\ \ \ Indicates that \fIh\fPalf \fIw\fPidth \fIk\fPana should be folded to their full-width counterparts. .I "(not yet implemented)." .TP disp\ \ \ Indicates that .I non-displayable characters (such as JIS X0212-1990 while the output encoding method is Shift-JIS) should be passed along anyway (most likely resulting in screen garbage). .TP nodisp\ \ \ Indicates that .I non-displayable characters should be quietly stripped from the output. .TP code\ \ \ Indicates that .I non-displayable characters should be printed as their octal codes (default setting). .TP mark\ \ \ Indicates that .I non-displayable characters should be printed as\c .Q ★ ". " .PP Of course, not all options make sense in all combinations, or at all times. When the current (or new) output encoding is reported, a complete and exact specifier representing the output encoding selected. An example might be\c .Q jis78-ascii-no212-hwk-code ". " .RS -5n lookup-1.08b.orig/doc/c_limit.so0100644000014400001440000000032105555424766016246 0ustar nakaharastaff.TP limit [\fIvalue\fP] .br Sets the number of lines to print during any search before aborting (or reports the current number if no value given). Default is 100. Output limiting is disabled if set to zero. lookup-1.08b.orig/doc/c_load.so0100644000014400001440000000170706076501201016033 0ustar nakaharastaff.TP load [-now|-whenneeded] "\fIfilename\fP" .br Loads the named file to the next available slot. If a precomputed index is found (as\c .Q "\fIfilename\fP.jin" ) it is loaded as well. Otherwise, an index is generated internally. The file to be loaded (and the index, if loaded) will be loaded during idle times. This allows a startup file to list many files to be loaded, but not have to wait for each of them to load in turn. Using the .Q "-now" flag causes the load to happen immediately, while using the .Q "-whenneeded" option (can be shortened to .Q "-wn" ) causes the load to happen only when the slot is first accessed. Invoke .I lookup as .nf % lookup -writeindex \fIfilename\fP .fi to generate and write an index file, which will then be automatically used in the future. If the file has already been loaded, the file is not re-read, but the previously-read file is shared. The new slot will, however, have its own separate flags, prompt, filter, etc. lookup-1.08b.orig/doc/c_log.so0100644000014400001440000000072605535566727015724 0ustar nakaharastaff.TP log [ to [+] \fIfile\fP ] .br Begins logging the program output to .I file (the Japanese encoding method being the same as for screen output). If\c .Q + is given, the log is appended to any text that might have previously been in .IR file , in which case a leading dashed line is inserted into the file. If no arguments are given, reports the current logging status. .TP log - | off If only\c .Q - or .I off is given, any currently-opened log file is closed. .TP lookup-1.08b.orig/doc/c_modify.so0100644000014400001440000000407505535037416016417 0ustar nakaharastaff.TP modify /\fIregex\fP/\fIreplace\fP/[ig] .br Sets the .I modify parameter for the .IR "selected file" . If a file has a modify parameter associated with it, each line selected during a search will have that part of the line which matches .I regex (if any) replaced by the .I replacement string before being printed. Like the .I filter command, the delimiter need not be\&‘\&/\&’\&; any non-space character is fine. If a final\&‘\&i\&’\&is given, the regex is applied in a case-insensitive manner. If a final\&‘\&g\&’\&is given, the replacement is done to all matches in the line, not just the first part that might match .IR regex . The .I replacement may have embedded\c .Q "\\1" ", " etc. in it to refer to parts of the matched text (see the tutorial on regular expressions). The modify parameter, once set, may be enabled or disabled with the other form of the modify command (described below). It may also be temporarily toggled via the\c .Q !m! line prefix. A silly example for the ultra-nationalist might be: .nf modify //Dainippon Teikoku/g .fi So that a line such as .nf 日銀 [にちぎん] /Bank of Japan/ would come out as 日銀 [にちぎん] /Bank of Dainippon Teikoku/ .fi As a real example of the modify command with .IR kanjidic , consider that it is likely that one is not interested in all the various fields each entry has. The following can be used to remove the info on the U, N, Q, M, E, B, C, and Y fields from the output: .nf modify /( [UNQMECBY]\\S+)+//g,1 .fi It's sort of complex, but works. Note that here the .I replacement part is empty, meaning to just remove those parts which matched. The result of such a search of 日 would normally print .nf 日 467c U65e5 N2097 B72 B73 S4 G1 H3027 F1 Q6010.0 MP5.0714 \ MN13733 E62 Yri4 P3-3-1 ニチ ジツ ひ -び -か {day} .fi but with the above modify spec, appears more simply as .nf 日 467c S4 G1 H3027 F1 P3-3-1 ニチ ジツ ひ -び -か {day} .fi .TP modify [\fIboolean\fP] .br Enables or disables the modify parameter for the .IR "selected file" , or report the current status if no argument is given. lookup-1.08b.orig/doc/c_msg.so0100644000014400001440000000017605554631611015712 0ustar nakaharastaff.TP msg \fIstring\fP The given .I string is printed. Most likely used in a script as the target command of an .I if command. lookup-1.08b.orig/doc/c_spinner.so0100644000014400001440000000044105523140331016563 0ustar nakaharastaff.TP spinner [\fIvalue\fP] Set the value of the spinner (A silly little feature). If set to a non-zero value, will cause a spinner to spin while a file is being checked, one increment per .I value lines in the file actually checked against the search specifier. Default is off (i.e. zero). lookup-1.08b.orig/doc/c_pager.so0100644000014400001440000000207405534351252016217 0ustar nakaharastaff.TP pager [ \fIboolean\fP | \fIsize\fP ] Turns on or off an output pager, sets it's idea of the screen size, or reports the current status. .I Size can be a single number indicating the number of lines to be printed between\c .Q MORE? prompts (usually a few lines less than the total screen height, the default being 20 lines). It can also be two numbers in the form\c .Q #x# where the first number is the width (in half-width characters; default 80) and the second is the lines-per-page as above. If the pager is on, every page of output will result in a\c .Q MORE? prompt, at which there are four possible responses. A space will allow one more full page to print. A return will allow one more line. A\&‘\&c\&’\&(for\c .Q continue ") " will all the rest of the output (for the current command) to proceed without pause, while a\&‘\&q\&’\&(for\c .Q quit ") " will flush the output for the current command. If supported by the OS, the pager size parameters are set appropriately from the window size upon startup or window resize. The default pager status is\c .Q "off" ". " lookup-1.08b.orig/doc/c_prompt.so0100644000014400001440000000535605755560033016454 0ustar nakaharastaff.TP [local] prompt "\fIstring\fP" Sets the prompt string. If\c .Q local is indicated, sets the prompt string for the .I "selected slot" only. Otherwise, sets the global default prompt string. Prompt strings may have the special %-sequences shown below, with related commands given in parenthesis: .nf %N … the \fIdefault slot\fP's file or combo name. %n … like %N, but any leading path is not shown if a filename. %# … the \fIdefault slot\fP's number. %S … the\&“\&command-introduction\&”\&character (cmdchar) %0 … the running program's name %F='\fIstring\fP' … \fIstring\fP shown if filtering enabled (filter) %M='\fIstring\fP' … \fIstring\fP shown if modification enabled (modify) %w='\fIstring\fP' … \fIstring\fP shown if word mode on (word) %c='\fIstring\fP' … \fIstring\fP shown if case folding on (fold) %f='\fIstring\fP' … \fIstring\fP shown if fuzzification on (fuzz). %W='\fIstring\fP' … \fIstring\fP shown if wildcard-pat. mode on (wildcard). %d='\fIstring\fP' … \fIstring\fP shown if displaying on (display). %C='\fIstring\fP' … \fIstring\fP shown if currently entering a command. %l='\fIstring\fP' … \fIstring\fP shown if logging is on (log). %L … the name of the current output log, if any (log) .fi For the tests (%f, etc), you can put\&‘\&!\&’\&just after the\&‘\&%\&’\&to reverse the sense of the test (i.e. %!f="no fuzz"). The reverse of %F is if a filter is installed but disabled (i.e. .I string will never be shown if there is no filter for the default file). The modify %M works comparably. Also, you can use an alternative form for the items that take an argument string. Replacing the quotes with parentheses will treat .I string as a recursive prompt specifier. For example, the specifier .nf %C='command'%!C(%f='fuzzy 'search:) .fi would result in a\c .Q command prompt if entering a command, while it would result in either a\c .Q "fuzzy search:" or a\c .Q search: prompt if not entering a command. The parenthesized constructs may be nested. Note that the letters of the test constructs are the same as the letters for the\c .Q !! sequences described in INPUT SYNTAX. An example of a nice prompt command might be: .nf prompt "%C(%0 command)%!C(%w'*'%!f'raw '%n)> " .fi With this prompt specification, the prompt would normally appear as\c .Q "\fIfilename\fP>\ " but when fuzzification is turned off as\c .Q "raw \fIfilename\fP>\ " ". " And if word-preference mode is on, the whole thing has a\c .Q * prepended. However if a command is being entered, the prompt would then become\c .Q "\fIname\fP command" ", " where .I name was the program's name (system dependent, but most likely\c .Q lookup "). " The default prompt format string is\c .Q "%C(%0 command)%!C(search [%n])> " ". " lookup-1.08b.orig/doc/c_quit.so0100644000014400001440000000006605534100330016067 0ustar nakaharastaff.TP quit | leave | bye | exit .br Exits the program. lookup-1.08b.orig/doc/c_rdebug.so0100644000014400001440000000022305534077466016377 0ustar nakaharastaff.TP regex debug [\fIboolean\fP] .br Sets the internal regex debugging flag (turn on if you want billions of lines of stuff spewed to your screen). lookup-1.08b.orig/doc/c_select.so0100644000014400001440000000205205554741515016403 0ustar nakaharastaff.TP select [ \fInum\fP | \fIname\fP | . ] .br If .I num is given, sets the .I "default slot" to that slot number. If .I name is given, sets the .I "default slot" to the first slot found with a file (or combination) loaded with that name. The incantation\c .Q "select ." merely sets the default slot to itself, which can be useful in script files where you want to indicate that any subsequent flags changes should work with whatever file was the default at the time the script was .IR source d. If no argument is given, simply reports the current .I "default slot" (also see the .I files command). In command files loaded via the .I source command, or as the startup file, commands dealing with per-slot items (flags, local prompt, filters, etc.) work with the file or slot last .IR select ed. The last such selected slot remains selected once the load is complete. Interactively, the default slot will become the .I "selected slot" for subsequent searches and commands that aren't augmented with an appended\c .Q ",#" (as described in the INPUT SYNTAX section). lookup-1.08b.orig/doc/c_show.so0100644000014400001440000000067205554653014016106 0ustar nakaharastaff.TP show .br Shows any lines elided from the previous search (either due to a .I filter or .IR "word-preference mode" ). Will apply any modifications (see the\c .Q modify command) if modifications are enabled for the file. You can use the\c .Q !m! line prefix as well with this command (in this case, put the\c .Q !m! .I before the command-indicator character). The length of the list is controlled by the\c .Q "saved list size" command. lookup-1.08b.orig/doc/c_encoding.so0100644000014400001440000000053705535562646016725 0ustar nakaharastaff.TP encoding [euc|sjis|jis] The same as the -euc, -jis, and -sjis command-line options, sets the encoding method for interactive input and output (or reports the current status). More detail over the output encoding can be achieved with the .I "output encoding" command. A separate encoding for input can be set with the .I "input encoding" command. lookup-1.08b.orig/doc/c_source.so0100644000014400001440000000237705554741770016441 0ustar nakaharastaff.TP source "\fIfilename\fP" .br Commands are read from .IR filename and executed. In the file, all lines beginning with\c .Q # are ignored as comments (note that comments must appear on a line by themselves, as\c .Q # is a reasonable character to have within commands). Lines whose first non-blank characters is\c .Q = ",\c" .Q ! , or\c .Q + are considered searches, while all other non-blank lines are considered .I lookup commands. Therefore, there is no need for lines to begin with the command-introduction character. However, leading whitespace is always OK. For search lines, take care that any trailing whitespace is deleted if undesired, as trailing whitespace (like all non-leading whitespace) is kept as part of the regular expression. Within a command file, commands that modify per-file flags and such always work with the most-recently loaded (or selected) file. Therefore, something along the lines of .nf load "my.word.list" set word on load "my.kanji.list" set word off set local prompt "enter kanji> " .fi would word as might make intuitive sense. Since a script file must have a .IR load , or .IR select before any per-slot flag is set, one can use\c .Q "select ." to facilitate command scripts that are to work with\c .Q "the current slot" . lookup-1.08b.orig/doc/c_autokana.so0100644000014400001440000000056505554652541016736 0ustar nakaharastaff.TP [default] autokana [\fIboolean\fP] .br Automatic romaji → kana conversion for the .I "selected file" is turned on or off (default is on). However, if\c .Q default is specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily disabled by a prepended\&‘\&=\&’\&,as described in the INPUT SYNTAX section. lookup-1.08b.orig/doc/c_stats.so0100644000014400001440000000023705534077504016263 0ustar nakaharastaff.TP stats Shows information about how many lines of the text file were checked against the last search specifier, and how many lines matched and were printed. lookup-1.08b.orig/doc/c_tag.so0100644000014400001440000000327505554653105015704 0ustar nakaharastaff.TP tag [\fIboolean\fP] ["\fIstring\fP"] Enable, disable, or set the tag for the .IR "selected slot" . If the slot is not a combination slot, a tag .I string may be set (the quotes are required). If a tag string is set and enabled for a file, the string is prepended to each matching output line printed. Unlike the .I filter and .I modify commands which automatically enable the function when a parameter is set, a .I tag is not automatically enabled when set. It can be enabled while being set via\c .Q 'tag on \"text:\"' or could be enabled subsequently via just\c .Q "tag on" If the selected slot is a combination slot, only the enable/disable status may be changed (on by default). No tag string may be set. The reason for the special treatment lies in the special nature of how tags work in conjunction with combination files. During a search when the selected slot is a combination slot, each file which is a member of the combination has its per-file flags disabled if their corresponding flag is disabled in the original combination slot. This allows the combination slot's flags to act as a\c .Q mask to blot out each component file's per-file flags. The tag flag, however, is special in that the component file's tag flag is turned .I on if the combination slot's tag flag is turned on (and, of course, the component file has a tag string registered). The intended use of this is that one might set a (disabled) tag to a file, yet .I direct searches against that file will have no prepended tag. However, if the file is searched as part of a combination slot (and the combination slot's tag flag is on), the tag .I will be prepended, allowing one to easily understand from which file an output line comes. lookup-1.08b.orig/doc/c_verbose.so0100644000014400001440000000025705523140331016557 0ustar nakaharastaff.TP verbose [\fIboolean\fP] .br Sets verbose mode on or off, or reports the current status (default on). Many commands reply with a confirmation if verbose mode is turned on. lookup-1.08b.orig/doc/c_version.so0100644000014400001440000000007005523627341016603 0ustar nakaharastaff.TP version Reports the current version of the program. lookup-1.08b.orig/doc/c_word.so0100644000014400001440000000323505554653117016103 0ustar nakaharastaff.TP [default] word|wordpreference [\fIboolean\fP] The selected file's word-preference mode is turned on or off (default is off), or reports the current setting if no argument is specified. However, if\c .Q default is specified, the value to be inherited as the default by subsequently-loaded files is set (or reported). In word-preference mode, entries are searched for .I "as if" the search regex had a leading\&‘\&<\&’\&and a trailing\&‘\&>\&’\&, resulting in a list of entries with a whole-word match of the regex. However, if there are none, but there .I are non-word entries, the non-word entries are shown (the\c .Q "saved list" is used for this -- see that command). This make it an\&“\&if there are whole words like this, show me, otherwise show me whatever you've got\&”\&mode. If there are both word and non-word entries, the non-word entries are remembered in the saved list (rather than any possible filtered entries being remembered there). One caveat: if a search matches a line in more than one place, and the first is .I not a whole-word, while one of the others .IR is , the line will be listed considered non-whole word. For example, the search\&「\&japan\&」\&with word-preference mode on will not list an entry such as\c .Q "/Japanese/language in Japan/" \&, as the first\c .Q "Japan" is part of\c .Q "Japanese" and not a whole word. If you really need just whole-word entries, use the\&‘\&<\&’\&and\&‘\&>\&’\&yourself. The mode may be temporarily toggled via the\c .Q "!w!" line prefix. The rules defining what lines are filtered, remembered, discarded, and shown for each permutation of search are rather complex, but the end result is rather intuitive. lookup-1.08b.orig/doc/c_cmd_debug.so0100644000014400001440000000016205523140327017023 0ustar nakaharastaff.TP command debug [\fIboolean\fP] .br Sets the internal command parser debugging flag on or off (default is off). lookup-1.08b.orig/doc/c_list_size.so0100644000014400001440000000051305535046732017127 0ustar nakaharastaff.TP saved list size [\fIvalue\fP] .br During a search, lines that match might be elided from the output due to filters or word-preference mode. This command sets the number of such lines to remember during any one search, such that they may be later displayed (before the next search) by the .I show command. The default is 100. lookup-1.08b.orig/doc/c_in_code.so0100644000014400001440000000035505535562531016526 0ustar nakaharastaff.TP input encoding [ euc | sjis ] Used to set (or report) what encoding to use when 8-bit bytes are found in the interactive input (all flavors of JIS are always recognized). Also see the .I "encoding" and .I "output encoding" commands. lookup-1.08b.orig/doc/c_wild.so0100644000014400001440000000255405756251762016076 0ustar nakaharastaff.TP [default] wildcard [\fIboolean\fP] .br The .IR "selected slot" "'s" patterns are considerd wildcard patterns if turned on, regular expressions if turned off. The current status is reported if no argument given. However, if\c .Q default is specified, the pattern-type to be inherited as the default by subsequently-loaded files is set (or reported). Can be temporarily toggled by the\c .Q "!W!" line prefix. When wildcard patterns are selected, the changed metacharacters are:\c .Q "*" means\c .Q "any stuff" , .Q "?" means\c .Q "any one character" , while\c .Q "+" and\c .Q "." become unspecial. Other regex items such as\c .Q "|" , .Q "(" , .Q "[" , etc. are unchanged. What\c .Q "*" and\c .Q "?" will actually match depends upon the status of word-mode, as well as on the pattern itself. If word-mode is on, or if the pattern begins with the start-of-word\c .Q "<" or\c .Q "[" , only non-spaces will be matched. Otherwise, any character will be matched. In summary,when wildcard mode is on, the input pattern is effected in the following ways: .nf * is changed to the regular expression .* or \S* ? is changed to the regular expression . or \S + is changed to the regular expression \+ . is changed to the regular expression \. .fi Because filename patterns are often called\c .Q "filename globs" , the command\c .Q glob can be used in place of\c .Q wildcard . lookup-1.08b.orig/doc/c_cmdchar.so0100644000014400001440000000102705534333663016525 0ustar nakaharastaff.TP cmdchar ['\fIone-byte-char\fP'] The default command-introduction character is a space, but it may be changed via this command. The single quotes surrounding the character are required. If no argument is given, the current value is printed. An input line consisting of a single question mark will also print the current value (useful for when you don't know the current value). Woe to the one that sets the command-introduction character to one of the other special input-line characters, such as\&‘\&+\&’\&,\&‘\&/\&’\&, etc. lookup-1.08b.orig/README0100600000014400001440000000532605555670102014363 0ustar nakaharastaff Jeffrey Friedl Omron Corp. Nagaokakyo, Japan April 22, 1994 LOOKUP provides a way to quickly and powerfully search text files. The author's prime use is to search "edict" (a Japanese-English word list), "kanjidic" (a database about Japanese characters), and "/usr/dict/words" (list of English words). However, one could easily be used to search for variables in huge programs, or most any other application of searching line-based text. From the manual page: Romaji-to-Kana Converter Lookup can convert romaji to kana for you, even "on the fly" as you type. Fuzzy Searching Searches can be a bit "vague" or "fuzzy" , so that you'll be able to find the Japanese word for Tokyo even if you try to search for "to kyo" (the proper Japanese "spelling" being "to u kyo u") Regular Expressions Uses the powerful and expressive regular expression for searching. One can easily specify complex searches that affect "I want lines that look like such-and-such, but not like this-and-that, but that also have this particular characteristic...." Filters You can have lookup not list certain lines that would otherwise match your search, yet can optionally save them for quick review. For example, you could have all nameonly entries from edict filtered from normal output. Automatic Modifications Similarly, you can do a standard search-and-replace on lines just before they print, perhaps to remove information you don't care to see on most searches. For example, if you're generally not interested in kanjidic's info on Chinese readings, you can have them removed from lines before printing. Smart Word-Preference Mode You can have lookup list only entries with whole words that match your search (as opposed to an embedded match, such as finding "the" inside "them" ), but if no wholeword matches exist, will go ahead and list any entry that matches the search. Handy Features Other handy features include a dynamically settable and parameterized prompt, automatic highlighting of that part of the line that matches your search, an output pager, readline-like input with horizontal scrolling for long input lines, a ".lookup" startup file, automated programability, and much more. The program just spits out encoded Japanese. If your terminal doesn't support JIS, Shift-JIS, or EUC, you may as well stop now. See the "BUILDING" file for sections COMPILING MAN PAGE RUNNING ----------------------------------------------------------------------------- Jeffrey jfriedl@nff.ncl.omron.co.jp lookup-1.08b.orig/README.JAP0100600000014400001440000002251405555670262015001 0ustar nakaharastaff lookup“ルーキャプ”【検索】 by フリードル・ジェフリー Friedl, Jeffrey jfriedl@omron.co.jp □━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━□ ┃ ┃ ┃ ★ 目的 : テキストファイルの中の言葉を簡単に速く探すこと。★ ┃ ┃ ‥‥ ‥‥ ┃ □━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━□ ┌──────────────────────────┐ │私の日本語を理解する力が足りないのでお許して下さい。│ ├──────────────────────────┴───────┐ │Lookup の機能はとてもpowerfulですが、日本語の説明書は書きに │ │くい(ジェフリーにとって). 英語の説明書は詳しいので、英語を読める人 │ │はそれを見てください。 │ └──────────────────────────────────┘ △ △ △ △ △ △ △ △ △ △ △ △ △ △ △ △ △ △ △ △ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ▽ ━━━━━━━━━━━━━━━━━━━━━━━━┓ 応用: edict や /usr/dict/words の中の言葉を引く ┃ ━━━━━━━━━━━━━━━━━━━━━━━━┛ edict はフリー(無料)の英和・和英辞典です。 "edict" というファイルの行は全て以下のフォーマットです。 漢字 [読み方] /英語/英語/.../ とか 仮名 /英語/英語/.../ 例えば: アメリカ /America/ 京 [けい] /10,000,000,000,000,000/ten quadrillion/ 元気 [げんき] /health(y)/robust/vigor/energy/vitality/vim/stamina/ 御飯 [ごはん] /rice (cooked)/meal/ 車 [くるま] /car/vehicle/wheel/ 日本語 [にほんご] /Japanese language/ 現在は約80,000行が入っているので、なかなか便利だと思います。 ftp.cc.monash.edu.au (130.194.1.106) の pub/nihongo にある、 Jim Breen先生が提供したものです。英語の edict.doc もあります。 edict の 80,000行が約3メガバイトあるので、 普通の grep などは遅過ぎるはずです。 ━━━━━━━━━━━━━━━━━━━━━━━━┓ コンパイル ┃ ━━━━━━━━━━━━━━━━━━━━━━━━┛ gmake と gcc あれば、"gmake" だけで出来ます。 普通の make でも大丈夫ですが、lookupのMakefile を理解出来ない make も あるので、その場合には "sh make.sh" と、してみてください。 ━━━━━━━━━━━━━━━━━━━━━━━━┓ 準備 ┃ ━━━━━━━━━━━━━━━━━━━━━━━━┛ テキストファイルのインデクスを作って保存する。 % lookup -write edict "edict.jin" と言うインデクスファイルが作成される。 ━━━━━━━━━━━━━━━━━━━━━━━━┓ JISとEUCとShift-JIS に対して ┃ ━━━━━━━━━━━━━━━━━━━━━━━━┛ 探す対象のファイルの日本語の文字コードは、EUCでなければいけません。ただし、 ディスプレイの入出力はJISとEUCとShift-JISいずれも出来ます。"-jis","-sjis", "-euc"のコマンドラインアーギュメントがあります. (アーギュメントが指定されな い場合には"-euc"になります。) ━━━━━━━━━━━━━━━━━━━━━━━━┓ 基本的な使用 ┃ ━━━━━━━━━━━━━━━━━━━━━━━━┛ % lookup -jis edict ^^^^─────jisの場合 起動されれば、『search [edict]> 』のプロンプトが表示されます。 lookupのインタラクティブ入力は二つの種類があります: 第一: 検索させる命令 (正規表現 "regular expression") 第二: パラメータ等に対してのコマンド (“コマンド”) 入力行の最初の文字が半角スペース(' ')の場合には、コマンドとして解釈され ます。他の入力行は正規表現として解釈されます。 ファイルの中の言葉を検索する例: v── 注意:このスペースはプロンプトのスペースです。 search [edict]> Japan ぼけ /Japanese quince/ やくざ /Japanese mafia/Yakuza/ カルピス /Japanese milk-based soft drink/sperm/semen/cum (col)/ 活弁 [かつべん] /narrator in Japanese silent cinema/ 漢和 [かんわ] /Chinese Character-Japanese (e.g. dictionary)/ 弓道 [きゅうどう] /(Japanese) archery/ など 停止する命令 (コマンドの例): v── このスペースはプロンプトのスペースです。 search [edict]> quit ^──── このスペースは大切ですよ。 ━━━━━━━━━━━━━━━━━━━━━━━━┓ ローマ字 → 仮名 ┃ ━━━━━━━━━━━━━━━━━━━━━━━━┛ 入力行の最初の文字が‘/’ならば、それに続くローマ字は仮名に変換され ます。他のところで仮名に変換したい場合には、ローマ字で入力した後 ^space (コントロールスペース)を入力することで、ローマ字は仮名に変換されます。 変換する時に、小文字はひらがなに、大文字はカタカナになります。 ━━━━━━━━━━━━━━━━━━━━━━━━┓ 正規表現 ┃ ━━━━━━━━━━━━━━━━━━━━━━━━┛ grep や nemacs や mifes や perlで使われている正規表現が扱えます。正規表現の文 法の内容はプログラムに依存していますが、lookup の場合を以下の表に示します。 記号 意味 ━━━━━━━━━━━━━━━━━━━━━━━ . いずれかの一文字に一致 […] []の中の文字だけに一致 ‐‐‐‐‐‐‐‐‐‐‐‐‐‐“character class” [^…] []の中の文字以外だけに一致 \d いずれかの数字に一致 (“[0123456789]”と同じ)‐‐‐‐‐‐‐‐“digit” \D \d の逆 (“[0123456789]”以外に一致、“[^0123456789]”と同じ) \w 言葉のローマ字に一致 (“[0-9a-zA-Z_]”と同じ) ‐‐‐“word element” \W \w の逆 \s スペースかタブに一致 (“[ \t]”と同じ) ‐‐‐‐‐‐‐‐‐‐‐“space” \S \s の逆 \a 半角文字に一致 ‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐“ascii character” \A 全角文字に一致 \k 全角カタカナ文字に一致‐‐‐‐‐‐‐‐‐‐‐‐‐“katakana character” \K 全角カタカナ文字以外に一致 \h 全角ひらがな文字に一致‐‐‐‐‐‐‐‐‐‐‐‐‐“hiragana character” \H 全角ひらがな文字以外に一致 \c 漢字だけに一致 ‐‐‐‐‐‐‐‐‐“chineese character”(国字も含む :-) \C 漢字以外に一致 □? □に一致する場合もない場合も一致‐‐‐‐‐‐‐‐‐‐‐‐ “maybe one” □+ □に一致する必要があるが、何回でも一致‐‐‐‐‐‐‐‐“at least one” □* □に一致する必要がないけれど、何回でも(なしでも)一致‐‐“any number” (…) 中の記号が一つのグループにする。 ‐‐‐‐‐‐‐‐‐‐‐‐‐‐“group” < 言葉の始まりに一致 ‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐“start of word” > 言葉の終りに一致‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐ “end of word” □|○ □かあるいは○に一致‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐“or” ^ 行の始まりに一致‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐ “start of line” $ 行の終りに一致 ‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐“end of line” 正規表現の例: “Japan”のある行に一致するが、 その“Japan”の文字列が含まれる言葉の場合には、一致しません。 ^日本 “日本”が行頭にある行に一致する。 \$7$$$N$G!"1Q8l$rFI$a$k?MH K("H K("$O$=$l$r8+$F$/$@$5$$!#H K("H K(&(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(!(%H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"$H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K"&H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(/H K1~MQH: edict K$dH /usr/dict/words K$NCf$N8@MU$r0z$/H K(-H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(0H edict K$O%U%j!<H(KL5NAH)K$N1QOB!&OB1Q<-E5$G$9!#H "edict" K$H$$$&%U%!%$%k$N9T$OA4$F0J2<$N%U%)!<%^%C%H$G$9!#H K4A;zH [KFI$_J}H] /K1Q8lH/K1Q8lH/.../ K$H$+H K2>L>H /K1Q8lH/K1Q8lH/.../ KNc$($PH: K%"%a%j%+H /America/ K5~H [K$1$$H] /10,000,000,000,000,000/ten quadrillion/ K855$H [K$2$s$-H] /health(y)/robust/vigor/energy/vitality/vim/stamina/ K8fHSH [K$4$O$sH] /rice (cooked)/meal/ KfIW$G$9$,!"HlookupK$NHMakefile K$rM}2r=PMh$J$$H make K$bH K$"$k$N$G!"$=$N>l9g$K$OH "sh make.sh" K$H!"$7$F$_$F$/$@$5$$!#H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(/H K=`HwH K(-H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(0H K%F%-%9%H%U%!%$%k$N%$%s%G%/%9$r:n$C$FJ]B8$9$k!#H % lookup -write edict "edict.jin" K$H8@$&%$%s%G%/%9%U%!%$%k$,:n@.$5$l$k!#H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(/H JISK$HHEUCK$HHShift-JIS K$KBP$7$FH K(-H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(0H KC5$9BP>]$N%U%!%$%k$NF|K\8l$NJ8;z%3!<%I$O!"HEUCK$G$J$1$l$P$$$1$^$;$s!#$?$@$7!"H K%G%#%9%W%l%$$NF~=PNO$OHJISK$HHEUCK$HHShift-JISK$$$:$l$b=PMh$^$9!#H"-jis","-sjis", "-euc"K$N%3%^%s%I%i%$%s%"!<%.%e%a%s%H$,$"$j$^$9H. (K%"!<%.%e%a%s%H$,;XDj$5$l$JH K$$>l9g$K$OH"-euc"K$K$J$j$^$9!#H) K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(/H K4pK\E*$J;HMQH K(-H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(0H % lookup -jis edict ^^^^K(!(!(!(!(!HjisK$N>l9gH K5/F0$5$l$l$P!"!XHsearch [edict]> K!Y$N%W%m%s%W%H$,I=<($5$l$^$9!#H lookupK$N%$%s%?%i%/%F%#%VF~NO$OFs$D$N3Q%9%Z!<%9H(' ')K$N>l9g$K$O!"%3%^%s%I$H$7$F2r$NF~NO9T$O@55,I=8=$H$7$F2r Japan K$\$1H /Japanese quince/ K$d$/$6H /Japanese mafia/Yakuza/ K%+%k%T%9H /Japanese milk-based soft drink/sperm/semen/cum (col)/ K3hJ[H [K$+$D$Y$sH] /narrator in Japanese silent cinema/ K4AOBH [K$+$s$oH] /Chinese Character-Japanese (e.g. dictionary)/ K5]F;H [K$-$e$&$I$&H] /(Japanese) archery/ K$J$IH KDd;_$9$kL?NaH (K%3%^%s%I$NNcH)K!'H vK(!(!H K$3$N%9%Z!<%9$O%W%m%s%W%H$N%9%Z!<%9$G$9!#H search [edict]> quit ^K(!(!(!(!H K$3$N%9%Z!<%9$OBg@Z$G$9$h!#H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(/H K%m!<%^;zH K"*H K2>L>H K(-H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(0H KF~NO9T$N:G=i$NJ8;z$,!FH/K!G$J$i$P!"$=$l$KB3$/%m!<%^;z$O2>L>$KJQ49$5$lH K$^$9!#B>$N$H$3$m$G2>L>$KJQ49$7$?$$>l9g$K$O!"%m!<%^;z$GF~NO$7$?8eH ^space (K%3%s%H%m!<%k%9%Z!<%9H)K$rF~NO$9$k$3$H$G!"%m!<%^;z$O2>L>$KJQ49$5$l$^$9!#H KJQ49$9$k;~$K!">.J8;z$O$R$i$,$J$K!"BgJ8;z$O%+%?%+%J$K$J$j$^$9!#H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(/H K@55,I=8=H K(-H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(0H grep K$dH nemacs K$dH mifes K$dH perlK$G;H$o$l$F$$$k@55,I=8=$,07$($^$9!#@55,I=8=$NJ8H KK!$NFbMF$O%W%m%0%i%`$K0MB8$7$F$$$^$9$,!"Hlookup K$N>l9g$r0J2<$NI=$K<($7$^$9!#H K5-9fH K0UL#H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,H . K$$$:$l$+$N0lJ8;z$K0lCWH [K!DH] []K$NCf$NJ8;z$@$1$K0lCWH K!>!>!>!>!>!>!>!>!>!>!>!>!>!>!HHcharacter classK!IH [^K!DH] []K$NCf$NJ8;z0J30$@$1$K0lCWH \d K$$$:$l$+$N?t;z$K0lCWH (K!HH[0123456789]K!I$HF1$8H)K!>!>!>!>!>!>!>!>!HHdigitK!IH \D \d K$N5UH (K!HH[0123456789]K!I0J30$K0lCW!"!HH[^0123456789]K!I$HF1$8H) \w K8@MU$N%m!<%^;z$K0lCWH (K!HH[0-9a-zA-Z_]K!I$HF1$8H) K!>!>!>!HHword elementK!IH \W \w K$N5UH \s K%9%Z!<%9$+%?%V$K0lCWH (K!HH[ \t]K!I$HF1$8H) K!>!>!>!>!>!>!>!>!>!>!>!HHspaceK!IH \S \s K$N5UH \a KH>3QJ8;z$K0lCWH K!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!HHascii characterK!IH \A KA43QJ8;z$K0lCWH \k KA43Q%+%?%+%JJ8;z$K0lCW!>!>!>!>!>!>!>!>!>!>!>!>!>!HHkatakana characterK!IH \K KA43Q%+%?%+%JJ8;z0J30$K0lCWH \h KA43Q$R$i$,$JJ8;z$K0lCW!>!>!>!>!>!>!>!>!>!>!>!>!>!HHhiragana characterK!IH \H KA43Q$R$i$,$JJ8;z0J30$K0lCWH \c K4A;z$@$1$K0lCWH K!>!>!>!>!>!>!>!>!>!HHchineese characterK!IH(K9q;z$b4^$`H :-) \C K4A;z0J30$K0lCWH K""H? K""$K0lCW$9$k>l9g$b$J$$>l9g$b0lCW!>!>!>!>!>!>!>!>!>!>!>!>H K!HHmaybe oneK!IH K""H+ K""$K0lCW$9$kI,MW$,$"$k$,!"2?2s$G$b0lCW!>!>!>!>!>!>!>!>!HHat least oneK!IH K""H* K""$K0lCW$9$kI,MW$,$J$$$1$l$I!"2?2s$G$bH(K$J$7$G$bH)K0lCW!>!>!HHany numberK!IH (K!DH) KCf$N5-9f$,0l$D$N%0%k!<%W$K$9$k!#H K!>!>!>!>!>!>!>!>!>!>!>!>!>!>!HHgroupK!IH < K8@MU$N;O$^$j$K0lCWH K!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!HHstart of wordK!IH > K8@MU$N=*$j$K0lCW!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>H K!HHend of wordK!IH K""H|K!{H K""$+$"$k$$$O!{$K0lCW!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!HHorK!IH ^ K9T$N;O$^$j$K0lCW!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>H K!HHstart of lineK!IH $ K9T$N=*$j$K0lCWH K!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!>!HHend of lineK!IH K@55,I=8=$NNc!'H K!HHJapanK!I$N$"$k9T$K0lCW$9$k$,!"H K$=$N!HHJapanK!I$NJ8;zNs$,4^$^$l$k8@MU$N>l9g$K$O!"0lCW$7$^$;$s!#H ^KF|K\H K!HF|K\!I$,9TF,$K$"$k9T$K0lCW$9$k!#H H) K$7$+$b!"!HHJapanK!I$,$"$k$N$G!"F1$8L?Na$GH K9q;zH [K$3$/$8H] /native script/kana/kanji made in Japan/ K$J$I$N9T$O8+$D$1$k$o$1$G$9$M!#H K$D$^$j!"!VF|K\H|JapanK!W$O!X!HF|K\!IHorK!HHJapanK!I$,$"$l$P!"<($7$F2<$5$$!YH K$H$$$&L?Na$G$9$M!#H K$=$l$G$O!"!HF|K\H||JapanK!I$O$I$&$$$&0UL#$G$7$g$&$+!#H ^K(!(!(!(!H K$3$N!FH|K!G$@$1IU2C$5$l$k!#H K$3$l$OFs$D$N@55,I=8=$NJ#9g$G$9!'H K!HH||K!I$N:8$K$O!HF|K\!I$H$$$&@55,I=8=!#H K!HH||K!I$N1&$K$O!HHJapanK!I$H$$$&@55,I=8=!#H K$3$l$i$O6&$K0l$D$NL?Na$r:n@.$7$^$9!#H K2re$NNc$G$O!"H KKLF|K\H [K$-$?$K$[$sH] /Kitanihon (pl)/ K$O!HHJapanK!I$J$7$G$9$N$G!"BLL\$G$9!#F1$8$h$&$K!"H K9q;zH [K$3$/$8H] /native script/kana/kanji made in Japan/ K$O!HF|K\!I$,$J$$$N$G!"BLL\$G$9!#H K$?$@$7!"H KF|K\H [K$K$[$sH] /Japan/ KF|K\8lH [K$K$[$s$4H] /Japanese language/ KN"F|K\H [K$&$i$K$C$]$sH] /Japan Sea coast areas/ KF|K\J|Aw6(2qH [K$K$C$]$s$[$&$=$&$-$g$&$+$$H] /NHK/Japan national TV/ K$J$IH K$O!"0lCW$7$^$9!#H K!HF|K\H||K$K$C$]$s!I$H!HF|K\H||K$K$[$s!I$O$I$&$G$9$+!#J,$+$j$^$9$+!#H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(/H KB>$KH K(-H K(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(,(0H K?'!9%3%^%s%I$"$j$^$9$,!"F|K\8l$G@bL@$O$A$g$C$H!#H K #include "lib/config.h" #include "lib/output.h" #include "lib/assert.h" #include "lib/index.h" #include "lib/loadfile.h" #include "lib/jregex.h" #include "lib/replace.h" #include "lookup.h" /* TODO: Optimize (code size/time) for when APPLY_REGEX_MAX_CHAR_COUNT is 1. */ void output_line(const struct slot_info *slot, String *line, unsigned length) { kibishii_assert(!COMBO(slot)); if (slot->current_flag.modify) { static string *new = 0; int retval; if (new) free(new); retval = apply_substitution(&slot->modify_spec.regex, &new, 0, line, length, slot->modify_spec.replacement, slot->modify_spec.global ? 1000 : 1); if (retval != APP_SUB_SUCCESS) new = 0; else length = str_len(line = new); } if (slot->current_flag.tag) { kibishii_assert(slot->tag_string); output((const char *)slot->tag_string); } #ifdef HAVE_SPINNER /* output an initial space if the first char is a tab, so that it will be sure to erase any spinner */ if (line[0] == '\t' && lookup.spinner.interval) { #ifndef LOG_FILE_SUPPORT outchar(' '); #else /* * But we don't want to output the space to the log file, * (if there is one), so we have to be careful about that. */ int log_fd = set_extra_output_file(JUST_CHECKING_OUTPUT_FILE); if (current_log_file == 0) outchar(' '); else { flush_output(); set_extra_output_file(NO_OUTPUT_FILE); /* turn logging off */ outchar(' '); flush_output(); set_extra_output_file(log_fd); } #endif /* LOG_FILE_SUPPORT */ } #endif /* HAVE_SPINNER */ #ifdef SERVER_CONFIG while (length--) outchar(*line++); #else if (!slot->current_flag.highlight || !regexec(&lookup.search[0].regex, line, length) || regexec_match_start == regexec_match_end) { while (length--) outchar(*line++); } else { String *lineend = line + length; static DECL_STYLES; while (line < regexec_match_start) outchar(*line++); (void)output_pager_transparent(1); output((lookup.flag.hl_style == HL_STYLE_HTML) ? (const char *)lookup.slot->highlight_tag : styles[lookup.flag.hl_style]); (void)output_pager_transparent(0); while (regexec_match_start < regexec_match_end) outchar(*regexec_match_start++); (void)output_pager_transparent(1); if (lookup.flag.hl_style != HL_STYLE_HTML) output("\033[0m"); else outputf("highlight_tag + 1); (void)output_pager_transparent(0); while (regexec_match_end < lineend) outchar(*regexec_match_end++); } #endif /* SERVER_CONFIG */ outchar('\n'); } /* * Save the given line to lookup.list[] */ static __inline__ void save_to_list(fileloc line, const struct slot_info *slot) { if (lookup.list.used >= lookup.list.size) lookup.list.overflow++; else { lookup.list.array[lookup.list.used].slot = slot; lookup.list.array[lookup.list.used++].line = line; } } #define CONTINUE_SEARCH 0 #define ABORT_SEARCH 1 /* * Return true if we should stop searching. */ static int matchedline(const struct slot_info *slot, String *line, fileloc loc, unsigned length) { lookup.count.matched++; /* If were're filtering and should filter this line, do so. */ if (slot->current_flag.filter && slot->filter_spec.negative != regexec(&slot->filter_spec.regex, line, length)) { lookup.count.filtered++; if (!lookup.count.nonword) /* only save if not saving nonwords */ { #ifdef HAVE_SPINNER if (lookup.count.filtered == 1) { lookup.spinner.chars = (String *)"─/│\○"; lookup.spinner.char_count = 5; } #endif /* HAVE_SPINNER */ save_to_list(loc, slot); } return CONTINUE_SEARCH; } /* If skipping non-words, do so if this one counts */ if (slot->current_flag.word && !(regexec_match_at_start_of_word && regexec_match_at_end_of_word)) { if (lookup.count.nonword++ == 0) { /* if was using for filtered lines, switched to non-word ones */ lookup.list.overflow = lookup.list.used = 0; #ifdef HAVE_SPINNER lookup.spinner.chars = (String *)"─/│\◎"; lookup.spinner.char_count = 5; #endif /* HAVE_SPINNER */ } save_to_list(loc, slot); return CONTINUE_SEARCH; } if (lookup.lines_to_print_this_time && !--lookup.lines_to_print_this_time) { if (slot->current_flag.display) output("≪search aborted: matched-line count over limit≫\n"); return ABORT_SEARCH; } lookup.count.printed++; if (slot->current_flag.display) output_line(slot, line, length); return CONTINUE_SEARCH; } #define _local_label(name, tag) name ## tag #define local_label(name,tag) _local_label(name, tag) #ifdef LOG_FILE_SUPPORT # define LOG_FILE_STUFF(stuff) stuff #else # define LOG_FILE_STUFF(stuff) /*nothing*/ #endif #ifndef HAVE_SPINNER # define SPINNER_STUFF /* nothing */ #else # define SPINNER_STUFF \ if (lookup.spinner.interval && --till_next_spinner < 0) \ { \ LOG_FILE_STUFF( \ int log_fd = 0; /* initialize only to shut up warnings */ \ if (current_log_file != 0) { \ log_fd = set_extra_output_file(JUST_CHECKING_OUTPUT_FILE); \ flush_output(); \ set_extra_output_file(NO_OUTPUT_FILE); /* turn logging off */ \ }) \ \ if (++spinner_position >= lookup.spinner.char_count) \ spinner_position = 0; \ outchar(lookup.spinner.chars[spinner_position*2]); \ outchar(lookup.spinner.chars[spinner_position*2+1]); \ outchar('\r'); \ flush_output(); \ LOG_FILE_STUFF( \ if (current_log_file != 0) \ set_extra_output_file(log_fd); \ ) \ till_next_spinner = lookup.spinner.interval; \ } #endif #define do_actual_check(V, POS, LENDEST) \ { \ unsigned _length_; \ VirtFile *v = (V); \ fileloc pos = (POS); \ String *_text_ = VirtPos2Str(v, pos, &_length_); \ int _i_; \ \ (LENDEST) = _length_; \ \ lookup.count.checked++; \ SPINNER_STUFF; \ \ for (_i_ = 0; _i_ < lookup.patterns; _i_++) \ if (!regexec(&lookup.search[_i_].regex, _text_, _length_) != \ lookup.search[_i_].not) \ { \ if (apply_regex_abort) \ return ABORT_SEARCH; \ else \ goto local_label(fail, __LINE__); \ } \ \ /* if we get here, the regex(es) matched */ \ if (apply_regex_abort) \ return ABORT_SEARCH; \ if (matchedline(slot, _text_, pos, _length_) == ABORT_SEARCH) \ return CONTINUE_SEARCH; \ local_label(fail, __LINE__) : \ (void)1; /* this here because ANSI requires something after the lable */ \ } /* * We'll consider up to this many different required characters. */ #ifndef APPLY_REGEX_MAX_CHAR_COUNT #define APPLY_REGEX_MAX_CHAR_COUNT 10 #endif /* * Can be set externally to cause the search to abort. */ volatile unsigned apply_regex_abort = 0; static int apply_regex_to_file(const struct slot_info *slot) { /* for char #n, charcount[n] will have #lines with that char */ unsigned charcount[APPLY_REGEX_MAX_CHAR_COUNT]; #ifdef HAVE_SPINNER unsigned spinner_position = 0; int till_next_spinner = 0; #endif /* HAVE_SPINNER */ /* pointer into index's packed list for char #n */ String *packed_list_ptr[APPLY_REGEX_MAX_CHAR_COUNT]; static MemItem mem_packed_list_ptr[APPLY_REGEX_MAX_CHAR_COUNT]; /* pointer to char #n's current line in the text */ fileloc current_line[APPLY_REGEX_MAX_CHAR_COUNT]; int used = 0; /* how many of each array are used */ int ret; int i; unsigned char HI[APPLY_REGEX_MAX_CHAR_COUNT]; unsigned char LO[APPLY_REGEX_MAX_CHAR_COUNT]; unsigned int pat_index = 0; const unsigned char *list = 0; #define KID_HIRA_HI 0244 /* high byte for hiragana EUC */ #define KID_KATA_HI 0245 /* high byte for katakana EUC */ unsigned first_pattern_flags, nonfirst_pattern_flags; unsigned generic_flags = 0; int MemIndex = IsMemIndex(slot->file->index); kibishii_assert(lookup.patterns != 0); kibishii_assert(!COMBO(slot)); kibishii_assert(!slot->current_flag.filter || slot->filter_spec.pattern); kibishii_assert(!slot->current_flag.modify || slot->modify_spec.pattern); if (slot->current_flag.fuzz) generic_flags |= REGCOMP_FUZZY_KANA_REPETITION; if (slot->current_flag.fold) generic_flags |= REGCOMP_IGNORE_ALPHA_CASE | REGCOMP_IGNORE_KANA_CASE; if (lookup.flag.regex_debug) generic_flags |= REGCOMP_DEBUG; first_pattern_flags = REGCOMP_CALC_MUSTHAVE | generic_flags | (slot->current_flag.word ? REGCOMP_WANT_WORD_MATCH_INFO : REGCOMP_JUST_MATCH); nonfirst_pattern_flags = generic_flags | REGCOMP_JUST_MATCH; /* make sure any previous regexes were freed */ for (i = 0; i < MAX_PATS_ON_ONE_LINE; i++) regfree(&lookup.search[i].regex); for (i = 0; i < lookup.patterns; i++) { unsigned these_flags = i==0 ? first_pattern_flags : nonfirst_pattern_flags; if (!lookup.search[i].not) these_flags |= REGCOMP_CALC_MUSTHAVE; ret = regcomp(&lookup.search[i].regex, lookup.search[i].pattern, these_flags); if (ret == REGCOMP_NEED_SAVE_PAREN_INFO) ret = regcomp(&lookup.search[i].regex, lookup.search[i].pattern, (these_flags| REGCOMP_SAVE_MATCHED_PAREN_INFO)); if (lookup.flag.debug) { outputf("pattern: %s>> %s <<\n", lookup.search[i].not ? "NOT ":"", lookup.search[i].pattern); showregex(&lookup.search[i].regex); } if (ret != REGCOMP_SUCCESS) { if (lookup.flag.debug) outputf("[bad regcomp returns %d]\n", ret); output((const char *)regcomp_error_report()); return ABORT_SEARCH; } } /* * Return the next line in the list for char #I, which will either * be zero (if the list is now exhausted) or the current line * bumped up by the amount according to the packed list. */ #define NULL_POINTER -1 #define ORIG_NEXTLINE(I) \ (charcount[I] == 0 ? NULL_POINTER : \ (charcount[I]--, \ current_line[I] + read_packed_value(&packed_list_ptr[I]))) #define MEM_NEXTLINE(I) \ (charcount[I] == 0 ? NULL_POINTER : \ (charcount[I]--, \ current_line[I] + mem_read_packed_value(&mem_packed_list_ptr[I]))) #define NEXTLINE(I) (MemIndex ? MEM_NEXTLINE(I) : ORIG_NEXTLINE(I)) /* * Find the APPLY_REGEX_MAX_CHAR_COUNT least common characters * that *must* be in the pattern. */ while (used < APPLY_REGEX_MAX_CHAR_COUNT) { unsigned char hi, lo; elementcount count; int res; while (list == 0 || list[0] == 0) { if (pat_index >= lookup.patterns) { list = 0; break; } list = regmusthave(&lookup.search[pat_index++].regex); } if (list == 0) break; /* since the index *always* folds case, make sure to do that here*/ if (list[0] & 0x80) { hi = (*list++) & 0x7f; lo = (*list++) & 0x7f; if (hi == (KID_KATA_HI & 0x7f)) hi = (KID_HIRA_HI & 0x7f); } else { hi = 0; lo = *list++; if (isupper(lo)) lo = tolower(lo); } /* if we've already seen this character, ignore */ for (i = 0; i < used; i++) if (LO[i] == lo && HI[i] == hi) break; if (i < used) { /* outputf("already have %x %x\n", hi, lo); */ continue; } if (MemIndex) res = mem_get_index_count(slot->file->index, hi, lo, &count); else res = get_index_count(slot->file->index, hi, lo, &count); if (!res || count == 0) { if (hi != 0 || isalnum(lo)) { if (lookup.flag.debug) { outputf("Character [%c%c] not in index; " "apparently no lines in the file contain it.\n", hi ? (hi|0x80) : hi, hi ? (lo|0x80) : lo); } return CONTINUE_SEARCH; } continue; /* just ignore if non-alphanumeric ASCII */ } if (count == SKIPPED_COUNT) continue; /* get the pointer to the packed_list for the character */ if (MemIndex) { IndexOffset io; res = mem_get_index_list(slot->file->index, hi, lo, &io); SetMem(&mem_packed_list_ptr[used], slot->file->index->FileP, io); } else { res = get_index_list(slot->file->index, hi, lo, &packed_list_ptr[used]); } if (!res) kibishii_assert(0); HI[used] = hi; LO[used] = lo; charcount[used] = count; /* * Prime CURRENT_LINE to point to the beginning of the * file, then use NEXTLINE to get what's really the first * line in the list for char #used. */ current_line[used] = 0; current_line[used] = NEXTLINE(used); used++; } if (used && lookup.flag.debug) { output("looking for lines with characters ["); for (i = 0; i < used; i++) { if (HI[i] == 0) outchar(LO[i]); else { outchar(HI[i]|0x80); outchar(LO[i]|0x80); } } output("]\n"); } if (used == 0) { /* must search all lines... do that now */ fileloc line = 0; fileloc end = slot->file->v->length; if (lookup.flag.debug) output("[checking all lines]\n"); while (line < end) { unsigned len; do_actual_check(slot->file->v, line, len); line += len + 1; } } else for (;;) { /* * Now we have info on USED characters: * current_line[i] holds a pointer to the text file for char #i */ int holdsmax = 0; fileloc max, startmax; for (i = 1; i < used; i++) { if (current_line[i] > current_line[holdsmax]) holdsmax = i; } startmax = max = current_line[holdsmax]; for (i = 0; i < used; i++) { if (apply_regex_abort) return ABORT_SEARCH; if (i == holdsmax) continue; while (current_line[i] < max) { if (current_line[i] = NEXTLINE(i), current_line[i] == NULL_POINTER) { return CONTINUE_SEARCH; } } if (current_line[i] > max) { max = current_line[holdsmax = i]; } } if (startmax == max) { /* "log_fd may be used uninitialized" below. */ unsigned len; do_actual_check(slot->file->v, max, len); /* skip passed one just done */ for (i = 0; i < used; i++) { soft_assert(current_line[i] == max); if (current_line[i] = NEXTLINE(i), current_line[i] == NULL_POINTER) return CONTINUE_SEARCH; } } } return CONTINUE_SEARCH; } void apply_regex(void) { unsigned old_regexec_flags = regexec_setflags(lookup.flag.regex_debug ? REGEXEC_DEBUG : 0); /* * If setting lookup.lines_to_print_this_time to anything nonzero, add * one extra. This is because the number internally means "the line * to abort on", so if we want to print 10 lines, we must abort on #11. */ lookup.lines_to_print_this_time = lookup.max_lines_to_print ? (lookup.max_lines_to_print + 1) : 0; #ifdef HAVE_SPINNER lookup.spinner.chars = (String *)"─/│\"; lookup.spinner.char_count = 4; #endif /* HAVE_SPINNER */ lookup.count.filtered = 0; lookup.count.checked = 0; lookup.count.matched = 0; lookup.count.printed = 0; lookup.count.nonword = 0; lookup.list.overflow = 0; apply_regex_abort = 0; lookup.list.used = 0; if (!COMBO(lookup.slot)) apply_regex_to_file(lookup.slot); else { unsigned int i; for (i = 0; i < lookup.slot->combo.entries; i++) { struct slot_info *slot = lookup.slot_info[lookup.slot->combo.entry[i]]; kibishii_assert(!COMBO(slot)); #define setflag(FLAG) slot->current_flag.FLAG = \ (lookup.slot->current_flag.FLAG & slot->default_flag.FLAG); setflag(word); setflag(fold); setflag(highlight); setflag(filter); setflag(modify); setflag(display); /* special case for tag... goes on if either flag on */ kibishii_assert(!slot->default_flag.tag || slot->tag_string); slot->current_flag.tag = slot->tag_string == 0 ? 0 : (lookup.slot->current_flag.tag || slot->default_flag.tag); kibishii_assert(!slot->current_flag.tag || slot->tag_string); apply_regex_to_file(slot); } } regexec_setflags(old_regexec_flags); if (apply_regex_abort) { output_pager_reset_more(); output("<>\n"); } } lookup-1.08b.orig/cmds.master0100600000014400001440000002445506173451754015661 0ustar nakaharastaff#!/usr/local/bin/perl -w # # Jeffrey Friedl 富利取ジェフリー # Omron Corporation オムロン(株) # Nagaokakyoshi, Japan 〒617長岡京下海印寺 # # jfriedl@nff.ncl.omron.co.jp # # This work is placed under the terms of the GNU General Purpose Licence # (the "GNU Copyleft"). # print <) { ++$count; ($flag, $help, $usage, $regex, $execution, $empty, $other) = split(/\s*\n\s*/); ## ## We expect exactly $flag .. $execution. We expect $empty to be ## defined but null. If $empty is undefined or non-null, or ## or $other is defined, the count out of split was wrong. if (!defined($empty) || $empty ne '' || defined($other)) { print qq/\n#error was bad record in "$0" at line $linenum.\n/; die qq/$0:$linenum: bad command record.\n/; } $help =~ s/["\\]/\\$&/g; $help = ($help eq "#") ? "0" : qq/"$help"/; $usage =~ s/["\\]/\\$&/g; $usage = ($usage eq "#") ? "0" : qq/"$usage"/; $regex =~ s/\s+/\\s*/g; # turn whitespace into a regex $regex = '^\\s*'.$regex.'\\s*$' if $regex !~ m/^\^/ && $regex !~ m/\$\$/; $regex =~ s/["\\]/\\$&/g; # protect quotes and backslashes if ($execution eq "skip") { $fun = "0"; } elsif ($execution =~ m/^(\w+)\(\);?$/) { $fun = $1; } else { $execution =~ s/\\(\d)/cmd_paren[$1-1]/g; $fun = "_func${count}_"; print "\n/* generated from \"$0\", record at line $linenum */\n"; print "static int $fun(void)\n{\n return $execution;\n}\n"; } push(@entries, join("\n", qq#\n /* generated from "$0" record at line $linenum*/#, qq# {#, qq# $flag,#, qq# (S)$usage,#, qq# (S)$help,#, qq# (S)"$regex",#, qq# $fun,#, qq# }#)); $linenum += tr/\n/\n/; } print "\nstatic struct command command[] = {\n", join(",\n",@entries), "};\n"; sub linenumbr_of_data { __LINE__ + 2; } ## make sure I'm just before __END__ __END__ CMD_FILE_ONLY /* comment line */ # # ^\s*# skip CMD_FILE_ONLY /* blank line */ # # ^\s*$ skip CMD_FILE_ONLY /* search */ # # ^\s*([+!=].*) cmd_do_search(\1) CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT Turns automatic romaji conversion on or off, or reports current status. [default] autokana [on|1|off|0] default autokana> (on|1|off|0)? cmd_set_local_autokana_flag(\1) CMD_GENERAL # # default autokana> (on|1|off|0)? cmd_set_default_autokana_flag(\1) CMD_GENERAL clear the screen clear|cls (clear|cls) cmd_clear() CMD_GENERAL set the this-is-a-command character cmdchar ['bytechar'] cmd(char)? ('(\a)')? cmd_cmdchar(\3) CMD_GENERAL Turns command debugging on or off, or reports current status. command debug [on|1|off|0] c(om(mand)?)?> debug> (on|1|off|0)? cmd_set_default_cmd_debug_flag(\3) CMD_GENERAL|CMD_LOAD_RELATED Combines previously-loaded slots to one new slot. combine> ["name"] [num =] num [num....] comb(o|ine) ((["'])(.*)\3 =?)? (\d+ \+?=)? ((#? \d+ ,? )+) cmd_combine(\4, \5 ? atoi(\5) : -1, \6) CMD_GENERAL Turns debugging on or off, or reports current status. debug [on|1|off|0] debug> (on|1|off|0)? cmd_set_default_debug_flag(\1) CMD_GENERAL describes the encodings for the character(s) indicated describe "string"|character|[kuten|euc|jis|sjis|ascii]code describe> (['"])(.*)\1 cmd_describe_raw(\2); CMD_GENERAL # # describe> (\A+|[!-~]) cmd_describe_raw(\1); CMD_GENERAL # # describe> (kuten> )?(\d\d\d\d) cmd_describe_kuten(\2); CMD_GENERAL # # describe ascii (\\|0)([0-7]+) cmd_describe_ascii(8, \2); CMD_GENERAL # # describe ascii (\d+) cmd_describe_ascii(10, \1); CMD_GENERAL # # describe ascii (0[xX])?([0-9a-f][0-9a-f]) cmd_describe_ascii(16, \1); CMD_GENERAL # # describe> (sjis>|jis>|euc>)? (0[xX])?(<[0-9a-f][0-9a-f][0-9a-f][0-9a-f]>) cmd_describe_encoding(\1, \3); CMD_GENERAL # # describe> (\$@|\$B|\$@\$B)?(([!-~][!-~])+)(\(\$@|\$B|\$@\$B|\$\(D|\(J|\(H|\(B|\(I)? cmd_describe_jis_string(\2) CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT Turns display of matching lines on or off. [default] display [on|1|off|0] display> (on|1|off|0)? cmd_set_local_display_flag(\1) CMD_GENERAL # # default display> (on|1|off|0)? cmd_set_default_display_flag(\1) CMD_GENERAL|CMD_ENCODING_RELATED set the input/output encoding-method encoding (euc|jis|sjis) encod(e|ing)> (euc|jis|sjis)? cmd_encoding(\2) CMD_GENERAL list what files are loaded into what slots. files|slots [-] (slot|file)s?> (-|-?(help|long))? cmd_list_files(\2 ? 1 : 0) CMD_GENERAL|CMD_LOAD_RELATED set the filter for the selected file. filter ["name"] [!] /regex/[i] filter> ("([^"]*)")? (!?) (\S)(.+)\4(i?) cmd_filter(\5, \2, \3[0] == '!', \6[0] == 'i') CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT turn the filter (for the selected file) on or off, or report its status filter [on|1|off|0] filter> (on|1|off|0)? cmd_toggle_filter(\1) CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT Turns case folding on or off, or reports current status. [default] fold [on|1|off|0] fold> (on|1|off|0)? cmd_set_local_fold_flag(\1) CMD_GENERAL # # default fold> (on|1|off|0)? cmd_set_default_fold_flag(\1) CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT Turns fuzzification on or off, or reports current status. [default] fuzz [on|1|off|0] fuzz> (on|1|off|0)? cmd_set_local_fuzz_flag(\1) CMD_GENERAL # # default fuzz> (on|1|off|0)? cmd_set_default_fuzz_flag(\1) CMD_GENERAL list help help [string] help (.*\S)? cmd_help(\1) CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT Turns highlighting on or off, or reports current status. [default] highlight [on|1|off|0] h(igh)?l(ight)?> (on|1|off|0)? cmd_set_local_highlight_flag(\3) CMD_GENERAL Sets the highlighting style (bold, inverse[standout], or to a given HTML tag) highlight [bold|inverse|blink|under|<___>|black|red|green|yellow|blue|purple|cyan|white] h(igh)?l(ight)?> (style)? (red|green|yellow|blue|purple|cyan|white|bold|inverse|standout|blink|under|\<([a-zA-Z]+)\>)? cmd_set_highlighting_style(\4) CMD_GENERAL if {expr} command Runs command only if EXPR is true. if {([^}]*)} (.*) cmd_if(\1,\2) CMD_GENERAL # # default h(igh)?l(ight)?> (on|1|off|0)? cmd_set_default_highlight_flag(\3) CMD_GENERAL|CMD_ENCODING_RELATED report or set the input encoding-method for 8-bit bytes (JIS always OK) input encoding [euc|sjis] input (encoding)?> (euc|sjis)? cmd_input_encoding(\2) CMD_GENERAL set the maximum number of lines to print during any one command. limit [ ] limit (=? (\d+))? cmd_set_limit(\2) CMD_GENERAL|CMD_LOAD_RELATED load a file (and read or compute its index as needed) load "file" (fast)?load> (-?(now)|-?(w(hen)?n(eeded)?))? (['"]?)(\S+)\7 ((\3||\4) ? warn("flags no longer need for 'load' command\n") : 0), cmd_load((const char *)\8); CMD_GENERAL log output to a file log [- | [+]"file"] log> ((-|)|(()? (\+)? (['"]?)(.+)\6))? cmd_log(\2 ? 1 : 0, \5 ? 1 : 0, \7) CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT sets the modify regular expression and replacement for the selected file. modify /regex/replace/[ig] modify> (\S)(.+)\1(.*)\1([ig]?) cmd_modify(\2,\3, \4[0] == 'i' || \4[1] == 'i', \4[0] == 'g' || \4[1] == 'g') CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT sets the modify filter on or off (for the indicated file) modify [on|1|off|0] modify> (on|1|off|0)? cmd_toggle_modify(\1) CMD_GENERAL prints a message to the screen msg .... msg (.*) cmd_msg(\1) CMD_GENERAL|CMD_ENCODING_RELATED report or set the output encoding-method output encoding [euc|sjis|jis|...] output> (encoding>)? (euc|sjis|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\s]+)*) cmd_output_encoding(\2, \3, \5, \6) CMD_GENERAL configure (width x height) or toggle the output pager pager [boolean | [W x] H] pager> ((on|1|off|0)|(([1-9]\d* [,x])? ([1-9]\d*)))? cmd_pager(\2, \4, \5) CMD_GENERAL|CMD_NEEDS_SLOT set the prompt format string [local] prompt "string" local prompt> ((['"])(.+)\2)? cmd_set_prompt(1, \3) CMD_GENERAL # # prompt> ((['"])(.+)\2)? cmd_set_prompt(0, \3) CMD_GENERAL Turns regex debugging on or off, or reports current status. regex debug [on|1|off|0] r(egex)? debug> (on|1|off|0)? cmd_set_default_regex_debug_flag(\2) CMD_GENERAL set the number of elided lines to remember for the "show" command. saved list size [ ] saved? (list)? (size|len(gth)?)? (=? (\d+))? cmd_list_size(\5) CMD_GENERAL|CMD_LOAD_RELATED sets the default file select [ num | name | . ] select> ((\.)|(#? (\d+))|((["']?)(.+)\6))? cmd_select(\4, (const char *)\7, \2 ? 1 : 0) CMD_GENERAL show the lines filtered by the last search, if any show show cmd_show() CMD_GENERAL load commands from a file source "filename" source> (['"]?)(.+)\1 cmd_source((const char *)\2) CMD_GENERAL sets the spinner to move each lines checked (0 to disable) spinner [ ] spinner (=? (\d+))? cmd_set_spinner(\2) CMD_GENERAL reports stats about the last search stats stats? cmd_stats() CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT set, toggle, or report the tag for the slot. tag [boolean] ["string"] tag> (on|1|off|0)? ((['"])(.*)\3)? cmd_tag(\1, \4) CMD_GENERAL Turns verbosity on or off, or reports current status. verbose [on|1|off|0] verbose> (on|1|off|0)? set_verbose_flag(\1) CMD_GENERAL report the version number version version cmd_version() CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT Turns word-preference mode on or off, or reports current status. [default] word [on|1|off|0] word> (on|1|off|0)? cmd_set_local_word_flag(\1) CMD_GENERAL # # default word> (on|1|off|0)? cmd_set_default_word_flag(\1) CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT Selects wildcard "glob" patterns instead of regular expression patterns. [default] wildcard [on|1|off|0] (glob|wild(card)?)> (on|1|off|0)? cmd_set_local_glob_flag(\3) CMD_GENERAL # # default (glob|wild(card)?)> (on|1|off|0)? cmd_set_default_glob_flag(\3) CMD_GENERAL Exits the program exit|quit|bye|leave|done (exit|quit|bye|leave|done) cmd_exit() CMD_GENERAL # # (load|describe|source) cmd_error("expecting argument", \1) CMD_GENERAL # # (default|local)? (autokana|cmdchar|debug|describe|encoding|files?|filter|fold|fuzz|help|highlight (style)?|input (encoding)?|limit|load|modify|output (encoding)?|pager|prompt|r(egex)? debug|saved? (list)? (size|len(gth)?)?|select|show|source|spinner|stats?|verbose|version|word|exit|quit|bye|leave|done|combine|combo|tag).* cmd_error("argument error", \2) lookup-1.08b.orig/commands.c0100600000014400001440000015116506271560252015453 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * December 1993. */ #include #include #include "lib/system.h" #if defined(_HAVE_SYS_FCNTL_H_) # include #elif defined(_HAVE_FCNTL_H_) # include #endif #include "lib/config.h" #include "lib/assert.h" #include "lib/jreadline.h" #include "lib/jregex.h" #include "lib/loadfile.h" #include "lib/output.h" #include "lib/replace.h" #include "lib/strsave.h" #include "lib/xmalloc.h" #include "lib/euc.h" #include "lookup.h" #include "eval.h" /* * The input string is "0", "1", "on", or "off". * Return 0 for "0" or "off", 1 otherwise. */ static unsigned interpret_boolean(String *str) { if (str[0] == '0' || str[1] == 'f' || str[1] == 'F') return 0; if (str[0] == '1' || str[1] == 'n' || str[1] == 'N') return 1; die("\n", str); return 0; /* in case assert isn't on */ } /* * Returned by each command. */ #define COMMAND_RUNS_OK 0 #define COMMAND_HAS_ERROR 1 #define SIMPLE_GLOBAL_FLAG(VAR, DESC) \ static int cmd_set_default_ ## VAR ## _flag(String *bool) \ { \ String *when = (String*)"is"; \ if (bool) \ { \ lookup.flag.VAR = interpret_boolean(bool); \ if (!lookup.flag.verbose) \ return COMMAND_RUNS_OK; \ when = (String*)"now"; \ } \ outputf("%s %s %s.\n", DESC, when, lookup.flag.VAR ? "on" : "off"); \ return COMMAND_RUNS_OK; \ } #define SIMPLE_LOCAL_FLAG(VAR, DESC) \ static int cmd_set_local_ ## VAR ## _flag(String *bool) \ { \ String *when = (String*)"is"; \ kibishii_assert(lookup.slot != 0); \ if (bool) { \ lookup.slot->default_flag.VAR = interpret_boolean(bool); \ if (!lookup.flag.verbose) \ return COMMAND_RUNS_OK; \ when = (String*)"now"; \ } \ outputf("slot #%d's %s %s %s.\n", slot_num(lookup.slot), \ DESC, when, (lookup.slot->default_flag.VAR) ? "on" : "off"); \ return COMMAND_RUNS_OK; \ } #define GLOBAL_AND_LOCAL_FLAG(VAR, DESC) \ SIMPLE_GLOBAL_FLAG(VAR, DESC) \ SIMPLE_LOCAL_FLAG(VAR, DESC) SIMPLE_GLOBAL_FLAG(debug, "general debugging") SIMPLE_GLOBAL_FLAG(regex_debug, "regex debugging") SIMPLE_GLOBAL_FLAG(cmd_debug, "command debugging") GLOBAL_AND_LOCAL_FLAG(word, "word preference mode") GLOBAL_AND_LOCAL_FLAG(glob, "wildcard-pattern mode") GLOBAL_AND_LOCAL_FLAG(fold, "case folding") GLOBAL_AND_LOCAL_FLAG(display, "matching-line display") GLOBAL_AND_LOCAL_FLAG(fuzz, "fuzzification") GLOBAL_AND_LOCAL_FLAG(highlight, "highlighting") GLOBAL_AND_LOCAL_FLAG(autokana, "auto-kana") /* * Used to turn verbosity on or off, or report its value. * Specialized version of command_flag() so that we don't say "verbosity on" * when we turn it on. */ static int set_verbose_flag(String *bool) { if (bool) { lookup.flag.verbose = interpret_boolean(bool); return COMMAND_RUNS_OK; } outputf("verbosity is %s.\n", lookup.flag.verbose ? "on" : "off"); return COMMAND_RUNS_OK; } /* * Given a filename, load it. */ static int cmd_load(const char *filename) { int slotnum = load_file(expand_filename_tilde(filename), LOADFILE_READifPRESENT); if (slotnum >= 0) { lookup.slot = lookup.slot_info[slotnum]; return COMMAND_RUNS_OK; } return COMMAND_HAS_ERROR; } int cmd_log(int closelog, int append, String *File) { #ifndef LOG_FILE_SUPPORT outputf("%s: log file support not compiled in\n", lookup.where); #else int fd = set_extra_output_file(JUST_CHECKING_OUTPUT_FILE); kibishii_assert((!closelog && !append && !File) || /* no args ok */ (closelog && !append && !File) || /* or just close ok */ !closelog); /* or not close ok */ kibishii_assert(!append || File); /* APPEND only allowed with FILE */ kibishii_assert(!output_fd_valid(fd) == !current_log_file); if (closelog) { /* close any current log */ if (!output_fd_valid(fd)) { if (lookup.flag.verbose) outputf("%sno log file to close\n", lookup.where); } else { flush_output(); close(fd); set_extra_output_file(NO_OUTPUT_FILE); if (lookup.flag.verbose) outputf("%swrote log file" quote(%s) "\n", lookup.where, current_log_file); free(current_log_file); current_log_file = 0; } return COMMAND_RUNS_OK; /* close-failure should not stop cmd loads */ } if (File) { /* open a new file */ const char *file = (const char *)File; int flags = O_WRONLY|O_CREAT | (append ? O_APPEND : O_TRUNC); if (output_fd_valid(fd)) { outputf("%salready logging to" quote(%s) "\n", lookup.where, current_log_file); return COMMAND_HAS_ERROR; } file = expand_filename_tilde(file); if (fd = open(file, flags, 0644), fd <0) { outputf("%sbad open of" quote(%s) ": %n", lookup.where, file); return COMMAND_HAS_ERROR; } if (lookup.flag.verbose) outputf("%soutput now %sing to" quote(%s) "\n", lookup.where, append ? "append" : "logg", file); flush_output(); /* * If we're appending, we'll add a line to the log, but not to * the normal output. So we'll leave the log unset, set the normal * output temporarily to the log file, print, then put the normal * output back. */ if (append) { int normal_fd = set_normal_output_file(fd); output("\n---------------------------------------------------\n"); flush_output(); set_normal_output_file(normal_fd); } set_extra_output_file(fd); /* set the log */ current_log_file = (const unsigned char *)strsave((String *)file); return COMMAND_RUNS_OK; } /* just asking about the current status */ if (!output_fd_valid(fd)) output("output not currently logged.\n"); else outputf("output currently logged to" quote(%s) "\n", current_log_file); #endif /* LOG_FILE_SUPPORT */ return COMMAND_RUNS_OK; } /* * Set the prompt, or show the current one. */ static int cmd_set_prompt(int local, String *new_prompt) { kibishii_assert(!local || lookup.slot); if (new_prompt == 0) { if (!local) outputf("prompt format is" quote(%s) "\n", lookup.prompt_format); else if (lookup.slot->prompt_format) outputf("local prompt format for selected slot is" quote(%s) "\n", lookup.slot->prompt_format); else outputf("no local prompt for selected slot.\n"); } else { if (local) { if (lookup.slot->prompt_format) free(lookup.slot->prompt_format); lookup.slot->prompt_format = strsave(new_prompt); } else { if (lookup.prompt_format) free(lookup.prompt_format); lookup.prompt_format = strsave(new_prompt); } } return COMMAND_RUNS_OK; } /* * Clear the screen. */ static int cmd_clear(void) { static enum { unchecked, yes, no} is_xterm = unchecked; if (is_xterm == unchecked) { extern const char *getenv(const char *); String *term = (String *)getenv("TERM"); if (term && (strNcmp(term, "kterm", 5) == 0 || strNcmp(term, "xterm", 5) == 0 || strNcmp(term, "vt100", 5) == 0)) { is_xterm = yes; } else { is_xterm = no; } } if (is_xterm == yes) { (void)output_pager_transparent(1); output("\033[H\33[2J"); (void)output_pager_transparent(0); output_pager_reset_more(); flush_output(); } else { flush_output(); system("clear"); } return COMMAND_RUNS_OK; } static int cmd_toggle_filter(String *bool) { const char *when = ""; kibishii_assert(lookup.slot); if (!COMBO(lookup.slot) && lookup.slot->filter_spec.pattern == 0) { outputf("%sno filter installed.\n", lookup.where); return bool ? COMMAND_HAS_ERROR : COMMAND_RUNS_OK; } if (bool) { lookup.slot->default_flag.filter = interpret_boolean(bool); if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now "; } output("filter "); if (COMBO(lookup.slot)) { outputf("flag is %s for combo slot #%d.\n", lookup.slot->default_flag.filter ? "on" : "off", slot_num(lookup.slot)); } else { if (lookup.slot->filter_spec.name) outputf("(" quote(%s) ") ", lookup.slot->filter_spec.name); outputf("(%s%sabled): %s" quote(%s) "\n", when, lookup.slot->default_flag.filter ? "en" : "dis", lookup.slot->filter_spec.negative ? "!" : "", lookup.slot->filter_spec.pattern); if (lookup.slot->default_flag.filter) { if (lookup.list.size == 0) outputf("%snote: no filtered lines will be saved (change with " "\"set filter list\").\n", lookup.where); else outputf("(first %d filtered lines will be saved each time).\n", lookup.list.size); } } return COMMAND_RUNS_OK; } /* set the filter for a file */ static int cmd_filter(String *filter_regex, String *filter_name, int is_negative, int is_case_insensitive) { unsigned flags = REGCOMP_SAVE_MATCHED_PAREN_INFO; kibishii_assert(lookup.slot); if (COMBO(lookup.slot)) { outputf("%sslot is a combo; no filter allowed.\n", lookup.where); return COMMAND_HAS_ERROR; } if (is_case_insensitive) flags |= REGCOMP_IGNORE_CASE; if (lookup.slot->filter_spec.name) { free(lookup.slot->filter_spec.name); lookup.slot->filter_spec.name = 0; } if (lookup.slot->filter_spec.pattern) { free(lookup.slot->filter_spec.pattern); lookup.slot->filter_spec.pattern = 0; regfree(&lookup.slot->filter_spec.regex); } if (regcomp(&lookup.slot->filter_spec.regex, filter_regex, flags) != REGCOMP_SUCCESS) { outputf("%s%s.\n", lookup.where, (string*)regcomp_error_report()); return COMMAND_HAS_ERROR; } lookup.slot->filter_spec.name = filter_name ? strsave(filter_name) : 0; lookup.slot->filter_spec.pattern = strsave(filter_regex); lookup.slot->filter_spec.negative = is_negative; lookup.slot->default_flag.filter = 1; return COMMAND_RUNS_OK; } /* set the size of the filter, etc, list */ int cmd_list_size(String *spec) { const char *when = "is"; if (spec) { int length = atoi(spec); if (length != lookup.list.size) /* no work if already that size */ { void *mem = 0; if (length) { mem = xmalloc(length * sizeof(lookup.list.array[0])); if (mem == 0) { outputf("%scan't allocate memory for filter " "list of length %d.\n", lookup.where, length); return COMMAND_HAS_ERROR; } } if (lookup.list.array) free(lookup.list.array); lookup.list.array = mem; lookup.list.size = length; lookup.list.used = 0; } if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } outputf("maximum filter list size %s: %d.\n", when, lookup.list.size); if (lookup.list.used) outputf("current lines held: %d.\n", lookup.list.used); return COMMAND_RUNS_OK; } static int cmd_set_limit(String *spec) { const char *when = "is"; if (spec) { int value = atoi(spec); if (value < 0) { outputf("%slimit of %d makes no sense.\n", lookup.where, value); return COMMAND_HAS_ERROR; } lookup.max_lines_to_print = value; if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } if (lookup.max_lines_to_print == 0) outputf("line-per-search limit %s disabled.\n", when); else outputf("line-per-search limit %s %ld.\n", when, lookup.max_lines_to_print); return COMMAND_RUNS_OK; } static int cmd_do_search(String *search) { string *copy; int clear_slot_when_done = 0; if (lookup.slot == 0) { if (lookup.default_slot) { lookup.slot = lookup.default_slot; clear_slot_when_done = 0; } else { outputf("%sno files loaded.\n", lookup.where); return COMMAND_HAS_ERROR; } } copy = strsave(search); (void)process_input_line(copy, 1); free(copy); if (clear_slot_when_done) lookup.slot = 0; return COMMAND_RUNS_OK; } static int cmd_if(String *expr, String *command) { int value = eval(expr); if (eval_error_val != EVAL_OK) { outputf("ERROR: %s\nAT-----", expr); while (expr++ < eval_error_loc) outchar('-'); outputf("^\n%sexpression: [%s]\n", lookup.where, eval_errstr[eval_error_val]); return COMMAND_HAS_ERROR; } if (value) { int length = str_len(command)+1; string *writable = xmalloc(length); MOVE_MEMORY(command, writable, length); parse_command(writable, str_len(writable), CMD_GENERAL|CMD_FILE_ONLY, 0); free(writable); } return COMMAND_RUNS_OK; } static int cmd_msg(String *msg) { #ifdef HAVE_SPINNER /* output an initial space if the first char is a tab, so that it will be sure to erase any spinner */ if (msg[0] == '\t' && lookup.spinner.interval) { #ifndef LOG_FILE_SUPPORT outchar(' '); #else /* * But we don't want to output the space to the log file, * (if there is one), so we have to be careful about that. */ int log_fd = set_extra_output_file(JUST_CHECKING_OUTPUT_FILE); if (current_log_file == 0) outchar(' '); else { flush_output(); set_extra_output_file(NO_OUTPUT_FILE); /* turn logging off */ outchar(' '); flush_output(); set_extra_output_file(log_fd); } #endif /* LOG_FILE_SUPPORT */ } #endif /* HAVE_SPINNER */ output((const char *)msg); outchar('\n'); return COMMAND_RUNS_OK; } static int cmd_exit(void) { exit_program_now = 1; return COMMAND_RUNS_OK; } static int cmd_tag(String *boolean, String *tag) { const char *when = "is"; kibishii_assert(lookup.slot); if (tag) { if (COMBO(lookup.slot)) { outputf("%sMay not set a tag string for a combo slot.\n", lookup.where); return COMMAND_HAS_ERROR; } else { if (lookup.slot->tag_string) free(lookup.slot->tag_string); lookup.slot->tag_string = strsave(tag); when = "now"; } } if (boolean) { if (!COMBO(lookup.slot) && lookup.slot->tag_string == 0) { outputf("%sno tag string to toggle.\n", lookup.where); return COMMAND_RUNS_OK; /* make a non-fatal error in cmd files */ } lookup.slot->default_flag.tag = interpret_boolean(boolean); when = "now"; } if ((tag || boolean) && !lookup.flag.verbose) return COMMAND_RUNS_OK; if (COMBO(lookup.slot)) outputf("tag mode for combo #%d %s %s.\n", slot_num(lookup.slot), when, lookup.slot->default_flag.tag ? "on" : "off"); else if (lookup.slot->tag_string == 0) outputf("there is no tag for file #%d.\n", slot_num(lookup.slot)); else outputf("tag for file #%d %s" quote(%s) "(%s).\n", slot_num(lookup.slot), when, lookup.slot->tag_string, lookup.slot->default_flag.tag ? "enabled" : "disabled"); return COMMAND_RUNS_OK; } /* set the spinner value */ static int cmd_set_spinner(String *spec) { #ifdef HAVE_SPINNER const char *when = "is"; if (spec) { lookup.spinner.interval = atoi(spec); if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } if (lookup.spinner.interval == 0) outputf("spinner %s disabled.\n", when); else outputf("spinner %s set to: %d.\n", when, lookup.spinner.interval); #else /* HAVE_SPINNER */ outputf("no spinner support.\n"); #endif /* HAVE_SPINNER */ return COMMAND_RUNS_OK; } int cmd_show(void) { int i; if (lookup.list.size == 0) output("the save list is disabled (its size is zero).\n"); else if (lookup.list.used == 0) output("there are currently no saved lines.\n"); else { unsigned old_regexec_flags = regexec_setflags(lookup.flag.regex_debug ? REGEXEC_DEBUG : 0); for (i = 0; i < lookup.list.used; i++) { const struct slot_info *slot = lookup.list.array[i].slot; String *line; unsigned len; line = VirtPos2Str(slot->file->v, lookup.list.array[i].line, &len); output_line(slot, line, len); } regexec_setflags(old_regexec_flags); } return COMMAND_RUNS_OK; } static void add_combo_slot(struct slot_info *parent, unsigned parent_slot_num, unsigned new_slot_num) { unsigned i; kibishii_assert(parent_slot_num == lookup.slots || parent_slot_num == slot_num(parent)); kibishii_assert(new_slot_num < lookup.slots); kibishii_assert(!COMBO(lookup.slot_info[new_slot_num])); for (i = 0; i < parent->combo.entries; i++) if (parent->combo.entry[i] == new_slot_num) { if (lookup.flag.verbose) outputf(" (%d)", new_slot_num); return; } /* * I don't see how the following could be possible if we get this * far, so I make it a kibishii assert. */ kibishii_assert(parent->combo.entries < MAX_LOADED_FILES); parent->combo.entry[parent->combo.entries++] = new_slot_num; if (lookup.flag.verbose) outputf(" %d", new_slot_num); } static int cmd_combine(String *name, int num, String *new_nums) { int parent_slot_num; struct slot_info *slot; unsigned new_num_list[MAX_LOADED_FILES]; unsigned new_num_next = 0; unsigned new_num_seen[MAX_LOADED_FILES]; const char *what; int i; kibishii_assert(new_nums != 0); for (i = 0; i < MAX_LOADED_FILES; i++) new_num_seen[i] = 0; /* check new num list to make sure all nums are valid */ do { int new_num = 0; /* skip to first digit (past a possible '#' or whitespace, etc.) */ while (*new_nums && !isdigit(*new_nums)) new_nums++; kibishii_assert(isdigit(*new_nums)); while (isdigit(*new_nums)) new_num = new_num * 10 + (*new_nums++ - '0'); if (new_num >= lookup.slots) { outputf("%sno such slot #%d\n", lookup.where, new_num); return COMMAND_HAS_ERROR; } if (!new_num_seen[new_num]) { kibishii_assert(new_num_next < MAX_LOADED_FILES); new_num_seen[new_num] = 0; new_num_list[new_num_next++] = new_num; } /* skip along until end of line or another number */ while (*new_nums && !isdigit(*new_nums)) new_nums++; } while (*new_nums); if (num >= 0) { /* use previous combo */ what = "adding to"; if (num > lookup.slots || !COMBO(lookup.slot_info[num])) { outputf("%sno previous combo in slot %d!\n", lookup.where, num); return COMMAND_HAS_ERROR; } slot = lookup.slot_info[num]; lookup.slot = slot; /* needed during startup */ parent_slot_num = slot_num(slot); } else { /* need a new slot */ what = "creating"; if (lookup.slots >= MAX_LOADED_FILES) { outputf("%scan't combine (too many loaded slots, max is %d)\n", lookup.where, MAX_LOADED_FILES); return COMMAND_HAS_ERROR; } slot = xmalloc(sizeof(struct slot_info)); bzero(slot, sizeof(struct slot_info)); #ifdef KIBISHII_DEBUG { char *ptr = (char*)&slot->onefile_or_combo; char *end = ((char*)slot) + sizeof(struct slot_info); while (ptr < end) *ptr++ = 0x7f; /* fill with some garbage */ slot->combo.name = 0; /* but leave these null */ slot->combo.entries = 0;/* but leave these null */ } #endif if (!name) name = (String *)"combo"; parent_slot_num = lookup.slots; lookup.slot_info[lookup.slots++] = slot; /* install new slot */ slot->default_flag.word = 1; slot->default_flag.fuzz = 1; slot->default_flag.fold = 1; slot->default_flag.highlight = 1; slot->default_flag.filter = 1; slot->default_flag.modify = 1; slot->default_flag.autokana = lookup.flag.autokana; slot->default_flag.tag = 1; slot->default_flag.display = 1; lookup.slot = slot; /* needed during startup */ } kibishii_assert(COMBO(slot)); if (lookup.flag.verbose) outputf("%s combo slot #%d", what, parent_slot_num); if (name) { if (slot->combo.name) free(slot->combo.name); slot->combo.name = strsave(name); outputf("(%s)", name); } outchar(':'); /* add new numbs */ for (i = 0; i < new_num_next; i++) { int new_num = new_num_list[i]; if (!COMBO(lookup.slot_info[new_num])) add_combo_slot(slot, parent_slot_num, new_num); else { /* bring in entries from that slot */ unsigned j; for (j = 0; j < lookup.slot_info[new_num]->combo.entries; j++) { int new_new_num = lookup.slot_info[new_num]->combo.entry[j]; kibishii_assert(!COMBO(lookup.slot_info[new_new_num])); if (!new_num_seen[new_new_num]) add_combo_slot(slot, parent_slot_num, new_new_num); } } } if (lookup.flag.verbose) outchar('\n'); return COMMAND_RUNS_OK; } static int cmd_toggle_modify(String *bool) { const char *when = ""; kibishii_assert(lookup.slot); if (!COMBO(lookup.slot) && lookup.slot->modify_spec.pattern == 0) { outputf("%sno modification installed.\n", lookup.where); return bool ? COMMAND_HAS_ERROR : COMMAND_RUNS_OK; } if (bool) { lookup.slot->default_flag.modify = interpret_boolean(bool); if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now "; } output("modify "); if (COMBO(lookup.slot)) { outputf("flag is %s for combo slot #%d.\n", lookup.slot->default_flag.modify ? "on" : "off", slot_num(lookup.slot)); } else { outputf("(%s%sabled):" quote(%s) "with ", when, lookup.slot->default_flag.modify ? "en" : "dis", lookup.slot->modify_spec.pattern); if (lookup.slot->modify_spec.replacement[0] == 0) output("nothing\n"); else outputf(quote(%s) "\n", lookup.slot->modify_spec.replacement); } return COMMAND_RUNS_OK; } static int cmd_modify(String *pattern, String *replacement, int is_case_insensitive, int is_global) { int i; kibishii_assert(lookup.slot); if (COMBO(lookup.slot)) { outputf("%sslot is a combo; no modify allowed.\n", lookup.where); return COMMAND_HAS_ERROR; } assert(pattern); assert(replacement); if (lookup.slots == 0) { outputf("%sno loaded file for modification.\n", lookup.where); return COMMAND_HAS_ERROR; } if (lookup.slot->modify_spec.pattern) { free(lookup.slot->modify_spec.pattern); regfree(&lookup.slot->modify_spec.regex); free(lookup.slot->modify_spec.replacement); } lookup.slot->modify_spec.pattern = lookup.slot->modify_spec.replacement = 0; i = regcomp(&lookup.slot->modify_spec.regex, pattern, REGCOMP_SAVE_MATCHED_PAREN_INFO | (is_case_insensitive ? REGCOMP_IGNORE_CASE : 0)); if (i != REGCOMP_SUCCESS) { outputf("%s%s.\n", lookup.where, (string *)regcomp_error_report()); return COMMAND_HAS_ERROR; } lookup.slot->modify_spec.pattern = strsave(pattern); lookup.slot->modify_spec.replacement = strsave(replacement); lookup.slot->modify_spec.global = is_global; lookup.slot->default_flag.modify = 1; return COMMAND_RUNS_OK; } /* * At most one of NUMBER, NAME, or USE_DEFAULT will be non-null. */ static int cmd_select(String *number, const char *name, int use_default) { const char *when = "is"; struct slot_info *selected = 0; if (name) { int i; name = expand_filename_tilde(name); for (i = 0; i < lookup.slots && !selected; i++) if ((lookup.slot_info[i]->file && (str_eql(lookup.slot_info[i]->file->short_filename, name)|| str_eql(lookup.slot_info[i]->file->v->filename, name))) || (COMBO(lookup.slot_info[i]) && str_eql(lookup.slot_info[i]->combo.name, name))) { selected = lookup.slot_info[i]; break; } if (selected == 0) { outputf("%sno" quote(%s) "slot loaded.\n", lookup.where, name); return COMMAND_HAS_ERROR; } } else if (number) { unsigned value = atoi(number); if (value >= lookup.slots) { outputf("%sno such slot #%d loaded.\n", lookup.where, value); return COMMAND_HAS_ERROR; } selected = lookup.slot_info[value]; } else if (use_default) { if (lookup.default_slot == 0) { outputf("%sno default file to use\n", lookup.where); return COMMAND_HAS_ERROR; } selected = lookup.default_slot; } if (selected) { lookup.slot = selected; /* needed during startup */ if (lookup.default_slot == selected) when = "already"; else { lookup.default_slot = selected; when = "now"; } if (!lookup.flag.verbose) return COMMAND_RUNS_OK; } if (lookup.slot == 0) { outputf("%sno file %s.\n", lookup.where, lookup.default_slot ? "selected" : "loaded"); return COMMAND_HAS_ERROR; } outputf("default slot %s #%d (%s).\n", when, slot_num(lookup.default_slot), COMBO(lookup.default_slot) ? (const char *)lookup.default_slot->combo.name : (const char *)lookup.default_slot->file->v->filename); return COMMAND_RUNS_OK; } static int cmd_list_files(int help) { #define ALTTOP "┏━━━━━━━━━━━━━━" #define TOP "━┳━━━━━┯━━┳━━━┳" #define BOTTOM "┗━┻━━━━━┷━━┻━━━┻" static char LINE[] = "━━━━━━━━━━━━━━━━━━━━━━━━" "━━━━━━━━━━━━━━━━━━━━━━━━━━━━"; #define min(A,B) ((A)<(B)?(A):(B)) int i; int extra = output_pager_columns(0) ? output_pager_columns(0) - sizeof(TOP) : 0; if (lookup.slots == 0) { outputf("%sno files loaded\n", lookup.where); return COMMAND_HAS_ERROR; } if (extra <= 0) extra = 0; else { extra = min(extra, sizeof(LINE)-1); extra &= ~1; /* round down to an even number of bytes */ } if (!help) { outputf(ALTTOP "%.*s\n", extra, LINE); outputf("┃%s's \"Lookup\", %s (%s).\n┣", author_name, version_string,version_date); } else { output( " +-------------F: has filter; #: but disabled (!F!, filter)\n" " |+------------M: has modify; %: but disabled (!M!, modify)\n" " ||+-----------w: word-preference mode (!w!, word)\n" " |||+----------c: case folding (!c!, fold)\n" " ||||+---------f: fuzz mode (!f!, fuzz)\n" " |||||+--------h: highlight mode (!h!, highlight)\n" " ||||||+-------t: has tag; @:but disabled (!T!, tag)\n" " |||||||+------d: will display (!d!, display)\n" " |||||||| +---a: automatic kana conversion (=, autokana)\n" " |||||||| |+--P: slot has local prompt (local prompt)\n" " |||||||| ||+-I: file loaded with precomputed-index\n" " FMwcfhtd aPI\n" "┏" ); } outputf(TOP "%.*s\n", extra, LINE); for (i = 0; i < lookup.slots; i++) { char buf1[12], buf2[10]; buf1[0] = lookup.slot_info[i]->default_flag.filter ? 'F' : (!COMBO(lookup.slot_info[i]) && lookup.slot_info[i]->filter_spec.pattern) ? '#' : ' '; buf1[1] = lookup.slot_info[i]->default_flag.modify ? 'M' : (!COMBO(lookup.slot_info[i]) && lookup.slot_info[i]->modify_spec.pattern) ? '%' : ' '; buf1[2] = lookup.slot_info[i]->default_flag.word ? 'w' : ' '; buf1[3] = lookup.slot_info[i]->default_flag.fold ? 'c' : ' '; buf1[4] = lookup.slot_info[i]->default_flag.fuzz ? 'f' : ' '; buf1[5] = lookup.slot_info[i]->default_flag.glob ? 'W' : ' '; buf1[6] = lookup.slot_info[i]->default_flag.highlight ? 'h' : ' '; buf1[7] = lookup.slot_info[i]->default_flag.tag ? 't' : (!COMBO(lookup.slot_info[i]) && lookup.slot_info[i]->tag_string) ? '@' : ' '; buf1[8] = lookup.slot_info[i]->default_flag.display ? 'd' : ' '; buf1[9] = ' '; buf1[10] = '\0'; buf2[0] = lookup.slot_info[i]->default_flag.autokana ? 'a' : ' '; buf2[1] = lookup.slot_info[i]->prompt_format ? 'P' : ' '; buf2[2] =(!COMBO(lookup.slot_info[i]) && lookup.slot_info[i]->file->indexfile) ? (IsMemIndex(lookup.slot_info[i]->file->index) ? 'i' : 'I') : ' '; buf2[3] = ' '; if (buf2[3] == ':') buf2[3] = ' '; buf2[4] = '\0'; outputf("┃%c%d┃%s│%s┃", lookup.slot_info[i] == lookup.default_slot ? '*' : ' ', i, buf1, buf2); if (!COMBO(lookup.slot_info[i])) outputf("%5dk┃%s\n", (lookup.slot_info[i]->file->v->length+1023)/1024, lookup.slot_info[i]->file->v->filename); else { unsigned int j; outputf(" combo┃%s (", lookup.slot_info[i]->combo.name); for (j = 0; j < lookup.slot_info[i]->combo.entries; j++) outputf("%s#%d", j ? ", " : "", lookup.slot_info[i]->combo.entry[j]); output(")\n"); } } outputf(BOTTOM "%.*s\n\n", extra, LINE); return COMMAND_RUNS_OK; } static int cmd_source(const char *filename) { filename = expand_filename_tilde(filename); switch (read_commands_from_file(filename, CMD_GENERAL|CMD_FILE_ONLY, 0)) { case FILE_READ_OK: return COMMAND_RUNS_OK; case FILE_NOT_FOUND: outputf("%sfile" quote(%s) "not found.\n", lookup.where, filename); break; } return COMMAND_HAS_ERROR; } static int cmd_stats(void) { outputf("%ld line%s checked, %ld matched", lookup.count.checked, lookup.count.checked == 1 ? "" : "s", lookup.count.matched); if (lookup.count.matched) { if (lookup.count.filtered) outputf(", %ld filtered", lookup.count.filtered); if (lookup.count.nonword) outputf(", %ld non-word", lookup.count.nonword); outputf(", %ld printed", lookup.count.printed); } output(".\n"); return COMMAND_RUNS_OK; } #define X208kuten2euc_byte1(R, C) ((R) + 160) #define X208kuten2euc_byte2(R, C) ((C) + 160) #define X208kuten2euc(R, C) X208kuten2euc_byte1(R, C),\ X208kuten2euc_byte2(R, C) static int describe(unsigned char c1, unsigned char c2) { unsigned char k1, k2, j1, j2; unsigned char C1 = c1 | 0x80; unsigned char C2 = c2 | 0x80; c1 &= 0x7f; c2 &= 0x7f; outputf(quote(%c%c) "as EUC is 0x%02x%02x (%3d %3d; \\%03o \\%03o)\n", C1, C2, C1, C2, C1, C2, C1, C2); outputf(" as JIS is 0x%02x%02x (%3d %3d; \\%03o \\%03o \"%c%c\")\n", c1, c2, c1, c2, c1, c2, c1 ,c2); k1 = c1 - 32; k2 = c2 - 32; outputf(" as KUTEN is %02d%02d ( 0x%02x%02x; \\%03o \\%03o)\n", k1, k2, k1, k2, k1, k2); j1 = ((c1 + 1) >> 1U) + (c1 < 95 ? 112 : 176); j2 = c2 + ((c1 & 1) ? (c2 > 95 ? 32 : 31) : 126); outputf(" as S-JIS is 0x%02x%02x (%3d %3d; \\%03o \\%03o)\n", j1, j2, j1, j2, j1, j2); return COMMAND_RUNS_OK; } static int cmd_describe_ascii(unsigned base, String *text) { unsigned num = 0; unsigned char c; while (c = *text++, c) { if (c >= 'a' && c <= 'f') num = num * base + c - 'a' + 10; else if (c >= 'A' && c <= 'F') num = num * base + c - 'A' + 10; else num = num * base + c - '0'; } outputf("ASCII \\%03o %d 0x%02x", num, num, num); if (isascii(num) && isprint(num)) outputf(": \"%c\"", num); outputf("\n"); return COMMAND_RUNS_OK; } /* * CHARACTER points to either a single ascii byte to describe, or * a string of double-byte characters. */ static int cmd_describe_raw(String *text) { while (*text) { unsigned c1, c2; switch (EUC_CHAR_LENGTH(*text)) { default: { int len = EUC_CHAR_LENGTH(*text); outputf("%d-byte character: ", len); while (len--) outputf("0x%02x", *text++); continue; } case 2: /* "normal" EUC */ c1 = text[0]; c2 = text[1]; text += 2; break; case 1: /* ascii */ c1 = text[0]; text += 1; /* * attempt to convert to wide. */ if (c1 >= 'A' && c1 <= 'Z') { c2 = X208kuten2euc_byte2(3,33) + (c1 - 'A'); c1 = X208kuten2euc_byte1(3,33); } else if (c1 >= 'a' && c1 <= 'z') { c2 = X208kuten2euc_byte2(3,65)+ (c1 - 'a'); c1 = X208kuten2euc_byte1(3,65); } else if (c1 >= '0' && c1 <= '9') { c2 = X208kuten2euc_byte2(3,16)+ (c1 - '0'); c1 = X208kuten2euc_byte1(3,16); } else if (c1 >= ' ' && c1 <= '/') { unsigned char byte2[] = { X208kuten2euc_byte2(1, 1), /* space */ X208kuten2euc_byte2(1,10), /* ! */ X208kuten2euc_byte2(1,41), /* " */ X208kuten2euc_byte2(1,84), /* # */ X208kuten2euc_byte2(1,80), /* $ */ X208kuten2euc_byte2(1,83), /* % */ X208kuten2euc_byte2(1,85), /* & */ X208kuten2euc_byte2(1,39), /* ' */ X208kuten2euc_byte2(1,42), /* ( */ X208kuten2euc_byte2(1,43), /* ) */ X208kuten2euc_byte2(1,86), /* * */ X208kuten2euc_byte2(1,60), /* + */ X208kuten2euc_byte2(1, 4), /* , */ X208kuten2euc_byte2(1,61), /* - */ X208kuten2euc_byte2(1,03), /* . */ X208kuten2euc_byte2(1,31), /* / */ }; c2 = byte2[c1 - ' ']; c1 = X208kuten2euc_byte1(1, 0); } else if (c1 >= ':' && c1 <= '@') { unsigned char byte2[] = { X208kuten2euc_byte2(1, 7), /* : */ X208kuten2euc_byte2(1, 8), /* ; */ X208kuten2euc_byte2(1,67), /* < */ X208kuten2euc_byte2(1,65), /* = */ X208kuten2euc_byte2(1,68), /* > */ X208kuten2euc_byte2(1, 9), /* ? */ X208kuten2euc_byte2(1,87), /* @ */ }; c2 = byte2[c1 - ':']; c1 = X208kuten2euc_byte1(1, 0); } else if (c1 >= '[' && c1 <= '`') { unsigned char byte2[] = { X208kuten2euc_byte2(1,46), /* [ */ X208kuten2euc_byte2(1,79), /* \ */ X208kuten2euc_byte2(1,47), /* ] */ X208kuten2euc_byte2(1,16), /* ^ */ X208kuten2euc_byte2(1,18), /* _ */ X208kuten2euc_byte2(1,38), /* ` */ }; c2 = byte2[c1 - '[']; c1 = X208kuten2euc_byte1(1, 0); } else if (c1 >= '{' && c1 <= '~') { unsigned char byte2[] = { X208kuten2euc_byte2(1,48), /* { */ X208kuten2euc_byte2(1,35), /* | */ X208kuten2euc_byte2(1,49), /* } */ X208kuten2euc_byte2(1,33), /* ~ */ }; c2 = byte2[c1 - '{']; c1 = X208kuten2euc_byte1(1, 0); } else { outputf("ascii character %d\n", c1); continue; } break; } describe(c1, c2); } return COMMAND_RUNS_OK; } /* * CODE is a four-digit kuten to describe. */ static int cmd_describe_kuten(String *code) { /* kuten will appear as "####". Interpret as decimal. */ unsigned c1 = (code[0]-'0') * 10 + (code[1]-'0'); unsigned c2 = (code[2]-'0') * 10 + (code[3]-'0'); if (c1 > 85 || c2 > 94 || (c1 >= 9 && c1 <= 15)) { warn("invalid KUTEN value %02d%02d\n", c1, c2); return COMMAND_HAS_ERROR; } c1 = (c1 | 0x80) + 32; c2 = (c2 | 0x80) + 32; return describe(c1, c2); } /* * Either CODE or ASCII will tell which character to describe... * if it's CODE, it'll be a pointer to a four-(hex)-digit string to be * taken as a JIS number. If it's ASCII, it'll be a string of 4, 8, 12, etc. * characters that are taken to be the ascii that JIS appears as, without * the escape characters to make the ascii be interpreted as JIS. * * In either case of CODE or ASCII, one the TYPE might be non-null and * be a pointer to "jis", "sjis", or "euc" to force the interpretation * to a different encoding. */ static int cmd_describe_encoding(String *type, String *code) { unsigned c1, c2; #define hexval(c) (c >= 'a' ? 10 + c - 'a' :\ c >= 'A' ? 10 + c - 'A' : c - '0') c1 = (hexval(code[0])<<4)|hexval(code[1]); c2 = (hexval(code[2])<<4)|hexval(code[3]); if (!type) type = (String *)"JIS"; /* default type is JIS */ switch(type[0]) { default: soft_assert(0); break; case '\0': /* no type? it's JIS */ case 'j': /* jis */ case 'J': /* jis */ if (code && (c1 < 31 || c1 > 126 || c2 < 32 || c2 > 126)) { type = (String *)"JIS"; invalid: warn("invalid %s value 0x%02x%02x\n", type, c1, c2); return COMMAND_HAS_ERROR; } /* we'll just use c1 & c2 as-is */ break; case 'e': /* euc */ case 'E': /* euc */ if (code && (c1 < 161 || c1 > 254 || c2 < 161 || c2 > 254)) { type = (String *)"EUC"; goto invalid; } /* we can use c1 and c2 as-is */ break; case 's': /* sjis */ case 'S': /* sjis */ if (code && (c1 < 129 || c1 > 239 || (c1 > 159 && c1 < 224) || c2 < 64 || c2 > 252 || (c2 > 126 && c2 < 128))) { type = (String *)"SJIS"; goto invalid; } c1 = (((c1 - (c1<160 ? 112:176))<<1) - (c2<159)); c2 = (c2 - (c2<159 ? (c2>127?32:31) : 126)); break; } describe(c1, c2); return COMMAND_RUNS_OK; } static int cmd_describe_jis_string(String *str) { unsigned c1, c2; while (*str) { c1 = *(str++); c2 = *(str++); if (c1 < 31 || c1 > 126 || c2 < 32 || c2 > 126) { warn("invalid JIS value 0x%02x%02x\n", c1, c2); return COMMAND_HAS_ERROR; } describe(c1, c2); } return COMMAND_RUNS_OK; } #ifdef USE_LOCAL_OUTPUT static void show_output_style_spec(void) { unsigned output_style = select_output_style(INQUIRE_ONLY); switch(output_style & _BASIC_OUTPUT_TYPE) { default: soft_assert(0); break; case SJIS_OUTPUT: output("sjis"); break; case EUC_OUTPUT: output("euc"); break; case JIS_OUTPUT: switch(output_style & _JIS_KANJI_STYLE) { default: soft_assert(0); break; case JIS_1978_OUTPUT: output("jis78"); break; case JIS_1983_OUTPUT: output("jis83"); break; case JIS_1990_OUTPUT: output("jis90"); break; } switch(output_style & _JIS_ENGLISH_STYLE) { default: soft_assert(0); break; case JIS_ROMAN: output("-roman"); break; case JIS_ASCII: output("-ascii"); break; } break; } switch (output_style & _0212_1990) { default: soft_assert(0); break; case SUPPORT_0212_1990: outputf("-212"); break; case NO_0212_1990: outputf("-no212"); break; } switch (output_style & _KATAKANA) { default: soft_assert(0); break; case PASS_HW_KATANANA: output("-hwk"); break; case ELIDE_HW_KATAKANA: output("-nohwk"); break; case FOLD_HW_KATAKANA_TO_FULL: output("-foldhwk"); break; } switch(output_style & _NONDISPLAYABLE) { default: soft_assert(0); break; case OUTPUT_NONDISPLAYABLE: output("-disp"); break; case ELIDE_NONDISPLAYABLE: output("-nodisp"); break; case SHOW_NONDISPLAYABLE_CODES: output("-code"); break; case MARK_NONDISPLAYABLE: output("-mark"); } } #endif /* USE_LOCAL_OUTPUT */ int cmd_output_encoding(String *main_style, String *jis_year, String *jis_english, String *other) { #ifndef USE_LOCAL_OUTPUT output(""); #else const char *when = "is"; if (main_style) { if (main_style[0]=='e' || main_style[0]=='E') (void)select_output_style(EUC_OUTPUT); else if (main_style[0]=='s' || main_style[0]=='S') (void)select_output_style(SJIS_OUTPUT); else if (main_style[0]=='j' || main_style[0]=='j') { if (!jis_year) (void)select_output_style(JIS_OUTPUT); else if (jis_year[0] == '7') (void)select_output_style(JIS_1978_OUTPUT); else if (jis_year[0] == '8') (void)select_output_style(JIS_1983_OUTPUT); else if (jis_year[0] == '9') (void)select_output_style(JIS_1990_OUTPUT); else { assert(0); } if (jis_english) { if (jis_english[0] == 'a' || jis_english[0] == 'A') (void)select_output_style(JIS_ASCII); else if (jis_english[0] == 'r' || jis_english[0] == 'R') (void)select_output_style(JIS_ROMAN); else { assert(0); } } } else { assert(0); } } if (other) { unsigned char c; while (c = *other, c && !isalnum(c)) other++; do { String *start = other; unsigned len; while (c = *other, c && isalnum(c)) other++; len = other - start; if (!strNcmp(start, "", len)) { } else if (!strNcmp(start, "212", len)) { (void)select_output_style(SUPPORT_0212_1990); } else if (!strNcmp(start, "no212", len)) { (void)select_output_style(NO_0212_1990); } else if (!strNcmp(start, "hwk", len)) { (void)select_output_style(PASS_HW_KATANANA); } else if (!strNcmp(start, "nohwk", len)) { (void)select_output_style(ELIDE_HW_KATAKANA); } else if (!strNcmp(start, "foldhwk", len)) { (void)select_output_style(FOLD_HW_KATAKANA_TO_FULL); } else if (!strNcmp(start, "disp", len)) { (void)select_output_style(OUTPUT_NONDISPLAYABLE); } else if (!strNcmp(start, "nodisp", len)) { (void)select_output_style(ELIDE_NONDISPLAYABLE); } else if (!strNcmp(start, "code", len)) { (void)select_output_style(SHOW_NONDISPLAYABLE_CODES); } else if (!strNcmp(start, "mark", len)) { (void)select_output_style(MARK_NONDISPLAYABLE); } else { soft_assert(0); } while (c = *other, c && !isalnum(c)) other++; } while (c != 0); } if (main_style || other) { if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } outputf("Output encoding %s ", when); show_output_style(); output(".\nAn exact specifier string would be ``"); show_output_style_spec(); output("''.\n"); #endif /* USE_LOCAL_OUTPUT */ return COMMAND_RUNS_OK; } static int cmd_input_encoding(String *arg) { #if !defined(USE_LOCAL_OUTPUT) || defined(SERVER_CONFIG) output(""); #else const char *when = "is"; if (arg) { if (arg[0] == 'e' || arg[0] == 'E') jreadline_highbit_input(JREADLINE_EUC); else jreadline_highbit_input(JREADLINE_SJIS); if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } outputf("High-bit input encoding %s %s.\n", when, jreadline_highbit_input(JREADLINE_INQUIRE) == JREADLINE_EUC ? "EUC" : "SJIS"); #endif /* !defined(USE_LOCAL_OUTPUT) || defined(SERVER_CONFIG) */ return COMMAND_RUNS_OK; } static int cmd_encoding(String *arg) { #ifndef USE_LOCAL_OUTPUT output(""); #else if (arg) { if (arg[0] == 'e' || arg[0] == 'E') { cmd_input_encoding((String *)"euc"); cmd_output_encoding((String *)"euc", 0, 0, 0); } else if (arg[0] == 's' || arg[0] == 'S') { cmd_input_encoding((String *)"sjis"); cmd_output_encoding((String *)"sjis", 0, 0, 0); } else { if (lookup.flag.verbose) cmd_input_encoding(0); cmd_output_encoding((String *)"jis", 0, 0, 0); } } else { cmd_input_encoding(0); cmd_output_encoding(0, 0, 0, 0); } #endif /* USE_LOCAL_OUTPUT */ return COMMAND_RUNS_OK; } static int cmd_cmdchar(String *arg) { const char *when = "is"; if (arg) { lookup.cmdstart_char = *arg; if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } outputf("command start char %s" quote(%c) ".\n", when, lookup.cmdstart_char); return COMMAND_RUNS_OK; } int cmd_version(void) { outputf("%s: %s, %s (compiled %s)\n" #ifdef SERVER_CONFIG "-- server configuration --\n" #endif /* SERVER_CONFIG */ ,(string *)lookup.prog,version_string,version_date,compile_date); outputf("Author: %s\nComments and questions to: %s\n", author_name, contact_addr); return COMMAND_RUNS_OK; } static int cmd_pager(String *boolean, String *width_str, String *height_str) { #if OUTPUT_PAGER const char *when = "is"; if (boolean) { (void)output_pager_status(interpret_boolean(boolean)); if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } else if (height_str) { int height = atoi(height_str); (void)output_pager_lines(height); if (width_str) { int width = atoi(width_str); (void)set_jreadline_width(width); (void)output_pager_columns(width); } if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } outputf("pager %s %s (screen registered as %dx%d).\n", when, output_pager_status(-1) ? "on" : "off", output_pager_columns(0), output_pager_lines(0)); #else warn("%soutput pager not compiled in, ignoring.\n", lookup.where); #endif return COMMAND_RUNS_OK; } /* * If string is in the form <....>, such as "", then set highlighting to * wrap with and . */ static int cmd_set_highlighting_style(String *style) { const char *when = "is"; if (style) { if (strcmp((const char*)style, "black") == 0) lookup.flag.hl_style = HL_STYLE_BLACK; else if (strcmp((const char*)style, "red") == 0) lookup.flag.hl_style = HL_STYLE_RED; else if (strcmp((const char*)style, "green") == 0) lookup.flag.hl_style = HL_STYLE_GREEN; else if (strcmp((const char*)style, "yellow") == 0) lookup.flag.hl_style = HL_STYLE_YELLOW; else if (strcmp((const char*)style, "blue") == 0) lookup.flag.hl_style = HL_STYLE_BLUE; else if (strcmp((const char*)style, "purple") == 0) lookup.flag.hl_style = HL_STYLE_PURPLE; else if (strcmp((const char*)style, "cyan") == 0) lookup.flag.hl_style = HL_STYLE_CYAN; else if (strcmp((const char*)style, "white") == 0) lookup.flag.hl_style = HL_STYLE_WHITE; else if (strcmp((const char*)style, "bold") == 0) lookup.flag.hl_style = HL_STYLE_BOLD; else if (strcmp((const char*)style, "blink") == 0) lookup.flag.hl_style = HL_STYLE_FLASH; else if (strcmp((const char*)style, "under") == 0) lookup.flag.hl_style = HL_STYLE_UNDERLINE; else if (style[0] != '<') lookup.flag.hl_style = HL_STYLE_INVERSE; else { lookup.flag.hl_style = HL_STYLE_HTML; /* free any previous tag before saving new one */ if (lookup.slot->highlight_tag) free(lookup.slot->highlight_tag); lookup.slot->highlight_tag = strsave(style); } if (!lookup.flag.verbose) return COMMAND_RUNS_OK; when = "now"; } outputf("highlighting (which is %s for the current slot) %s %s.\n", lookup.slot->default_flag.highlight ? "on" : "off", when, (lookup.flag.hl_style == HL_STYLE_HTML) ? (const char *)lookup.slot->highlight_tag : (lookup.flag.hl_style == HL_STYLE_BOLD ? "bold":"inverse")); return COMMAND_RUNS_OK; } static int cmd_error(const char *error, String *command) { outputf("%s%s to" quote(%s) "command.\n", lookup.where, error, command); return COMMAND_HAS_ERROR; } /**********************************************************************/ /**********************************************************************/ struct command { unsigned flags; String *usage; String *help; String *pattern; int (*function)(void); regex_t compiled; }; #define MAX_PARENS_NEEDED_FOR_COMMANDS 9 static matched_paren_t cmd_paren_info[MAX_PARENS_NEEDED_FOR_COMMANDS]; static string *cmd_paren[MAX_PARENS_NEEDED_FOR_COMMANDS]; static int cmd_help(String *); /* forward */ #include "commands.h" #define command_count array_elements(command) static int cmd_help(String *str) { regex_t regex; int cmd, count; if (str == 0) { for (cmd = 0; cmd < command_count; cmd++) if (command[cmd].usage && command[cmd].help) outputf("%s\n %s\n", command[cmd].usage, command[cmd].help); return COMMAND_RUNS_OK; } /* list only items that match a particular regex */ if (regcomp(®ex, str, REGCOMP_IGNORE_CASE) != REGCOMP_SUCCESS) { outputf("%s%s.\n",lookup.where, (string *)regcomp_error_report()); return COMMAND_HAS_ERROR; } for (count = cmd = 0; cmd < command_count; cmd++) { if (command[cmd].usage && command[cmd].help && (regexec(®ex,command[cmd].usage,str_len(command[cmd].usage)) || regexec(®ex,command[cmd].help,str_len(command[cmd].help)))) { if (count++ == 0) { int len = output_pager_columns(0); while (len -=2, len > 0) output("─"); outchar('\n'); } outputf("%s\n %s\n", command[cmd].usage, command[cmd].help); } } regfree(®ex); if (count == 0) outputf("%snothing appropriate\n", lookup.where); return COMMAND_RUNS_OK; } /* * Given a text line, see if it's a command and if so, execute the command. * Return one of the defines from lookup.h. * ACCEPT are flags of accepted commands, while SKIP are flags of commands * to be ignored. */ int parse_command(String *line, unsigned len, unsigned accept, unsigned skip) { int cmd; if (lookup.flag.cmd_debug) outputf("parsing" quote(%.*s) "accept=%x, skip=%x.\n", (int)len, line, accept, skip); for (cmd = 0; cmd < command_count; cmd++) { int matches; matched_paren_t *prev_regexec_paren_info; unsigned prev_regexec_paren_info_size; /* skip commands that we haven't asked for */ if ((command[cmd].flags & accept) == 0) continue; /* if we've never compiled this pattern, do so now */ if ((command[cmd].flags & _IS_COMPILED_) == 0) { int i = regcomp(&command[cmd].compiled, command[cmd].pattern, REGCOMP_IGNORE_CASE|REGCOMP_SAVE_MATCHED_PAREN_INFO); if (i != REGCOMP_SUCCESS) { outputf("%sbad compile" quote(%s) "at" quote(%s) "%d.\n", lookup.where, command[cmd].pattern, regcomp_eptr, i); } assert(i == REGCOMP_SUCCESS); if (lookup.flag.cmd_debug) outputf("compiled" quote(%s) "\n", command[cmd].pattern); command[cmd].flags |= _IS_COMPILED_; } /* see if it matches */ prev_regexec_paren_info = regexec_paren_info; prev_regexec_paren_info_size = regexec_paren_info_size; regexec_paren_info = cmd_paren_info; regexec_paren_info_size = array_elements(cmd_paren_info); matches = regexec(&command[cmd].compiled, line, len); regexec_paren_info = prev_regexec_paren_info; regexec_paren_info_size = prev_regexec_paren_info_size; if (matches) { int paren, cmd_result; if (lookup.flag.cmd_debug) outputf("matches regex" quote(%s) "\n", command[cmd].pattern); if (command[cmd].flags & skip) { if (lookup.flag.debug) warn("skipping command [%.*s]\n", (int)len, line); return COMMAND_SKIPPED; } if (command[cmd].function == 0) return COMMAND_EXECUTED_OK; if (lookup.slot == 0 && (command[cmd].flags & CMD_NEEDS_SLOT)) { outputf("%sno file %s.\n", lookup.where, lookup.default_slot ? "selected" : "loaded"); return COMMAND_HAS_ERROR; } for (paren = 0; paren < array_elements(cmd_paren_info); paren++) { if (paren >= regexec_paren_info_used || cmd_paren_info[paren].match_start == 0 || cmd_paren_info[paren].match_end == 0) cmd_paren[paren] = 0; else { String *start = cmd_paren_info[paren].match_start; String *end = cmd_paren_info[paren].match_end; string *dest = xmalloc(end - start + 1); cmd_paren[paren] = dest; while (start < end) *dest++ = *start++; *dest = '\0'; } if (lookup.flag.cmd_debug) { if (cmd_paren[paren]) outputf("paren \\%d is [%s]\n", paren+1, cmd_paren[paren]); else outputf("paren \\%d is empty\n", paren+1); } } /* call the function */ cmd_result = (command[cmd].function)(); for (paren=0; paren < array_elements(cmd_paren_info); paren++) if (cmd_paren[paren]) { free(cmd_paren[paren]); cmd_paren[paren] = 0; } if (cmd_result != COMMAND_RUNS_OK) return COMMAND_EXECUTED_WITH_ERROR; return COMMAND_EXECUTED_OK; } } return COMMAND_NOT_FOUND; } int quick_command(String *str) { int retval, old_verbose = lookup.flag.verbose; lookup.flag.verbose = 0; retval = parse_command(str, str_len(str), CMD_GENERAL, 0); lookup.flag.verbose = old_verbose; return retval; } int read_commands_from_file(const char *file, unsigned accept, unsigned skip) { FILE *fp; #define FILE_INPUT_LINE_LEN (512 * 40) unsigned char line[FILE_INPUT_LINE_LEN]; int len, linenum = 0; String *old_where = lookup.where; struct slot_info *old_slot; int retval = FILE_READ_OK; /* open the file, return error if can't do so */ if (fp = fopen(file, "r"), fp == 0) return FILE_NOT_FOUND; lookup.where = xmalloc(str_len(file) + 20); /* * To enforce that while reading a file, the local-flag, etc. commands * only work with just-loaded files, we'll start out with "no files * loaded (from this command file)" by setting `lookup.slot' to null. */ old_slot = lookup.slot; lookup.slot = 0; /* for each line... */ while (fgets((char *)line, sizeof(line), fp)) { int i; linenum++; sprintf((void*)lookup.where, "\"%s\" line %d: ", file, linenum); /* strip traling newline */ len = str_len(line); if (len > 0 && line[len-1] == '\n') line[--len] = '\0'; else { if (len == FILE_INPUT_LINE_LEN - 1) { outputf("%sline too long for internal buffer.\n", lookup.where); retval = FILE_HAS_UNKNOWN_COMMAND; break; } } i = parse_command(line, len, accept, skip); if (i == COMMAND_NOT_FOUND) { outputf("%sunknown command:\n " quote(%s) "\n", lookup.where, line); retval = FILE_HAS_UNKNOWN_COMMAND; break; } else if (i == COMMAND_EXECUTED_WITH_ERROR) { retval = FILE_HAS_BAD_COMMAND; break; } else if (exit_program_now) { exit_program_now = 0; /* exit only the source file */ break; } else if (apply_regex_abort) break; } fclose(fp); free(lookup.where); lookup.where = old_where; lookup.slot = old_slot; return FILE_READ_OK; } lookup-1.08b.orig/lookup.h0100600000014400001440000002606506173452772015200 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ /* * This program contains EUC-encoded Japanese text. * If some 8bit-unfriendly program were to strip it, things won't compile. * If this box -->[]<-- is empty, this file is OK. If you see a comment- * close there, it's been stripped and you're screwed. */ #ifndef MAX_LOADED_FILES # define MAX_LOADED_FILES 10 #endif #ifndef MAX_PATS_ON_ONE_LINE # define MAX_PATS_ON_ONE_LINE 10 #endif /* set defaults */ #define HAVE_SPINNER #define LOG_FILE_SUPPORT #include "lib/virtfile.h" /* * SERVER_CONFIG * See the comments in lookup.c at service_socket() for info about * the lookup server. */ #ifdef SERVER_CONFIG # undef LOG_FILE_SUPPORT # undef HAVE_SPINNER # define SERV_TCP_PORT 9827 /* some random number */ #else # ifndef USE_LOCAL_OUTPUT # undef LOG_FILE_SUPPORT # endif #endif #define string unsigned char #define String const unsigned char struct flags { unsigned word:1; /* if true, whole-word matches take precidence */ unsigned fuzz:1; /* if true, fuzzification done when appropriate */ unsigned fold:1; /* if true, case folding on */ unsigned highlight:1;/* if true, matched part of line highlighted */ unsigned filter:1; /* if true, FILTER_SPEC exists and is enabled */ unsigned modify:1; /* if true, MODIFY_SPEC exists and is enabled */ unsigned autokana:1; /* if true, automatic kana conversion on */ unsigned tag:1; /* if true, TAG_STRING prepended to matched lines */ unsigned display:1; /* if true, matching lines are shown */ unsigned glob:1; /* if true, patterns are wildcard globs, not regexes */ /* the following only used in lookup.flags */ unsigned debug:1; unsigned regex_debug:1; unsigned cmd_debug:1; unsigned verbose:1; unsigned hl_style:4; #define HL_STYLE_INVERSE 0 #define HL_STYLE_BOLD 1 #define HL_STYLE_UNDERLINE 2 #define HL_STYLE_FLASH 3 #define HL_STYLE_RED 4 #define HL_STYLE_GREEN 5 #define HL_STYLE_YELLOW 6 #define HL_STYLE_BLUE 7 #define HL_STYLE_PURPLE 8 #define HL_STYLE_CYAN 9 #define HL_STYLE_WHITE 10 #define HL_STYLE_BLACK 11 #define HL_STYLE_HTML 12 /* text in highlight_tag below */ #define DECL_STYLES const char *styles[] = \ { \ "\033[7m" , /* inverse */ \ "\033[1m" , /* bold */ \ "\033[4m" , /* underline */ \ "\033[5m" , /* flashing */ \ "\033[31m", /* red */ \ "\033[32m", /* green */ \ "\033[33m", /* yellow */ \ "\033[34m", /* blue */ \ "\033[35m", /* purple */ \ "\033[36m", /* cyan */ \ "\033[37m", /* white */ \ "\033[30m", /* black */ \ }; }; /* * Stuff about a file loaded in a slot. */ struct slot_info { struct flags default_flag; /* Flags as they apply to this file. */ struct flags current_flag; /* Flags as they apply during a command */ String *prompt_format; /* prompt string specific for this file */ struct fileinfo *file; /* File and index (null if a COMBO) */ union { struct { struct { /* The modify info, if any. */ String *pattern; /* Regex to match (null if no modify) */ regex_t regex; /* The compiled regex itself. */ String *replacement; /* String to replace. */ unsigned global; /* True if it's a global replacement. */ } _modify_; #define modify_spec onefile_or_combo.onefile._modify_ struct { /* The filter info, if any. */ String *pattern; /* Regex to match (null if no filte) */ regex_t regex; /* The compiled regex itself. */ String *name; /* Optional name of the filter. */ unsigned negative:1; /* True if "filter if no match". */ } _filter_; #define filter_spec onefile_or_combo.onefile._filter_ String *_tag_; /* string to print before each output line */ #define tag_string onefile_or_combo.onefile._tag_ String *_highlight_tag_; /* for HTML highlighting */ #define highlight_tag onefile_or_combo.onefile._highlight_tag_ } onefile; struct { String *name; /* name of combo */ unsigned char entries; unsigned char entry[MAX_LOADED_FILES]; } _combo_; #define combo onefile_or_combo._combo_ } onefile_or_combo; }; #define COMBO(SLOT_INFO) ((SLOT_INFO)->file == 0) /* * General global variables. */ struct lookup { struct flags flag; unsigned char cmdstart_char; unsigned char percent; long int max_lines_to_print; long int lines_to_print_this_time; String *prompt_format; /* current prompt format string */ struct slot_info *slot_info[MAX_LOADED_FILES]; unsigned slots; /* number of used entries in slot[] */ struct slot_info *default_slot; /* the default selected slot */ struct slot_info *slot; /* slot to use for this command */ String *where; /* for error messages, indicates source of command */ const char *prog; /* argv[0] */ const char *prog_short; /* prog, but with any leading path stripped */ struct /* various line counts, grouped for clarity. */ { unsigned long checked; /* lines checked during search. */ unsigned long matched; /* lines that matched during search. */ unsigned long printed; /* lines printed during search. */ unsigned long nonword; /* nonword lines elided during search. */ unsigned long filtered; /* filtered lines elided during search. */ } count; struct /* the saved-line info */ { struct { const struct slot_info *slot; fileloc line; } *array; unsigned size; /* Size of array[]. */ unsigned used; /* Elements of array[] currently used. */ unsigned overflow; /* Number of elements that couldn't fit. */ } list; struct /* regexes we need to check. */ { String *pattern; /* Regex string */ regex_t regex; /* Actual compiled regex. */ unsigned char not; /* True if we want nonmatching lines. */ } search[MAX_PATS_ON_ONE_LINE]; int patterns; /* valid elements in search[] */ #ifdef HAVE_SPINNER struct /* stuff to do with the spinner */ { String *chars; unsigned short interval; unsigned char char_count; } spinner; #endif /* HAVE_SPINNER */ }; /* * Flags for read_command_from_file() and parse_command(). */ #define CMD_GENERAL 0x0001 #define CMD_FILE_ONLY 0x0010 /* command allowed only in a script */ #define CMD_INTERACTIVE 0x0020 /* command not allowed in a file */ #define CMD_LOAD_RELATED 0x0040 /* skip in .lookup if cmdline load */ #define CMD_ENCODING_RELATED 0x0080 /* skip in .lookup if cmdline -jis */ #define CMD_NEEDS_SLOT 0x0100 /* only allow if lookup.slot defined */ #define _IS_COMPILED_ 0x1000 /* -- private -- */ /* * Possible return values of parse_command() */ #define COMMAND_EXECUTED_OK 10 #define COMMAND_EXECUTED_WITH_ERROR 11 #define COMMAND_NOT_FOUND 12 #define COMMAND_SKIPPED 13 /* * Return value of read_commands_from_file() */ #define FILE_READ_OK 0 #define FILE_NOT_FOUND 1 #define FILE_HAS_UNKNOWN_COMMAND 2 #define FILE_HAS_BAD_COMMAND 3 /* defined in lookup.c */ extern struct lookup lookup; extern const char *expand_filename_tilde(const char *filename); extern String *gen_prompt(String *template, int showerror); extern int slot_num(struct slot_info *s); extern int load_file(const char *filename, unsigned flag); extern void process_input_line(string *input, int forced_search); extern int exit_program_now; extern String *current_log_file; /* defined in command.c */ extern int cmd_list_size(String *spec); extern int cmd_show(void); extern int cmd_output_encoding(String *, String *, String *, String *); extern int cmd_version(void); extern int parse_command(String *, unsigned, unsigned, unsigned); extern int quick_command(String *str); extern int read_commands_from_file(const char *, unsigned, unsigned); extern int cmd_log(int closelog, int append, String *file); /* defined in apply_regex.c */ extern unsigned linelength(const struct fileinfo *fileinfo, String *line); extern void output_line(const struct slot_info *, String *, unsigned); extern void apply_regex(void); extern volatile unsigned apply_regex_abort; /* defined in version.c */ extern const char version_string[]; extern const char version_date[]; extern const char compile_date[]; extern const char author_name[]; extern const char contact_addr[]; /* * General cruft. */ #define array_elements(array) (sizeof(array)/sizeof(array[0])) #define generic(ptr) ((void *)(string *)(ptr)) #define Generic(ptr) ((void *)(String *)(ptr)) #define str_len(X) strlen(Generic(X)) #define strNcmp(X, Y, LEN) strncmp(Generic(X), Generic(Y), (LEN)) #define str_cmp(X, Y) strcmp(Generic(X), Generic(Y)) #define str_cpy(X, Y) strcpy(generic(X), Generic(Y)) #define str_cat(X, Y) strcat(generic(X), Generic(Y)) #define str_eql(X, Y) (!str_cmp((X), (Y))) #if !defined(__GNUC__) # if !defined(__volatile__) # define __volatile__ /*nothing; for use with volatile functions */ # endif # if !defined(__inline__) # define __inline__ /*nothing; for use with volatile functions */ # endif #endif #if 1 # define quote(stuff) "“" #stuff "”" /* do it like “this” */ #else # define quote(stuff) " \"" #stuff "\" " /* do it like "this" */ # define quote(stuff) " ``" #stuff "'' " /* do it like ``this'' */ # define quote(stuff) " `" #stuff "' " /* do it like `this' */ # define quote(stuff) "〔" #stuff "〕" /* do it like 〔this〕 */ # define quote(stuff) "【" #stuff "】" /* do it like 【this】 */ # define quote(stuff) "『" #stuff "』" /* do it like 『this』 */ # define quote(stuff) "「" #stuff "」" /* do it like 「this」 */ # define quote(stuff) "《" #stuff "》" /* do it like 《this》 */ # define quote(stuff) "‘" #stuff "’" /* do it like ‘this’ */ # define quote(stuff) "→" #stuff "←" /* do it like →this← */ #endif /* some general system-V stuff */ #if defined(__svr4__) || defined(__DGUX__) # define index strchr # define rindex strrchr # define bcopy(FROM, TO, LENGTH) memcpy(TO, FROM, LENGTH) # if !defined(__DGUX__) /* DGUX memset is broken */ # define bzero(ADDR, LENGTH) memset(ADDR, LENGTH, 0) # endif #endif lookup-1.08b.orig/dot-lookup0100600000014400001440000000175505554656442015537 0ustar nakaharastaff## ## This is an example ~/.lookup file. ## ## turn verbose mode off during startup file processing verbose off prompt "%C([%#]%0)%!C(%w'*'%!f'raw '%n)> " spinner 200 pager on ## The filter for edict will hit for entries that ## have only one English part, and that English part ## having a pl or pn designation. load ~/lib/edict filter "name" #^[^/]+/[^/]*[^/]*/$# highlight on word on ## The filter for kanjidic will hit for entries without a ## frequency-of-use number. The modify spec will remove ## fields with the named initial code (U,N,Q,M,E, and Y) load ~/lib/kanjidic filter "uncommon" !// modify /( [UNQMEY]\S+)+//g ## Use the same filter for my local word file, ## but turn off by default. load ~/lib/local.words filter "name" #^[^/]+/[^/]*[^/]*/$# filter off highlight on word on ## Want a tag for my local words, but only when ## accessed via the combo below tag off "》" combine "words" 2 0 select words ## turn verbosity back on for interactive use. verbose on lookup-1.08b.orig/eval.c0100600000014400001440000001416706076513001014572 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * April 1994 * */ #ifndef TEST # include "lib/config.h" # include "lib/jregex.h" # include "lib/output.h" # include "lookup.h" # include "lib/assert.h" #else # include # define outputf printf #endif #include #include #include "eval.h" #ifndef array_elements # define array_elements(array) (sizeof(array)/sizeof(array[0])) #endif static jmp_buf top_level; const unsigned char *eval_errstr[] = { (const unsigned char *)"ok", (const unsigned char *)"syntax error", (const unsigned char *)"division by zero", (const unsigned char *)"end of string unexpected", (const unsigned char *)"unknown symbol", (const unsigned char *)"unmatched close paren", (const unsigned char *)"unmatched open paren", }; #define ERROR(VAL) longjmp(top_level, (VAL)) int eval_error_val = EVAL_OK; const unsigned char *eval_error_loc; static const unsigned char *str; static unsigned long true = 1; static unsigned long false = 0; static struct { const unsigned char *name; unsigned char namelen; #define NAME(STRING) (const unsigned char *)(STRING), (sizeof(STRING)-1) enum { UNSIGNED_LONG, SIGNED_LONG } type; void *pointer; } symtab[] = { #ifndef TEST { NAME("checked"), UNSIGNED_LONG, &lookup.count.checked }, { NAME("matched"), UNSIGNED_LONG, &lookup.count.matched }, { NAME("printed"), UNSIGNED_LONG, &lookup.count.printed }, { NAME("nonword"), UNSIGNED_LONG, &lookup.count.nonword }, { NAME("filtered"), UNSIGNED_LONG, &lookup.count.filtered }, #endif { NAME("true"), UNSIGNED_LONG, &true }, { NAME("false"), UNSIGNED_LONG, &false }, }; enum prec { P_EQUAL, P_BOOL, P_PLUS, P_MULT, P_HIGH }; #define P_LOWEST P_EQUAL static int evaluate(int level /*recursion*/, enum prec plevel, int doit) { int val = 0; /* skip any whitespace */ while (str[0] == ' ' || str[0] == '\t') str++; #ifndef NDEBUG if (lookup.flag.debug) outputf("EVALU: level %d, plevel %d, val %2d, doit %d: {%s}\n", level, plevel, val, doit, str); #endif switch (str[0]) { case 0: /* end of string */ ERROR(EVAL_EOS); case '(': str++; val = evaluate(level+1, P_LOWEST, doit); if (*str != ')') ERROR(EVAL_UNMATCHED_OPEN); str++; break; case ')': ERROR(EVAL_UNMATCHED_CLOSE); /* case '(' should nab all ')' */ case '!': /* unary bang */ str++; val = ! evaluate(level+1, P_HIGH, doit); break; case '-': /* unary minus */ str++; val = - evaluate(level+1, P_HIGH, doit); break; case '+': /* unary plus */ str++; val = evaluate(level+1, P_HIGH, doit); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': val = str[0] - '0'; while (++str, isascii(str[0]) && isdigit(str[0])) val = val * 10 + (str[0] - '0'); break; default: if (!isascii(str[0]) || !isalpha(str[0])) ERROR(EVAL_SYNTAX); else { int i; for (i = 0; i < array_elements(symtab); i++) { if (!isalnum(str[symtab[i].namelen]) && strncmp(str, symtab[i].name, symtab[i].namelen) == 0) break; } if (i >= array_elements(symtab)) ERROR(EVAL_NOSYM); str += symtab[i].namelen; switch(symtab[i].type) { default: assert(0); case UNSIGNED_LONG: val = *(unsigned long *)(symtab[i].pointer); break; case SIGNED_LONG: val = *(long *)(symtab[i].pointer); break; } } break; } again: /* skip any whitespace */ while (str[0] == ' ' || str[0] == '\t') str++; #define threshold(PLEVEL) (plevel < PLEVEL ? PLEVEL : (PLEVEL + 1)) #ifndef NDEBUG if (lookup.flag.debug) outputf("EVALU: level %d, plevel %d, val %2d, doit %d: {%s}\n", level, plevel, val, doit, str); #endif switch (str[0]) { int val2; /* used as a temp by some cases */ default: syntax_error: ERROR(EVAL_SYNTAX); case ')': if (level == 0) ERROR(EVAL_UNMATCHED_OPEN); /*FALLTHROUGH*/ case 0: return val; case '*': if (plevel > P_MULT) return val; str++; val *= evaluate(level+1, threshold(P_MULT), doit); break; case '/': if (plevel > P_MULT) return val; str++; val2 = evaluate(level+1, threshold(P_MULT), doit); if (doit) { if (val2 == 0) ERROR(EVAL_DIVZERO); val /= val2; } break; case '+': if (plevel > P_PLUS) return val; str++; val += evaluate(level+1, threshold(P_PLUS), doit); break; case '-': if (plevel > P_PLUS) return val; str++; val -= evaluate(level+1, threshold(P_PLUS), doit); break; case '&': if (plevel > P_BOOL) return val; str++; if (str[0] == '&') /* allow "&&" as well as "&" */ str++; val2 = evaluate(level+1, threshold(P_BOOL), doit && val); val = val && val2; break; case '|': if (plevel > P_BOOL) return val; str++; if (str[0] == '|') /* allow "||" as well as "|" */ str++; val2 = evaluate(level+1, threshold(P_BOOL), doit && !val); val = val || val2; break; case '=': if (plevel > P_EQUAL) return val; str++; if (str[0] == '=') /* allow "==" as well as "=" */ str++; val = (val == evaluate(level+1, P_EQUAL, doit)); break; case '!': if (str[1] != '=') goto syntax_error; if (plevel > P_EQUAL) return val; str += 2; val = (val != evaluate(level+1, P_EQUAL, doit)); break; } goto again; } int eval(const unsigned char *expression_string) { str = expression_string; eval_error_val = setjmp(top_level); if (eval_error_val) { eval_error_loc = str; return 0; } return evaluate(/*level*/0, P_LOWEST, /*doit*/1); } #ifdef TEST int main(int argc, const unsigned char *argv[0]) { const unsigned char *s = argv[1]; int val = eval(s); if (eval_error_val != EVAL_OK) { printf("ERROR: %s\n%s\n", eval_errstr[eval_error_val], s); while (s++ < eval_error_loc) putchar('-'); printf("^\n"); } else { printf("value is %d\n", val); } return 0; } #endif lookup-1.08b.orig/eval.h0100600000014400001440000000134406076513006014575 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * April 1994 * */ extern int eval(const unsigned char *expression_string); #define EVAL_OK 0 #define EVAL_SYNTAX 1 #define EVAL_DIVZERO 2 #define EVAL_EOS 3 #define EVAL_NOSYM 4 #define EVAL_UNMATCHED_CLOSE 5 #define EVAL_UNMATCHED_OPEN 6 #define EVAL_MAX_ERROR EVAL_UNMATCHED_OPEN extern int eval_error_val; extern const unsigned char *eval_errstr[]; extern const unsigned char *eval_error_loc; lookup-1.08b.orig/jmake0100711000014400001440000000153705754601426014524 0ustar nakaharastaff: ## ## Can have Make-style "var=val" arguments to override the variables ## below. Also "make=options" to add additioal options to make. ## ## To make the server, try ## jmake EXTRA=-DSERVER_CONFIG ## CC=gcc2 CFLAGS_EXTRA=-I/usr/local/lib/gcc-lib/m88k-omron-luna/latest/include #OTHERLIBS=/mach1/jfriedl/dbmalloc/libdbmalloc.a -lmach #OTHERLIBS=-lmach KIBISHII=-DKIBHISHII_DEBUG gcc=gcc2 warn=1 kibishii=1 pedantic=1 debug=1 #md=1 MANPAGE_WIDTH=-r195 ############################################################################# ## allow to be changed on jmake command line eval $@ export OTHERLIBS export KIBISHII export CFLAGS_EXTRA export MANPAGE_WIDTH export EXTRA gmake -k $make CC="$CC" gcc="$gcc" warn="$warn" debug="$debug" md="$md"\ kibishii="$kibishii" pedantic="$pedantic" 2>&1| sed \ -e '/^\//d' \ -e '/ included from /d' \ -e '/^ *from /d' lookup-1.08b.orig/jmake20100711000014400001440000000206705653571326014610 0ustar nakaharastaff: ## ## (this copy just to make the regex package test program) ## ## Can have Make-style "var=val" arguments to override the variables ## below. Also "make=options" to add additioal options to make. ## CC=gcc2 CFLAGS_EXTRA=-I/usr/local/lib/gcc-lib/m88k-omron-luna/latest/include #OTHERLIBS=/mach1/jfriedl/dbmalloc/libdbmalloc.a -lmach #OTHERLIBS=-lmach KIBISHII=-DKIBHISHII_DEBUG gcc=gcc2 warn=1 kibishii=1 optimize=0 pedantic=1 debug=1 #md=1 MANPAGE_WIDTH=-r195 ############################################################################# ## allow to be changed on jmake command line eval $@ export OTHERLIBS export KIBISHII export CFLAGS_EXTRA export MANPAGE_WIDTH export EXTRA gmake -k $make CC="$CC" gcc="$gcc" warn="$warn" debug="$debug" md="$md" \ kibishii="$kibishii" optimize="$optimize" pedantic="$pedantic" \ regextest 2>&1| sed \ -e '/^\//d' \ -e '/ included from /d' \ -e '/^ *from /d' lookup-1.08b.orig/lookup.c0100600000014400001440000011321406300560223015143 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). * * October 1993 * */ #include "lib/config.h" #include "lib/assert.h" #include #include #include #include "lib/output.h" #include "lib/fuzzkana.h" #include "lib/loadfile.h" #include "lib/romaji2kana.h" #include "lib/jregex.h" #include "lib/strsave.h" #include "lib/replace.h" #include "lib/xmalloc.h" #include "lib/input.h" #include "lookup.h" struct lookup lookup; int UseNoMemIndex = 0; const char *expand_filename_tilde(const char *filename) { extern const char *getenv(const char *); String *home = (String*)getenv("HOME"); if (home == 0 || filename[0] != '~' || filename[1] != '/') return filename; else { static const char *new = 0; if (new) free(new); /* * Note: in calculating the length to malloc, we realize that we'll * need an extra byte for the final null, but since we won't be * using the '~' in FILENAME, it all works out. */ new = xmalloc(str_len(home) + str_len(filename)); str_cpy(new, home); str_cat(new, &filename[1]); return new; } } #ifdef LOG_FILE_SUPPORT String *current_log_file = 0; #endif #ifdef SERVER_CONFIG static unsigned int serv_tcp_port = SERV_TCP_PORT; /* default port */ static int verbose_server = 0; #endif #ifndef SERVER_CONFIG #define DEFAULT_PROMPT "%C(%0 command)%!C(search [%n])> " static int in_command; # ifdef NOREADLINE # include # else # include "lib/jreadline.h" # endif /* * Getline: get a line of user input. */ static __inline__ string * getline(String *prompt) { #ifdef NOREADLINE static string linebuf[200]; String *line; unsigned len; output(prompt); flush_output(); line = fgets(linebuf, sizeof(linebuf), stdin); if (line == 0) return 0; len = str_len(linebuf); if (linebuf[len-1] == '\n') ilnebuf[len-1] = 0; return linebuf; #else /* NOREADLINE */ static string *lastline = 0, *line = 0; #ifdef LOG_FILE_SUPPORT int log_fd=0; #endif if (line) free(line); /* free the previously read line */ jreadline_auto_romaji = lookup.default_slot->default_flag.autokana; apply_regex_abort = 0; #ifdef LOG_FILE_SUPPORT if (current_log_file != 0) { log_fd = set_extra_output_file(JUST_CHECKING_OUTPUT_FILE); flush_output(); set_extra_output_file(NO_OUTPUT_FILE); /* turn logging off */ } #endif /* LOG_FILE_SUPPORT */ output_pager_reset_more(); line = readline(prompt); /* Get a line from the user. */ output_pager_reset_more(); #ifdef LOG_FILE_SUPPORT if (current_log_file != 0) { int normal_fd = set_normal_output_file(log_fd); outputf("%s%s\n", jreadline_last_prompt, line); flush_output(); set_normal_output_file(normal_fd); set_extra_output_file(log_fd); } #endif /* LOG_FILE_SUPPORT */ /* If the line has any text in it, save it on the history. */ if (line && *line && !(lastline && str_eql(lastline, line))) { if (lastline) free(lastline); lastline = strsave(line); add_history(line); } return line; #endif /* NOREADLINE */ } /* If we get a kill or something, tell the apply_regex routine to abort */ static int sighandler(int sig) { if (apply_regex_abort++ > 2) exit(1); /* exit immediately if three aborts in a row. */ return 0; } static void romaji_converter(string *start_of_line, String *bufend, string **dot_p, String **eol_p, int force) { string *dot = *dot_p; string *slwp = start_of_line; /*Start of Line Without Prefixes*/ int eat_leading_slash = !force; /* if forcing, don't eat slash */ static const char *allowed = 0; /* * This works with the main input processor... both must know the * form of an input line to at least some extent. * * Skip past any line prefixes. If the first character is then * cmdstart_char, we're on a command line, so we'll just return. */ if (slwp[0] == '+') slwp++; if (slwp[0] == '!') { unsigned char c; while (c = (++slwp)[0], slwp < dot && isascii(c) && isalpha(c)) ; if (c == '!') slwp++; } if (slwp >= dot) return; if (!force) { /* * If we're on a multiple-regex line (with || or |!|) figure * the start of the regex we're on. */ string *this_regex = slwp; string *ptr; /* line "begins" with cmdstart_char no conversion */ if (this_regex[0] == lookup.cmdstart_char) return; for (ptr = dot - 1; ptr > &slwp[3]; ptr--) { if (ptr[-1] != '|') continue; if (ptr[-2] == '|' && ptr[-3] != '\\') { this_regex = ptr; break; } if (ptr[-2] == '!' && ptr[-3] == '|' && (&ptr[-4] >= start_of_line) && ptr[-4] != '\\') { this_regex = ptr; break; } } /* * If first character of regex is '=', don't do any * automatic conversion. */ if (this_regex[0] == '=') return; if (this_regex[0] == '[' || this_regex[0] == '/') { allowed = std_romaji_allowed_nonletters("-^'*?+.[]<>\\$ \t"); if (this_regex[0] == '[') { this_regex[0] = '/'; std_romaji_converter(start_of_line,bufend,dot_p,eol_p,force,0); this_regex[0] = '['; goto done; } /* else this_regex[0] == '/' */ while (--dot > this_regex) if (*dot == '/') break; if (dot == this_regex) eat_leading_slash = 0; } } std_romaji_converter(start_of_line, bufend, dot_p, eol_p, force, eat_leading_slash); done: if (allowed) std_romaji_allowed_nonletters(allowed); } static int check_commandline_status(string *line, string **p_dot, string **p_eol) { int old_in_command = in_command; String *eol = *p_eol; if (line[0] == '+') line++; if (line[0] == '!') { unsigned char c; while (c = (++line)[0], line < eol && isascii(c) && isalpha(c)) ; if (c == '!') line++; } if (line >= eol) in_command = 0; else in_command = (line[0] == lookup.cmdstart_char); if (in_command != old_in_command) { (void)jreadline_mod_prompt(gen_prompt(lookup.prompt_format, 0)); return 1; } return 0; } #define PROMPT_OK 0 #define PROMPT_UNMATCHED_OPEN -1 #define PROMPT_UNMATCHED_CLOSE -2 #define PROMPT_UNEXPECTED_EOL -3 #define PROMPT_EXPECTED_PAREN_OR_QUOTE -4 #define PROMPT_OVERFLOW -5 static String *prompt_errors[] = { (String *)"ok", (String *)"unmatched open paren", (String *)"unmatched close paren", (String *)"unexpected end-of-string", (String *)"expected open paren or quote", (String *)"prompt too large" }; /* * Given a pointer into a string just beyond an open paren, * and a pointer to the end of the string, return the number of * characters to the matching close paren. If the return value is * negative, it's one of the PROMPT_* errors. */ static int skip_parens(String *template, String *tend) { String *orig = template; unsigned char c; int level; for (level = 0; template < tend; template++) { if (c = *template, c == '\\') template++; else if (c == '(') level++; else if (c == ')') { if (level == 0) return template - orig; if (level == 0) return PROMPT_UNMATCHED_CLOSE; --level; } } return PROMPT_UNMATCHED_OPEN; } static string *do_prompt_buf; static int do_prompt_len; static int do_prompt_use_command; static int do_create_prompt(String *template, String *tend) { unsigned char c; String *orig_buf = do_prompt_buf; while (template < tend) { int not; String *temp; /* deal with non-special and escaped characters */ if (c = *template, c != '%') { if (c == '\\') { if (++template >= tend) return PROMPT_UNEXPECTED_EOL; c = *template; } if (do_prompt_len-- == 0) return PROMPT_OVERFLOW; *do_prompt_buf++ = c; ++template; continue; } temp = template; c = *++template; not = 0; while (c == '!') { not = !not; c = *++template; } if (template >= tend) return PROMPT_UNEXPECTED_EOL; switch(c) { int len, doit; /* "may be used uninitialized" warning OK here */ unsigned char temp_buf[3]; do_flag: kibishii_assert(doit == 0 || doit == 1); /* expect an open paren */ if (template[1] == '\'' || template[1] == '"') { unsigned char match = template[1]; temp = (template += 2); while (template < tend && *template != match) template++; if (template++ >= tend) return PROMPT_UNEXPECTED_EOL; if (doit) { len = template - temp - 1; goto dump_temp_with_len; } } else if (template[1] != '(') /* ) */ return PROMPT_EXPECTED_PAREN_OR_QUOTE; else if (len = skip_parens(&template[2], tend), len < 0) return len; else { if (doit) { int retval = do_create_prompt(&template[2], &template[2+len]); if (retval < 0) return retval; } template += 2 + len +1; } break; dump_temp: len = str_len(temp); dump_temp_with_len: kibishii_assert(len < 1234); /* some random sanity check */ if (do_prompt_len -= len, do_prompt_len < 0) return PROMPT_OVERFLOW; while (len-- > 0) *do_prompt_buf++ = *temp++; break; default: len = ++template - temp; goto dump_temp_with_len; case '#': ++template; temp_buf[0] = '0' + slot_num(lookup.default_slot); temp = temp_buf; len = 1; goto dump_temp_with_len; case 'n': ++template; if (COMBO(lookup.default_slot)) { kibishii_assert(lookup.default_slot->combo.name); temp = lookup.default_slot->combo.name; } else { kibishii_assert(lookup.default_slot->file->short_filename); temp = (String *)lookup.default_slot->file->short_filename; } goto dump_temp; case 'N': /*... */ ++template; if (COMBO(lookup.default_slot)) { kibishii_assert(lookup.default_slot->combo.name); temp = lookup.default_slot->combo.name; } else { kibishii_assert(lookup.default_slot->file->v->filename); temp = (String *)lookup.default_slot->file->v->filename; } goto dump_temp; case 'w': doit = lookup.default_slot->default_flag.word != not; goto do_flag; case 'W': doit = lookup.default_slot->default_flag.glob != not; goto do_flag; case 'd': doit = lookup.default_slot->default_flag.display != not; goto do_flag; case 'f': doit = lookup.default_slot->default_flag.fuzz != not; goto do_flag; case 'F': doit = (lookup.default_slot->default_flag.filter != not && (COMBO(lookup.default_slot) || lookup.default_slot->filter_spec.pattern)); goto do_flag; case 'M': doit = (lookup.default_slot->default_flag.modify != not && (COMBO(lookup.default_slot) || lookup.default_slot->modify_spec.pattern)); goto do_flag; case 'C': doit = (in_command != not); do_prompt_use_command = 1; goto do_flag; case 'c': doit = lookup.default_slot->default_flag.fold != not; goto do_flag; case 'S': ++template; temp = &lookup.cmdstart_char; len = 1; goto dump_temp_with_len; case '0': ++template; temp = (String *)lookup.prog_short; goto dump_temp; case 'l': /* is logging */ #ifdef LOG_FILE_SUPPORT doit = !!(current_log_file) != not; #else doit = 0; #endif goto do_flag; case 'L': /* logging file name, if any */ ++template; #ifdef LOG_FILE_SUPPORT if (current_log_file) { temp = current_log_file; goto dump_temp; } #endif break; } } return do_prompt_buf - orig_buf; } String *gen_prompt(String *template, int showerror) { static unsigned char buffer[100]; int i; do_prompt_use_command = 0; do_prompt_buf = buffer; do_prompt_len = sizeof(buffer); i = do_create_prompt(template, template + str_len(template)); if (i < 0) { if (showerror) warn("(prompt spec error: %s)\n", (unsigned)-i > array_elements(prompt_errors) ? (String*)"???" : prompt_errors[-i]); jreadline_access = 0; return template; } buffer[i] = '\0'; jreadline_access = do_prompt_use_command ? check_commandline_status: 0; return buffer; } #endif /* if not SERVER_CONFIG */ static char **temp_memory = 0; /* used in temp_malloc(), et. al. */ /* * Allocate temporary memory that will be freed when free_temp_memory * is later called. Don't use for allocating double floating point * values (due to possible alignment problems) */ static void *temp_malloc(unsigned size) { char **mem = xmalloc(size + /*enough for hidden link*/ sizeof(mem)); *(char ***)mem = temp_memory; /* Link the chain to the new memory */ temp_memory = mem; /* and the new memory to the chain. */ return &mem[1]; /* Return the requested non-hidden memory */ } /* * Free whatever memory has been allocated via temp_malloc. */ static void free_temp_memory(void) { while (temp_memory) { char *tmp = (char *)temp_memory; temp_memory = (char **)*temp_memory; free(tmp); } } /* * return true if OK, zero on error. */ static __inline__ int prepare_line(int print_only, string *line, unsigned len, int not) { int convert_from_romaji = 0; if (lookup.patterns >= MAX_PATS_ON_ONE_LINE) { outputf("maximum of %d regexes per line\n", MAX_PATS_ON_ONE_LINE); return /*BAD*/0; } /* if pattern begins with '=', skip that character */ if (line[0] == '=') { line++; len--; } else if (line[0] == '[') { line[0] = '<'; if (line[len-1] == ']') line[len-1] = '>'; convert_from_romaji = 1; } else if (line[0] == '/') { convert_from_romaji = 1; line++; len--; } if (line[0] == '\0') { output("≪no pattern!≫\n"); return /*BAD*/0; } if (lookup.slot->current_flag.glob) { unsigned char *new = temp_malloc(len * 2); unsigned char *dest = new, c; int is_word = lookup.slot->current_flag.word || line[0] == '<' || line[0] == '['; while (c = *line++, c) { if (c == '.' || c == '+') { *dest++ = '\\'; *dest++ = c; } else if (c == '*') { /* * If in word mode or pattern begins with < or [, * a wildcard '*' will become "\S*". Otherwise, ".*" */ if (!is_word) { *dest++ = '.'; } else { *dest++ = '\\'; *dest++ = 'S'; } *dest++ = '*'; /* * Just in case there were multiple *** in a row, eat them. */ while (*line == '*') line++; } else if (c == '?') { /* similar to '*' above, will become "." or "\S" */ if (!is_word) { *dest++ = '.'; } else { *dest++ = '\\'; *dest++ = 'S'; } } else { *dest++ = c; } } *dest = '\0'; line = new; } if (convert_from_romaji) { struct romaji2kana_info info; string *kana; int error = romaji2kana(line, line + len, 0, 0, &info); if (error < 0) { outputf("≪romaji2kana returns %d≫\n", error); return /*BAD*/0; } if (info.modified) { /* get memory and do conversion */ kana = temp_malloc(info.k_buf_used); error = romaji2kana(line, line+len, kana , info.k_buf_used, 0); if (error) { kibishii_assert(error > 0); /* < 0 case taken care of above */ outputf("≪bad kana conversion %d" quote(%s) "≫\n", error, kana); return /*BAD*/0; } line = kana; } if (lookup.slot->current_flag.fuzz) { /* * No fuzz if *, +, or ? is used to modify a non-ASCII */ unsigned const char *ptr = &line[1]; while (*ptr) { if (ptr[-1] >= 0x80 && (*ptr=='*' || *ptr=='+' || *ptr=='?')) break; ptr++; } if (!*ptr) { unsigned buflen = fuzzkana(line, 0, 0, FUZZ_ALL); string *Fuzz; kibishii_assert(buflen != 0); fuzzkana(line, Fuzz = temp_malloc(buflen), buflen, FUZZ_ALL); line = Fuzz; } } } if (print_only) { outputf("%s%s" quote(%s) "\n", lookup.patterns ? "and" : "a match is", not ? " not" : "", line); } else { lookup.search[lookup.patterns].pattern = line; lookup.search[lookup.patterns].not = not; } lookup.patterns++; return /* OK */1; } static __volatile__ void usage(void) { warn("usage: %s [FLAGS] [FILES...]\n", lookup.prog); output( " -help :: report this list.\n" " -version :: show program version string and exit.\n" #ifdef SERVER_CONFIG " -verbose :: have server report verbosely to stdout.\n" #else " -verbose :: turn index-creation verbosity on\n" #endif " -debug :: turn on general debugging flag\n" " -writeindex :: create indices for FILES and exit.\n" " -percent # :: parameter for created indices.\n" " -jis | -euc | -sjis :: select encoding.\n" #ifndef SERVER_CONFIG " -norc :: (this is the default for a server)\n" #else " -norc :: don't read '~/.lookup'.\n" #endif " -rc FILE :: use FILE as the rc file.\n" " -noindex :: don't load FILE's precomputed index\n" " -cmddebug :: turn on command debugging flag\n" " -regexdebug :: turn on regex debugging flag\n" #ifdef SERVER_CONFIG " -port # :: use given port number.\n" #endif ); #ifdef SERVER_CONFIG outputf("Default port number is %d.\n", serv_tcp_port); #endif exit(1); } #ifndef SERVER_CONFIG #if OUTPUT_PAGER || !NOREADLINE /* if we need the window size */ /* * Taken from GNU emacs source -- Define the 4.3 names in terms of the * Sun names if the latter exist and the former do not. */ #ifdef TIOCGSIZE # ifndef TIOCGWINSZ # define TIOCGWINSZ TIOCGSIZE # define winsize ttysize # define ws_row ts_lines # define ws_col ts_cols #endif #endif #ifdef TIOCGWINSZ #define GET_WINDOW_SIZE get_window_size_bsd int GET_WINDOW_SIZE(/*unused*/int signum) { struct winsize w; if (ioctl(2, TIOCGWINSZ, &w) == 0 && w.ws_row > 0) { #if OUTPUT_PAGER (void)output_pager_columns(w.ws_col); (void)output_pager_lines(w.ws_row < 5 ? w.ws_row : w.ws_row - 2); #endif #if !NOREADLINE (void)set_jreadline_width(w.ws_col); #endif } #ifdef SIGWINCH signal(SIGWINCH, GET_WINDOW_SIZE); #endif return 0; } #endif /* ifdef TIOCGWINSZ */ #endif /* OUTPUT_PAGER || !NOREADLINE */ #endif /* SERVER_CONFIG */ int slot_num(struct slot_info *s) { unsigned i; for (i = 0; i < lookup.slots; i++) if (lookup.slot_info[i] == s) return (int)i; kibishii_assert(0); return -1; } int exit_program_now = 0; void process_input_line(string *input, int forced_search) { int left_side_is_not = 0; int print_only = 0; int len; kibishii_assert(!exit_program_now); kibishii_assert(input && input[0]); if (input[0] == '?' && input[1] == '\0') /* special request for help */ { outputf("Command indicator is" quote(%c) "; use" quote(%chelp) "for help.\n", lookup.cmdstart_char, lookup.cmdstart_char); return; } /* * If the line starts with '+', then we'll only print the pattern * we would otherwise search for. */ if (input[0] == '+') { input++; print_only = 1; } if (len = str_len(input), len == 0) return; lookup.slot = lookup.default_slot; /* default file if no ,# */ /* ending with "," means "apply command to th file" */ if (len>2 && input[len-2]==',' && input[len-1]>='0' && input[len-1]<='9') { int value = input[len-1] - '0'; if (value >= lookup.slots) { outputf("Nothing loaded to slot #%d.\n", value); return; } len -= 2; lookup.slot = lookup.slot_info[value]; kibishii_assert(lookup.slot != 0); } else { /* allow an embedded ",1||" type thing */ int i; for (i = 2; i < len; i++) { if (input[i] == ',' && input[i+1] >= '0' && input[i+1] <= '9' && input[i+2] == '|' && (input[i+3] == '|' || (input[i+3] == '!' && input[i+4] == '|'))) { int value = input[i+1] - '0'; if (value >= lookup.slots) continue; /* quietly ignore ones that don't match */ lookup.slot = lookup.slot_info[value]; len -= 2; MOVE_MEMORY(&input[i+2], &input[i], len-i+1/*+1 for null*/); break; } } } lookup.slot->current_flag = lookup.slot->default_flag; /* * If the line starts with a '!', read search modifiers. * Stop after the first non-letter. Eat a following '!' if there. */ if (input[0] == '!') { unsigned char c; int count = 0; int error = 0; while (len--, c = (++input)[0], !error && isascii(c)&&isalpha(c)) { count++; switch(c) { default: outputf("unknown !-code " quote(%c) "; use !? for list.\n",c); error = 1; break; case 'w': lookup.slot->current_flag.word ^= 1; break; case 'W': lookup.slot->current_flag.glob ^= 1; break; case 'M': if (COMBO(lookup.slot) || lookup.slot->modify_spec.pattern) lookup.slot->current_flag.modify ^= 1; break; case 'F': if (COMBO(lookup.slot) || lookup.slot->filter_spec.pattern) lookup.slot->current_flag.filter ^= 1; break; case 't': if (COMBO(lookup.slot) || lookup.slot->tag_string) lookup.slot->current_flag.tag ^= 1; break; case 'c': lookup.slot->current_flag.fold ^= 1; break; case 'd': lookup.slot->current_flag.display ^= 1; break; case 'f': lookup.slot->current_flag.fuzz ^= 1; break; case 'r': /* special "raw" (no fuzz) */ lookup.slot->current_flag.fuzz = 0; break; case 'h': lookup.slot->current_flag.highlight ^= 1; break; } } if (c == '?') { outputf("┏━┯━━━━━━━━━━━━━━┯━┓\n" "┃ F│toggle filtration │%s┃\n" "┃ M│toggle modification │%s┃\n" "┃ w│toggle word-preference mode │%s┃\n", lookup.slot->default_flag.filter ? "Х" : " ", lookup.slot->default_flag.modify ? "Х" : " ", lookup.slot->default_flag.word ? "Х" : " "); outputf("┃ c│toggle case folding │%s┃\n" "┃ f│toggle fuzzification │%s┃\n" "┃ r│raw (force fuzzification off) ┃\n" "┃ W│toggle wildcard-pattern mode│%s┃\n", lookup.slot->default_flag.fold ? "Х" : " ", lookup.slot->default_flag.fuzz ? "Х" : " ", lookup.slot->default_flag.glob ? "Х" : " "); outputf( "┃ h│toggle match highlighting │%s┃\n" "┃ t│toggle tagging │%s┃\n" "┃ d│toggle displaying │%s┃\n" "┗━┷━━━━━━━━━━━━━━┷━┛\n", lookup.slot->default_flag.highlight ? "Х" : " ", lookup.slot->default_flag.display ? "Х" : " ", lookup.slot->default_flag.tag ? "Х" : " "); return; } if (error) return; if (count == 0) { /* was simply a "!!", so do default "!", which is "!f" */ lookup.slot->current_flag.fuzz ^= 1; } if (c == '!') /* eat a trailing '!' if there */ { input++; len--; } } kibishii_assert(len >= 0); if (len == 0) return; /* * beginning with cmdstart_char makes it a command, unless we were * told that it's a search regardless */ if (!forced_search && input[0] == lookup.cmdstart_char) { /* skip the beginning-of-command character */ while (*input == lookup.cmdstart_char) { input++; len--; } /* also skip spaces */ while (*input == ' ') { input++; len--; } if (len && parse_command(input,len,CMD_GENERAL,0) == COMMAND_NOT_FOUND) outputf("unknown command" quote(%s) "\n", input); return; } input[len] = '\0'; /* ensure a null-terminated line */ lookup.patterns = 0; left_side_is_not = 0; while (input) { string *ptr = input; int right_side_is_not = 0; string *nextstart = 0; kibishii_assert(*input != 0); while (ptr[0] != '\0') { if (ptr[0] == '\\' && ptr[1] != 0) ptr += 2; else if (ptr[0] == '|' && ptr[1] == '|') { nextstart = &ptr[2]; ptr[0] = '\0'; break; } else if (ptr[0] == '|' && ptr[1] == '!' && ptr[2] == '|') { nextstart = &ptr[3]; right_side_is_not = 1; ptr[0] = '\0'; break; } else { ptr++; } } if (input == ptr) { outputf("empty regex clause before" quote(%s) ".\n", input); return; } if (nextstart && *nextstart == 0) { outputf("expected regex after final" quote(|%s) ", aborting\n", &ptr[1]); return; } /* have a line to deal with */ if (!prepare_line(print_only, input, ptr-input, left_side_is_not)) return; input = nextstart; left_side_is_not = right_side_is_not; } if (lookup.patterns == 0) { output("no patterns!\n"); return; } if (print_only) return; apply_regex(); free_temp_memory(); if (!apply_regex_abort && lookup.slot->current_flag.display) { if (!lookup.count.printed && lookup.count.nonword) { output("≪no whole words found, non-words shown here≫\n"); cmd_show(); lookup.count.printed += lookup.list.used; if (lookup.count.filtered) { outputf("≪additionally, %ld ", lookup.count.filtered); if (!COMBO(lookup.slot) && lookup.slot->filter_spec.name != 0) outputf(quote(%s), lookup.slot->filter_spec.name); output("filtered and discarded≫\n"); } return; } if (lookup.list.used == 1 && (lookup.count.filtered + lookup.count.nonword) == 1) { cmd_show(); /* aw, just show */ lookup.count.printed += lookup.list.used; return; } } if (lookup.count.filtered || lookup.count.nonword) { if (lookup.flag.debug) outputf("[%ld checked; %ld matched; %ld filtered; " "%ld nonword; %ld printed]\n", lookup.count.checked, lookup.count.matched, lookup.count.filtered, lookup.count.nonword, lookup.count.printed); output("≪elided: "); if (lookup.count.filtered) { if (COMBO(lookup.slot) || lookup.slot->filter_spec.name == 0) outputf("%ld filtered", lookup.count.filtered); else outputf("%ld" quote(%s) "filtered", lookup.count.filtered, lookup.slot->filter_spec.name); } if (lookup.count.nonword) outputf("%s%ld nonword", lookup.count.filtered ? "; " : "", lookup.count.nonword); if (!lookup.list.used) outputf("≫\n"); else { outputf(" (use" quote(%cshow) "to display", lookup.cmdstart_char); if (lookup.list.overflow) outputf(" first %d", lookup.list.used); outputf(")≫\n"); } } } /* * Given a filename and some flags to describe how, load. * Returns the slot number, or a negative number on error; */ int load_file(const char *filename, unsigned flag) { const char *action = "load"; struct fileinfo *fileinfo = 0; int i; for (i = 0; i < lookup.slots; i++) if (!COMBO(lookup.slot_info[i]) && str_eql(filename, lookup.slot_info[i]->file->v->filename)) { fileinfo = lookup.slot_info[i]->file; action = "link"; break; } if (fileinfo == 0) { if (lookup.slots >= MAX_LOADED_FILES) { outputf("%scan't load [%s] (too many loaded files, max is %d)\n", lookup.where, filename, MAX_LOADED_FILES); return /*ERROR*/-1; } if (UseNoMemIndex) flag |= LOADFILE_NO_MEM_INDEX; if (fileinfo = loadfile(filename, lookup.percent, flag), fileinfo == 0) return /*ERROR*/-1; if (flag & LOADFILE_WRITEINDEX) return 0; /* no need to save to slot if just writing and exiting */ if (lookup.flag.verbose) outputf("loaded \"%s\".\n", filename); } lookup.slot_info[lookup.slots] = xmalloc(sizeof(struct slot_info)); bzero(lookup.slot_info[lookup.slots], sizeof(struct slot_info)); lookup.slot_info[lookup.slots]->file = fileinfo; lookup.slot_info[lookup.slots]->default_flag = lookup.flag; lookup.slot_info[lookup.slots]->default_flag.filter = 0; /* just 2 b sure*/ lookup.slot_info[lookup.slots]->default_flag.modify = 0; /* just 2 b sure*/ if (lookup.flag.verbose) outputf("\r%sed \"%s\" to slot %d\n", action, filename, lookup.slots); return lookup.slots++; } static __inline__ void startup(int argc, const char *argv[]) { unsigned cmd_line_loadfile_flags = 0; const char *rc_file_name = 0; String *encoding = 0; unsigned read_rc_skip_mask = 0; #ifndef SERVER_CONFIG int read_rc = 1; #else /* SERVER_CONFIG */ int read_rc = 0; #endif /* SERVER_CONFIG */ if (argv[0] == 0) lookup.prog = lookup.prog_short = "lookup"; else { const char *ptr; lookup.prog = argv[0]; for (ptr = lookup.prog_short = lookup.prog; *ptr; ptr++) if (*ptr == '/') lookup.prog_short = ptr+1; } lookup.where = (String *)"commandline: "; while (++argv, --argc > 0&& argv[0][0] == '-') { if (str_eql(*argv, "-help")) usage(); else if (str_eql(*argv, "-verbose")) { cmd_line_loadfile_flags |= INDEX_REPORT_STATS; #ifdef SERVER_CONFIG verbose_server = 1; /* additionally, report this stuff */ #endif } else if (str_eql(*argv, "-debug")) lookup.flag.debug = 1; else if (str_eql(*argv, "-v") || str_eql(*argv, "-version")) { (void)cmd_version(); exit(0); } else if (str_eql(*argv, "-writeindex") || str_eql(*argv, "-write")) { cmd_line_loadfile_flags |= LOADFILE_WRITEINDEX; lookup.flag.verbose = 1; } else if (str_eql(*argv, "-rc")) { if (--argc <= 0) { warn("%s: expected filename arg to %s\n",lookup.prog, argv[0]); usage(); } rc_file_name = (++argv)[0]; read_rc = 1; } else if (str_eql(*argv, "-percent")) { if (--argc > 0) lookup.percent = atoi((++argv)[0]); else { warn("%s: expected arg to -percent\n", lookup.prog); usage(); } } else if (str_eql(*argv, "-jis")) encoding = (String*)"encode jis"; else if (str_eql(*argv, "-euc")) encoding = (String*)"encode euc"; else if (str_eql(*argv, "-sjis")) encoding = (String*)"encode sjis"; else if (str_eql(*argv, "-noindex")) cmd_line_loadfile_flags |= LOADFILE_READINDEX; else if (str_eql(*argv, "-regexdebug")) lookup.flag.regex_debug = 1; else if (str_eql(*argv, "-cmddebug")) lookup.flag.cmd_debug = 1; else if (str_eql(*argv, "-nomem")) UseNoMemIndex = 1; else if (str_eql(*argv, "-norc")) read_rc = 0; #ifdef SERVER_CONFIG else if (str_eql(*argv, "-port")) { if (--argc > 0) serv_tcp_port = atoi((++argv)[0]); else { warn("%s: expected arg to -port\n", lookup.prog); usage(); } } #endif /* SERVER_CONFIG */ else { warn("%s: unknown flag \"%s\".\n", lookup.prog, *argv); usage(); } } if (encoding) { quick_command(encoding); read_rc_skip_mask = CMD_ENCODING_RELATED; } /* any remaining command-line arguments are files to load */ if (argc) { /* * Set default load flags if none had been set. */ if ((cmd_line_loadfile_flags & (LOADFILE_WRITEINDEX|LOADFILE_READINDEX)) == 0) { cmd_line_loadfile_flags |= LOADFILE_READifPRESENT; } while (argc-- > 0) if (load_file(argv++[0], cmd_line_loadfile_flags) < 0) die("exiting"); } /* if just writing the index, exit now */ if (cmd_line_loadfile_flags & LOADFILE_WRITEINDEX) exit(0); /* * Some more defaults for interactive use. */ cmd_list_size((String*)"100"); lookup.flag.verbose = 1; /* * If we haven't been told not to load the startup file * "~/.lookup", load it. However, if we've loaded any files via * the command line, we'll not load any files or set any filters, * etc., from the startup file. */ if (read_rc) { int i; int file_must_be_found = 1; if (rc_file_name == 0) { rc_file_name = expand_filename_tilde("~/.lookup"); if (rc_file_name[0] == '~') rc_file_name += 2; /* skip the "~/" that didn't get expanded */ file_must_be_found = 0; } if (lookup.slots != 0) read_rc_skip_mask |= CMD_LOAD_RELATED; i = read_commands_from_file(rc_file_name, CMD_GENERAL|CMD_FILE_ONLY, read_rc_skip_mask); if (file_must_be_found && i == FILE_NOT_FOUND) die("%s: startup file \"%s\" not found.\n", lookup.prog, rc_file_name); if (i != FILE_READ_OK && i != FILE_NOT_FOUND) die("aborting"); } /* if none loaded anywhere, nothing to do.... */ if (lookup.slots == 0) die("%s: no files specified\n", lookup.prog); #ifndef SERVER_CONFIG /* set default prompt if none had been set in startup file */ if (lookup.prompt_format == 0) lookup.prompt_format = strsave((String*)DEFAULT_PROMPT); #endif /* SERVER_CONFIG */ /* set default slot if non had ben set in startup file */ if (lookup.default_slot == 0) lookup.default_slot = lookup.slot_info[0]; kibishii_assert(lookup.default_slot != 0); kibishii_assert(slot_num(lookup.default_slot) < MAX_LOADED_FILES); #ifndef SERVER_CONFIG (void)set_romaji_converter(romaji_converter); signal(SIGINT, sighandler); #endif /* SERVER_CONFIG */ lookup.where = (String *)""; } #ifdef SERVER_CONFIG /* * Experimental server. * * Compiling 'lookup' (and its library) with SERVER_CONFIG defined will * result in a server which will listen on a socket (at port SERV_TCP_PORT) * for command lines (which would normally come from the user via the * readline interface) and spit back the results to the socket. * * The idea is that some commonly-accessed takes-time-to-load file would be * loaded via a server-configured lookup, and then a simple client program * would access it when needed. * * I grafted this in (with, as it turns out, pretty minimal fudging) to for * use by my World Wide Web Japanese/English/Japanese dictionary server as * of October 94, at: * http://www.wg.omron.co.jp/cgi-bin/j-e * * Others, however, might find it useful for other purposes. */ #include #include #include #include #define heinous_death(SYSCALL) \ { \ perror(SYSCALL); \ die("%s: heinous death at %s line %d\n", lookup.prog,__FILE__,__LINE__); \ } static void service_socket(void) { struct sockaddr_in cli_addr, serv_addr; struct protoent *proto; int socket_fd; if (proto = getprotobyname("tcp"), proto == 0) heinous_death("getprotobyname"); if (socket_fd = socket(AF_INET, SOCK_STREAM, proto->p_proto), socket_fd<0) heinous_death("socket"); bzero((void*)&serv_addr, sizeof(serv_addr)); serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); serv_addr.sin_port = htons(serv_tcp_port); if (bind(socket_fd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) heinous_death("bind"); if (listen(socket_fd, 5) < 0) heinous_death("listen"); if (verbose_server) printf("server waiting...\n"); for (;;) { int clilen, newsockfd, i, old_output = NO_OUTPUT_FILE; int continuous = 0, first = 1; string buffer[500]; clilen = sizeof(cli_addr); newsockfd = accept(socket_fd, (struct sockaddr *)&cli_addr, &clilen); if (newsockfd < 0) heinous_death("accept"); while (i = read(newsockfd, buffer, sizeof(buffer)), i > 0) { buffer[i] = '\0'; /* ensure capped off */ if (str_eql(buffer, "-- -exit- --")) { output("--exit--\n"); exit(1); } if (first) { old_output = set_normal_output_file(newsockfd); first = 0; if (str_eql(buffer, "--continuous--")) { continuous = 1; output("--ok--\n"); flush_output(); continue; } } if (continuous && str_eql(buffer, "--bye--")) { output("--bye--\n"); break; } process_input_line(buffer, 0); /* Report to the server's TTY */ if (verbose_server) { printf("%s: %ld checked; %ld matched; %ld filtered; " "%ld nonword; %ld printed\n", buffer, lookup.count.checked, lookup.count.matched, lookup.count.filtered, lookup.count.nonword, lookup.count.printed); } /* if not continuous, always exit after the first line */ if (!continuous) break; output("--done--\n"); /* tell that no more output coming */ flush_output(); } flush_output(); if (old_output != NO_OUTPUT_FILE) { set_normal_output_file(old_output); } close(newsockfd); } } #endif /* SERVER_CONFIG */ int main(int argc, const char *argv[]) { #ifdef GET_WINDOW_SIZE GET_WINDOW_SIZE(0); #endif /* set some basic defaults */ lookup.flag.fuzz = 1; lookup.flag.fold = 1; lookup.flag.display = 1; lookup.flag.autokana = 1; lookup.cmdstart_char = ' '; #ifdef HAVE_SPINNER lookup.spinner.interval = 200; #endif lookup.max_lines_to_print = 100; lookup.percent = 100; startup(argc, argv); kibishii_assert(lookup.default_slot); #ifdef SERVER_CONFIG { extern void service_socket(void); extern int std_romaji_toggled_force; lookup.flag.verbose = 0; std_romaji_toggled_force = 0; cmd_output_encoding((String *)"euc", 0, 0, 0); service_socket(); } #else /* SERVER_CONFIG */ do { static int abort_count = 0; String *prompt_format = lookup.default_slot->prompt_format ? lookup.default_slot->prompt_format : lookup.prompt_format; string *input; in_command = 0; kibishii_assert(prompt_format); input = getline(gen_prompt(prompt_format, 1)); /* if they're pounding on the break key, let them eventually leave */ if (apply_regex_abort) { if (abort_count++ == 2) output(" → ※one more to abort※ ←\n"); else if (abort_count >= 3) { output(" ※abort※\n"); break; } continue; } abort_count = 0; if (input == 0) break; else if (input[0] != '\0') process_input_line(input, 0); } while (!exit_program_now); #ifdef LOG_FILE_SUPPORT /* * If there's a log open, close it verbosely. */ if (output_fd_valid(set_extra_output_file(JUST_CHECKING_OUTPUT_FILE))) { lookup.flag.verbose = 1; cmd_log(1,0,0); } #endif /* LOG_FILE_SUPPORT */ #endif /* SERVER_CONFIG */ return 0; } lookup-1.08b.orig/version.c0100600000014400001440000000104706173474062015334 0ustar nakaharastaff/* * Jeffrey Friedl * Omron Corporation オムロン(株) * Nagaokakyoshi, Japan 〒617長岡京下海印寺 * * jfriedl@nff.ncl.omron.co.jp * * This work is placed under the terms of the GNU General Purpose License * (the "GNU Copyleft"). */ #define PATCHLEVEL 7 /* patchlevel off the major version number */ const char version_string[] = "Version 1.08beta"; const char version_date [] = "July 19, 1996"; const char compile_date [] = __DATE__; const char author_name [] = "Jeffrey Friedl"; const char contact_addr [] = "jfriedl@omron.co.jp"; lookup-1.08b.orig/net.announcement0100644000014400001440000001010105555701576016712 0ustar nakaharastaffNewsgroups: sci.lang.japan,soc.culture.japan,fj.life.in-japan,fj.sources.d,fj.sci.lang Subject: [ENGLISH] dictionary program release announcement Distribution: world Reply-to: jfriedl@nff.ncl.omron.co.jp FCC: ~/News/Posts --text follows this line-- [nihongo no anaunsu ha betsu desu ] [Japanese-language announcement is in a seperate post.] After much preparation, I am finally releasing my "lookup" program. One-line summary is that it's a fast, interactive grep program that can handle Japanese. I use it primairily for looking up things in edict, kanjidic, and /usr/dict/words. But for a "grep", it's _very_ powerful. The manual is 33 pages printed, if that's any indication. I have found it absolutely indispensable for study and reading help. It should run on most UNIX boxes. I'd like to port it to DOS some time, but the DOS world has Jim Breen's jdic already, so I'm in no rush. Jim Breen will kindly make it available on his archive: ftp.cc.monash.edu.au (130.194.1.106) in pub/nihongo/ Until then, for a short time, you should be able to find it on omrongw.wg.omron.co.jp (133.210.4.4) in .tmp/jfriedl/lookup_v1.0_tar.Z I have put a lot of hard work into this, so I hope it can be enjoyed by many. I even tried writing a mini-manual in Japanese to make it more accessable. yoroshiku onegaishimasu. *jeffrey* ------------------------------------------------------------------------- Jeffrey E.F. Friedl Omron Corporation, Nagaokakyo (Kyoto), Japan jfriedl@nff.ncl.omron.co.jp [ DoD##4 N8XBK 92 CB-1 400 ] Below is from the English README ------------------------------------------------------------------------- LOOKUP provides a way to quickly and powerfully search text files. The author's prime use is to search "edict" (a Japanese-English word list), "kanjidic" (a database about Japanese characters), and "/usr/dict/words" (list of English words). However, one could easily be used to search for variables in huge programs, or most any other application of searching line-based text. From the manual page: Romaji-to-Kana Converter Lookup can convert romaji to kana for you, even "on the fly" as you type. Fuzzy Searching Searches can be a bit "vague" or "fuzzy" , so that you'll be able to find the Japanese word for Tokyo even if you try to search for "to kyo" (the proper Japanese "spelling" being "to u kyo u") Regular Expressions Uses the powerful and expressive regular expression for searching. One can easily specify complex searches that affect "I want lines that look like such-and-such, but not like this-and-that, but that also have this particular characteristic...." Filters You can have lookup not list certain lines that would otherwise match your search, yet can optionally save them for quick review. For example, you could have all nameonly entries from edict filtered from normal output. Automatic Modifications Similarly, you can do a standard search-and-replace on lines just before they print, perhaps to remove information you don't care to see on most searches. For example, if you're generally not interested in kanjidic's info on Chinese readings, you can have them removed from lines before printing. Smart Word-Preference Mode You can have lookup list only entries with whole words that match your search (as opposed to an embedded match, such as finding "the" inside "them" ), but if no wholeword matches exist, will go ahead and list any entry that matches the search. Handy Features Other handy features include a dynamically settable and parameterized prompt, automatic highlighting of that part of the line that matches your search, an output pager, readline-like input with horizontal scrolling for long input lines, a ".lookup" startup file, automated programability, and much more. ----------------------------------------------------------------------------- Jeffrey jfriedl@nff.ncl.omron.co.jp lookup-1.08b.orig/regextest.c0100644000014400001440000007160306176251456015701 0ustar nakaharastaff#include "lib/config.h" #include "lib/assert.h" #include "lib/jregex.h" #include "lib/output.h" #include "lib/xmalloc.h" #if 0 #include "lib/euc.h" #endif #ifdef NO_REGEX_STATS # define STATS(stuff) /* nothing */ #else # define STATS(stuff) { stuff;} struct regex_stats regex_stats; #endif volatile unsigned apply_regex_abort = 0; /* just for linking */ #define sizeof_array(array) (sizeof(array)/sizeof(array[0])) #define PASS 1 #define FAIL (!PASS) #define IA REGCOMP_IGNORE_ALPHA_CASE #define IK REGCOMP_IGNORE_KANA_CASE #define IC REGCOMP_IGNORE_CASE #define X (const unsigned char *) #define L __LINE__, struct { const unsigned short line; const unsigned char *pattern; const unsigned char *string; const unsigned char *musthave; const unsigned flags; int val; } tests[] = { {L X"jeff", X"foo", X"jef", 0, FAIL }, {L X"a", X"a", X"a", 0, PASS }, {L X"a", X"z", X"a", 0, FAIL }, {L X"3", X"123", X"3", 0, PASS }, {L X"3", X"456", X"3", 0, FAIL }, {L X"3", X"123", X"3", 0, PASS }, {L X"[a]", X"a", X"a", 0, PASS }, {L X"[a]", X"z", X"a", 0, FAIL }, {L X"[3]", X"123", X"3", 0, PASS }, {L X"[3]", X"456", X"3", 0, FAIL }, {L X"[3]", X"123", X"3", 0, PASS }, {L X"x37", X"x39xxx373x37", X"x37", 0, PASS }, {L X"x37", X"x39xxx3.73x7", X"x37", 0, FAIL }, {L X"^wow", X"swownow", X"wo", 0, FAIL }, {L X"^wow", X"wownwos", X"wo", 0, PASS }, {L X"xxx$", X"xxxx ", X"x", 0, FAIL }, {L X"xxx$", X"xxx 3xxx", X"x", 0, PASS }, {L X">jeffrey<<", X"jef", 0, PASS }, {L X"", X"jeffrey", X"jef", 0, FAIL }, {L X"jeff>", X" jeffrey", X"jef", 0, FAIL }, {L X"jeff>", X" jeff-rey", X"jef", 0, PASS }, {L X"jeff>", X" sljeffrey", X"jef", 0, FAIL }, {L X"jeff>", X" sljeff", X"jef", 0, PASS }, {L X"", X"jeff", X"jef", 0, PASS }, {L X"", X"jeff,wow", X"jef", 0, PASS }, {L X"", X"->jeff", X"jef", 0, PASS }, {L X"", X" jeff ", X"jef", 0, PASS }, {L X"", X" jeffrey ", X"jef", 0, FAIL }, {L X"ab+c", X"abc", X"abc", 0, PASS }, {L X"ab+c", X"ac", X"abc", 0, FAIL }, {L X"ab+c", X"abbbbabbc", X"abc", 0, PASS }, {L X"ab+c", X"abbbbabbdc", X"abc", 0, FAIL }, {L X"ab+c", X"abbbbbbc", X"abc", 0, PASS }, {L X"ab*c", X"ab", X"ac", 0, FAIL }, {L X"ab*c", X"abac", X"ac", 0, PASS }, {L X"ab*c", X"abbc", X"ac", 0, PASS }, {L X"ab?c", X"ac", X"ac", 0, PASS }, {L X"ab?c", X"abc", X"ac", 0, PASS }, {L X"ab?c", X"abbc", X"ac", 0, FAIL }, {L X"abaa", X"aababaa", X"ab", 0, PASS }, {L X"abaa", X"aababab", X"ab", 0, FAIL }, {L X"弔", X"弔", X"弔", 0, PASS }, {L X"弔", X"x弔", X"弔", 0, PASS }, {L X"x弔", X"弔", X"x弔", 0, FAIL }, {L X"x弔", X"x弔", X"弔x", 0, PASS }, {L X"弔", X"つだ", X"弔", 0, FAIL }, {L X"弔", X"xつだ", X"弔", 0, FAIL }, {L X"x弔", X"xつだ", X"弔x", 0, FAIL }, {L X"<弔", X"xつだ", X"弔", 0, FAIL }, {L X"<弔", X"x弔", X"弔", 0, PASS }, {L X"弔>", X"x弔", X"弔", 0, PASS }, {L X"弔>", X"x弔x", X"弔", 0, PASS }, {L X"[弔]", X"弔", X"弔", 0, PASS }, {L X"[弔]", X"x弔", X"弔", 0, PASS }, {L X"x[弔]", X"弔", X"x弔", 0, FAIL }, {L X"x[弔]", X"x弔", X"弔x", 0, PASS }, {L X"[弔]", X"つだ", X"弔", 0, FAIL }, {L X"[弔]", X"xつだ", X"弔", 0, FAIL }, {L X"x[弔]", X"xつだ", X"弔x", 0, FAIL }, {L X"<[弔]", X"xつだ", X"弔", 0, FAIL }, {L X"<[弔]", X"x弔", X"弔", 0, PASS }, {L X"[弔]>", X"x弔", X"弔", 0, PASS }, {L X"[弔]>", X"x弔x", X"弔", 0, PASS }, {L X"a.c", X"ac", X"ac", 0, FAIL }, {L X"a.c", X"abc", X"ac", 0, PASS }, {L X"a.c", X"abac", X"ac", 0, FAIL }, {L X"a.c", X"abaxc", X"ac", 0, PASS }, {L X"a.c", X"aaaxxc", X"ac", 0, FAIL }, {L X"a.c", X"aacac", X"ac", 0, PASS }, {L X"ab*bc", X"abc", X"abc", 0, PASS }, {L X"a[b]*bc", X"abc", X"abc", 0, PASS }, {L X"a.*b", X"ab", X"ab", 0, PASS }, {L X"a.*b", X"aab", X"ab", 0, PASS }, {L X"a.*b", X"axxxxb", X"ab", 0, PASS }, {L X"a.*b", X"aabcdefbcb", X"ab", 0, PASS }, {L X"", X"jdakj", X"a", 0, FAIL }, {L X".*a>", X"jda", X"a", 0, PASS }, {L X"a.*", X"jjfkd", X"a", 0, FAIL }, {L X"a.*", X"djakd", X"a", 0, PASS }, {L X"", X"akd", X"a", 0, PASS }, {L X"x(ab(cd)?)+y", X"xy", X"xaby", 0, FAIL }, {L X"x(ab(cd)?)+y", X"xaby", X"xaby", 0, PASS }, {L X"x(ab(cd)?)+y", X"xabcdy", X"xaby", 0, PASS }, {L X"x(ab(cd)?)+y", X"xababy", X"xaby", 0, PASS }, {L X"x(ab(cd)?)+y", X"xabcdaby", X"xaby", 0, PASS }, {L X"x(ab(cd)?)+y", X"xabay", X"xaby", 0, FAIL }, {L X"x(ab(cd)?)+y", X"xabcday", X"xaby", 0, FAIL }, {L X"x(ab(cd)?)+y", X"xababay", X"xaby", 0, FAIL }, {L X"x(ab(cd)?)+y", X"xabcdabay", X"xaby", 0, FAIL }, {L X"^(a(ab)*((ab)+b)*)*y", X"y", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"ay", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"aaby", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"aababy", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"abby", X"y", 0, FAIL }, {L X"^(a(ab)*((ab)+b)*)*y", X"ababy", X"y", 0, FAIL }, {L X"^(a(ab)*((ab)+b)*)*y", X"aabby", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"aabababy", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"aabababby", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"", X"y", 0, FAIL }, {L X"^(a(ab)*((ab)+b)*)*y", X"aaay", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"aababaay", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"aby", X"y", 0, FAIL }, {L X"^(a(ab)*((ab)+b)*)*y", X"abby", X"y", 0, FAIL }, {L X"^(a(ab)*((ab)+b)*)*y", X"aabby", X"y", 0, PASS }, {L X"^(a(ab)*((ab)+b)*)*y", X"aaababaabby", X"y", 0, PASS }, {L X"a|b", X"a", X"", 0, PASS }, {L X"a|b", X"b", X"", 0, PASS }, {L X"a|b", X"b", X"", 0, PASS }, {L X"^(a|b)$", X"abb", X"", 0, FAIL }, {L X"^a|b$", X"abb", X"", 0, PASS }, {L X"^a|(b$)", X"abb", X"", 0, PASS }, {L X"^(b|a)+$", X"abb", X"", 0, PASS }, {L X"^xyz|abb|123$", X"abb", X"", 0, PASS }, {L X"^(xyz|abb|123)$", X"abb", X"", 0, PASS }, {L X"^xyz|bb|(123$)", X"abb", X"", 0, PASS }, {L X"^(xyz|bb|(123$))",X"abb", X"", 0, FAIL }, {L X"^a+|b$", X"aa", X"", 0, PASS }, {L X"^a+|b$", X"aab", X"", 0, PASS }, {L X"^(a+|b)$", X"aab", X"", 0, FAIL }, {L X"(^a)|(b$)", X"ba", X"", 0, FAIL }, {L X"^(a|b)+$", X"ba", X"", 0, PASS }, {L X"<((a*b+)+|(a+b*))>", X"b", X"", 0, PASS }, {L X"<((a*b+)+|(a+b*))>", X"a", X"", 0, PASS }, {L X"<((a*b+)+|(a+b*))>", X"aba", X"", 0, FAIL }, {L X"<((a*b+)+|(a+b*))>", X"bb", X"", 0, PASS }, {L X"<((a*b+)+|(a+b*))>", X"aab", X"", 0, PASS }, {L X"<((a*b+)+|(a+b*))>", X"aaba", X"", 0, FAIL }, {L X"<((a*b+)+|(a+b*))>", X"aabab", X"", 0, PASS }, {L X"<((a*b+)+|a+b*)>",X"b", X"", 0, PASS }, {L X"<((a*b+)+|a+b*)>",X"a", X"", 0, PASS }, {L X"<((a*b+)+|a+b*)>",X"aba", X"", 0, FAIL }, {L X"<((a*b+)+|a+b*)>",X"bb", X"", 0, PASS }, {L X"<((a*b+)+|a+b*)>",X"aab", X"", 0, PASS }, {L X"<((a*b+)+|a+b*)>",X"aaba", X"", 0, FAIL }, {L X"<((a*b+)+|a+b*)>",X"aabab", X"", 0, PASS }, {L X"ab|cd|ef|gh", X"ab", X"", 0, PASS }, {L X"ab|cd|ef|gh", X"cd", X"", 0, PASS }, {L X"ab|cd|ef|gh", X"ef", X"", 0, PASS }, {L X"ab|cd|ef|gh", X"gh", X"", 0, PASS }, {L X"ab|cd|ef|gh", X"ac", X"", 0, FAIL }, {L X"ab|cd|ef|gh", X"ah", X"", 0, FAIL }, {L X"ab|cd|ef|gh", X"abdfh", X"", 0, PASS }, {L X"a(b|cd|ef|g)h", X"ah", X"ah", 0, FAIL }, {L X"a(b|cd|ef|g)h", X"abh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)h", X"acdh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)h", X"aefh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)h", X"agh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"ah", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"abh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"acdh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"aefh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"agh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"abcdefgh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"agcdefh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"acdh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"aefh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"agh", X"ah", 0, PASS }, {L X"a(b|cd|ef|g)*h", X"agxh", X"ah", 0, FAIL }, {L X"ab|ac|ad|ae", X"aaaa", X"a", 0, FAIL }, {L X"a", X"A", X"a", 0, FAIL }, {L X"a", X"A", X"a", IA, PASS }, {L X"[a]", X"A", X"a", 0, FAIL }, {L X"[a]", X"A", X"a", IA, PASS }, {L X"A", X"a", X"A", 0, FAIL }, {L X"A", X"a", X"a", IA, PASS }, {L X"[A]", X"a", X"A", 0, FAIL }, {L X"[A]", X"a", X"a", IA, PASS }, {L X"A", X"A", X"A", 0, PASS }, {L X"A", X"A", X"a", IA, PASS }, {L X"[a-z]", X"S", X"", 0, FAIL }, {L X"[a-z]", X"T", X"", IA, PASS }, {L X"わかる", X"わかる", X"わかる", 0, PASS }, {L X"わかる", X"ワカル", X"わかる", 0, FAIL }, {L X"わか[る]", X"わかる", X"わかる", 0, PASS }, {L X"わか[る]", X"ワカル", X"わかる", 0, FAIL }, {L X"わかる", X"わかる", X"わかる", IK, PASS }, {L X"わかる", X"わカる", X"わかる", IK, PASS }, {L X"わかる", X"ワカル", X"わかる", IK, PASS }, {L X"わ[か]る", X"わかる", X"わかる", IK, PASS }, {L X"わ[か]る", X"わカる", X"わかる", IK, PASS }, {L X"わ[か]る", X"ワカル", X"わかる", IK, PASS }, {L X"[わ]か[る]", X"ワカル", X"わかる", IK, PASS }, {L X"[わ][か][る]", X"ワカル", X"わかる", IK, PASS }, {L X"[ワ][カ][ル]", X"ワカル", X"わかる", IK, PASS }, {L X"[ワ][カ][ル]", X"わかる", X"わかる", IK, PASS }, {L X"[わかる]", X"カ", X"", 0, FAIL }, {L X"[わかる]", X"カ", X"", IK, PASS }, {L X"[わかる]", X"ぴ", X"", IK, FAIL }, {L X"[abcわかる]", X"A", X"", IC, PASS }, {L X"[abcわかる]", X"Z", X"", IC, FAIL }, {L X"<[abcわかる]>", X"Cわ", X"", IC, PASS }, {L X"<[abcわかる]>", X"Zわ", X"", IC, PASS }, {L X"[^a-z]", X"a", X"", 0, FAIL }, {L X"[^a-z]", X"1", X"", 0, PASS }, {L X"[^a-z]", X"め", X"", 0, PASS }, {L X"[a-z]", X"め", X"", 0, FAIL }, {L X"[わかる]", X"a", X"", 0, FAIL }, {L X"[わかる]", X"ふ", X"", 0, FAIL }, {L X"[わかる]", X"か", X"", 0, PASS }, {L X"[^わかる]", X"a", X"", 0, PASS }, {L X"[^わかる]", X"ふ", X"", 0, PASS }, {L X"[^わかる]", X"か", X"", 0, FAIL }, {L X"[abcわかる]", X"か", X"", 0, PASS }, {L X"[abcわかる]", X"b", X"", 0, PASS }, {L X"[abcわかる]", X"ほ", X"", 0, FAIL }, {L X"[abcわかる]", X"x", X"", 0, FAIL }, {L X"[^abcわかる]", X"か", X"", 0, FAIL }, {L X"[^abcわかる]", X"b", X"", 0, FAIL }, {L X"[^abcわかる]", X"ほ", X"", 0, PASS }, {L X"[^abcわかる]", X"x", X"", 0, PASS }, {L X"\\d", X"d", X"", 0, FAIL }, {L X"\\d", X"\\", X"", 0, FAIL }, {L X"\\d", X"3", X"", 0, PASS }, {L X"\\d", X"a", X"", 0, FAIL }, {L X"\\w+", X"@", X"", 0, FAIL }, {L X"\\w+", X"a", X"", 0, PASS }, {L X"[\\d]", X"d", X"", 0, FAIL }, {L X"[\\d]", X"\\", X"", 0, FAIL }, {L X"[\\d]", X"3", X"", 0, PASS }, {L X"[\\d]", X"a", X"", 0, FAIL }, {L X"[\\w+]", X"@", X"", 0, FAIL }, {L X"[\\w+]", X"a", X"", 0, PASS }, {L X"ー", X"カー", X"ー", 0, PASS }, {L X"<ー", X"カー", X"ー", 0, FAIL }, {L X"ー>", X"カー", X"ー", 0, PASS }, {L X"<ー>", X"カー", X"ー", 0, FAIL }, {L X"ー", X"カーか", X"ー", 0, PASS }, {L X"<ー", X"カーか", X"ー", 0, FAIL }, {L X"ー>", X"カーか", X"ー", 0, FAIL }, {L X"<ー>", X"カーか", X"ー", 0, FAIL }, {L 0} }; static int lists_equal(const unsigned char *A, const unsigned char *B) { unsigned char *a = xmalloc(strlen((void*)A)+1); unsigned char *b = xmalloc(strlen((void*)B)+1); unsigned char *ina; unsigned char *bend = b + strlen((void*)B); unsigned retval = 1; strcpy((void*)a, (void*)A); strcpy((void*)b, (void*)B); for (ina = a; *ina; ina++) { unsigned char *inb; if (ina[0] & 0x80) { for (inb = b; inb < bend; inb++) { if (ina[0] == inb[0] && ina[1] == inb[1]) { inb[0] = inb[1] = 0; break; } else { if (inb[0] & 0x80) inb++; } } if (inb >= bend) { retval = 0; break; } ina++; } else { for (inb = b; inb < bend; inb++) { if (ina[0] == inb[0]) { inb[0] = 0; break; } else { if (inb[0] & 0x80) inb++; } } if (inb >= bend) { retval = 0; break; } } } if (retval) { unsigned char *inb; /* all chars in a in b. Now make sure b is clear */ for (inb = b; inb < bend; inb++) if (inb[0]) { retval = 0; break; } } free(a); free(b); return retval; } static int passed = 0, failed = 0; #ifndef NO_PAREN_INFO static void do_paren_tests(void) { regex_t buf; #define MAX_PARENS_TO_MATCH 10 matched_paren_t my_parens[MAX_PARENS_TO_MATCH]; static struct { const unsigned short line; const unsigned char *pattern; const unsigned char *string; const unsigned char *parens[MAX_PARENS_TO_MATCH]; } test[] = { {L X"a", X"a", }, {L X"(a)", X"a", {X"a"}}, {L X"(a?)", X"a", {X"a"}}, {L X"(a?)", X"b", {X""}}, {L X"(a)?", X"a", {X"a"}}, {L X"(a)?", X"b", }, {L X"(a*)", X"a", {X"a"}}, {L X"(a*)", X"aaa", {X"aaa"}}, {L X"(a*)", X"aaaX", {X"aaa"}}, {L X"(a*)", X"b", {X""}}, {L X"(a)*", X"a", {X"a"}}, {L X"(a)*", X"aaa", {X"a"}}, {L X"(a)*", X"aaaX", {X"a"}}, {L X"(a)*", X"b", }, {L X"(a+)", X"a", {X"a"}}, {L X"(a+)", X"aaa", {X"aaa"}}, {L X"(a+)", X"aaaX", {X"aaa"}}, {L X"(a)+", X"a", {X"a"}}, {L X"(a)+", X"aaa", {X"a"}}, {L X"(a)+", X"aaaX", {X"a"}}, {L X"(hi?)*", X"h", {X"h"}}, {L X"hi", X"hi", }, {L X"(hi)", X"hi", {X"hi"}}, {L X"(hi?)", X"hi", {X"hi"}}, {L X"(hi?)", X"h", {X"h"}}, {L X"(hi?)?", X"i", }, {L X"(hi)?", X"hi", {X"hi"}}, {L X"(hi*)", X"hi", {X"hi"}}, {L X"(hi*)", X"hii", {X"hii"}}, {L X"(hi*)", X"hiiX", {X"hii"}}, {L X"(hi*)*", X"b", }, {L X"(hi)*", X"hi", {X"hi"}}, {L X"(hi)*", X"hihihi", {X"hi"}}, {L X"(hi)*", X"hihihiX", {X"hi"}}, {L X"(hi+)", X"hi", {X"hi"}}, {L X"(hi+)", X"hii", {X"hii"}}, {L X"(hi+)", X"hiiX", {X"hii"}}, {L X"(hi)+", X"hi", {X"hi"}}, {L X"(hi)+", X"hihihi", {X"hi"}}, {L X"(hi)+", X"hihihiX", {X"hi"}}, {L X"a(b)(c)(d)e", X"abcde", {X"b",X"c",X"d"}}, {L X"a(b)(c)(d)e", X"abcdEabcde", {X"b",X"c",X"d"}}, {L X"((a)(b))", X"acab", {X"ab",X"a",X"b"}}, {L X"a(b?)(c)d", X"abcXacd", {X"", X"c"}}, {L X"(a(b?)(c)d)", X"abcXacd", {X"acd", X"", X"c"}}, {L X"(a(b)?(c)d)", X"abcXacd", {X"acd", 0, X"c"}}, {L X"((a(b)d)+)", X"abdaBx", {X"abd", X"abd", X"b"}}, {L X"((jeff)+)", X"mrjeffjeffrey",{X"jeffjeff", X"jeff"}}, {L X"(a)(b?)(c)", X"abc", {X"a", X"b", X"c"}}, {L X"(a)(b?)(c)", X"ac", {X"a", X"", X"c"}}, {L X"((a(b))+)", X"xaBaby", {X"aBab", X"ab", X"b"}}, {L X"((ab(cde))|(ab(cdX)))", X"abcde", {X"abcde", X"abcde", X"cde"}}, {L X"((ab(cde))|(ab(cdf)))", X"abcdf", {X"abcdf", 0,0, X"abcdf",X"cdf"}}, {L X"((((a)b)|((a)c))+)", X"abAC", {X"abAC",X"AC",0,X"A",X"AC",X"A"}}, {L X"(ab(cde)|ab(cdX))", X"abcde", {X"abcde", X"cde"}}, {L X"(ab(cde)|ab(cdf))", X"abcdf", {X"abcdf", 0, X"cdf"}}, {L X"(((a)b|(a)c)+)", X"abAC", {X"abAC",X"AC",X"A",X"A"}}, {L X"(a)\\1", X"aa", {X"a"}}, {L X"((a))\\1", X"aa", {X"a", X"a"}}, {L X"((a))\\2", X"aA", {X"a", X"a"}}, {L X"(((a))\\2)", X"aA", {X"aA", X"a", X"a"}}, {L X"(((a))\\3)", X"aA", {X"aA", X"a", X"a"}}, {L X"^([a-c]*)\\1$", X"abCABc", {X"abC"}}, {L X"(わかる)\\1", X"わかるワカル",{X"わかる"}}, {L X"((([a-z])\\3)+(\\2))",X"aAbBBbcC", {X"aAbBBb", X"bB", X"b", X"Bb"}}, }; int i, r, e; for (i = 0; i < sizeof_array(test); i++) { regexec_paren_info = my_parens; regexec_paren_info_size = MAX_PARENS_TO_MATCH; #if 0 outputf("「%s」「%s」\n", test[i].pattern, test[i].string); #endif r = regcomp(&buf, test[i].pattern, REGCOMP_SAVE_MATCHED_PAREN_INFO | REGCOMP_IGNORE_CASE); if (r != REGCOMP_SUCCESS) { outputf("paren test line %d recomp returns %d:\n", test[i].line, i, r); failed++; continue; } regexec_setflags(0); r = regexec(&buf, test[i].string, strlen((void*)test[i].string)); if (r == 0) { outputf("PAREN TEST LINE %d REGEXEC FAILED:\n", test[i].line); outputf("pattern 「%s」string 「%s」\n", test[i].pattern, test[i].string); regexec_setflags(REGEX_DEBUG); regexec(&buf, test[i].string, strlen((void*)test[i].string)); failed++; continue; } e = 0; for (r = 0; r < MAX_PARENS_TO_MATCH; r++) { const unsigned char *want, *got; int want_len, got_len; want = test[i].parens[r]; got = (r >= regexec_paren_info_used) ? 0 : (regexec_paren_info[r].match_start == 0 || regexec_paren_info[r].match_end == 0) ? 0 : regexec_paren_info[r].match_start; if (want == 0 && got == 0) continue; want_len = strlen((void*)want); got_len = got == 0 ? 0 : (regexec_paren_info[r].match_end - regexec_paren_info[r].match_start); if (want == 0) { outputf("PAREN TEST LINE %d.%d: wanted undefined, got [%.*s]\n", test[i].line, r, got_len, got); e++; } else if (got == 0) { outputf("PAREN TEST LINE %d.%d: wanted [%s] got undefined\n", test[i].line, r, want); e++; } else if (want_len == got_len && strncmp(want, got, want_len)==0) { /* matches */ } else { outputf("PAREN TEST LINE %d.%d: wanted [%s] got [%.*s]\n", test[i].line, r, want, got_len, got); e++; } } if (e) { outputf("pattern 「%s」string 「%s」\n", test[i].pattern, test[i].string); regexec_setflags(REGEX_DEBUG); showregex(&buf); regexec(&buf, test[i].string, strlen((void*)test[i].string)); for (r = 0; r < regexec_paren_info_used; r++) { if (regexec_paren_info[r].match_start == 0&& regexec_paren_info[r].match_end == 0) outputf(" paren #%d undefined (s=0, e=0)\n", r); else if (regexec_paren_info[r].match_start == 0) outputf(" paren #%d undefined (s=0)\n", r); else if (regexec_paren_info[r].match_end == 0) outputf(" paren #%d undefined (e=0)\n", r); else outputf(" paren #%d [%.*s]\n", r, (int)(regexec_paren_info[r].match_end - regexec_paren_info[r].match_start), regexec_paren_info[r].match_start); } failed++; } else { passed++; } } } #endif /* NO_PAREN_INFO */ /* * If no args, run all the standard tests. * * Otherwise, usage is: * a.out PATTERN STRING [FLAGS] * (where FLAGS is a >decimal< number) * In this case, the PATTERN is compiled and matched against the STRING * with all debugging on. */ int main(int argc, char *argv[]) { const unsigned char *pattern = X""; /* default */ const unsigned char *string = X"word"; /* default */ regex_t buf; unsigned flags = 0; unsigned regex_debug_flag = 0; int i; if (argc == 2 && strcmp(argv[1], "-help") == 0) { printf("%s -- run built-in tests\n", argv[0]); printf("%s pattern string [flags [special]]\n", argv[0]); printf("special are: 1-debug, 2-POSIXish, 3-both\n"); exit(0); } if (argc > 1) { #ifndef NO_PAREN_INFO matched_paren_t my_parens[MAX_PARENS_TO_MATCH]; regexec_paren_info = my_parens; regexec_paren_info_size = MAX_PARENS_TO_MATCH; #endif if (argc > 1) pattern = (const unsigned char *)argv[1]; if (argc > 2) string = (const unsigned char *)argv[2]; if (argc > 3) flags = atoi(argv[3]); if (argc > 4) { switch (atoi(argv[4])) { case 0: { break; } case 1: regex_debug_flag = REGEX_DEBUG; break; case 2: regex_debug_flag = REGEXEC_LLM; break; case 3: regex_debug_flag = REGEXEC_LLM|REGEX_DEBUG; break; } } flags |= REGCOMP_SAVE_MATCHED_PAREN_INFO; if (regcomp(&buf, pattern, flags|REGEX_DEBUG) == 0) { const unsigned char *orig_string = string; int times = 0; int overall_match = 0; int overall_fail = 0; showregex(&buf); regexec_setflags(flags|regex_debug_flag); retry: times++; regex_reset_stats(); i = regexec(&buf, string, strlen((void*)string)); if (!regex_debug_flag) { outputf(i ? "Regex Matches.\n" : "Regex Fails.\n"); } overall_match += regex_stats.matches; overall_fail += regex_stats.failures; STATS( outputf("\n%d Tests. Cycles %d (%dm+%df=%dc: success = %.1f%%)\n", regex_stats.tests, regex_stats.cycles, regex_stats.matches, regex_stats.failures, regex_stats.matches+ regex_stats.failures, regex_stats.matches * 100.0 / (regex_stats.matches+ regex_stats.failures)); outputf("States: %d pushed, %d popped (max depth %d)\n", regex_stats.states_pushed, regex_stats.states_popped, regex_stats.max_state_depth); if (regex_stats.parens_entered) outputf("Parens: %d entered, %d saved, %d pushed, %d popped.\n", regex_stats.parens_entered, regex_stats.parens_saved, regex_stats.parens_pushed, regex_stats.parens_popped); ); if (i) { const unsigned char *retry_point = 0; #ifndef NO_REGEXEC_MATCH_POINTS { /* show the string and where it matched */ const unsigned char *ptr = orig_string; outputf(">|%s|<\n>", orig_string); while (ptr < string) { outchar('>'); ptr++; } outchar('|'); while (ptr < regexec_match_start) { outchar(' '); ptr++; } if (regexec_match_start == regexec_match_end) outchar('\\'); else { while (ptr < regexec_match_end) { outchar('^'); ptr++; } } while (*(ptr++)) outchar(' '); output("|<\n"); /* retry from the end of the match if any string left*/ if (*regexec_match_end) { retry_point = regexec_match_end; if (regexec_match_start == regexec_match_end) retry_point++; } } #endif /* NO_REGEXEC_MATCH_POINTS */ #ifndef NO_PAREN_INFO { for (i = 0; i < regexec_paren_info_used; i++) { if (regexec_paren_info[i].match_start == 0&& regexec_paren_info[i].match_end == 0) outputf(" paren #%d undefined (s=0, e=0)\n", i); else if (regexec_paren_info[i].match_start == 0) outputf(" paren #%d undefined (s=0)\n", i); else if (regexec_paren_info[i].match_end == 0) outputf(" paren #%d undefined (e=0)\n", i); else outputf(" paren #%d [%.*s]\n", i, (int)(regexec_paren_info[i].match_end - regexec_paren_info[i].match_start), regexec_paren_info[i].match_start); } } #endif /* NO_PAREN_INFO */ if (retry_point) { string = retry_point; output("\n-------------------\n"); goto retry; } } if (times > 1) { printf("OVERALL: %d match + %d fail = %d tests\n", overall_match, overall_fail, overall_match + overall_fail); } } } else { int I, j; static unsigned extra_flags[] = { REGCOMP_CALC_MUSTHAVE, REGCOMP_CALC_MUSTHAVE|REGCOMP_JUST_MATCH, #ifndef NO_PAREN_INFO REGCOMP_CALC_MUSTHAVE|REGCOMP_SAVE_MATCHED_PAREN_INFO, #endif }; for (I = 0; I < sizeof_array(extra_flags); I++) { for (j = 0; tests[j].pattern; j++) { #if 0 outputf("「%s」「%s」 flags %d\n", tests[j].pattern, tests[j].string, tests[j].flags|extra_flags[I]); #endif i = regcomp(&buf, tests[j].pattern, tests[j].flags|extra_flags[I]); if (i != 0) { outputf("LINE d: \n", tests[j].line, i, tests[j].pattern, tests[j].flags|extra_flags[I]); regcomp(&buf, tests[j].pattern, tests[j].flags|REGEX_DEBUG|extra_flags[I]); failed++; } else if (!lists_equal(buf.musthave, tests[j].musthave)) { outputf("LINE %d: for _%s_ got musthave=[%s], expected [%s]\n", tests[j].line, tests[j].pattern, buf.musthave, tests[j].musthave); failed++; regfree(&buf); } else { regexec_setflags(tests[j].flags|extra_flags[I]); i = regexec(&buf, tests[j].string,strlen((void*)tests[j].string)); if (i == tests[j].val) passed++; else { failed++; outputf("\n\nTEST LINE %d, EXPECTED TO %s「%s」「%s」[%x]<<\n", tests[j].line, tests[j].val == PASS ? "PASS" : "FAIL", tests[j].pattern, tests[j].string, tests[j].flags|extra_flags[I]); regfree(&buf); regcomp(&buf, tests[j].pattern, extra_flags[I]|tests[j].flags|REGEX_DEBUG); regexec_setflags(tests[j].flags|REGEX_DEBUG| extra_flags[I]); regexec(&buf,tests[j].string,strlen((void*)tests[j].string)); } regfree(&buf); } } } #ifndef NO_PAREN_INFO do_paren_tests(); #endif outputf("\n%d tests passed, %d tests failed\n", passed, failed); } return 0; } lookup-1.08b.orig/net.announcement.japanese0100644000014400001440000000331105555701536020500 0ustar nakaharastaffNewsgroups: sci.lang.japan,soc.culture.japan,fj.life.in-japan,fj.sources.d,fj.sci.lang Subject: [JAPANESE] dictionary program release announcement Distribution: world Reply-to: jfriedl@nff.ncl.omron.co.jp FCC: ~/News/Posts --text follows this line-- 自分で作ったのプログラムをリリースします。 名前: lookup“ルーキャプ”【検索】 目的 : テキストファイルの中の言葉を簡単に速く探すこと (UNIXの上) ‥‥ ‥‥ 応用: edict や /usr/dict/words の中の言葉を引く edict はフリー(無料)の英和・和英辞典です。 "edict" というファイルの行は全て以下のフォーマットです。 漢字 [読み方] /英語/英語/.../ とか 仮名 /英語/英語/.../ 例えば: アメリカ /America/ 京 [けい] /10,000,000,000,000,000/ten quadrillion/ 元気 [げんき] /health(y)/robust/vigor/energy/vitality/vim/stamina/ 御飯 [ごはん] /rice (cooked)/meal/ 車 [くるま] /car/vehicle/wheel/ 日本語 [にほんご] /Japanese language/ 現在は約80,000行が入っているので、なかなか便利だと思います。 ftp.cc.monash.edu.au (130.194.1.106) の pub/nihongo にある、 Jim Breen先生が提供したものです。英語の edict.doc もあります。 edict の 80,000行が約3メガバイトあるので、 普通の grep などは遅過ぎるはずです。 4月26日頃 lookup も ftp.cc.monash.edu.au におきますが、それまでは、 omrongw.wg.omron.co.jp (133.210.4.4) の“.tmp/jfriedl/lookup_v1.0_tar.Z” にあります。 英語のマニュアルも日本語の説明書(README.JAP)もあります。 ENJOY してください! コメントとか jfriedl@omron.co.jp へお願いします。 *jeffrey* ------------------------------------------------------------------------- フリードル・ジェフリー オムロン株式会社 (京都府長岡京市) Jeffrey E.F. Friedl jfriedl@nff.ncl.omron.co.jp lookup-1.08b.orig/make.sh.gz0100755000014400001440000000211606123424321015370 0ustar nakaharastaff(N1make.shVmS6\\ZbOstps杢G怨D< -FPhW逖dL{ EpTl`vtAm("M59am! LmJ F09Fn4富 n g??/F|カnB@8叩F謎舶r~ `E43# {ρIS功k2Pw棧}Z?~緑&+`rfX\皋g{弄睨U7B1\bug鰆`n6%徭NL$dV匁l:mX踞r =[0c窶含艙菰QT4^]t飲b) 閘AyH)V-dFa7F ,餓藩扨a+a.4|F 1./j+/薛)jHl髯/袷'VjCば5嚆^/n5禧皆j+:d;rV*>v_mhtJx}N!3?\b:m-8頚,'s5Gzc\橇\10蒜 bQ韆=n B剌$∵\ FpZ钁褪 珥03oz聊鴻b:Pp4Q ~飲0-=v?es- lookup-1.08b.orig/makeregex0100755000014400001440000000007006053712121015370 0ustar nakaharastaffmake optimize=0 pedantic=1 kibishii=1 debug=1 regextest lookup-1.08b.orig/dot-lookup.jfriedl0100644000014400001440000000435506174615024017152 0ustar nakaharastaff## turn verbose mode off during startup file processing verbose off ## want all output to be via EUC encoding euc ## For slots ("files") that have highlighting on, I like cyan. ## Choices include the colors: ## black red green yellow blue purple white cyan ## or the effects ## bold inverse blink under(line) highlight cyan ## The prompt. See documentation for an explination of this monster. prompt "%C(命令[%#:%0])%!C(%!d'NONDISPLAYED '%l("%L" )%!f'raw '%w'word '%n)> " ## Just a cute effect -- when searching, show a spinner to indicat that hey, ## at least some progress is being made. spinner 200 ## want output paged pager on ######################################################################### ## Individual slots follow ############################################## ######################################################################### load /usr/jfriedl/lib/edict highlight on word on ######################################################################### ## The filter for kanjidic will hit for entries without a frequency-of-use ## number. The modify spec will remove fields with an initial code given ## in the [LIST] below, 'cause I normally don't want to see'em. load /usr/jfriedl/lib/kanjidic highlight on filter "uncommon" !// modify /( [NUQMKLYXZ]\S+)+//g tag on "*" ######################################################################### load /usr/jfriedl/lib/enamdict highlight on word on ######################################################################### load /usr/jfriedl/lib/jack highlight on modify /^[wsc] | (ON|KUN \[[^]]+]) ?\^?\d* ?:| KUN//g ######################################################################### load /usr/jfriedl/lib/kodansha highlight on tag on "*" word off ######################################################################### load /usr/jfriedl/lib/lawdict highlight on word on ######################################################################### load /usr/jfriedl/lib/lifedict highlight on word on ######################################################################### ######################################################################### # I like the initial default to be edict select edict ## turn verbosity back on for interactive use. verbose on lookup-1.08b.orig/lookup-ideas0100644000014400001440000000014206032563015016014 0ustar nakaharastaffload -when-needed file.... (sets a slot to be loaded only when first used) check ^Z with linux. lookup-1.08b.orig/commands.h0100644000014400001440000005663306174367707015510 0ustar nakaharastaff/* * * This file generated from (and by) the file "cmds.master". * */ #define S const unsigned char * /* generated from "cmds.master", record at line 92 */ static int _func3_(void) { return cmd_do_search(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 98 */ static int _func4_(void) { return cmd_set_local_autokana_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 104 */ static int _func5_(void) { return cmd_set_default_autokana_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 116 */ static int _func7_(void) { return cmd_cmdchar(cmd_paren[3-1]); } /* generated from "cmds.master", record at line 122 */ static int _func8_(void) { return cmd_set_default_cmd_debug_flag(cmd_paren[3-1]); } /* generated from "cmds.master", record at line 128 */ static int _func9_(void) { return cmd_combine(cmd_paren[4-1], cmd_paren[5-1] ? atoi(cmd_paren[5-1]) : -1, cmd_paren[6-1]); } /* generated from "cmds.master", record at line 134 */ static int _func10_(void) { return cmd_set_default_debug_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 140 */ static int _func11_(void) { return cmd_describe_raw(cmd_paren[2-1]);; } /* generated from "cmds.master", record at line 146 */ static int _func12_(void) { return cmd_describe_raw(cmd_paren[1-1]);; } /* generated from "cmds.master", record at line 152 */ static int _func13_(void) { return cmd_describe_kuten(cmd_paren[2-1]);; } /* generated from "cmds.master", record at line 158 */ static int _func14_(void) { return cmd_describe_ascii(8, cmd_paren[2-1]);; } /* generated from "cmds.master", record at line 164 */ static int _func15_(void) { return cmd_describe_ascii(10, cmd_paren[1-1]);; } /* generated from "cmds.master", record at line 170 */ static int _func16_(void) { return cmd_describe_ascii(16, cmd_paren[1-1]);; } /* generated from "cmds.master", record at line 176 */ static int _func17_(void) { return cmd_describe_encoding(cmd_paren[1-1], cmd_paren[3-1]);; } /* generated from "cmds.master", record at line 182 */ static int _func18_(void) { return cmd_describe_jis_string(cmd_paren[2-1]); } /* generated from "cmds.master", record at line 188 */ static int _func19_(void) { return cmd_set_local_display_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 194 */ static int _func20_(void) { return cmd_set_default_display_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 200 */ static int _func21_(void) { return cmd_encoding(cmd_paren[2-1]); } /* generated from "cmds.master", record at line 206 */ static int _func22_(void) { return cmd_list_files(cmd_paren[2-1] ? 1 : 0); } /* generated from "cmds.master", record at line 212 */ static int _func23_(void) { return cmd_filter(cmd_paren[5-1], cmd_paren[2-1], cmd_paren[3-1][0] == '!', cmd_paren[6-1][0] == 'i'); } /* generated from "cmds.master", record at line 218 */ static int _func24_(void) { return cmd_toggle_filter(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 224 */ static int _func25_(void) { return cmd_set_local_fold_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 230 */ static int _func26_(void) { return cmd_set_default_fold_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 236 */ static int _func27_(void) { return cmd_set_local_fuzz_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 242 */ static int _func28_(void) { return cmd_set_default_fuzz_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 248 */ static int _func29_(void) { return cmd_help(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 254 */ static int _func30_(void) { return cmd_set_local_highlight_flag(cmd_paren[3-1]); } /* generated from "cmds.master", record at line 260 */ static int _func31_(void) { return cmd_set_highlighting_style(cmd_paren[4-1]); } /* generated from "cmds.master", record at line 266 */ static int _func32_(void) { return cmd_if(cmd_paren[1-1],cmd_paren[2-1]); } /* generated from "cmds.master", record at line 272 */ static int _func33_(void) { return cmd_set_default_highlight_flag(cmd_paren[3-1]); } /* generated from "cmds.master", record at line 278 */ static int _func34_(void) { return cmd_input_encoding(cmd_paren[2-1]); } /* generated from "cmds.master", record at line 284 */ static int _func35_(void) { return cmd_set_limit(cmd_paren[2-1]); } /* generated from "cmds.master", record at line 290 */ static int _func36_(void) { return ((cmd_paren[3-1]||cmd_paren[4-1]) ? warn("flags no longer need for 'load' command\n") : 0), cmd_load((const char *)cmd_paren[8-1]);; } /* generated from "cmds.master", record at line 296 */ static int _func37_(void) { return cmd_log(cmd_paren[2-1] ? 1 : 0, cmd_paren[5-1] ? 1 : 0, cmd_paren[7-1]); } /* generated from "cmds.master", record at line 302 */ static int _func38_(void) { return cmd_modify(cmd_paren[2-1],cmd_paren[3-1], cmd_paren[4-1][0] == 'i' || cmd_paren[4-1][1] == 'i', cmd_paren[4-1][0] == 'g' || cmd_paren[4-1][1] == 'g'); } /* generated from "cmds.master", record at line 308 */ static int _func39_(void) { return cmd_toggle_modify(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 314 */ static int _func40_(void) { return cmd_msg(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 320 */ static int _func41_(void) { return cmd_output_encoding(cmd_paren[2-1], cmd_paren[3-1], cmd_paren[5-1], cmd_paren[6-1]); } /* generated from "cmds.master", record at line 326 */ static int _func42_(void) { return cmd_pager(cmd_paren[2-1], cmd_paren[4-1], cmd_paren[5-1]); } /* generated from "cmds.master", record at line 332 */ static int _func43_(void) { return cmd_set_prompt(1, cmd_paren[3-1]); } /* generated from "cmds.master", record at line 338 */ static int _func44_(void) { return cmd_set_prompt(0, cmd_paren[3-1]); } /* generated from "cmds.master", record at line 344 */ static int _func45_(void) { return cmd_set_default_regex_debug_flag(cmd_paren[2-1]); } /* generated from "cmds.master", record at line 350 */ static int _func46_(void) { return cmd_list_size(cmd_paren[5-1]); } /* generated from "cmds.master", record at line 356 */ static int _func47_(void) { return cmd_select(cmd_paren[4-1], (const char *)cmd_paren[7-1], cmd_paren[2-1] ? 1 : 0); } /* generated from "cmds.master", record at line 368 */ static int _func49_(void) { return cmd_source((const char *)cmd_paren[2-1]); } /* generated from "cmds.master", record at line 374 */ static int _func50_(void) { return cmd_set_spinner(cmd_paren[2-1]); } /* generated from "cmds.master", record at line 386 */ static int _func52_(void) { return cmd_tag(cmd_paren[1-1], cmd_paren[4-1]); } /* generated from "cmds.master", record at line 392 */ static int _func53_(void) { return set_verbose_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 404 */ static int _func55_(void) { return cmd_set_local_word_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 410 */ static int _func56_(void) { return cmd_set_default_word_flag(cmd_paren[1-1]); } /* generated from "cmds.master", record at line 416 */ static int _func57_(void) { return cmd_set_local_glob_flag(cmd_paren[3-1]); } /* generated from "cmds.master", record at line 422 */ static int _func58_(void) { return cmd_set_default_glob_flag(cmd_paren[3-1]); } /* generated from "cmds.master", record at line 434 */ static int _func60_(void) { return cmd_error("expecting argument", cmd_paren[1-1]); } /* generated from "cmds.master", record at line 440 */ static int _func61_(void) { return cmd_error("argument error", cmd_paren[2-1]); } static struct command command[] = { /* generated from "cmds.master" record at line 80*/ { CMD_FILE_ONLY /* comment line */, (S)0, (S)0, (S)"^\\s*#", 0, }, /* generated from "cmds.master" record at line 86*/ { CMD_FILE_ONLY /* blank line */, (S)0, (S)0, (S)"^\\s*$", 0, }, /* generated from "cmds.master" record at line 92*/ { CMD_FILE_ONLY /* search */, (S)0, (S)0, (S)"^\\s*([+!=].*)", _func3_, }, /* generated from "cmds.master" record at line 98*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"[default] autokana [on|1|off|0]", (S)"Turns automatic romaji conversion on or off, or reports current status.", (S)"^\\s*default\\s*autokana>\\s*(on|1|off|0)?\\s*$", _func4_, }, /* generated from "cmds.master" record at line 104*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*default\\s*autokana>\\s*(on|1|off|0)?\\s*$", _func5_, }, /* generated from "cmds.master" record at line 110*/ { CMD_GENERAL, (S)"clear|cls", (S)"clear the screen", (S)"^\\s*(clear|cls)\\s*$", cmd_clear, }, /* generated from "cmds.master" record at line 116*/ { CMD_GENERAL, (S)"cmdchar ['bytechar']", (S)"set the this-is-a-command character", (S)"^\\s*cmd(char)?\\s*('(\\a)')?\\s*$", _func7_, }, /* generated from "cmds.master" record at line 122*/ { CMD_GENERAL, (S)"command debug [on|1|off|0]", (S)"Turns command debugging on or off, or reports current status.", (S)"^\\s*c(om(mand)?)?>\\s*debug>\\s*(on|1|off|0)?\\s*$", _func8_, }, /* generated from "cmds.master" record at line 128*/ { CMD_GENERAL|CMD_LOAD_RELATED, (S)"combine> [\"name\"] [num =] num [num....]", (S)"Combines previously-loaded slots to one new slot.", (S)"^\\s*comb(o|ine)\\s*(([\"'])(.*)\\3\\s*=?)?\\s*(\\d+\\s*\\+?=)?\\s*((#?\\s*\\d+\\s*,?\\s*)+)\\s*$", _func9_, }, /* generated from "cmds.master" record at line 134*/ { CMD_GENERAL, (S)"debug [on|1|off|0]", (S)"Turns debugging on or off, or reports current status.", (S)"^\\s*debug>\\s*(on|1|off|0)?\\s*$", _func10_, }, /* generated from "cmds.master" record at line 140*/ { CMD_GENERAL, (S)"describe \"string\"|character|[kuten|euc|jis|sjis|ascii]code", (S)"describes the encodings for the character(s) indicated", (S)"^\\s*describe>\\s*(['\"])(.*)\\1\\s*$", _func11_, }, /* generated from "cmds.master" record at line 146*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*describe>\\s*(\\A+|[!-~])\\s*$", _func12_, }, /* generated from "cmds.master" record at line 152*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*describe>\\s*(kuten>\\s*)?(\\d\\d\\d\\d)\\s*$", _func13_, }, /* generated from "cmds.master" record at line 158*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*describe\\s*ascii\\s*(\\\\|0)([0-7]+)\\s*$", _func14_, }, /* generated from "cmds.master" record at line 164*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*describe\\s*ascii\\s*(\\d+)\\s*$", _func15_, }, /* generated from "cmds.master" record at line 170*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*describe\\s*ascii\\s*(0[xX])?([0-9a-f][0-9a-f])\\s*$", _func16_, }, /* generated from "cmds.master" record at line 176*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*describe>\\s*(sjis>|jis>|euc>)?\\s*(0[xX])?(<[0-9a-f][0-9a-f][0-9a-f][0-9a-f]>)\\s*$", _func17_, }, /* generated from "cmds.master" record at line 182*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*describe>\\s*(\\$@|\\$B|\\$@\\$B)?(([!-~][!-~])+)(\\(\\$@|\\$B|\\$@\\$B|\\$\\(D|\\(J|\\(H|\\(B|\\(I)?\\s*$", _func18_, }, /* generated from "cmds.master" record at line 188*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"[default] display [on|1|off|0]", (S)"Turns display of matching lines on or off.", (S)"^\\s*display>\\s*(on|1|off|0)?\\s*$", _func19_, }, /* generated from "cmds.master" record at line 194*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*default\\s*display>\\s*(on|1|off|0)?\\s*$", _func20_, }, /* generated from "cmds.master" record at line 200*/ { CMD_GENERAL|CMD_ENCODING_RELATED, (S)"encoding (euc|jis|sjis)", (S)"set the input/output encoding-method", (S)"^\\s*encod(e|ing)>\\s*(euc|jis|sjis)?\\s*$", _func21_, }, /* generated from "cmds.master" record at line 206*/ { CMD_GENERAL, (S)"files|slots [-]", (S)"list what files are loaded into what slots.", (S)"^\\s*(slot|file)s?>\\s*(-|-?(help|long))?\\s*$", _func22_, }, /* generated from "cmds.master" record at line 212*/ { CMD_GENERAL|CMD_LOAD_RELATED, (S)"filter [\"name\"] [!] /regex/[i]", (S)"set the filter for the selected file.", (S)"^\\s*filter>\\s*(\"([^\"]*)\")?\\s*(!?)\\s*(\\S)(.+)\\4(i?)\\s*$", _func23_, }, /* generated from "cmds.master" record at line 218*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"filter [on|1|off|0]", (S)"turn the filter (for the selected file) on or off, or report its status", (S)"^\\s*filter>\\s*(on|1|off|0)?\\s*$", _func24_, }, /* generated from "cmds.master" record at line 224*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"[default] fold [on|1|off|0]", (S)"Turns case folding on or off, or reports current status.", (S)"^\\s*fold>\\s*(on|1|off|0)?\\s*$", _func25_, }, /* generated from "cmds.master" record at line 230*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*default\\s*fold>\\s*(on|1|off|0)?\\s*$", _func26_, }, /* generated from "cmds.master" record at line 236*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"[default] fuzz [on|1|off|0]", (S)"Turns fuzzification on or off, or reports current status.", (S)"^\\s*fuzz>\\s*(on|1|off|0)?\\s*$", _func27_, }, /* generated from "cmds.master" record at line 242*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*default\\s*fuzz>\\s*(on|1|off|0)?\\s*$", _func28_, }, /* generated from "cmds.master" record at line 248*/ { CMD_GENERAL, (S)"help [string]", (S)"list help", (S)"^\\s*help\\s*(.*\\S)?\\s*$", _func29_, }, /* generated from "cmds.master" record at line 254*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"[default] highlight [on|1|off|0]", (S)"Turns highlighting on or off, or reports current status.", (S)"^\\s*h(igh)?l(ight)?>\\s*(on|1|off|0)?\\s*$", _func30_, }, /* generated from "cmds.master" record at line 260*/ { CMD_GENERAL, (S)"highlight [bold|inverse|blink|under|<___>|black|red|green|yellow|blue|purple|cyan|white]", (S)"Sets the highlighting style (bold, inverse[standout], or to a given HTML tag)", (S)"^\\s*h(igh)?l(ight)?>\\s*(style)?\\s*(red|green|yellow|blue|purple|cyan|white|bold|inverse|standout|blink|under|\\<([a-zA-Z]+)\\>)?\\s*$", _func31_, }, /* generated from "cmds.master" record at line 266*/ { CMD_GENERAL, (S)"Runs command only if EXPR is true.", (S)"if {expr} command", (S)"^\\s*if\\s*{([^}]*)}\\s*(.*)\\s*$", _func32_, }, /* generated from "cmds.master" record at line 272*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*default\\s*h(igh)?l(ight)?>\\s*(on|1|off|0)?\\s*$", _func33_, }, /* generated from "cmds.master" record at line 278*/ { CMD_GENERAL|CMD_ENCODING_RELATED, (S)"input encoding [euc|sjis]", (S)"report or set the input encoding-method for 8-bit bytes (JIS always OK)", (S)"^\\s*input\\s*(encoding)?>\\s*(euc|sjis)?\\s*$", _func34_, }, /* generated from "cmds.master" record at line 284*/ { CMD_GENERAL, (S)"limit [ ]", (S)"set the maximum number of lines to print during any one command.", (S)"^\\s*limit\\s*(=?\\s*(\\d+))?\\s*$", _func35_, }, /* generated from "cmds.master" record at line 290*/ { CMD_GENERAL|CMD_LOAD_RELATED, (S)"load \"file\"", (S)"load a file (and read or compute its index as needed)", (S)"^\\s*(fast)?load>\\s*(-?(now)|-?(w(hen)?n(eeded)?))?\\s*(['\"]?)(\\S+)\\7\\s*$", _func36_, }, /* generated from "cmds.master" record at line 296*/ { CMD_GENERAL, (S)"log [- | [+]\"file\"]", (S)"log output to a file", (S)"^\\s*log>\\s*((-|)|(()?\\s*(\\+)?\\s*(['\"]?)(.+)\\6))?\\s*$", _func37_, }, /* generated from "cmds.master" record at line 302*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"modify /regex/replace/[ig]", (S)"sets the modify regular expression and replacement for the selected file.", (S)"^\\s*modify>\\s*(\\S)(.+)\\1(.*)\\1([ig]?)\\s*$", _func38_, }, /* generated from "cmds.master" record at line 308*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"modify [on|1|off|0]", (S)"sets the modify filter on or off (for the indicated file)", (S)"^\\s*modify>\\s*(on|1|off|0)?\\s*$", _func39_, }, /* generated from "cmds.master" record at line 314*/ { CMD_GENERAL, (S)"msg ....", (S)"prints a message to the screen", (S)"^\\s*msg\\s*(.*)\\s*$", _func40_, }, /* generated from "cmds.master" record at line 320*/ { CMD_GENERAL|CMD_ENCODING_RELATED, (S)"output encoding [euc|sjis|jis|...]", (S)"report or set the output encoding-method", (S)"^\\s*output>\\s*(encoding>)?\\s*(euc|sjis|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\\s]+)*)\\s*$", _func41_, }, /* generated from "cmds.master" record at line 326*/ { CMD_GENERAL, (S)"pager [boolean | [W x] H]", (S)"configure (width x height) or toggle the output pager", (S)"^\\s*pager>\\s*((on|1|off|0)|(([1-9]\\d*\\s*[,x])?\\s*([1-9]\\d*)))?\\s*$", _func42_, }, /* generated from "cmds.master" record at line 332*/ { CMD_GENERAL|CMD_NEEDS_SLOT, (S)"[local] prompt \"string\"", (S)"set the prompt format string", (S)"^\\s*local\\s*prompt>\\s*((['\"])(.+)\\2)?\\s*$", _func43_, }, /* generated from "cmds.master" record at line 338*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*prompt>\\s*((['\"])(.+)\\2)?\\s*$", _func44_, }, /* generated from "cmds.master" record at line 344*/ { CMD_GENERAL, (S)"regex debug [on|1|off|0]", (S)"Turns regex debugging on or off, or reports current status.", (S)"^\\s*r(egex)?\\s*debug>\\s*(on|1|off|0)?\\s*$", _func45_, }, /* generated from "cmds.master" record at line 350*/ { CMD_GENERAL, (S)"saved list size [ ]", (S)"set the number of elided lines to remember for the \"show\" command.", (S)"^\\s*saved?\\s*(list)?\\s*(size|len(gth)?)?\\s*(=?\\s*(\\d+))?\\s*$", _func46_, }, /* generated from "cmds.master" record at line 356*/ { CMD_GENERAL|CMD_LOAD_RELATED, (S)"select [ num | name | . ]", (S)"sets the default file", (S)"^\\s*select>\\s*((\\.)|(#?\\s*(\\d+))|(([\"']?)(.+)\\6))?\\s*$", _func47_, }, /* generated from "cmds.master" record at line 362*/ { CMD_GENERAL, (S)"show", (S)"show the lines filtered by the last search, if any", (S)"^\\s*show\\s*$", cmd_show, }, /* generated from "cmds.master" record at line 368*/ { CMD_GENERAL, (S)"source \"filename\"", (S)"load commands from a file", (S)"^\\s*source>\\s*(['\"]?)(.+)\\1\\s*$", _func49_, }, /* generated from "cmds.master" record at line 374*/ { CMD_GENERAL, (S)"spinner [ ]", (S)"sets the spinner to move each lines checked (0 to disable)", (S)"^\\s*spinner\\s*(=?\\s*(\\d+))?\\s*$", _func50_, }, /* generated from "cmds.master" record at line 380*/ { CMD_GENERAL, (S)"stats", (S)"reports stats about the last search", (S)"^\\s*stats?\\s*$", cmd_stats, }, /* generated from "cmds.master" record at line 386*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"tag [boolean] [\"string\"]", (S)"set, toggle, or report the tag for the slot.", (S)"^\\s*tag>\\s*(on|1|off|0)?\\s*((['\"])(.*)\\3)?\\s*$", _func52_, }, /* generated from "cmds.master" record at line 392*/ { CMD_GENERAL, (S)"verbose [on|1|off|0]", (S)"Turns verbosity on or off, or reports current status.", (S)"^\\s*verbose>\\s*(on|1|off|0)?\\s*$", _func53_, }, /* generated from "cmds.master" record at line 398*/ { CMD_GENERAL, (S)"version", (S)"report the version number", (S)"^\\s*version\\s*$", cmd_version, }, /* generated from "cmds.master" record at line 404*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"[default] word [on|1|off|0]", (S)"Turns word-preference mode on or off, or reports current status.", (S)"^\\s*word>\\s*(on|1|off|0)?\\s*$", _func55_, }, /* generated from "cmds.master" record at line 410*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*default\\s*word>\\s*(on|1|off|0)?\\s*$", _func56_, }, /* generated from "cmds.master" record at line 416*/ { CMD_GENERAL|CMD_LOAD_RELATED|CMD_NEEDS_SLOT, (S)"[default] wildcard [on|1|off|0]", (S)"Selects wildcard \"glob\" patterns instead of regular expression patterns.", (S)"^\\s*(glob|wild(card)?)>\\s*(on|1|off|0)?\\s*$", _func57_, }, /* generated from "cmds.master" record at line 422*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*default\\s*(glob|wild(card)?)>\\s*(on|1|off|0)?\\s*$", _func58_, }, /* generated from "cmds.master" record at line 428*/ { CMD_GENERAL, (S)"exit|quit|bye|leave|done", (S)"Exits the program", (S)"^\\s*(exit|quit|bye|leave|done)\\s*$", cmd_exit, }, /* generated from "cmds.master" record at line 434*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*(load|describe|source)\\s*$", _func60_, }, /* generated from "cmds.master" record at line 440*/ { CMD_GENERAL, (S)0, (S)0, (S)"^\\s*(default|local)?\\s*(autokana|cmdchar|debug|describe|encoding|files?|filter|fold|fuzz|help|highlight\\s*(style)?|input\\s*(encoding)?|limit|load|modify|output\\s*(encoding)?|pager|prompt|r(egex)?\\s*debug|saved?\\s*(list)?\\s*(size|len(gth)?)?|select|show|source|spinner|stats?|verbose|version|word|exit|quit|bye|leave|done|combine|combo|tag).*\\s*$", _func61_, }}; lookup-1.08b.orig/lookup.c.jin0100444000014400001440000004100606300550472015734 0ustar nakaharastaffxdjBr10{l Vcz5GQ'B^E^DPgU(/@x@X@`a@zr@@p@qJL H(e P*t D78 l "; p uvH ,=J $ul  ~` {|d 9:h @X@ A*@'@@b@b@'@X@@C @b@b@@X@b@0@@b@V@諦@'@@`a@ b@@y@諦@'@@`a@b@@J@諦@`a@b@頽@:`a@@@+0 p`@@+Cm< !$%k&-)*-02+368;A>?@@OAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBT fI~-L'g\,  `=G@E5!0)z&B,"0^S/L2/S+![S=@v;H\oDZ*WN)AT1 ;I 4Gs/P*"l2`T^RuISc "- Ug 9F,%qW; / ON;Dr ;> Tl'A|DAD 4aZ}ID#% W ar%.N~ e <1P'!TW B!j% Bl2\;;zl&S+C5s:Y?!;EA6HF=;/)*5m3HVMe=Z1B$FBG+K !L EF,F=C>U6=M5~)#>', I2,,XY6%O-'<@B29%7^/8%#><=4<>l$CIF! . ;%0LD,)m5!Q =  '48>['48;X ; ; > ; @%3 @%3 !  ; P P-=62_g'Z.HFC3*II@C"gD&7D;Q.,4#N51(*\A70E~/D zy011'}_YS>IMC8J>sFH)OQh*()R@UFH!Q(1 ;t!C=m3j^ ' ' >- >- 5* ' * ' /& ,`3g668:33499I33g99y?.XBG2B/HTIG;~(LAAS&51 RB$I,:A9^:?5n-*Gb%b!I3@PPf?*<<02N/9)UY$k+!5!#B!(27IIK&4)"' <#-%14>=02A6'9<IZ%"-/`2{HLFCLK#LKOR#\3"O"1$30O:3#iF{gU co" ,@j&>Q!1G/'"$KB4KVmQ l!XL <F-Qt8i'48'48^QbFC4Y*\AH/D YS>Vn(1A'kk_ADA^FBA:PP~Y$)"'%/CK3OUF/5KXRG/'"$W"J.:;Ew3K'F';+SG! 8F,CHA65>d IH72,,,Y6%&)tV\0"Z8%#>y4!5$Q,%&1$T[[?\'4''4\\_\M%GM%G !4\P];g[2_gEa#HFv*II@C"gZ;.,Y3Uj>#(Ax0Ezy0bsIE0MC8Jzn]\V+tU;HIR@C!F&W00>6>653030//@3:33?.G2B/HTIyGx/':A9]=)FU^235C<='9xy5!VmHf?*"%k/4wV#T!(2n'"JN'\#@1I~2AN`<I7F,<8!z-F}0RC3HLFCLKoKm#,UO0g?6#NF?+F\kg56(?oE#@0(LRLe4i:i"}+7e;kD;C3d:?]:1B$&+q7L 80_Fj6:Y: #>7I2,e,,,,,c%&)T<V%8%#>'=p<92F! ?B$/O)]#(Ac+*=\E'48>E='48;B#;!; >!;!@%3!@%3!! !;21f46 RV!^2$ZEa#HFC3*AU@DG,04=  '5(*H`EEYG >IE0M7DJ>  /)E (&" )@<@HCh>034668499g99yG2B/IGTIf fG7A/'?=)F=9%23b"-/H2 C3FCLK#LAR#HG"O"1$30O"gu#NF8/56q%,E<; 9c+#@d*>QcTi"i"J+:-,? EA63FS * ;3  ,+3H!ME = !1B$& BGV!;8JEF,F=C >*+6,V?=956D&B%3 #>7n W& ^B,,,,,-6%&)-'<@B297!C#><,4<$5&2F-. ;%02!#,)>!5!A2  8' '4*>E'4*;B O O R O @%G @%G.4 O # )+ - (62$W'2(L#HFC3*IIAU@%"&)D&7D: !! .,@;!4O DH Q;5(*EA70E*T/D @:H"01'E8EN G >IE0B 7 8J><=A FH>  /2Z)E (&  )@V<6()R@>Fo(HIT!6 ($ 3 ;) ?!C=m"jA ' ' >- >- F* ' * ' /& ,`334668:33499I33g99: #?.BG2B/8'I7;. O0( C* 8B&U D.$$ G7R/':A9/f/=9%23C2 =`A. HQ:?VzCA*G1%I!I3HE5@PPH?*"%`/9)4!C$@+!5!;B!=& 7"'"'#(&4)"' $#:% 4!E2Y6'9<<.77#%"-/H2 %HLFCLK#LK,AOR#,3"O"1$30O"4_#NF/,%?q5%& 8/56(U q%,E# %5+#&0E>b!1G8/B6 #" 4K6"i"J3?> YA6H3eSL _Y3 W3d!<E?]!11$& BG[l!X8JE=!x">:/%W-eT)z= ?>'"x"a,e,,,,,@B70"/r!V!Z>RU]X21+g8i'48>'48;e;!;!> ; @%3!@%3b!;G >9!^QfbkF@&7]:+11!2VysB!qR#.;O?*EAx0]4 @  >Iu 8J>  zA ;<" )4'x<R@UF.A6HIuQX5`C=NhAA>6>G5DA30/@@:334668:3g99;99J?.wBI4aS&f.O1G7':A9?=hC<='AH*'yV/'$CQ*GN%b!I3HE5@PP*a,0x:,)4!C$kL5!#T6(IIK&4)"' <5%)UdA6'9PU.IZ%"-/HjC3FCLnBUq1g=!? #N9F/W2 [1\K 8d^?`%,E3">9c+?&hE>b!1GLwa 4K6";SJ3C-'m*, ;,[H!ME = !&Vy80E>*6,T)z&Y%@aon[#%&)-'<@B29[>SF!EB#;!;!>!;!!b!;c=)S>;(62C3*].,u4'}E0><TwF/KA)H )@mAG5DAD_@334668:f499I33g99.Gtky5,:?%b!I@PP?L%,0JP$/9)4z$k+!5!#B6(27"'"'#(&4)"' <5%14)=2A6`<<.#MQ0}:J %3nLa,3 )ME?&IE~C1G" 4K";SJ3C:-,`;EA63f<{FV7EF,C>*+6,T)z&Y,%@ao,%@B8<={F-?lll>l1=6HIIAUe",x}4D*y08_FH;E()R@UFy1=m3TmXBG2B/`.5 A^:?5i8*P$/9iXHLFCLK#La,Aqw3:Fg^A RB4&1"}+g,?!EA6H3F; * ;3 ,3d!ME?]!1B$& BGK !L 80EF,F=CM!H"h6i 3$-9,5)6D&B,% #>', I% H2,e,,,,,-#%&)-'<@B29G0"r#>'" 4!VeB@;F-N;?B>/#U]HALlE'l>'l;e\\_\eTeTb=?6 >9!(6&G$B72(/a#HFC3*IIAU@%~)D&7]:!!V.,@B!4=xD6#;O(*EAb0E*]4 @ H1'E8EsG >IE0 8J><=A FH>  /2A)E (&" )$'*<()RF. 6HIT!Q(1+,P ;5=NT&>G>Gto@L3g6n:3333g9m4$#G2B/*#IGT 4aZUS&51 R.OJG/':A9?fF=9W3C<='9A7HQ:?5!# '$C+&*GN%b!I3HE5@PPf?*"%:/9,)4!C@@+!V#T6(~"'"'#(&4KG$5%z~2A6'9<<.77H"-/H2 %HLFCLK#La,AR#H3"O"1$30O"4? #NF/,2 +F\K 8d6(?q%G)V[# Z+#@0(~!1GLR%#" 4"81v*E;#.N 9P"p# a@;S4^e sQt'48>'48;e\\_\@%T@%mnIyN*\A7+/D zy01A'3(p(Xt!N''>->-5*'*'8&,669999ByB/HG/':AxfF=^235C<='9A7H'$C+-qMHE5@PPHJP$\2IrmaYbs}#"-/XHCLR#HiOgN9{g56}6,%5GIi^>bRFKi"J3.?JEA6H3<S)* '3 ,3H!<E?]!11$& BGV!X8JEFr=CM 0W9h+6:n%\#>'7I% 0W& 2,,X|t927!Z=z=4<$5$&2gZ;%02!#U>!5AA2B ,N4'4*>q'4*;e;!;!>!;!@%3!@%3b!;!#,?+(-$Z.HFC3*Iej7 !2; V.K@\UdQD]:6g*u*D @ H"011'EN^>KB C8J>sU)OQhE4'x<_R@UFW6HIT!F ($ 3 ;) `C=jhAA>G>G5DAD0/@334668:f499I33g99y?.B#G2q8';Oa8yf f.O,G7R/':A9?fF=9%23C2 ='A. HQ:?5!/'$C+&*Gb%b!I3HE5@PPH?*c-P$/9,)UY$k+!5!;T6=& LIIK&4)"' "-/>=HLFCLK#LAOu$"gu#ui?[1\k8/56(^A%,"6,%5j&hE>1Gg#" 4KW"g[g,?\63;q=Z1BXF6~v7I2,e,,,,,-6%|c@B29B7^/8%#S',E<$$C2F{;%02!#,)a3A217,8$4'48>E,'48;Bl @%T!1,)-H;(h$g'2?.H3*IIAU@%"&&7;4#.NO(*zJnH@p>G3jyB/IGA^:?$*GN%3m*<wT' $2`PI7FK8!B8-FaB#`LnLa,3"gcob@>Q!1&1"J3+:-,?! EA63.F; * ;3  E3}!ME?v!1BJ BGK 7;80EF,F=C$>*bVe5:DY#>, I% W2,B,,,,,-#%&)-'<@B29/8%#>'" p<F8+2F[[?B2!O)>/KAA2N=N'E48>[N48;e\\_\eTeT!A=?;G [9!(6& ;$g'2(.L#HFC3*IIAU)D&7D: !! .,@}D#N5(*\A70E4 @ H101AE8EYG >IE0M7 8J><=AFHP  /2U)E (&  )$7,<()R@>t (HIT@6 Y+,P=;) `C=NF4hAA>G>G5DAD0/@R:33:33333S.4yB/*(I7;tUB&5  D.$6G7R/':A9/.f/%235C<='Y':?5!#3$C+&GG1%I!c3HE5@PmH?*"%k4!CV@+LT6=2nIJ(&$5I)U2Y6'9P<.I7H# %`#_I(~21b/'+#B4"&1"JW2g0:-,`;EA63H'F';EEWC(3H!<E = !11$& +l!L 80EF,Fj>U6:He5~Y: ?>7I%mj2,e,,,,,-#%&)-'<@BkB/8%#SI<,8B2g .$/O#A21U=\E'48>E='48;B#;!;!>!;!@%3!@%3!! !;21=)6>;(h$W'2ma#HFC3*IIAU@%">?D&7;Q7.K4=x6#.;O(*EA7A0E`H101'9 8EYS>IE0M{J><T]FH>;+/KA)OQH )4']<&QR@UFWNHQ(13;5`C=F4A'AA>G>G5DAD0/@R:334668:33499;33g99J?G2B/HTIIky51 f.$+1G7A/':A9?=)F=9%23C<='AH*':?5!/'$C+&*G1%I!I3HE5@PPH?*"?,038P$)4!}@w5\2nIJ(&]I $5:1I)U2Y6'9PU.IZ%"-/HR%3HLFCLK#LK,XCOR#,3"O"U30O"9?6#NF[=?+5'%&X/56(?A%,)+"6,%5+#&0 >Q!1G/'+L#" 4K"&1"Jc>2.HTEqA3H:<E = :11$&l0F= 0W9hE:/%3$-9nm^e,,,,,-6%&)-'<@B29B7^/8%#S',4<$$Cx[ ;%02!#,)a3X217,8$4'48>E,'48;B#\\_\@%T@%uA\1,)47;(h$g'2?.HFC3sI@%"&Wj7]7.Kxu*Mv`y01X9 8_{ns)y']Z&()R@Ut 6HIT!Q(1P,P X5`NRzhAA>G>G5DAD0/@R4668499;tg99y?bB/*#IG;4K ( 0C0(y5QRB$+G7R/':A9?fF=9%23C<='AH*'$Cn*G3HE5@PPH8:$hUU}\=2I:1r%<YbsL1HB#X>"-/=HnLa,AO03"4id[=f'%F8/56A% ,%9>@"!1G8VP#" 4K6 R y,@k+wIIKZKG*6,/%3-956D&B,%3 ?>87n[^B,,,,,-#%&)-'<@B29%/8%#>S<2F-.[)>/57 1A21l>l;eeun]E(62$W'2(L#HFC]IIAU@%"&?D&7D;Q.,4=x6#;51REA700E*@ Fy01AE8EYS>IuMC8J><T]FHA)O5)4FZ&()R@>FW(Cdp?!C=m"zA ''>->-5*'*' /&,`f4l3 b.X1G2B/;L ( 0 CB&5 1D.$ G7R/':A9/.f/U9W35C<='xHQ:?5!/K1VI!IMHPPHi"%8:$/9,)4!Y@@++!5!#T!(2L"'"'#(&4)"G<#%1I)2Au9<U.I7#%@"!1G/'"$(#" 4K6";S+w:-}A633'F';+0*, 'Qe,A3d:<E = :11$& BGFK 7L 80EF,F=CM Ww*6:Q9,5)6D&B,%@#>8, n[ #%&)-'<@B29"Z]#>SF-UL1'4*>[''4*;X O O R O @%G @%G.4 OP?H<>62;$W'2(Ea#C3*U@j7]e|.,!4= j6 H]f(*\Ax0E4j011'E8EN IE0BN8T]FH> +/2U)b;HI$'3E6>653030//334668:f499I33g99y?BG2B/HTIG;3o01C38S&R.$u~AA9?[)u =B_. HQ:?5!/'$Cn*GNI3HE5@PPH?*"%,03- *P$h)4!Y@@++!5!;BHc 7"'"'#(&4KG$5:% 4!=J2Y6`PU.I7#%"-/H2 %HLFCLK#LACOR#,G"O"1$30O"4? #NF/W2 +01\%& ?`6,)+V; ,@+#&0(>@"E 4K6"yJ/'K,:-k!;EA63*6,/ 3$-9,56D&B% #>8, I%^B,,,,,-#%&)-'<@B2927^/8%#S'" 4<$$F-. ;%0$!#,)>53A21+ ,8$4'48>E,'48;B#;!;!>!;!@%3!@%3!! !;!1,)46A;(6& $g'2?.L#HFC3sIAU@%"&)D&7D;Q.,04O H '%#N51(AH00E*MF `H101` 8_YS>IEKMC8J>VnM&FH> +/2A)bQ3 E4'<()R@>F. (HIT!6(1+%,P ;5`C=N1FhAA>G>65DAD_@R:334668:33499;33g99:?.4$1G2B/*6#I7; ( 0C(B&5  D.$G7R/':A9/=)/U9%23C<='AH*':?5!# '$C+C*xVI!c3HE5@PmHi"V0H8*:$h,)UY@Z+!V;=2L"'"'#(ZKG"-/`2>=HLFCLK#LK,AR#,3"OS$30O"34? 9i,2 +5'%F8/56(?A%c)+s6,%G#@0A>@"!1G/'+#" 4?J3\EA<f)q2F  WA3d!<E = !11$&,F$,V:^B>,,,%&)T<@/8%8a[,2FSOgI(1+*=8$E'48>E='48;B#\\_\@%T@%uAm1=)4;^2qL8C3*?j7D.,|5Y*H00E4 @:y01'9YSKMC8J>VnsFH2Z)5E$',-)R@>]."(T!6(,P ;5`!F$hAA>G>G5DAD0/@R668:3g99;99:?bUyq*6#I7;4KB7U1DB$$6G7R/':A9/=)/=^23C<='AH*a?V/'$Cn*G19I:I3HE5@PPH?fJP$Y$\B27^")"'D4>=0LYwZ%<K8 01HB#X>"-/XHLFCLK#LAu,cqO:g? 9iF/,=?+5'/56(?A%"6,%9q>@C1b/'+ETK6?-F' |B{$/6H;3@;AEYT])b!Q X5!p:I,0`9,]!Y@@++V!;T6=~IIKZKG<5:z)Z%<}:RCHL#1$30&lEIi4KX EA63,M 0W9>*+p ={l OIe,Y}4=xDB6} +/2U)b;HF=T&W03`7A=oWxHp! /+!rui.*a,T)z&Y,%@,) 7(tS&,*GN! L&4).7Y,#v8c'{Fl!  eHIAU@%"}Y p>  ELC3$5LG1 *040@  a/2H0/3$H M竪V;竪竪lLlL669999曷33:3333゛曷33:3333゛゛漸漸゛漸;;;;;;;lookup-1.08b.orig/BUILDING0100600000014400001440000000430106174615250014614 0ustar nakaharastaff Jeffrey Friedl Omron Corp. Nagaokakyo, Japan COMPILING It's written in ANSI C, so you'll need an appropriate compiler. If you have gcc and gmake, you should just be able to: gmake If you know the gcc is version 2, you can get better performance with gmake gcc=gcc2 ^^^^---- this is not a compiler name, just a string. If you don't have gcc, specify "gcc=" and tell what CC to use: gmake gcc= CC="cc -ansi" CC_TRAD=cc If you have a gcc with a weird name, try something like % gmake CC=/funky/gnu_compiler (with an added "gcc=gcc2" if it's gcc version 2) Check out "jmake" for a way to easily customize things. [ IF YOUR MAKE SUCKS AND CAN'T HANDLE THE MAKEFILE (some SysV makes are ] [ known to be have been developed before certain technilogical advances,] [ such as electricity), try "sh make.sh" after setting CC and CFLAGS at ] [ the top. ] Note about CC_TRAD: If you don't have a traditional (K&R) compiler handy, you can do without it if your sys/include.h is ANSIfied. MAN PAGE ----------------------------------------------------------------------------- The formatted manual can be found in the files: lookup.man.jis lookup.man.euc lookup.man.sjs If you wish to rebuild with a different width (I like to use the full screen width, so I set the width to 79 on screen and 99 on my 100 column printer), set MANPAGE_WIDTH in the Makefile. The form of the value is "-r1#" where you replace # with your desired width, as in "-r179". You can then do a "make manual". Note that jnroff is required to rebuild the man pages, and that Ken Lunde's "jconv" is required to make the .jis and .sjs versions. RUNNING ----------------------------------------------------------------------------- Use the command-line arguments -jis -euc or -sjis to select which encoding method to use (using -jis with kterm should always be safe). Then read the manual and follow along on, trying things yourself. The example startup file ("~/.lookup") shown in the man page can be found in "dot-lookup". My personal version is in dot-lookup.jfriedl. Let me know how things go. Jeffrey jfriedl@nff.ncl.omron.co.jp lookup-1.08b.orig/CHANGES0100600000014400001440000002477406173476357014523 0ustar nakaharastaffVersion 1.08 [ ] Overall of internal file handling. Now does its own virtual-memory-like file access, no longer requiring the whole file or index to be loaded. Overall, much faster for most lookups with a zippy startup. [ ] Removed all options to "load" command. [ ] Added lots of color options to the "highlight" command. [ ] Output pager now understands that ANSI color escapes don't take up horizontal space. Version 1.07 [x] fixed a stoooopid octal->char mistake in jregex.c [x] added perl5's (?:....) [x] fixed resizing under Linux (used to die with "interrupted system call") [x] added -whenneeded option to 'load' command. Version 1.06 (Nov 8, 1995) [x] fixed an error with the 'describe' command. Wasn't working for 'jis',etc. Version 1.05 (Sep 18, 1995) [x] fixed cmds.master for perl5 [x] added server.info directory with small test program [x] added "-port #" arg for the server configuration. [x] can now describe longer EUC/SJIS (looks at each character) [x] redid how 'describe' worked internally, replaced old bugs with new ones. [x] made most flags usable for a server too. [x] added HTML highlight ability Version 1.04d (May 16, 1995) [x] added 'now' option to 'load' [x] updated documentation to '!', 'files', and 'prompt' to reflect reality. [x] added wildcard command (and associated stuff, such as !W!, etc.) [x] fuzzification now turned off if regex has *, +, or ? in it that modifies a non-ASCII character. [x] fixed odd regex error where きょ[ぅうおぉをー]*っ?か[ぁあー]*い[ぃいー]*> was getting turned into きょ alone. Not sure what I did to fix it, though /-: Fixed ^abc|xyz to be (^abc)|xyz, as God intended. Version 1.03 (March 8, 1995) [x] fixed transliteration of 'konnichiha' (was 'こっにちは', now 'こんにちは') Version 1.02 (Feb 15, 1995) [x] added fuzzy-々 to jregex.c, and turn it on during fuzzy-mode. [x] added EITHER_EXACT_2 to jregex.c as a reasonable optimization. [x] made を and ゑ fuzzy with お and え. [x] made the command part of an if {} "script mode" [x] added "continuous" ability to server. [x] fixed some things about the romaji->kana transliterator. [x] had jreadline flush any autokana changes before printing final line. Version 1.01 (Oct 27, 1994) [x] moved include of stdio.h above lookup.h in commands.c (helps linux) [x] grrr. fixed "defined(__GNUC__)" to "!defined(__GNUC__)" in lookup.h [x] if stdtypes.h exists, include it instead of types.h in index.h [x] added check for stdtypes.h to system.h (in Makefile) [x] changed signal() base return type to 'void' (jreadline.c) for non-GCC [x] added some more sysV stuff under the __svr4__ || DGUX defines [ ] added server-config ability. [x] added 'highlight style' command and abilities. Version 1.00 (April 22, 1994) [x] the kanji repeater character is now fuzziefied as well. [x] all filenames now <= 14 characters. [x] now ",1" can fall before a || or |!|. [x] now leading tabs in a file won't leave spinner chars hanging around. [x] lines beginning with + and ! can now be used for searches. [x] if {expr} cmd [x] if {expr} cmd can be used to get around "not in command file" restrictions. [x] expr has =(==), !=, &(&&), |(||), +, -, *, /, (, ), !, [x] expr has checked, matched, printed, nonword, filtered, true, false [x] new msg command [x] added display flag: [x] new "display" command [x] new FILES command output [x] new !d! prefix [x] new action with combos (no print if either display turned off) [x] new prompt %d [x] have じ&ぢ fuzzify together. Same for ず&づ [x] new l=logging? and L=logging filename prompt things. [x] can now "select ." [x] added \c and \C to regex. [x] fixed bug that had caused kuten row 84 to be treated as non-EUC. Version 0.08 ( March 6, 1994 ) [x] assert(is_reading == 0) failed "lib/input.c" line 115. on ^C [x] fixed: ^space at "末||(み|/bi" strange [x] search of 〆 doesn't show or report one name entry. [x] fixed 'describe' SJIS conversion problem. [x] Fixed __GNUC__ typo on lookup.h [x] spelled Ken Lunde's name right (-: [x] Search of 五輪 in WORDS checks all lines.... - fixed [x] outputf now has %N= errno string. [x] added input.c module. [x] Lazy-evaluation with loading. [x] "files" command now has a digit indicating what percent is loaded if not 100. [x] stuff will load automatically while idle. Version 0.07 ( March 4, 1994 ): Changes since Version 0.06 ---------------------------------------------------------------------------- [x] minor bug fixes [x] fixed single-quote not being allowed in romaji (to force N) [x] select now OK in comand file [x] added tag stuff [x] new !t! [x] new field in "files" command report [x] new "tag" command (note: not automatically enabled) [x] combo stuff [x] "combo" command [x] creating a new slot [x] adding to a previous slot [x] note that combo slots are "flag" lists. [x] in listing, (#) means "already there". [x] treatment of !xxx! stuff WRT combos [x] combo slot in "files" command report. [x] combo slots & filters, modify. [x] "slots" now synonymous with "files" [x] make sure "name" prints when names are filtered. [x] certain flags only matter during combo stuff... for example, the fuzz flag will be applied based upon only the combo slot status: the component slot's fuzz flag will be ignored. Same for autokana, obviously. [x] search of merely ^ dies. -- fixed [x] note no extra spaces around multirexex || [x] note that can't have [\k\h];use (\h|\k) [x] note input encoding only for interactive input, not file input. [x] note files sourced from startup don't have restriction about load, etc. [x] note startup file not read if -write given. [x] added '=' command (can now have regexes in command files) [x] added "log" command [x] have exmaple "example.dot-lookup" [x] note T = have and enabled, '@' == have not enabled. [x] note M = have and enabled, '%' == have not enabled [x] note F = have and enabled, '#' == have not enabled [x] note: can set filter and modify and tag flags even for combos. [x] now have '%#' prompt format. Version 0.06 (Feb 28 '94): Changes since Version 0.04 ---------------------------------------------------------------------------- [x] flushed -readindex and -fastload options... now loads index _if_there_. [x] flushed "fastload" command. [x] added "-noindex" [x] now has a 100-line save-list by default. [x] modified initial prompt [x] revamped romaji converter... now faster and more powerful. [x] default size of screen now querired upon startup (if system supported) [x] screen size now changed upon signal (if system supported) [x] highlighting now done during show. [x] note about jconv. Also good for HW kana. [x] files list is better. [x] !m! now !M! [x] !? is help [X] now unified prompt/!/flags for FMwcfh [x] note prompt command-intro char '%c' --> '%S' [x] added prompt '%c'=case folding [x] now per-file: word, fuzz, fold, highlight, autokana [x] can now select by name [x] note that reloading a file uses the same internal file stuff, but allows different filters and flags (only major difference is that it won't re-read a possibly changed file). [x] "-rc filename" command-line arg [x] 'source' command [x] note in manual that !/foo would be same as !default!/foo [ ] can use !M! with show only if "selected file" is same as when search done. [x] note that -euc, etc will override settings in .lookup [x] manuals now generated in jis, euc, and sjis. Version 0.04 (Feb 2, 94): Changes from version dated Jan 10, 1994 ----------------------------------------------------------------------------- [x] Added describe command [x] Added 'output encoding' command [x] Added 'input encoding' command [x] Added 'encoding' command [x] Added output pager command [x] Addded version command. [x] Added cmdchar command [x] Added -euc, -jis, -sjis command-line options. [x] Added (..) feature to prompt strings. [x] Added %C [if entering a command] to prompt. [x] Added %c [command char] to prompt. [x] Added %0 [command name] to prompt. [x] Added ? line [x] Added -v (-version) command line flag [x] Revamped jreadline output -- will now do horizontal scroll. Move man to "doc" directory. Lots of man-page stuff. Finally added a minimal REGEX intro. Changes from version dated Dec 28, 1993 ----------------------------------------------------------------------------- Sort of big changes: -------------------- "filter list size" command now called "saved list size". Fixed so you'll actually get a prompt if you don't have a ".lookup" (blush). Can now enter multiple regexes for one search with || and |!|. Added \a, \A, \k, \K, \h, and \H to regexes. Added "whole-word-only" stuff. See manual for new "word" command, as well as new "!w" prefix and new prompt control sequence. Other changes I can remember: ----------------------------- In Makefiles: "Make clean" will now enter the lib directory and make clean there. Will now make the man page. Now notes about warnings expected in "trad_set_tty.c". Note in README to set the collumn-width in lookup.man if you care to Made !m! and !F! to be "reverse the current status" a'la !c! and !f!, rather than simpley "force off" Using an unsupported code in a !..! prefix now is an error. Man page much changed. FYI, fixed < & > WRT EOL & BOL. Added simplistic support for SJIS input... run with "-sjis" arg to enable. Noted in the README that one's kterm needs to be in "EUC Kanji Mode" Added the "spinner" command. Moved apply_regex out of the library... was silly to have there. Internally-converted kana line no longer put into history. /\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\ \/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/ TODO ------ Add mode where it will try to deinflect words. want to search for "/ときょ|tokyo" Add notions of userdefined ranges to 'describe', so it'll at least mention it when it runs into one. Have it try to open the index file for writing before constructing it, so it'll die right away if it can't.... Fix halfwidth kana and JIS-0212 support. make sure 'select' before 'show' doesn't botch clean up malloc use in jregex. loaded-file's #if stuff. (#if file loaded, #if host=X, etc.) be able to set favorite !! default. prompt %stuff-available-in-info have load check file inodes to make sure to not load the same file twice. allow processing to continue while "MORE?" is waiting. add a sample ".lookup" to distribution Allow [\k\h] as well as (\h|\k). make default output JIS can have end-of-line comments stripped..... auto "if Japanese, refuse/checkotherslot" "/wakaru?" won't work.... add Japanese-language error messages. allow the port number (for server config) to be given on the command line or in .lookup lookup-1.08b.orig/Makefile0100600000014400001440000002142506300555416015140 0ustar nakaharastaff## ## If you have gcc, you should just be able to ## make ## ## If you know the gcc is version 2, you can get better performance with ## make gcc=gcc2 ## ## If you don't have gcc, specify "gcc=" and tell what CC to use: ## make gcc= CC=/bin/cc ## ## You can add ## warn=1 ## to get compiler warnings. There are others as well... see below. ## CC=gcc CC_TRAD=$(CC) -traditional ## set to "gcc1", "gcc2" or leave blank gcc=gcc1 ## selecte exactly "0" (no) or "1" (yes) for the following optimize=0 pedantic=0 kibishii=0 debug=1 warn=0 md=0 #MANPAGE_WIDTH=-r179 ## ## If you know your include files (sys/ioctl.h specifically) are ANSIfied, ## you don't need to worry about COMPILE_WITH_TRAD, so you can comment ## it out if that's the case and you don't have a traditional compiler ## around. ## COMPILE_WITH_TRAD=termset_trad.o RANLIB=/usr/bin/ranlib ############################################################################## ############################################################################## ## ## in these defines, _* are for when GCC is not used. ## gcc1_* for when gcc version 1 used. ## gcc2_* for when gcc version 2 used. ## KIBISHII_1 = -DKIBISHII_DEBUG gcc1_WARN_1 = -Wall -Wshadow -Wwrite-strings gcc2_WARN_1 = -Wall -Wshadow -Wwrite-strings -Wno-implicit ## For generic non-GNU compilers. First digit is optimize, 2nd is debug _OPTIMIZE_1_0 = -O -DFAST_REGEXEC -DNDEBUG _OPTIMIZE_1_1 = -O -DFAST_REGEXEC _OPTIMIZE_0_0 = -DFAST_REGEXEC -DNDEBUG _OPTIMIZE_0_1 = -g ## For GNU Version 1.x compilers. First digit is optimize, 2nd is debug gcc1_OPTIMIZE_1_0 = -O -g -DFAST_REGEXEC -DNDEBUG gcc1_OPTIMIZE_1_1 = -O -g -DFAST_REGEXEC gcc1_OPTIMIZE_0_0 = -DFAST_REGEXEC -DNDEBUG gcc1_OPTIMIZE_0_1 = -g ## For GNU Version 2.x compilers. First digit is optimize, 2nd is debug gcc2_OPTIMIZE_1_0 = -O2 -funroll-loops -g -DFAST_REGEXEC -DNDEBUG gcc2_OPTIMIZE_1_1 = -O2 -g -DFAST_REGEXEC gcc2_OPTIMIZE_0_0 = -DFAST_REGEXEC -DNDEBUG gcc2_OPTIMIZE_0_1 = -g gcc1_PEDANTIC_1 = -pedantic gcc2_PEDANTIC_1 = -pedantic gcc2_MD_1 =-MMD ## now use the above PEDANTIC = $($(gcc)_PEDANTIC_$(pedantic)) WARNINGS = $($(gcc)_WARN_$(warn)) OPTIMIZE = $($(gcc)_OPTIMIZE_$(optimize)_$(debug)) KIBISHII = $(KIBISHII_$(kibishii)) MD = $($(gcc)_MD_$(md)) OPTIONS=$(PEDANTIC) $(WARNINGS) $(OPTIMIZE) $(MD) $(KIBISHII) ############################################################################## SHELL=/bin/sh LOCAL_LIB_SRC=lib LOCAL_LIB=jefflib.a #OTHERLIBS=-lmach LIBS= $(LOCAL_LIB) $(OTHERLIBS) CFLAGS= $(OPTIONS) $(EXTRA) $(CFLAGS_EXTRA) objs=lookup.o commands.o apply_regex.o eval.o #all: _note_ lookup manual all: lookup LocalLibObs = \ virtfile.o \ euc.o \ fuzzkana.o \ index.o \ input.o \ jreadline.o \ jregex.o \ kanaid.o \ loadfile.o \ output.o \ replace.o \ romaji2kana.o \ std_romaji.o \ strsave.o \ termset.o \ xmalloc.o $(LOCAL_LIB): $(LocalLibObs) $(COMPILE_WITH_TRAD) ar r $@ $? -if [ -f $(RANLIB) ]; then ranlib $@; else true; fi $(COMPILE_WITH_TRAD): $(CC_TRAD) -c -o $@ $(CFLAGS) -I$(LOCAL_LIB_SRC)/ $(LOCAL_LIB_SRC)/$*.c $(LocalLibObs): $(CC) -c -o $@ $(CFLAGS) -I$(LOCAL_LIB_SRC)/ $(LOCAL_LIB_SRC)/$*.c _note_: -@echo ' ' -@echo 'NOTE: "may be used uninitialized" warnings are OK in compile.' -@echo ' ' lookup: $(LOCAL_LIB) $(objs) version.c ## Some systems (DGUX) have SH bugs that cause the link to die. ## Can be fixed by wrapping in: csh -c "...." -@/bin/rm -f lookup.new $(CC) $(CFLAGS) -o lookup.new $(objs) version.c $(LIBS) /bin/mv -f lookup.new lookup regextest: $(LOCAL_LIB) regextest.o $(CC) -o $@ regextest.o $(LOCAL_LIB) commands.h: cmds.master perl -e 1 && perl cmds.master > temp.h && /bin/mv -f temp.h commands.h lib/system.h: Makefile /bin/rm -f lib/system.h echo '/* this file generated by Makefile */' > tmp; -echo '#ifndef __SYSTEM_H__ /*file wrapper*/' >> tmp; -echo '#define __SYSTEM_H__' >> tmp; if [ -f /usr/include/strings.h ]; then\ echo '#define _HAVE_STRINGS_H_' >> tmp; \ else true; fi if [ -f /usr/include/sys/termio.h ]; then\ echo '#define _HAVE_SYS_TERMIO_H_' >> tmp; \ else true; fi if [ -f /usr/include/sys/stdtypes.h ]; then\ echo '#define _HAVE_SYS_STDTYPES_H_' >> tmp; \ else true; fi if [ -f /usr/include/sys/fcntl.h ]; then\ echo '#define _HAVE_SYS_FCNTL_H_' >> tmp; \ else true; fi if [ -f /usr/include/fcntl.h ]; then\ echo '#define _HAVE_FCNTL_H_' >> tmp; \ else true; fi -echo '#endif /* file wrapper */' >> tmp; mv tmp lib/system.h make.sh: realclean @echo ':# script to make lookup' > tmp @echo '## Can set CC= and CFLAGS= on the command line, just as with make' >> tmp @echo 'CC="gcc"' >> tmp @echo 'CFLAGS="-O -DFAST_REGEXEC -DNDEBUG"' >> tmp @echo '#####################################################'>> tmp @echo 'set -ex' >> tmp @echo 'eval $$@' >> tmp $(MAKE) --silent -n gcc= CC='[ -s $$@ ]||$$$$CC' CFLAGS='$$$$CFLAGS' >> tmp /bin/mv tmp make.sh chmod +x make.sh ## ## We use man.tmp so that lookup.man won't be erased if jnroff or jconv ## fails (such as if it's not even found). ## manual: lookup.man.euc lookup.man.jis lookup.man.sjs lookup.man.euc: doc/lookup.man doc/*.so @echo @echo "NOTE : if jnroff isn't found, manuals won't be remade." @echo cd doc; jnroff $(MANPAGE_WIDTH) -man lookup.man > man.tmp mv doc/man.tmp lookup.man.euc lookup.man.jis: lookup.man.euc jconv -ie -oj < lookup.man.euc > man.tmp mv man.tmp lookup.man.jis lookup.man.sjs: lookup.man.euc jconv -ie -os < lookup.man.euc > man.tmp mv man.tmp lookup.man.sjs ## The following is JIS, but without any highlighting. lookup.man.raw: lookup.man.jis sed -e 's/.//g' < lookup.man.jis > man.tmp mv man.tmp lookup.man.raw tidy: -@echo dummy > dummy~ /bin/rm -f .*~ *~ */*~ core a.out man.tmp doc/*.bak clean: tidy -@echo dummy > dummy.o /bin/rm -f \#* *.o *.d doc/#* $(LOCAL_LIB) lib/system.h realclean: clean -@echo dummy > lookup.man.xxx /bin/rm -f lookup lookup.man.* commands.h make.sh ################################################################## apply_regex.o: apply_regex.c lib/config.h lib/output.h lib/assert.h \ lib/index.h lib/system.h lib/packed_list.h lib/MemItem.h \ lib/xmalloc.h lib/virtfile.h lib/loadfile.h lib/longlinenote.h \ lib/jregex.h lib/replace.h lookup.h commands.o: commands.c lib/system.h lib/config.h lib/assert.h \ lib/jreadline.h lib/jregex.h lib/loadfile.h lib/longlinenote.h \ lib/xmalloc.h lib/index.h lib/packed_list.h lib/MemItem.h \ lib/output.h lib/virtfile.h lib/replace.h lib/strsave.h lib/euc.h \ lookup.h eval.h commands.h euc.o: lib/euc.c lib/config.h lib/euc.h eval.o: eval.c lib/config.h lib/jregex.h lib/output.h lookup.h \ lib/virtfile.h lib/assert.h eval.h fuzzkana.o: lib/fuzzkana.c lib/config.h lib/assert.h lib/output.h \ lib/fuzzkana.h lib/kanaid.h index.o: lib/index.c lib/config.h lib/system.h lib/assert.h \ lib/output.h lib/index.h lib/packed_list.h lib/MemItem.h \ lib/xmalloc.h lib/virtfile.h input.o: lib/input.c lib/config.h jreadline.o: lib/jreadline.c lib/config.h lib/assert.h lib/output.h \ lib/jreadline.h lib/strsave.h lib/input.h lib/xmalloc.h lib/euc.h \ lib/system.h jregex.o: lib/jregex.c lib/config.h lib/assert.h lib/jregex.h \ lib/output.h lib/euc.h kanaid.o: lib/kanaid.c lib/config.h lib/kanaid.h loadfile.o: lib/loadfile.c lib/config.h lib/assert.h lib/xmalloc.h \ lib/strsave.h lib/system.h lib/loadfile.h lib/longlinenote.h \ lib/index.h lib/packed_list.h lib/MemItem.h lib/output.h \ lib/virtfile.h lookup.o: lookup.c lib/config.h lib/assert.h lib/output.h \ lib/fuzzkana.h lib/loadfile.h lib/longlinenote.h lib/xmalloc.h \ lib/index.h lib/system.h lib/packed_list.h lib/MemItem.h \ lib/virtfile.h lib/romaji2kana.h lib/jregex.h lib/strsave.h \ lib/replace.h lib/input.h lookup.h lib/jreadline.h output.o: lib/output.c lib/config.h lib/assert.h lib/input.h \ lib/output.h replace.o: lib/replace.c lib/config.h lib/assert.h lib/jregex.h \ lib/xmalloc.h lib/replace.h romaji2kana.o: lib/romaji2kana.c lib/config.h lib/assert.h \ lib/romaji2kana.h lib/kanaid.h std_romaji.o: lib/std_romaji.c lib/config.h lib/system.h \ lib/romaji2kana.h lib/jreadline.h lib/strsave.h lib/xmalloc.h strsave.o: lib/strsave.c lib/xmalloc.h lib/strsave.h termset.o: lib/termset.c lib/config.h lib/system.h termset_trad.o: lib/termset_trad.c lib/termset.c lib/config.h \ lib/system.h version.o: version.c virtfile.o: lib/virtfile.c lib/config.h lib/output.h lib/xmalloc.h \ lib/strsave.h lib/virtfile.h xmalloc.o: lib/xmalloc.c lib/xmalloc.h